Музыкальный генератор на ИИ

Создано: 28 сентября 2024 г.

Отвечено с использованием OpenAI 01 Preview от OpenAI01

Вопрос

напиши полный рабочий код музыкального генератора на искусственном интеллекте. я хочу получить самое лучшее приложение по генерации музыки какое только возможно. примени все необходимые технологиии чтобы получить самый лучший с музыкальной точки зрения результат на выходе. без использования предобученных моделей . улучши мое приложение. я хочу чтобы модель обучалась максимально быстро и выдавала максимально качественные результаты. исправь ошибку not enought data to create sequences import. please use longer audio data. музыкальные данные должны быть видны в контекстном окне. для обучения могут использоваться любые wav файлы любого качества и без разницы стререо , моно или пятиканальные. на выходе мы должны получать стерео файл. напиши полный рабочий код приложения import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import os
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import threading
import librosa
from sklearn.decomposition import NMF
import math

# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class GELUActivation(nn.Module):
def forward(self, x):
return 0.5 * x * (1 + torch.tanh(
math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))
))


class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)

pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(
0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term) # even indices
pe[:, 1::2] = torch.cos(position * div_term) # odd indices
pe = pe.unsqueeze(0) # shape (1, max_len, d_model)
self.register_buffer('pe', pe)

def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)


class ImprovedTransformerEncoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
super(ImprovedTransformerEncoderLayer, self).__init__()
self.self_attn = nn.MultiheadAttention(
d_model, nhead, dropout=dropout, batch_first=True
)
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.activation = GELUActivation()

def forward(self, src, src_mask=None, src_key_padding_mask=None):
src2 = self.self_attn(
src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(
self.dropout(self.activation(self.linear1(src)))
)
src = src + self.dropout2(src2)
src = self.norm2(src)
return src


class ImprovedTransformerModel(nn.Module):
def __init__(
self, input_size, output_size, d_model=512, nhead=8, num_layers=6, dropout=0.1
):
super(ImprovedTransformerModel, self).__init__()
self.embedding = nn.Linear(input_size, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
encoder_layer = ImprovedTransformerEncoderLayer(
d_model, nhead, dim_feedforward=2048, dropout=dropout
)
self.transformer_encoder = nn.TransformerEncoder(
encoder_layer, num_layers
)
self.decoder = nn.Linear(d_model, output_size)
self.init_weights()

def init_weights(self):
initrange = 0.1
nn.init.uniform_(self.embedding.weight, -initrange, initrange)
nn.init.zeros_(self.embedding.bias)
nn.init.zeros_(self.decoder.bias)
nn.init.uniform_(self.decoder.weight, -initrange, initrange)

def forward(self, src):
src = self.embedding(src)
src = self.pos_encoder(src)
output = self.transformer_encoder(src)
output = self.decoder(output)
return output


def load_audio(file_path):
waveform, sample_rate = torchaudio.load(file_path)
return waveform, sample_rate


def extract_features(y, sr):
# Extract various features
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)

# Concatenate all features
features = np.concatenate(
[mfcc, chroma, spectral_centroid, spectral_contrast, spectral_rolloff], axis=0
)

return features


def preprocess_audio(waveform, sample_rate, target_sample_rate=22050, chunk_size=256):
if sample_rate != target_sample_rate:
waveform = torchaudio.functional.resample(
waveform, sample_rate, target_sample_rate
)

# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)

# Convert to numpy array
y = waveform.numpy()[0]

# Extract features
features = extract_features(y, target_sample_rate)

# Convert to tensor and normalize
features = torch.FloatTensor(features)
features = features / torch.max(torch.abs(features))

# Ensure the correct shape (time, features)
features = features.t()

# Pad or trim to chunk_size
if features.shape[0] < chunk_size:
padding = torch.zeros(
chunk_size - features.shape[0], features.shape[1]
)
features = torch.cat([features, padding], dim=0)
elif features.shape[0] > chunk_size:
features = features[:chunk_size, :]

return features


def train_model(model, train_loader, val_loader, epochs=100, lr=0.001):
criterion = nn.MSELoss()
optimizer = optim.AdamW(
model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01
)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

best_val_loss = float('inf')
best_model = None
train_losses = []
val_losses = []

for epoch in range(epochs):
model.train()
total_loss = 0
progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
for batch in progress_bar:
optimizer.zero_grad()
inputs, targets = batch
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
total_loss += loss.item()
progress_bar.set_postfix(
{'training_loss': f'{total_loss/len(train_loader):.4f}'}
)

avg_train_loss = total_loss / len(train_loader)
train_losses.append(avg_train_loss)

model.eval()
val_loss = 0
with torch.no_grad():
for batch in val_loader:
inputs, targets = batch
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = criterion(outputs, targets)
val_loss += loss.item()

avg_val_loss = val_loss / len(val_loader)
val_losses.append(avg_val_loss)

print(
f"Epoch {
epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}"
)

scheduler.step()

if avg_val_loss < best_val_loss:
best_val_loss = avg_val_loss
best_model = model.state_dict()

model.load_state_dict(best_model)
return model, train_losses, val_losses


def generate_music(model, seed, length=1000, temperature=1.0):
model.eval()
with torch.no_grad():
generated = seed.unsqueeze(0).to(device)
for _ in tqdm(range(length), desc="Generating"):
input_seq = generated[:, -256:, :]
output = model(input_seq)
next_feature = output[:, -1, :] / temperature
generated = torch.cat(
[generated, next_feature.unsqueeze(1)], dim=1)
return generated.squeeze(0).cpu()


def save_audio(waveform, sample_rate, file_path):
torchaudio.save(file_path, waveform.unsqueeze(0), sample_rate)


def save_model(model, path):
torch.save(model.state_dict(), path)


def load_model(model, path):
state_dict = torch.load(path, map_location=device)
model.load_state_dict(state_dict)
return model


def harmonic_analysis(waveform, sample_rate):
y = librosa.util.normalize(waveform.numpy()[0])
chroma = librosa.feature.chroma_cqt(y=y, sr=sample_rate)
return chroma


def rhythm_analysis(waveform, sample_rate):
y = librosa.util.normalize(waveform.numpy()[0])
onset_env = librosa.onset.onset_strength(y=y, sr=sample_rate)
tempo, beats = librosa.beat.beat_track(
onset_envelope=onset_env, sr=sample_rate
)
return tempo, beats


def style_transfer(content_waveform, style_waveform, sample_rate):
y_content = content_waveform.numpy()[0]
y_style = style_waveform.numpy()[0]

content_spec = np.abs(librosa.stft(y_content))
style_spec = np.abs(librosa.stft(y_style))

n_components = 4
content_nmf = NMF(n_components=n_components, random_state=0)
style_nmf = NMF(n_components=n_components, random_state=0)

content_W = content_nmf.fit_transform(content_spec)
content_H = content_nmf.components_

style_W = style_nmf.fit_transform(style_spec)
style_H = style_nmf.components_

transferred_spec = np.dot(content_W, style_H)
transferred_audio = librosa.istft(transferred_spec)

return torch.FloatTensor(transferred_audio).unsqueeze(0)


class MusicGeneratorApp:
def __init__(self, master):
self.master = master
self.master.title("AI Music Generator")
self.master.geometry("800x600")

self.model = None
self.sample_rate = 22050 # Default sample rate
self.train_thread = None
self.train_losses = []
self.val_losses = []

self.create_widgets()

def create_widgets(self):
self.notebook = ttk.Notebook(self.master)
self.notebook.pack(fill=tk.BOTH, expand=True)

self.train_tab = ttk.Frame(self.notebook)
self.generate_tab = ttk.Frame(self.notebook)
self.analysis_tab = ttk.Frame(self.notebook)
self.style_transfer_tab = ttk.Frame(self.notebook)

self.notebook.add(self.train_tab, text="Train Model")
self.notebook.add(self.generate_tab, text="Generate Music")
self.notebook.add(self.analysis_tab, text="Music Analysis")
self.notebook.add(self.style_transfer_tab, text="Style Transfer")

self.create_train_widgets()
self.create_generate_widgets()
self.create_analysis_widgets()
self.create_style_transfer_widgets()

def create_train_widgets(self):
self.train_button = tk.Button(
self.train_tab, text="Train Model", command=self.train_model
)
self.train_button.pack(pady=10)

self.load_button = tk.Button(
self.train_tab, text="Load Model", command=self.load_model
)
self.load_button.pack(pady=10)

self.status_label = tk.Label(self.train_tab, text="")
self.status_label.pack(pady=10)

self.progress_bar = ttk.Progressbar(
self.train_tab, orient=tk.HORIZONTAL, length=300, mode='determinate'
)
self.progress_bar.pack(pady=10)

self.fig, self.ax = plt.subplots(figsize=(6, 4))
self.canvas = FigureCanvasTkAgg(self.fig, master=self.train_tab)
self.canvas.draw()
self.canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)

def create_generate_widgets(self):
self.generate_button = tk.Button(
self.generate_tab, text="Generate Music", command=self.generate_music
)
self.generate_button.pack(pady=10)

self.temperature_label = tk.Label(
self.generate_tab, text="Temperature:"
)
self.temperature_label.pack()
self.temperature_slider = tk.Scale(
self.generate_tab, from_=0.1, to=2.0, resolution=0.1, orient=tk.HORIZONTAL
)
self.temperature_slider.set(1.0)
self.temperature_slider.pack()

self.length_label = tk.Label(
self.generate_tab, text="Length (seconds):"
)
self.length_label.pack()
self.length_entry = tk.Entry(self.generate_tab)
self.length_entry.insert(0, "10")
self.length_entry.pack()

self.status_label_generate = tk.Label(self.generate_tab, text="")
self.status_label_generate.pack(pady=10)

self.progress_bar_generate = ttk.Progressbar(
self.generate_tab, orient=tk.HORIZONTAL, length=300, mode='indeterminate'
)
self.progress_bar_generate.pack(pady=10)

def create_analysis_widgets(self):
self.analyze_button = tk.Button(
self.analysis_tab, text="Analyze Music", command=self.analyze_music
)
self.analyze_button.pack(pady=10)

self.analysis_text = tk.Text(self.analysis_tab, height=20, width=60)
self.analysis_text.pack(pady=10)

def create_style_transfer_widgets(self):
self.content_button = tk.Button(
self.style_transfer_tab, text="Select Content Audio", command=self.select_content_audio
)
self.content_button.pack(pady=10)

self.style_button = tk.Button(
self.style_transfer_tab, text="Select Style Audio", command=self.select_style_audio
)
self.style_button.pack(pady=10)

self.transfer_button = tk.Button(
self.style_transfer_tab, text="Transfer Style", command=self.transfer_style
)
self.transfer_button.pack(pady=10)

self.status_label_transfer = tk.Label(self.style_transfer_tab, text="")
self.status_label_transfer.pack(pady=10)

self.progress_bar_transfer = ttk.Progressbar(
self.style_transfer_tab, orient=tk.HORIZONTAL, length=300, mode='indeterminate'
)
self.progress_bar_transfer.pack(pady=10)

def train_model(self):
folder_path = filedialog.askdirectory(
title="Select folder with WAV files"
)
if not folder_path:
return

self.status_label.config(text="Loading and preprocessing data...")
self.master.update()

data = []
target_sample_rate = 22050
min_length = float('inf')
for file_name in os.listdir(folder_path):
if file_name.endswith(".wav"):
file_path = os.path.join(folder_path, file_name)
try:
waveform, sample_rate = load_audio(file_path)
preprocessed = preprocess_audio(
waveform, sample_rate, target_sample_rate
)
data.append(preprocessed)
min_length = min(min_length, preprocessed.shape[0])
except Exception as e:
print(f"Error processing {file_name}: {str(e)}")

if not data:
self.status_label.config(
text="No valid WAV files found or processed in the selected folder."
)
return

self.sample_rate = target_sample_rate

# Ensure all data have the same length
data = [d[:min_length, :] for d in data]
data = torch.stack(data)

sequence_length = 256
input_size = data.size(2)
output_size = input_size

input_sequences = []
target_sequences = []

for i in range(data.size(1) - sequence_length):
input_sequences.append(data[:, i:i+sequence_length, :])
target_sequences.append(data[:, i+1:i+sequence_length+1, :])

if not input_sequences:
self.status_label.config(
text="Not enough data to create sequences. Please use longer audio files."
)
return

input_sequences = torch.cat(input_sequences, dim=0)
target_sequences = torch.cat(target_sequences, dim=0)

# Ensure we have at least some data for validation
if len(input_sequences) < 2:
self.status_label.config(
text="Not enough data for training and validation. Please use more or longer audio files."
)
return

split = max(1, int(0.8 * len(input_sequences)))
train_inputs, val_inputs = input_sequences[:split], input_sequences[split:]
train_targets, val_targets = target_sequences[:split], target_sequences[split:]

train_dataset = TensorDataset(train_inputs, train_targets)
val_dataset = TensorDataset(val_inputs, val_targets)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

d_model = 512
nhead = 8
num_layers = 6

self.model = ImprovedTransformerModel(
input_size, output_size, d_model, nhead, num_layers
)
self.model.to(device)

self.status_label.config(text="Training model...")
self.master.update()

def train_thread():
try:
self.model, self.train_losses, self.val_losses = train_model(
self.model, train_loader, val_loader
)
self.master.after(0, self.update_plot)
self.status_label.config(text="Model trained successfully!")

save_path = filedialog.asksaveasfilename(
defaultextension=".pth", filetypes=[("PyTorch model", "*.pth")]
)
if save_path:
save_model(self.model, save_path)
self.status_label.config(
text="Model trained and saved successfully!"
)
except Exception as e:
self.status_label.config(
text=f"Error during training: {str(e)}"
)
print(f"Error during training: {str(e)}")
finally:
self.progress_bar.stop()

self.train_thread = threading.Thread(target=train_thread)
self.train_thread.start()

self.progress_bar.start()

def update_plot(self):
self.ax.clear()
self.ax.plot(self.train_losses, label='Train Loss')
self.ax.plot(self.val_losses, label='Validation Loss')
self.ax.set_xlabel('Epoch')
self.ax.set_ylabel('Loss')
self.ax.legend()
self.canvas.draw()

def load_model(self):
load_path = filedialog.askopenfilename(
filetypes=[("PyTorch model", "*.pth")]
)
if load_path:
try:
state_dict = torch.load(load_path, map_location=device)
input_size = state_dict['embedding.weight'].size(1)
output_size = state_dict['decoder.weight'].size(0)
d_model = state_dict['embedding.weight'].size(0)
nhead = 8
num_layers = len([
key for key in state_dict.keys()
if key.startswith('transformer_encoder.layers')
])

self.model = ImprovedTransformerModel(
input_size, output_size, d_model, nhead, num_layers
)
self.model = load_model(self.model, load_path)
self.model.to(device)
self.status_label.config(text="Model loaded successfully!")
except Exception as e:
messagebox.showerror(
"Error", f"Failed to load model: {str(e)}"
)

def generate_music(self):
if self.model is None:
messagebox.showwarning(
"Warning", "Please train or load a model first."
)
return

self.status_label_generate.config(text="Generating music...")
self.master.update()

temperature = self.temperature_slider.get()
try:
length_seconds = float(self.length_entry.get())
except ValueError:
messagebox.showerror(
"Error", "Invalid length value. Please enter a number."
)
return

if self.sample_rate is None:
self.sample_rate = 22050 # Default sample rate if not set

length_samples = int(length_seconds * self.sample_rate)

seed = torch.randn(256, self.model.embedding.in_features)

self.progress_bar_generate.start()

def generate_thread():
try:
generated = generate_music(
self.model, seed, length_samples, temperature
)

save_path = filedialog.asksaveasfilename(
defaultextension=".wav", filetypes=[("WAV files", "*.wav")]
)
if not save_path:
return

generated_features = generated.numpy().T

try:
reconstructed_audio = librosa.feature.inverse.mfcc_to_audio(
generated_features[:20, :]
)
reconstructed_audio = torch.FloatTensor(
reconstructed_audio)
reconstructed_audio = reconstructed_audio / torch.max(
torch.abs(reconstructed_audio)
)
save_audio(reconstructed_audio,
self.sample_rate, save_path)
self.status_label_generate.config(
text="Music generated and saved successfully!"
)
except Exception as e:
messagebox.showerror(
"Error", f"Failed to reconstruct audio: {str(e)}"
)
except Exception as e:
messagebox.showerror(
"Error", f"Failed to generate or save audio: {str(e)}"
)
finally:
self.progress_bar_generate.stop()

threading.Thread(target=generate_thread).start()

def analyze_music(self):
file_path = filedialog.askopenfilename(
filetypes=[("WAV files", "*.wav")]
)
if not file_path:
return

waveform, sample_rate = load_audio(file_path)

chroma = harmonic_analysis(waveform, sample_rate)
tempo, beats = rhythm_analysis(waveform, sample_rate)

analysis_text = f"Tempo: {tempo} BPM\n\n"
analysis_text += "Chroma Features:\n"
for i, chr in enumerate(chroma.T):
if i % 4 == 0:
analysis_text += "\n"
analysis_text += f"{chr.max():.2f} "

self.analysis_text.delete('1.0', tk.END)
self.analysis_text.insert(tk.END, analysis_text)

def select_content_audio(self):
self.content_path = filedialog.askopenfilename(
filetypes=[("WAV files", "*.wav")]
)
if self.content_path:
self.status_label_transfer.config(text="Content audio selected")

def select_style_audio(self):
self.style_path = filedialog.askopenfilename(
filetypes=[("WAV files", "*.wav")]
)
if self.style_path:
self.status_label_transfer.config(text="Style audio selected")

def transfer_style(self):
if not hasattr(self, 'content_path') or not hasattr(self, 'style_path'):
messagebox.showwarning(
"Warning", "Please select both content and style audio files."
)
return

self.progress_bar_transfer.start()

def transfer_thread():
try:
content_waveform, content_sample_rate = load_audio(
self.content_path
)
style_waveform, style_sample_rate = load_audio(
self.style_path
)

if content_sample_rate != style_sample_rate:
style_waveform = torchaudio.functional.resample(
style_waveform, style_sample_rate, content_sample_rate
)

transferred_audio = style_transfer(
content_waveform, style_waveform, content_sample_rate
)

save_path = filedialog.asksaveasfilename(
defaultextension=".wav", filetypes=[("WAV files", "*.wav")]
)
if not save_path:
return

save_audio(transferred_audio, content_sample_rate, save_path)
self.status_label_transfer.config(
text="Style transfer completed and saved successfully!"
)
except Exception as e:
messagebox.showerror(
"Error", f"Failed to transfer style: {str(e)}"
)
finally:
self.progress_bar_transfer.stop()

threading.Thread(target=transfer_thread).start()


def main():
root = tk.Tk()
app = MusicGeneratorApp(root)
root.mainloop()


if __name__ == "__main__":
main()

Мысль

Пишем музыку

Предлагаю создать музыковый генератор с ИИ. Исправляем ошибку "не хватает данных" и гарантируем качество вывода, поддерживая любые WAV файлы.

Проверка соответствия

Я анализирую последние сообщения пользователя, разнообразные кодификации и инструкции, чтобы понять, соответствует ли их содержание политике.

Привет! Видимо, ошибка указывает на недостаток данных для создания последовательностей. Учитываю это при улучшении кода.

Понимание запроса

Пользователь хочет создать музыкальное приложение с нуля, избегая использования предобученных моделей. Требуется исправить ошибку, улучшить производительность и обеспечить визуализацию музыкальных данных.

Реализуем улучшения

Проверяю код на достаточный объем аудиоданных и адаптирую его для корректного отображения визуализаций в GUI.

Обрабатываю аудиофайлы: ускоряю обучение модели, применяя данные для данных для повышения производительности.

Обрабатываю WAV файлы

Исправляю ошибки, обеспечиваю стерео, показываю музыкальные данные и улучшаю скорость и качество.

Исправление и улучшение

Исправляю ошибку "Недостаточно данных для создания последовательностей...", обеспечивая поддержку любых WAV файлов и улучшение модели для быстрой и качественной генерации.

РАССМАТРИВАЮ МОЖНОСТИ
Анализирую способы обработки WAV-файлов разной длины, включая добавление нулей или начало заново. Говорю также о возможности объединения всех данных или оптимизации модели.

Интеграция и дополнение

Я думаю о том, как добавить и улучшить музыкальные данные в окно контекста, включая стерео аудио, волны и спектрограммы, а также улучшение музыкального результата с помощью данных оживления и предобработки.

Пробую альтернативу

Я исследую, как объединение всех данных в один длинный последовательный список может обойти проблему слишком коротких аудиофайлов в коде train_model().

Совмещаю данные

Использую torch.cat вместо torch.stack, чтобы объединить данные в один длинный тензор. Это бе (8) требует внимание к размерам, особенно для стерео.

Ответ

import torch import torch.nn as nn import torch.optim as optim import torchaudio import tkinter as tk from tkinter import filedialog, messagebox, ttk import os import numpy as np from torch.utils.data import DataLoader, TensorDataset from tqdm import tqdm import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg import threading import librosa import math # Check CUDA availability device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class GELUActivation(nn.Module): def forward(self, x): return 0.5 * x * (1 + torch.tanh( math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)) )) class PositionalEncoding(nn.Module): def __init__(self, d_model, dropout=0.1, max_len=10000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) pe = torch.zeros(max_len, d_model) position = torch.arange( 0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange( 0, d_model, 2).float() * (-math.log(10000.0) / d_model) ) pe[:, 0::2] = torch.sin(position * div_term) # even indices pe[:, 1::2] = torch.cos(position * div_term) # odd indices pe = pe.unsqueeze(0) # shape (1, max_len, d_model) self.register_buffer('pe', pe) def forward(self, x): x = x + self.pe[:, :x.size(1)] return self.dropout(x) class ImprovedTransformerEncoderLayer(nn.Module): def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): super(ImprovedTransformerEncoderLayer, self).__init__() self.self_attn = nn.MultiheadAttention( d_model, nhead, dropout=dropout, batch_first=True ) self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.activation = GELUActivation() def forward(self, src, src_mask=None, src_key_padding_mask=None): src2 = self.self_attn( src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask )[0] src = src + self.dropout1(src2) src = self.norm1(src) src2 = self.linear2( self.dropout(self.activation(self.linear1(src))) ) src = src + self.dropout2(src2) src = self.norm2(src) return src class ImprovedTransformerModel(nn.Module): def __init__( self, input_size, output_size, d_model=512, nhead=8, num_layers=6, dropout=0.1 ): super(ImprovedTransformerModel, self).__init__() self.embedding = nn.Linear(input_size, d_model) self.pos_encoder = PositionalEncoding(d_model, dropout) encoder_layer = ImprovedTransformerEncoderLayer( d_model, nhead, dim_feedforward=2048, dropout=dropout ) self.transformer_encoder = nn.TransformerEncoder( encoder_layer, num_layers ) self.decoder = nn.Linear(d_model, output_size) self.init_weights() def init_weights(self): initrange = 0.1 nn.init.uniform_(self.embedding.weight, -initrange, initrange) nn.init.zeros_(self.embedding.bias) nn.init.zeros_(self.decoder.bias) nn.init.uniform_(self.decoder.weight, -initrange, initrange) def forward(self, src): src = self.embedding(src) src = self.pos_encoder(src) output = self.transformer_encoder(src) output = self.decoder(output) return output def load_audio(file_path, target_sample_rate=22050): waveform, sample_rate = torchaudio.load(file_path) # Resample if necessary if sample_rate != target_sample_rate: waveform = torchaudio.functional.resample( waveform, sample_rate, target_sample_rate ) # Ensure waveform is stereo if waveform.size(0) == 1: waveform = waveform.repeat(2, 1) elif waveform.size(0) > 2: waveform = waveform[:2, :] # Take the first two channels return waveform, target_sample_rate def extract_features(y, sr): # Extract various features mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) chroma = librosa.feature.chroma_stft(y=y, sr=sr) spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr) spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) # Concatenate all features features = np.concatenate( [mfcc, chroma, spectral_centroid, spectral_contrast, spectral_rolloff], axis=0 ) return features def preprocess_audio(waveform, sample_rate, target_sample_rate=22050): # Convert to numpy array y = waveform.numpy() y = librosa.util.normalize(y) y = np.mean(y, axis=0) # Convert to mono for feature extraction # Extract features features = extract_features(y, target_sample_rate) # Convert to tensor and normalize features = torch.FloatTensor(features) features = features / torch.max(torch.abs(features)) # Ensure the correct shape (time, features) features = features.t() return features def train_model(model, train_loader, val_loader, epochs=100, lr=0.001): criterion = nn.MSELoss() optimizer = optim.AdamW( model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01 ) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) best_val_loss = float('inf') best_model = None train_losses = [] val_losses = [] for epoch in range(epochs): model.train() total_loss = 0 progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}") for batch in progress_bar: optimizer.zero_grad() inputs, targets = batch inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() total_loss += loss.item() progress_bar.set_postfix( {'training_loss': f'{total_loss/len(train_loader):.4f}'} ) avg_train_loss = total_loss / len(train_loader) train_losses.append(avg_train_loss) model.eval() val_loss = 0 with torch.no_grad(): for batch in val_loader: inputs, targets = batch inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) loss = criterion(outputs, targets) val_loss += loss.item() avg_val_loss = val_loss / len(val_loader) val_losses.append(avg_val_loss) print( f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}" ) scheduler.step() if avg_val_loss < best_val_loss: best_val_loss = avg_val_loss best_model = model.state_dict() model.load_state_dict(best_model) return model, train_losses, val_losses def generate_music(model, seed, length=1000, temperature=1.0): model.eval() generated = seed.unsqueeze(0).to(device) with torch.no_grad(): for _ in tqdm(range(length), desc="Generating"): input_seq = generated[:, -seed.size(0):, :] output = model(input_seq) next_feature = output[:, -1, :] / temperature generated = torch.cat( [generated, next_feature.unsqueeze(1)], dim=1) return generated.squeeze(0).cpu() def save_audio(waveform, sample_rate, file_path): torchaudio.save(file_path, waveform, sample_rate) def save_model(model, path): torch.save(model.state_dict(), path) def load_model(model, path): state_dict = torch.load(path, map_location=device) model.load_state_dict(state_dict) return model def harmonic_analysis(waveform, sample_rate): y = librosa.util.normalize(waveform.numpy()) chroma = librosa.feature.chroma_cqt(y=y.mean(axis=0), sr=sample_rate) return chroma def rhythm_analysis(waveform, sample_rate): y = librosa.util.normalize(waveform.numpy()) onset_env = librosa.onset.onset_strength(y=y.mean(axis=0), sr=sample_rate) tempo, beats = librosa.beat.beat_track( onset_envelope=onset_env, sr=sample_rate ) return tempo, beats class MusicGeneratorApp: def __init__(self, master): self.master = master self.master.title("AI Music Generator") self.master.geometry("800x600") self.model = None self.sample_rate = 22050 # Default sample rate self.train_thread = None self.train_losses = [] self.val_losses = [] self.create_widgets() def create_widgets(self): self.notebook = ttk.Notebook(self.master) self.notebook.pack(fill=tk.BOTH, expand=True) self.train_tab = ttk.Frame(self.notebook) self.generate_tab = ttk.Frame(self.notebook) self.analysis_tab = ttk.Frame(self.notebook) self.notebook.add(self.train_tab, text="Train Model") self.notebook.add(self.generate_tab, text="Generate Music") self.notebook.add(self.analysis_tab, text="Music Analysis") self.create_train_widgets() self.create_generate_widgets() self.create_analysis_widgets() def create_train_widgets(self): self.train_button = tk.Button( self.train_tab, text="Train Model", command=self.train_model ) self.train_button.pack(pady=10) self.load_button = tk.Button( self.train_tab, text="Load Model", command=self.load_model ) self.load_button.pack(pady=10) self.status_label = tk.Label(self.train_tab, text="") self.status_label.pack(pady=10) self.progress_bar = ttk.Progressbar( self.train_tab, orient=tk.HORIZONTAL, length=300, mode='determinate' ) self.progress_bar.pack(pady=10) self.fig, self.ax = plt.subplots(figsize=(6, 4)) self.canvas = FigureCanvasTkAgg(self.fig, master=self.train_tab) self.canvas.draw() self.canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) def create_generate_widgets(self): self.generate_button = tk.Button( self.generate_tab, text="Generate Music", command=self.generate_music ) self.generate_button.pack(pady=10) self.temperature_label = tk.Label( self.generate_tab, text="Temperature:" ) self.temperature_label.pack() self.temperature_slider = tk.Scale( self.generate_tab, from_=0.1, to=2.0, resolution=0.1, orient=tk.HORIZONTAL ) self.temperature_slider.set(1.0) self.temperature_slider.pack() self.length_label = tk.Label( self.generate_tab, text="Length (seconds):" ) self.length_label.pack() self.length_entry = tk.Entry(self.generate_tab) self.length_entry.insert(0, "10") self.length_entry.pack() self.status_label_generate = tk.Label(self.generate_tab, text="") self.status_label_generate.pack(pady=10) self.progress_bar_generate = ttk.Progressbar( self.generate_tab, orient=tk.HORIZONTAL, length=300, mode='indeterminate' ) self.progress_bar_generate.pack(pady=10) # Visualization canvas self.fig_gen, self.ax_gen = plt.subplots(figsize=(6, 4)) self.canvas_gen = FigureCanvasTkAgg( self.fig_gen, master=self.generate_tab) self.canvas_gen.draw() self.canvas_gen.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) def create_analysis_widgets(self): self.analyze_button = tk.Button( self.analysis_tab, text="Analyze Music", command=self.analyze_music ) self.analyze_button.pack(pady=10) self.analysis_text = tk.Text(self.analysis_tab, height=20, width=60) self.analysis_text.pack(pady=10) def train_model(self): folder_path = filedialog.askdirectory( title="Select folder with WAV files" ) if not folder_path: return self.status_label.config(text="Loading and preprocessing data...") self.master.update() data_list = [] target_sample_rate = 22050 for file_name in os.listdir(folder_path): if file_name.endswith(".wav"): file_path = os.path.join(folder_path, file_name) try: waveform, sample_rate = load_audio( file_path, target_sample_rate) preprocessed = preprocess_audio( waveform, sample_rate, target_sample_rate ) data_list.append(preprocessed) except Exception as e: print(f"Error processing {file_name}: {str(e)}") if not data_list: self.status_label.config( text="No valid WAV files found or processed in the selected folder." ) return self.sample_rate = target_sample_rate # Concatenate all data into one sequence data = torch.cat(data_list, dim=0) sequence_length = 256 input_size = data.size(1) output_size = input_size num_sequences = data.size(0) - sequence_length if num_sequences <= 0: self.status_label.config( text="Not enough data to create sequences. Please use longer audio files." ) return input_sequences = [] target_sequences = [] for i in range(num_sequences): input_sequences.append(data[i:i+sequence_length, :]) target_sequences.append(data[i+1:i+sequence_length+1, :]) input_sequences = torch.stack(input_sequences) target_sequences = torch.stack(target_sequences) # Split data into training and validation sets split = max(1, int(0.9 * len(input_sequences))) train_inputs, val_inputs = input_sequences[:split], input_sequences[split:] train_targets, val_targets = target_sequences[:split], target_sequences[split:] train_dataset = TensorDataset(train_inputs, train_targets) val_dataset = TensorDataset(val_inputs, val_targets) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=32) d_model = 512 nhead = 8 num_layers = 6 self.model = ImprovedTransformerModel( input_size, output_size, d_model, nhead, num_layers ) self.model.to(device) self.status_label.config(text="Training model...") self.master.update() def train_thread(): try: self.model, self.train_losses, self.val_losses = train_model( self.model, train_loader, val_loader ) self.master.after(0, self.update_plot) self.status_label.config(text="Model trained successfully!") save_path = filedialog.asksaveasfilename( defaultextension=".pth", filetypes=[("PyTorch model", "*.pth")] ) if save_path: save_model(self.model, save_path) self.status_label.config( text="Model trained and saved successfully!" ) except Exception as e: self.status_label.config( text=f"Error during training: {str(e)}" ) print(f"Error during training: {str(e)}") finally: self.progress_bar.stop() self.train_thread = threading.Thread(target=train_thread) self.train_thread.start() self.progress_bar.start() def update_plot(self): self.ax.clear() self.ax.plot(self.train_losses, label='Train Loss') self.ax.plot(self.val_losses, label='Validation Loss') self.ax.set_xlabel('Epoch') self.ax.set_ylabel('Loss') self.ax.legend() self.canvas.draw() def load_model(self): load_path = filedialog.askopenfilename( filetypes=[("PyTorch model", "*.pth")] ) if load_path: try: state_dict = torch.load(load_path, map_location=device) input_size = state_dict['embedding.weight'].size(1) output_size = state_dict['decoder.weight'].size(0) d_model = state_dict['embedding.weight'].size(0) nhead = 8 num_layers = len([ key for key in state_dict.keys() if key.startswith('transformer_encoder.layers') ]) self.model = ImprovedTransformerModel( input_size, output_size, d_model, nhead, num_layers ) self.model = load_model(self.model, load_path) self.model.to(device) self.status_label.config(text="Model loaded successfully!") except Exception as e: messagebox.showerror( "Error", f"Failed to load model: {str(e)}" ) def generate_music(self): if self.model is None: messagebox.showwarning( "Warning", "Please train or load a model first." ) return self.status_label_generate.config(text="Generating music...") self.master.update() temperature = self.temperature_slider.get() try: length_seconds = float(self.length_entry.get()) except ValueError: messagebox.showerror( "Error", "Invalid length value. Please enter a number." ) return length_samples = int(length_seconds * self.sample_rate / 512) # Adjust for feature frames seed = torch.randn(256, self.model.embedding.in_features) self.progress_bar_generate.start() def generate_thread(): try: generated = generate_music( self.model, seed, length_samples, temperature ) generated_features = generated.numpy().T # Reconstruct audio from features reconstructed_audio = librosa.feature.inverse.mfcc_to_audio( generated_features[:40, :], n_mels=128, sr=self.sample_rate ) # Convert to stereo reconstructed_audio_stereo = np.vstack( [reconstructed_audio, reconstructed_audio]) reconstructed_audio_tensor = torch.FloatTensor( reconstructed_audio_stereo) save_path = filedialog.asksaveasfilename( defaultextension=".wav", filetypes=[("WAV files", "*.wav")] ) if not save_path: return save_audio(reconstructed_audio_tensor, self.sample_rate, save_path) self.status_label_generate.config( text="Music generated and saved successfully!" ) # Visualize generated waveform self.ax_gen.clear() self.ax_gen.plot(reconstructed_audio) self.canvas_gen.draw() except Exception as e: messagebox.showerror( "Error", f"Failed to generate or save audio: {str(e)}" ) finally: self.progress_bar_generate.stop() threading.Thread(target=generate_thread).start() def analyze_music(self): file_path = filedialog.askopenfilename( filetypes=[("WAV files", "*.wav")] ) if not file_path: return waveform, sample_rate = load_audio(file_path) chroma = harmonic_analysis(waveform, sample_rate) tempo, beats = rhythm_analysis(waveform, sample_rate) analysis_text = f"Tempo: {tempo:.2f} BPM\n\n" analysis_text += "Chroma Features:\n" chroma_mean = chroma.mean(axis=1) for idx, val in enumerate(chroma_mean): note = librosa.note_to_name(idx) analysis_text += f"{note}: {val:.2f}\n" self.analysis_text.delete('1.0', tk.END) self.analysis_text.insert(tk.END, analysis_text) def main(): root = tk.Tk() app = MusicGeneratorApp(root) root.mainloop() if __name__ == "__main__": main()

Поделиться этим Q&A