Skip to content

Commit

Permalink
support single speaker & fix loading old model
Browse files Browse the repository at this point in the history
  • Loading branch information
wind4000 committed Nov 10, 2022
1 parent 71881d6 commit f6d9de5
Show file tree
Hide file tree
Showing 5 changed files with 469 additions and 33 deletions.
147 changes: 146 additions & 1 deletion data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,151 @@
from utils import load_wav_to_torch, load_filepaths_and_text
from text import text_to_sequence, cleaned_text_to_sequence


class TextAudioLoader(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate

self.cleaned_text = getattr(hparams, "cleaned_text", False)

self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 190)

random.seed(1234)
random.shuffle(self.audiopaths_and_text)
self._filter()


def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length

audiopaths_and_text_new = []
lengths = []
for audiopath, text in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths

def get_audio_text_pair(self, audiopath_and_text):
# separate filename and text
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
text = self.get_text(text)
spec, wav = self.get_audio(audiopath)
emo = torch.FloatTensor(np.load(audiopath+".emo.npy"))
return (text, spec, wav, emo)

def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError("{} {} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename)
return spec, audio_norm

def get_text(self, text):
if self.cleaned_text:
text_norm = cleaned_text_to_sequence(text)
else:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm

def __getitem__(self, index):
return self.get_audio_text_pair(self.audiopaths_and_text[index])

def __len__(self):
return len(self.audiopaths_and_text)


class TextAudioCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False):
self.return_ids = return_ids

def __call__(self, batch):
"""Collate's training batch from normalized text and aduio
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)

max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])

text_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))

text_padded = torch.LongTensor(len(batch), max_text_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
emo = torch.FloatTensor(len(batch), 1024)

text_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
emo.zero_()

for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]

text = row[0]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)

spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)

wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)

emo[:] = row[3]

if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, emo


"""Multi speaker version"""
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
"""
Expand Down Expand Up @@ -159,7 +304,7 @@ def __call__(self, batch):

if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid,emo
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo


class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
Expand Down
2 changes: 1 addition & 1 deletion emotion_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,6 @@ def preprocess_one(path):
print(filelist,"----start emotion extract-------")
with open(filelist) as f:
for idx, line in enumerate(f.readlines()):
path, _, _ = line.strip().split("|")
path = line.strip().split("|")[0]
preprocess_one(path)
print(idx, path)
Loading

0 comments on commit f6d9de5

Please sign in to comment.