-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathtext2speech.py
102 lines (74 loc) · 2.9 KB
/
text2speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import sys
import io
import torch
import time
import numpy as np
from collections import OrderedDict
import librosa
import librosa.display
from TTS.synthesis import *
from TTS.models.tacotron import Tacotron
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence
import spacy
from pydub import AudioSegment
from pydub.playback import play
class tts_class:
def __init__(self):
# Set constants
ROOT_PATH = 'TTS/tts_model/'
MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
# MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test'
self.CONFIG = load_config(CONFIG_PATH)
self.use_cuda = True # True
# load the model
self.model = Tacotron(self.CONFIG.embedding_size,
self.CONFIG.num_freq, self.CONFIG.num_mels, self.CONFIG.r)
# load the audio processor
self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels, self.CONFIG.min_level_db,
self.CONFIG.frame_shift_ms, self.CONFIG.frame_length_ms,
self.CONFIG.ref_level_db, self.CONFIG.num_freq, self.CONFIG.power, self.CONFIG.preemphasis,
60)
# load model state
if self.use_cuda:
cp = torch.load(MODEL_PATH)
else:
cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
# load the model
self.model.load_state_dict(cp['model'])
if self.use_cuda:
self.model.cuda()
self.model.eval()
self.model.decoder.max_decoder_steps = 500
self.nlp = spacy.load("en")
def process(self, text):
self.model.decoder.max_decoder_steps = 500
wavefiles = self.text2audio(text, self.model, self.CONFIG, self.use_cuda, self.ap)
return wavefiles
def tts(self, model, text, CONFIG, use_cuda, ap, wavefile, figures=True):
waveform, alignment, spectrogram, stop_tokens = create_speech(
model, text, CONFIG, use_cuda, ap)
self.ap.save_wav(waveform, wavefile)
def text2audio(self, text, model, CONFIG, use_cuda, ap):
wavefiles = []
base_name = "gen_{}.wav"
doc = self.nlp(text)
for i, sent in enumerate(doc.sents):
text = sent.text.strip()
wavefile = base_name.format(i)
self.tts(model, text, CONFIG, use_cuda, ap, wavefile)
wavefiles.append(wavefile)
return wavefiles
def play(self,wavefiles):
voice = AudioSegment.empty()
for wavefile in wavefiles:
voice += AudioSegment.from_wav(wavefile)
play(voice)
for w in wavefiles:
os.remove(w)