You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
23 lines
1.0 KiB
Python
23 lines
1.0 KiB
Python
3 years ago
|
import datetime as dt
|
||
|
import soundfile as sf
|
||
|
|
||
|
from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder
|
||
|
|
||
|
# Download and load the pretrained tacotron2 model
|
||
|
spec_gen = SpectrogramGenerator.from_pretrained("tts_en_tacotron2")
|
||
|
# Download and load the pretrained waveglow model
|
||
|
vocoder = Vocoder.from_pretrained("tts_waveglow_88m")
|
||
|
#vocoder = Vocoder.from_pretrained("tts_squeezewave")
|
||
|
|
||
|
# All spectrogram generators start by parsing raw strings to a tokenized version of the string
|
||
|
print("starting at {}".format(dt.datetime.now()))
|
||
|
parsed = spec_gen.parse("How will this squeeze model sound?")
|
||
|
# They then take the tokenized string and produce a spectrogram
|
||
|
spectrogram = spec_gen.generate_spectrogram(tokens=parsed)
|
||
|
# Finally, a vocoder converts the spectrogram to audio
|
||
|
audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
|
||
|
print("Finished encoding {}".format(dt.datetime.now()))
|
||
|
# Save the audio to disk in a file called speech.wav
|
||
|
sf.write("squeeze2.wav", audio.to('cpu').numpy().T, 22050)
|
||
|
print("Finished write at {}".format(dt.datetime.now()))
|