import soundfile as sf import torchaudio as ta from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder # Download and load the pretrained tacotron2 model spec_gen = SpectrogramGenerator.from_pretrained("tts_en_tacotron2") # Download and load the pretrained waveglow model vocoder = Vocoder.from_pretrained("tts_waveglow_88m") # All spectrogram generators start by parsing raw strings to a tokenized version of the string parsed = spec_gen.parse("You can type your sentence here to get nemo to produce speech.") # They then take the tokenized string and produce a spectrogram spectrogram = spec_gen.generate_spectrogram(tokens=parsed) # Finally, a vocoder converts the spectrogram to audio audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram) # import pdb; pdb.set_trace() # Save the audio to disk in a file called speech.wav try: ta.save('attempt1.wav', audio, 22050) ta.save('attemp2.wav', audio.to('cpu').numpy(), 22050) except Exception: import pdb; pdb.set_trace() #sf.write('speech.wav', audio.to('cpu').numpy(), 22050)