diff --git a/tts_tcp_stream.py b/tts_tcp_stream.py new file mode 100644 index 0000000..21b487e --- /dev/null +++ b/tts_tcp_stream.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +import datetime as dt +import os +import socket + +import numpy as np +from pytorch_lightning.core.saving import convert + +import simpleaudio as sa + + +import soundfile as sf + +from nemo.collections.tts.models.base import SpectrogramGenerator, Vocoder + +FIFO_PATH = os.environ.get("FIFO_PATH", "tts_fifo_file") + + +def send_file(audio_data): + (HOST, PORT) = ('rospi.runcible.io', 9000) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((HOST, PORT)) + s.sendfile(audio_data) + + +def create_fifo(path): + try: + os.mkfifo(path) + except OSError as e: + print("File {} already exists".format(path)) + + +def main(): + # load models + spec_gen = SpectrogramGenerator.from_pretrained("tts_en_tacotron2") + # Download and load the pretrained waveglow model + # vocoder = Vocoder.from_pretrained("tts_waveglow_88m") + vocoder = Vocoder.from_pretrained("tts_squeezewave") + + create_fifo(FIFO_PATH) + + print("Pipe text to {}".format(FIFO_PATH)) + with open(FIFO_PATH, 'r') as fifo_file: + while True: + text = fifo_file.readline().strip() + if text: + parsed = spec_gen.parse(text) + spectrogram = spec_gen.generate_spectrogram(tokens=parsed) + audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram) + converted_audio = audio.to('cpu').numpy().T[0:] + # normalize to 16-bit range + converted_audio *= 32767 / np.max(np.abs(converted_audio)) + converted_audio = converted_audio.astype(np.int16) + # start playing audio + send_file(converted_audio) + text = '' + + +if __name__ == "__main__": + main()