import gradio as gr import librosa import numpy as np import torch import os import torch from speechbrain.pretrained import EncoderClassifier from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from scipy.io import wavfile import scipy.signal as sps import openai as ai import gc checkpoint = "microsoft/speecht5_tts" processor = SpeechT5Processor.from_pretrained(checkpoint) model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") ai.api_key = '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' spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name)) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def prepare_data(temp_text, audio_prompt): rate, audio_data = audio_prompt # new_rate = 16000 # number_of_samples = round(len(audio_data) * float(new_rate) / rate) # audio_data = sps.resample(audio_data, number_of_samples) example = processor( text=temp_text, audio_target=audio_data, sampling_rate=16000, return_attention_mask=False,) example["speaker_embeddings"] = create_speaker_embedding(audio_data) example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) return example_embeddings def generate_gpt4_response(user_text, print_output=False): """ Query OpenAI GPT-4 for the specific key and get back a response :type user_text: str the user's text to query for :type print_output: boolean whether or not to print the raw output JSON """ message=[{"role": "user", "content": user_text+'in just 2 very small sentences'}] completions = ai.ChatCompletion.create( model="gpt-4", messages=message, max_tokens=250 ) # Return the first choice's text return completions['choices'][0]['message']['content'] def predict(temp_text, temp_audio, record_audio_prompt, prompt_text): if temp_audio is not None : audio_prompt = temp_audio else: audio_prompt = record_audio_prompt text = generate_gpt4_response(prompt_text) embeddings=prepare_data(temp_text, audio_prompt) inputs = processor(text=text, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], embeddings) with torch.no_grad(): speech = vocoder(spectrogram) speech = (speech.numpy() * 32767).astype(np.int16) speech=(16000, speech) del temp_text, temp_audio, record_audio_prompt, prompt_text, audio_prompt,embeddings,inputs,spectrogram gc.collect() return text, speech app = gr.Blocks() with app: with gr.Row(): with gr.Column(): temp_text=gr.Text(label="Template Text") temp_audio=gr.Audio(label="Template Speech", type="numpy") record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', type="numpy") prompt_text=gr.Text(label="Input Text") with gr.Column(): text = gr.Textbox(label="Message") speech=gr.Audio(label="Generated Speech", type="numpy") btn = gr.Button("Generate!") btn.click(predict, inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text], outputs=[text, speech]) app.launch()