Spaces:

CAMB-AI
/

mars5_space

Sleeping

App Files Files Community

arnavmehta7 commited on Jun 17, 2024

Commit

3a0629c

verified ·

1 Parent(s): e1be390

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -19

app.py CHANGED Viewed

@@ -4,17 +4,40 @@ import torch
 import librosa
 from pathlib import Path
 import tempfile, torchaudio
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
-# Default reference audio and transcript
-# default_audio_path = "example.wav"
-# default_transcript = "We actually haven't managed to meet demand."
 # Function to process the text and audio input and generate the synthesized output
 def synthesize(text, audio_file, transcript):
     # Load the reference audio
     wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True)
     wav = torch.from_numpy(wav)
@@ -29,21 +52,52 @@ def synthesize(text, audio_file, transcript):
     # Save the synthesized audio to a temporary file
     output_path = Path(tempfile.mktemp(suffix=".wav"))
     torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr)
     return str(output_path)
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=synthesize,
-    inputs=[
-        gr.Textbox(label="Text to synthesize"),
-        gr.Audio(label="Audio file to clone from", type="filepath"),
-        gr.Textbox(label="Uploaded audio file transcript"),
-    ],
-    outputs=gr.Audio(label="Synthesized Audio"),
-    title="MARS5 TTS Demo",
-    description="Enter text and upload an audio file to clone the voice and generate synthesized speech using MARS5 TTS."
-)
-# Launch the Gradio app
-interface.launch()

 import librosa
 from pathlib import Path
 import tempfile, torchaudio
+# from faster_whisper import WhisperModel
+from transformers import pipeline
+from uuid import uuid4
 # Load the MARS5 model
 mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
+# asr_model = WhisperModel("small", device="cpu", compute_type="int8")
+asr_model = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-medium",
+    chunk_length_s=30,
+    device=torch.device("cuda"),
+)
+def transcribe_file(f: str) -> str:
+    predictions = asr_model(f, return_timestamps=True)["chunks"]
+    print(f">>>>>.  predictions: {predictions}")
+    return " ".join([prediction["text"] for prediction in predictions])
 # Function to process the text and audio input and generate the synthesized output
 def synthesize(text, audio_file, transcript):
+    audio_file = Path(audio_file)
+    temp_file = f"{uuid4()}.{audio_file.suffix}"
+    # copying the audio_file
+    with open(audio_file, 'rb') as src, open(temp_file, 'wb') as dst:
+        dst.write(src.read())
+    audio_file = temp_file
+    print(f">>>>> synthesizing! audio_file: {audio_file}")
+    if not transcript:
+        transcript = transcribe_file(audio_file)
     # Load the reference audio
     wav, sr = librosa.load(audio_file, sr=mars5.sr, mono=True)
     wav = torch.from_numpy(wav)
     # Save the synthesized audio to a temporary file
     output_path = Path(tempfile.mktemp(suffix=".wav"))
     torchaudio.save(output_path, wav_out.unsqueeze(0), mars5.sr)
     return str(output_path)
+defaults = {
+    'temperature': 0.8,
+    'top_k': -1,
+    'top_p': 0.2,
+    'typical_p': 1.0,
+    'freq_penalty': 2.6,
+    'presence_penalty': 0.4,
+    'rep_penalty_window': 100,
+    'max_prompt_phones': 360,
+    'deep_clone': True,
+    'nar_guidance_w': 3
+}
+with gr.Blocks() as demo:
+    gr.Markdown("## MARS5 TTS Demo\nEnter text and upload an audio file to clone the voice and generate synthesized speech using MARS5 TTS.")
+    text = gr.Textbox(label="Text to synthesize")
+    audio_file = gr.Audio(label="Audio file to clone from", type="filepath")
+    generate_btn = gr.Button(label="Generate Synthesized Audio")
+    with gr.Accordion("Advanced Settings", open=False):
+        gr.Markdown("additional inference settings\nWARNING: changing these incorrectly may degrade quality.")
+        prompt_text = gr.Textbox(label="Transcript of voice reference")
+        temperature = gr.Slider(minimum=0.01, maximum=3, step=0.01, label="temperature", value=defaults['temperature'])
+        top_k = gr.Slider(minimum=-1, maximum=2000, step=1, label="top_k", value=defaults['top_k'])
+        top_p = gr.Slider(minimum=0.01, maximum=1.0, step=0.01, label="top_p", value=defaults['top_p'])
+        typical_p = gr.Slider(minimum=0.01, maximum=1, step=0.01, label="typical_p", value=defaults['typical_p'])
+        freq_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="freq_penalty", value=defaults['freq_penalty'])
+        presence_penalty = gr.Slider(minimum=0, maximum=5, step=0.05, label="presence_penalty", value=defaults['presence_penalty'])
+        rep_penalty_window = gr.Slider(minimum=1, maximum=500, step=1, label="rep_penalty_window", value=defaults['rep_penalty_window'])
+        nar_guidance_w = gr.Slider(minimum=1, maximum=8, step=0.1, label="nar_guidance_w", value=defaults['nar_guidance_w'])
+        meta_n = gr.Slider(minimum=1, maximum=10, step=1, label="meta_n", value=2, interactive=False)
+        deep_clone = gr.Checkbox(value=defaults['deep_clone'], label='deep_clone')
+        dummy = gr.Number(label='Example number', visible=False)
+    output = gr.Audio(label="Synthesized Audio", type="filepath")
+    def on_click(text, audio_file, prompt_text):
+        print(f">>>> transcript: {prompt_text}; audio_file = {audio_file}")
+        of = synthesize(text, audio_file, prompt_text)
+        print(f">>>> output file: {of}")
+        return of
+    generate_btn.click(on_click, inputs=[text, audio_file, prompt_text], outputs=[output])
+demo.launch(share=False)