Spaces:

NbAiLab
/

f5-tts-north-sami

Running on Zero

App Files Files Community

versae commited on Dec 9, 2024

Commit

24ac2a0

1 Parent(s): 34dd2ff

Add voice selector

Browse files

Files changed (1) hide show

app.py +12 -5

app.py CHANGED Viewed

@@ -104,11 +104,14 @@ def generate_response(messages, model, tokenizer):
 @gpu_decorator
 def infer(
-    ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=0.8, show_info=gr.Info
 ):
     # ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
-    ref_audio = Path("./ref_male.wav").read_bytes()
-    ref_text = Path("./ref_male.txt").read_text()
     if model == "F5-TTS":
         ema_model = F5TTS_ema_model
@@ -165,7 +168,8 @@ with gr.Blocks() as app_credits:
 """)
 with gr.Blocks() as app_tts:
     gr.Markdown("# Batched TTS")
-    ref_audio_input = gr.Audio(label="Reference Audio", type="filepath", visible=False, value=Path("./ref_male.wav"))
     gen_text_input = gr.Textbox(label="Text to Generate", lines=10, value=Path("./sample.txt").read_text())
     generate_btn = gr.Button("Synthesize", variant="primary")
     with gr.Accordion("Advanced Settings", open=False):
@@ -174,7 +178,7 @@ with gr.Blocks() as app_tts:
             info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.",
             lines=2,
             visible=False,
-            value=Path("./ref_male.txt").read_text()
         )
         remove_silence = gr.Checkbox(
             label="Remove Silences",
@@ -203,6 +207,7 @@ with gr.Blocks() as app_tts:
     @gpu_decorator
     def basic_tts(
         ref_audio_input,
         ref_text_input,
         gen_text_input,
@@ -211,6 +216,7 @@ with gr.Blocks() as app_tts:
         speed_slider,
     ):
         audio_out, spectrogram_path, ref_text_out = infer(
             ref_audio_input,
             ref_text_input,
             gen_text_input,
@@ -224,6 +230,7 @@ with gr.Blocks() as app_tts:
     generate_btn.click(
         basic_tts,
         inputs=[
             ref_audio_input,
             ref_text_input,
             gen_text_input,

 @gpu_decorator
 def infer(
+    ref_voice, ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=0.8, show_info=gr.Info
 ):
     # ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
+    if ref_voice:
+        ref_audio = Path(f"./ref_{str(voice).lower()}.wav").read_bytes()
+        ref_text = Path(f"./ref_{str(voice).lower()}.txt").read_text()
+    else:
+        ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
     if model == "F5-TTS":
         ema_model = F5TTS_ema_model
 """)
 with gr.Blocks() as app_tts:
     gr.Markdown("# Batched TTS")
+    ref_voice = gr.Radio(["Female", "Male"], label="Voice", info="Reference voice for inference")
+    ref_audio_input = gr.Audio(label="Reference Audio", type="filepath", visible=False, value=Path("./ref_female.wav"))
     gen_text_input = gr.Textbox(label="Text to Generate", lines=10, value=Path("./sample.txt").read_text())
     generate_btn = gr.Button("Synthesize", variant="primary")
     with gr.Accordion("Advanced Settings", open=False):
             info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.",
             lines=2,
             visible=False,
+            value=Path("./ref_female.txt").read_text()
         )
         remove_silence = gr.Checkbox(
             label="Remove Silences",
     @gpu_decorator
     def basic_tts(
+        ref_voice,
         ref_audio_input,
         ref_text_input,
         gen_text_input,
         speed_slider,
     ):
         audio_out, spectrogram_path, ref_text_out = infer(
+            ref_voice,
             ref_audio_input,
             ref_text_input,
             gen_text_input,
     generate_btn.click(
         basic_tts,
         inputs=[
+            ref_voice,
             ref_audio_input,
             ref_text_input,
             gen_text_input,