Spaces:

mantrakp
/

aai

Runtime error

App Files Files Community

barreloflube commited on Sep 17, 2024

Commit

acde4c3

1 Parent(s): 84df228

Refactor app.py to add audio tab and update gradio UI

Browse files

Files changed (7) hide show

app.py +5 -2
config.py +4 -0
requirements.txt +2 -0
tabs/audios/events.py +65 -0
tabs/audios/load_models.py +17 -0
tabs/audios/ui.py +49 -0
tabs/images/load_models.py +1 -0

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 from config import css
 from tabs.images.ui import image_tab
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     # Header
@@ -16,13 +18,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     with gr.Tabs():
         with gr.Tab(label="🖼️ Image"):
             image_tab()
-        # with gr.Tab(label="🎵 Audio"):
-        #     audio_tab()
         # with gr.Tab(label="🎥 Video"):
         #     video_tab()
         # with gr.Tab(label="📝 Text"):
         #     text_tab()
 demo.launch(
     share=False,
     debug=True,

 from config import css
 from tabs.images.ui import image_tab
+from tabs.audios.ui import audio_tab
 with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
     # Header
     with gr.Tabs():
         with gr.Tab(label="🖼️ Image"):
             image_tab()
+        with gr.Tab(label="🎵 Audio"):
+            audio_tab()
         # with gr.Tab(label="🎥 Video"):
         #     video_tab()
         # with gr.Tab(label="📝 Text"):
         #     text_tab()
 demo.launch(
     share=False,
     debug=True,

config.py CHANGED Viewed

@@ -73,3 +73,7 @@ class Config:
             "compute_type": torch.bfloat16,
         }
     ]

             "compute_type": torch.bfloat16,
         }
     ]
+    # Audios
+    AUDIOS_MODELS = [{"repo_id": "fal/AuraSR-v2"}]

requirements.txt CHANGED Viewed

@@ -22,3 +22,5 @@ git+https://github.com/mantrakp04/BasicSR-fix.git
 git+https://github.com/TencentARC/GFPGAN.git
 git+https://github.com/xinntao/Real-ESRGAN.git
 aura_sr

 git+https://github.com/TencentARC/GFPGAN.git
 git+https://github.com/xinntao/Real-ESRGAN.git
 aura_sr
+deepfilternet
+styletts2

tabs/audios/events.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import gc
+import tempfile
+from uuid import uuid4
+import spaces
+import gradio as gr
+import numpy as np
+from df.enhance import enhance, load_audio, save_audio
+from config import Config
+from .load_models import *
+# Helper functions
+def create_temp_file():
+    return tempfile.NamedTemporaryFile(delete=False)
+@spaces.GPU(duration=10)
+def clear_audio(audio: np.ndarray):
+    # Save the audio file
+    audio_file = create_temp_file()
+    np.save(audio_file.name, audio)
+    # Load the audio file
+    audio, _ = load_audio(audio_file.name, sr=df_state.sr())
+    enhanced = enhance(df_model, df_state, audio)
+    # Save the enhanced audio file
+    save_audio(audio_file.name, enhanced, df_state.sr())
+    return gr.update( # speaker_audio, output_audio
+        value=audio_file.name,
+    )
+@spaces.GPU(duration=20)
+def gen_audio(
+    text,
+    language,
+    speaker_audio: np.ndarray,
+    tts_alpha,
+    tts_beta,
+    tts_diffusion_steps,
+    tts_embedding_scale,
+):
+    # Save the speaker audio file
+    speaker_audio_file = create_temp_file()
+    np.save(speaker_audio_file.name, speaker_audio)
+    # Generate the audio
+    output = styletts2_model.inference(
+        text=text,
+        target_voice_path=speaker_audio_file.name,
+        output_wav_file=create_temp_file().name,
+        alpha=float(tts_alpha),
+        beta=float(tts_beta),
+        diffusion_steps=int(tts_diffusion_steps),
+        embedding_scale=int(tts_embedding_scale),
+    )
+    return gr.update( # output_audio
+        value=output,
+    )

tabs/audios/load_models.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+from df.enhance import init_df
+from styletts2 import tts
+from config import Config
+def init_sys():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    df_model, df_state, _ = init_df()
+    styletts2_model = tts.StyleTTS2()
+    return device, df_model, df_state, styletts2_model
+device, df_model, df_state, styletts2_model = init_sys()

tabs/audios/ui.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import gradio as gr
+from config import Config
+from .events import *
+def audio_tab():
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                with gr.Group():
+                    text = gr.Textbox(lines=5, label="Enter text")
+                    language = gr.Dropdown(
+                        label="Language",
+                        choices=["en"],
+                        value="en",
+                    )
+                with gr.Accordion('Voice Clone', open=True):
+                    speaker_audio = gr.Audio(label="Upload Audio", type='numpy')
+                    clear_speaker_audio = gr.Button(label="Clear Audio")
+        with gr.Column():
+            output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
+            clear_output_audio = gr.Button(label="Clear Audio")
+            generate_audio = gr.Button(label="Generate Audio")
+            with gr.Accordion('Advance Settings', open=True):
+                settings = [
+                    ('Alpha', 'tts_alpha', 'float', 0.0, 1.0, 0.3, 0.1,),
+                    ('Beta', 'tts_beta', 'float', 0.0, 1.0, 0.7, 0.1,),
+                    ('Diffusion Steps', 'tts_diffusion_steps', 'int', 1, 100, 10, 1,),
+                    ('Embedding Scale', 'tts_embedding_scale', 'int', 0, 10, 1, 1,),
+                ]
+                for label, key, type_, min_, max_, value, step in settings:
+                    globals()[key] = gr.Slider(label=label, minimum=min_, maximum=max_, value=value, step=step)
+    # Events
+    # Clear Audio
+    clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
+    clear_output_audio.click(clear_audio, output_audio, output_audio)
+    # Generate Audio
+    generate_audio.click(
+        gen_audio,
+        [text, language, speaker_audio, tts_alpha, tts_beta, tts_diffusion_steps, tts_embedding_scale], # type: ignore
+        [output_audio]
+    )

tabs/images/load_models.py CHANGED Viewed

@@ -10,6 +10,7 @@ from diffusers.schedulers import *
 from config import Config
 def init_sys():
     device = "cuda" if torch.cuda.is_available() else "cpu"

 from config import Config
 def init_sys():
     device = "cuda" if torch.cuda.is_available() else "cpu"