import gradio as gr
from Soundwave.run_inference import *


device = 'cuda'

model, audio_processor, tokenizer = load_model("FreedomIntelligence/Soundwave", device)
model.cuda()

@spaces.GPU(duration=40, progress=gr.Progress(track_tqdm=True))
def process_audio_text(text, audio):
    # 音频路径是传入的文件路径
    audio_path = audio  
    print(audio_path)
    system = "You are a helpful language and speech assistant. You are able to understand the speech content that the user provides, and assist the user with a variety of tasks using natural language."
    if text == "" or text == " ":
        text = "Please transcribe the following audio and then answer based on the audio's transcription."
    response = inference(model, audio_processor, tokenizer, text, audio_path, device)
    result = f"{response}"
    return result

examples = [  
    ["Can you turn my English into German?", "./show_case/common_voice_en_19664034.mp3"],  # En-De
    ["Can you identify the initial word that connects to 'currency_name' in this audio clip?", "./show_case/audio-1434542201-headset.wav"],  # ER
    ["What do you think the speaker's message is intended to be in this audio?", "./show_case/audio-1434542201-headset.wav"],  # IC
    ["What does the person say?", "./show_case/p225_002.wav"],  # DFake
    # ["Assess whether this speech's pronunciation is Real or Fake.", "./show_case/Real.wav"],  # DFake
    ["Assess whether this speech's pronunciation is Real or Fake.", "./show_case/Fake.wav"],  # DFake
    ["What emotional weight does the speaker's tone carry?\nPick one answer from A, B, C, and D.\nA: fear\nB: sadness\nC: joy\nD: neutral", "./show_case/SER(emotion)_example.wav"],  #SER(emotion)
    # ["Assess whether this speech's pronunciation is Real or Fake.", "./show_case/SVD_14154_file31512.mp3.wav_16k.wav_norm.wav_mono.wav_silence.wav"],  # SVD  
    ["Choose the most suitable answer from options A, B, C, and D to respond the question in next line, you may only choose A or B or C or D.\nThe number of speakers delivering this speech is what?\nA. 4\nB. 2\nC.1\nD. 3", "./show_case/SNV_example.wav"],  #SNV
    ["Identify the language of the conversation you just heard.","./show_case/SLR_example.wav"], #SLR
    ["tell the gender of the speaker in this audio.","./show_case/SGR_018.wav"], #SGR
    ["What's the sound we're hearing in this audio from?","./show_case/Sound_Vocal_example.wav"], #Sound_vocal
    ["What is your best guess at the setting of this sound clip?","./show_case/Scene_example.wav"], #Sound_cochl
    ["Choose the most suitable answer from options A, B, C, and D to respond the question in next line, Please think step by step and you may only choose A or B or C or D.\nRecognize the segment where 'project' is spoken by the speaker.\nA. [5.28, 5.39]\nB. [0.92, 1.39]\nC. [4.75, 5.28]\nD. [3.86, 4.23]","./show_case/SG_audio_1.wav"], #SG
    ["What type of business does the first person's son have?","./show_case/SFT_Fisher_example.wav"] #SFT_Fisher
]  

with gr.Blocks() as demo:
    gr.Markdown("""
        <h1 style='text-align: center; color: #014377;'>🔊 Soundwave Demo</h1>
        <p style='text-align: center;'>Upload an audio file and provide an instruction for the AI to process.</p>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(label="🎤 Upload Audio", type="filepath", value="./show_case/p225_002.wav")
        with gr.Column(scale=1):
            text_input = gr.Textbox(label="📝 Enter text instruction", value="What does the person say?", lines=5)
            submit_button = gr.Button("🚀 Process Audio", size="lg")
    
    with gr.Row():
        output_text = gr.Textbox(label="📜 Model output", lines=5, interactive=False)
    
    def handle_submit(text, audio):
        return process_audio_text(text, audio)
    
    submit_button.click(fn=handle_submit, inputs=[text_input, audio_input], outputs=output_text)
    
    gr.Examples(examples, inputs=[text_input, audio_input])

if __name__ == "__main__":
    demo.launch()