File size: 3,841 Bytes
254ac8d
 
 
cac3bb3
 
 
 
 
 
 
254ac8d
ed0b2d9
 
85e039a
cac3bb3
 
 
254ac8d
 
 
 
 
 
2f6b046
254ac8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cac3bb3
 
 
 
 
 
d0a19ad
cac3bb3
d0a19ad
85e039a
 
cac3bb3
 
 
 
85e039a
cac3bb3
 
 
 
 
 
 
 
 
d0a19ad
 
 
cac3bb3
d0a19ad
cac3bb3
 
 
 
 
85e039a
d0a19ad
 
 
cac3bb3
 
 
 
 
 
 
 
 
 
 
 
 
d0a19ad
85e039a
254ac8d
cac3bb3
254ac8d
cac3bb3
254ac8d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate


MODEL_NAME = "cahya/whisper-medium-id" #this always needs to stay in line 8 :D sorry for the hackiness
lang = "id"
title = "Indonesian Whisperer"
description = "Cross Language Speech to Speech using OpenAI Whisper and Coqui TTS"
info = "more info at [indonesian Whisperer](https://github.com/cahya-wirawan/indonesian-whisperer)"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")

def transcribe(microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"

    file = microphone if microphone is not None else file_upload

    text = pipe(file)["text"]

    return warn_output + text


LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"

coquiTTS = CoquiTTS()


def tts(language: str, audio_microphone: str, audio_file: str):
    print(f"### {datetime.now()} TTS", language, audio_file)
    transcription = transcribe(audio_microphone, audio_file)
    print(f"### {datetime.now()} transcribed:", transcription)
    translation = translate(transcription, language, "id")
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        coquiTTS.get_tts(translation, fp, speaker={"language": language})
        print(f"### {datetime.now()} fp.name:", fp.name)
        return transcription, translation, fp.name


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
                + title
                + "</h1>")
    gr.Markdown(description)
    with gr.Row():# equal_height=False
        with gr.Column():# variant="panel"
            audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
            audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
            print("upload:", audio_upload)
            radio = gr.Radio(
                label="Target Language",
                choices=LANGUAGES,
                value=default_lang
            )
            with gr.Row(): # mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
        with gr.Column():
            text_source = gr.Textbox(label="Source Language")
            text_target = gr.Textbox(label="Target Language")
            audio = gr.Audio(label="Target Audio", interactive=False)
    memory = psutil.virtual_memory()
    gr.Markdown(info)
    system_status = info = f"""
            *Memory: {memory.total/(1024*1024*1024):.2f}GB, used: {memory.percent}%, available: {memory.available/(1024*1024*1024):.2f}GB*
            """
    gr.Markdown(system_status)
    gr.Markdown("<center>"
                +f'<img src={badge} alt="visitors badge"/>'
                +"</center>")

    # actions
    submit.click(
        tts,
        [radio, audio_microphone, audio_upload],
        [text_source, text_target, audio],
    )
    radio.change(lambda lang: CoquiTTS.langs[lang]["sentence"], radio)

blocks.launch()