File size: 5,124 Bytes
254ac8d
 
 
cac3bb3
 
 
 
 
 
206fd6a
 
cac3bb3
86c3abc
ed0b2d9
85e039a
e9872f9
eedab3c
cac3bb3
254ac8d
de64dc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254ac8d
 
 
 
 
2f6b046
254ac8d
 
 
 
 
de64dc1
254ac8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cac3bb3
 
 
 
 
 
d0a19ad
2400ec6
86c3abc
cac3bb3
d0a19ad
85e039a
 
cac3bb3
 
 
86c3abc
 
 
206fd6a
86c3abc
34fd6ff
86c3abc
206fd6a
86c3abc
cac3bb3
86c3abc
cac3bb3
 
 
 
 
 
 
86c3abc
 
d0a19ad
 
de64dc1
86c3abc
cac3bb3
cef8aa0
 
85e039a
d0a19ad
 
 
86c3abc
 
cef8aa0
cac3bb3
 
86c3abc
 
cac3bb3
 
 
 
87e6564
86c3abc
254ac8d
 
cac3bb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate
from gpuinfo import GPUInfo


MODEL_NAME = "cahya/whisper-medium-id"  # this always needs to stay in line 8 :D sorry for the hackiness
lang = "id"
title = "Indonesian Whisperer"
description = "Cross Language Speech to Speech (Indonesian/English to 25 other languages) using OpenAI Whisper and Coqui TTS"
info = "This application uses [Indonesian Whisperer Medium](https://huggingface.co/cahya/whisper-medium-id) model"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"

languages = {
    'English': 'en',
    'German': 'de',
    'Spanish': 'es',
    'French': 'fr',
    'Portuguese': 'pt',
    'Polish': 'pl',
    'Dutch': 'nl',
    'Swedish': 'sv',
    'Italian': 'it',
    'Finnish': 'fi',
    'Ukrainian': 'uk',
    'Greek': 'el',
    'Czech': 'cs',
    'Romanian': 'ro',
    'Danish': 'da',
    'Hungarian': 'hu',
    'Croatian': 'hr',
    'Bulgarian': 'bg',
    'Lithuanian': 'lt',
    'Slovak': 'sk',
    'Latvian': 'lv',
    'Slovenian': 'sl',
    'Estonian': 'et',
    'Maltese': 'mt'
}

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")


def transcribe(microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"

    file = microphone if microphone is not None else file_upload

    text = pipe(file)["text"]

    return warn_output + text


LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"

coquiTTS = CoquiTTS()


def tts(language: str, audio_microphone: str, audio_file: str):
    language = languages[language]
    time_start = time.time()
    print(f"### {datetime.now()} TTS", language, audio_file)
    transcription = transcribe(audio_microphone, audio_file)
    print(f"### {datetime.now()} transcribed:", transcription)
    translation = translate(transcription, language, "id")
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        coquiTTS.get_tts(translation, fp, speaker={"language": language})
        time_end = time.time()
        time_diff = time_end - time_start
        memory = psutil.virtual_memory()
        gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
        system_info = f"""
        *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.* 
        *Processing time: {time_diff:.5} seconds*
        *GPU Utilization: {gpu_utilization[0]}%, GPU Memory: {gpu_memory[0]}MiB.*
        """
        print(f"### {datetime.now()} fp.name:", fp.name)
        return transcription, translation, fp.name, system_info


with gr.Blocks() as blocks:
    gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
                + title
                + "</h1>")
    gr.Markdown(description)
    with gr.Row():  # equal_height=False
        with gr.Column():  # variant="panel"
            audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
            audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
            language = gr.Dropdown([lang for lang in languages.keys()], label="Target Language", value="English")
            with gr.Row():  # mobile_collapse=False
                submit = gr.Button("Submit", variant="primary")
            examples = gr.Examples(examples=["data/Jokowi - 2022.mp3", "data/Soekarno - 1963.mp3", "data/JFK.mp3"],
                                   label="Examples", inputs=[audio_upload])
        with gr.Column():
            text_source = gr.Textbox(label="Source Language")
            text_target = gr.Textbox(label="Target Language")
            audio = gr.Audio(label="Target Audio", interactive=False)
            memory = psutil.virtual_memory()
            system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")

    gr.Markdown(info)
    gr.Markdown("<center>"
                + f'<a href="https://github.com/cahya-wirawan/indonesian-whisperer"><img src={badge} alt="visitors badge"/></a>'
                + "</center>")

    # actions
    submit.click(
        tts,
        [language, audio_microphone, audio_upload],
        [text_source, text_target, audio, system_info],
    )

blocks.launch()