Spaces:
Runtime error
Runtime error
Move to Speech to Speech
Browse files- README.md +3 -4
- app.py +63 -59
- packages.txt +2 -0
- requirements.txt +3 -1
README.md
CHANGED
@@ -1,15 +1,14 @@
|
|
1 |
---
|
2 |
title: Indonesian Whisperer
|
3 |
emoji: 🇮🇩
|
4 |
-
colorFrom:
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
tags:
|
11 |
- whisper-event
|
12 |
-
duplicated_from: whisper-event/whisper-demo
|
13 |
---
|
14 |
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Indonesian Whisperer
|
3 |
emoji: 🇮🇩
|
4 |
+
colorFrom: purple
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.15.0
|
8 |
app_file: app.py
|
9 |
+
pinned: true
|
10 |
tags:
|
11 |
- whisper-event
|
|
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,12 +1,20 @@
|
|
1 |
import torch
|
2 |
-
|
3 |
import gradio as gr
|
4 |
-
import pytube as pt
|
5 |
from transformers import pipeline
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
MODEL_NAME = "cahya/whisper-medium-id" #this always needs to stay in line 8 :D sorry for the hackiness
|
9 |
lang = "id"
|
|
|
|
|
|
|
|
|
10 |
|
11 |
device = 0 if torch.cuda.is_available() else "cpu"
|
12 |
|
@@ -37,62 +45,58 @@ def transcribe(microphone, file_upload):
|
|
37 |
return warn_output + text
|
38 |
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
)
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
def yt_transcribe(yt_url):
|
50 |
-
yt = pt.YouTube(yt_url)
|
51 |
-
html_embed_str = _return_yt_html_embed(yt_url)
|
52 |
-
stream = yt.streams.filter(only_audio=True)[0]
|
53 |
-
stream.download(filename="audio.mp3")
|
54 |
-
|
55 |
-
text = pipe("audio.mp3")["text"]
|
56 |
-
|
57 |
-
return html_embed_str, text
|
58 |
-
|
59 |
-
|
60 |
-
demo = gr.Blocks()
|
61 |
-
|
62 |
-
mf_transcribe = gr.Interface(
|
63 |
-
fn=transcribe,
|
64 |
-
inputs=[
|
65 |
-
gr.inputs.Audio(source="microphone", type="filepath", optional=True),
|
66 |
-
gr.inputs.Audio(source="upload", type="filepath", optional=True),
|
67 |
-
],
|
68 |
-
outputs="text",
|
69 |
-
layout="horizontal",
|
70 |
-
theme="huggingface",
|
71 |
-
title="Whisper Demo: Transcribe Audio",
|
72 |
-
description=(
|
73 |
-
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
|
74 |
-
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
|
75 |
-
" of arbitrary length."
|
76 |
-
),
|
77 |
-
allow_flagging="never",
|
78 |
-
)
|
79 |
-
|
80 |
-
yt_transcribe = gr.Interface(
|
81 |
-
fn=yt_transcribe,
|
82 |
-
inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
|
83 |
-
outputs=["html", "text"],
|
84 |
-
layout="horizontal",
|
85 |
-
theme="huggingface",
|
86 |
-
title="Whisper Demo: Transcribe YouTube",
|
87 |
-
description=(
|
88 |
-
"Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
|
89 |
-
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
|
90 |
-
" arbitrary length."
|
91 |
-
),
|
92 |
-
allow_flagging="never",
|
93 |
-
)
|
94 |
|
95 |
-
|
96 |
-
gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
|
97 |
|
98 |
-
demo.launch(enable_queue=True)
|
|
|
1 |
import torch
|
|
|
2 |
import gradio as gr
|
|
|
3 |
from transformers import pipeline
|
4 |
+
import tempfile
|
5 |
+
from neon_tts_plugin_coqui import CoquiTTS
|
6 |
+
from datetime import datetime
|
7 |
+
import time
|
8 |
+
import psutil
|
9 |
+
from mtranslate import translate
|
10 |
+
|
11 |
|
12 |
MODEL_NAME = "cahya/whisper-medium-id" #this always needs to stay in line 8 :D sorry for the hackiness
|
13 |
lang = "id"
|
14 |
+
title = "indonesian Whisperer"
|
15 |
+
description = "Cross Language Speech to Speech using OpenAI Whisper and Coqui TTS"
|
16 |
+
info = "more info at [indonesian Whisperer](https://github.com/cahya-wirawan/indonesian-whisperer)"
|
17 |
+
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
|
18 |
|
19 |
device = 0 if torch.cuda.is_available() else "cpu"
|
20 |
|
|
|
45 |
return warn_output + text
|
46 |
|
47 |
|
48 |
+
LANGUAGES = list(CoquiTTS.langs.keys())
|
49 |
+
default_lang = "en"
|
50 |
+
|
51 |
+
coquiTTS = CoquiTTS()
|
52 |
+
|
53 |
+
|
54 |
+
def tts(language: str, audio_file: str):
|
55 |
+
print(f"### {datetime.now()} TTS", language, audio_file)
|
56 |
+
transcribed = transcribe(None, audio_file)
|
57 |
+
print(f"### {datetime.now()} transcribed:", transcribed)
|
58 |
+
translation = translate(transcribed, language, "id")
|
59 |
+
# return output
|
60 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
61 |
+
coquiTTS.get_tts(translation, fp, speaker={"language": language})
|
62 |
+
print(f"### {datetime.now()} fp.name:", fp.name)
|
63 |
+
return fp.name
|
64 |
+
|
65 |
+
|
66 |
+
with gr.Blocks() as blocks:
|
67 |
+
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
|
68 |
+
+ title
|
69 |
+
+ "</h1>")
|
70 |
+
gr.Markdown(description)
|
71 |
+
with gr.Row():# equal_height=False
|
72 |
+
with gr.Column():# variant="panel"
|
73 |
+
upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
|
74 |
+
print("upload:", upload)
|
75 |
+
radio = gr.Radio(
|
76 |
+
label="Language",
|
77 |
+
choices=LANGUAGES,
|
78 |
+
value=default_lang
|
79 |
+
)
|
80 |
+
with gr.Row(): # mobile_collapse=False
|
81 |
+
submit = gr.Button("Submit", variant="primary")
|
82 |
+
audio = gr.Audio(label="Output", interactive=False)
|
83 |
+
memory = psutil.virtual_memory()
|
84 |
+
gr.Markdown(info)
|
85 |
+
system_status = info = f"""
|
86 |
+
*Memory: {memory.total/(1024*1024*1024):.2f}GB, used: {memory.percent}%, available: {memory.available/(1024*1024*1024):.2f}GB*
|
87 |
+
"""
|
88 |
+
gr.Markdown(system_status)
|
89 |
+
gr.Markdown("<center>"
|
90 |
+
+f'<img src={badge} alt="visitors badge"/>'
|
91 |
+
+"</center>")
|
92 |
+
|
93 |
+
# actions
|
94 |
+
submit.click(
|
95 |
+
tts,
|
96 |
+
[radio, upload],
|
97 |
+
[audio],
|
98 |
)
|
99 |
+
radio.change(lambda lang: CoquiTTS.langs[lang]["sentence"], radio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
+
blocks.launch()
|
|
|
102 |
|
|
packages.txt
CHANGED
@@ -1 +1,3 @@
|
|
1 |
ffmpeg
|
|
|
|
|
|
1 |
ffmpeg
|
2 |
+
libsndfile1
|
3 |
+
espeak-ng
|
requirements.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
git+https://github.com/huggingface/transformers
|
2 |
torch
|
3 |
-
|
|
|
|
|
|
1 |
git+https://github.com/huggingface/transformers
|
2 |
torch
|
3 |
+
neon-tts-plugin-coqui==0.6.0
|
4 |
+
psutil
|
5 |
+
mtranslate
|