Afrinetwork7 commited on
Commit
09ab406
1 Parent(s): 384d281

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -93
app.py CHANGED
@@ -1,95 +1,57 @@
1
- import gradio as gr
2
- import librosa
3
- from asr import transcribe, ASR_EXAMPLES, ASR_LANGUAGES, ASR_NOTE
4
- from tts import synthesize, TTS_EXAMPLES, TTS_LANGUAGES
5
- from lid import identify, LID_EXAMPLES
6
-
7
-
8
-
9
- mms_transcribe = gr.Interface(
10
- fn=transcribe,
11
- inputs=[
12
- gr.Audio(),
13
- gr.Dropdown(
14
- [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()],
15
- label="Language",
16
- value="eng English",
17
- ),
18
- # gr.Checkbox(label="Use Language Model (if available)", default=True),
19
- ],
20
- outputs="text",
21
- examples=ASR_EXAMPLES,
22
- title="Speech-to-text",
23
- description=(
24
- "Transcribe audio from a microphone or input file in your desired language."
25
- ),
26
- article=ASR_NOTE,
27
- allow_flagging="never",
28
- )
29
-
30
- mms_synthesize = gr.Interface(
31
- fn=synthesize,
32
- inputs=[
33
- gr.Text(label="Input text"),
34
- gr.Dropdown(
35
- [f"{k} ({v})" for k, v in TTS_LANGUAGES.items()],
36
- label="Language",
37
- value="eng English",
38
- ),
39
- gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"),
40
- ],
41
- outputs=[
42
- gr.Audio(label="Generated Audio", type="numpy"),
43
- gr.Text(label="Filtered text after removing OOVs"),
44
- ],
45
- examples=TTS_EXAMPLES,
46
- title="Text-to-speech",
47
- description=("Generate audio in your desired language from input text."),
48
- allow_flagging="never",
49
- )
50
-
51
- mms_identify = gr.Interface(
52
- fn=identify,
53
- inputs=[
54
- gr.Audio(),
55
- ],
56
- outputs=gr.Label(num_top_classes=10),
57
- examples=LID_EXAMPLES,
58
- title="Language Identification",
59
- description=("Identity the language of input audio."),
60
- allow_flagging="never",
61
- )
62
-
63
- tabbed_interface = gr.TabbedInterface(
64
- [mms_transcribe, mms_synthesize, mms_identify],
65
- ["Speech-to-text", "Text-to-speech", "Language Identification"],
66
- )
67
-
68
- with gr.Blocks() as demo:
69
- gr.Markdown(
70
- "<p align='center' style='font-size: 20px;'>MMS: Scaling Speech Technology to 1000+ languages demo. See our <a href='https://ai.facebook.com/blog/multilingual-model-speech-recognition/'>blog post</a> and <a href='https://arxiv.org/abs/2305.13516'>paper</a>.</p>"
71
- )
72
- gr.HTML(
73
- """<center>Click on the appropriate tab to explore Speech-to-text (ASR), Text-to-speech (TTS) and Language identification (LID) demos. </center>"""
74
- )
75
- gr.HTML(
76
- """<center>You can also finetune MMS models on your data using the recipes provides here - <a href='https://huggingface.co/blog/mms_adapters'>ASR</a> <a href='https://github.com/ylacombe/finetune-hf-vits'>TTS</a> </center>"""
77
- )
78
- gr.HTML(
79
- """<center><a href="https://huggingface.co/spaces/facebook/MMS?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"><img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for more control and no queue.</center>"""
80
- )
81
-
82
- tabbed_interface.render()
83
- gr.HTML(
84
- """
85
- <div class="footer" style="text-align:center">
86
- <p>
87
- Model by <a href="https://ai.facebook.com" style="text-decoration: underline;" target="_blank">Meta AI</a> - Gradio Demo by 🤗 Hugging Face
88
- </p>
89
- </div>
90
- """
91
  )
92
 
93
- if __name__ == "__main__":
94
- demo.queue()
95
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, Form
2
+ from fastapi.responses import JSONResponse, FileResponse
3
+ import uvicorn
4
+ from pydantic import BaseModel
5
+ import numpy as np
6
+ import io
7
+ import soundfile as sf
8
+
9
+ from asr import transcribe, ASR_LANGUAGES
10
+ from tts import synthesize, TTS_LANGUAGES
11
+ from lid import identify
12
+
13
+ app = FastAPI(title="MMS: Scaling Speech Technology to 1000+ languages")
14
+
15
+ class TTSRequest(BaseModel):
16
+ text: str
17
+ language: str
18
+ speed: float
19
+
20
+ @app.post("/transcribe")
21
+ async def transcribe_audio(audio: UploadFile = File(...), language: str = Form(...)):
22
+ contents = await audio.read()
23
+ audio_array, sample_rate = sf.read(io.BytesIO(contents))
24
+
25
+ result = transcribe(audio_array, language)
26
+ return JSONResponse(content={"transcription": result})
27
+
28
+ @app.post("/synthesize")
29
+ async def synthesize_speech(request: TTSRequest):
30
+ audio, filtered_text = synthesize(request.text, request.language, request.speed)
31
+
32
+ # Convert numpy array to bytes
33
+ buffer = io.BytesIO()
34
+ sf.write(buffer, audio, 22050, format='wav')
35
+ buffer.seek(0)
36
+
37
+ return FileResponse(
38
+ buffer,
39
+ media_type="audio/wav",
40
+ headers={"Content-Disposition": "attachment; filename=synthesized_audio.wav"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  )
42
 
43
+ @app.post("/identify")
44
+ async def identify_language(audio: UploadFile = File(...)):
45
+ contents = await audio.read()
46
+ audio_array, sample_rate = sf.read(io.BytesIO(contents))
47
+
48
+ result = identify(audio_array)
49
+ return JSONResponse(content={"language_identification": result})
50
+
51
+ @app.get("/asr_languages")
52
+ async def get_asr_languages():
53
+ return JSONResponse(content=ASR_LANGUAGES)
54
+
55
+ @app.get("/tts_languages")
56
+ async def get_tts_languages():
57
+ return JSONResponse(content=TTS_LANGUAGES)