Spaces:
Sleeping
Sleeping
j-tobias
commited on
Commit
β’
e9daf29
1
Parent(s):
e1e27eb
added model
Browse files- app.py +54 -14
- requirements.txt +4 -0
app.py
CHANGED
@@ -23,14 +23,14 @@ client = InferenceClient(
|
|
23 |
token=hf_token)
|
24 |
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
chat.append({'role':'user','content':transcription})
|
35 |
response = client.chat_completion(
|
36 |
messages=chat,
|
@@ -40,7 +40,7 @@ def chat(audio, chat:list):
|
|
40 |
chat.append({'role':'assistant','content':response})
|
41 |
return chat
|
42 |
|
43 |
-
def
|
44 |
sr, audio = audio
|
45 |
audio = audio.astype(np.float32)
|
46 |
if len(audio.shape) > 2 and audio.shape[1] > 1:
|
@@ -52,8 +52,44 @@ def transcribe(audio):
|
|
52 |
transcription = processor.tokenizer.normalize(transcription[0])
|
53 |
return transcription
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
with gr.Blocks() as app:
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
chatbot = gr.Chatbot(
|
58 |
value=[{
|
59 |
'role':'System',
|
@@ -71,12 +107,16 @@ with gr.Blocks() as app:
|
|
71 |
scale=8
|
72 |
)
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
78 |
|
79 |
# Event listener for when the audio recording stops
|
80 |
-
audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot], outputs=chatbot)
|
81 |
|
82 |
app.launch()
|
|
|
23 |
token=hf_token)
|
24 |
|
25 |
|
26 |
+
def chat(audio, chat:list, asr_model:str):
|
27 |
+
if asr_model == "openai/whisper-large-v2":
|
28 |
+
transcription = transcribe_whisper_large_v2(audio)
|
29 |
+
elif asr_model == "openai/whisper-tiny.en":
|
30 |
+
transcription = transcribe_whisper_tiny_en(audio)
|
31 |
+
else:
|
32 |
+
raise ValueError(f"No Model found with the given choice: {asr_model}")
|
33 |
+
|
34 |
chat.append({'role':'user','content':transcription})
|
35 |
response = client.chat_completion(
|
36 |
messages=chat,
|
|
|
40 |
chat.append({'role':'assistant','content':response})
|
41 |
return chat
|
42 |
|
43 |
+
def transcribe_whisper_large_v2(audio):
|
44 |
sr, audio = audio
|
45 |
audio = audio.astype(np.float32)
|
46 |
if len(audio.shape) > 2 and audio.shape[1] > 1:
|
|
|
52 |
transcription = processor.tokenizer.normalize(transcription[0])
|
53 |
return transcription
|
54 |
|
55 |
+
def transcribe_whisper_tiny_en(audio):
|
56 |
+
sr, audio = audio
|
57 |
+
audio = audio.astype(np.float32)
|
58 |
+
if len(audio.shape) > 2 and audio.shape[1] > 1:
|
59 |
+
audio = np.mean(audio, axis=1)
|
60 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
61 |
+
input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
|
62 |
+
predicted_ids = model.generate(input_features)
|
63 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
64 |
+
transcription = processor.tokenizer.normalize(transcription[0])
|
65 |
+
return transcription
|
66 |
+
|
67 |
+
|
68 |
+
def load_model(asr_model_choice:str):
|
69 |
+
global processor
|
70 |
+
global model
|
71 |
+
global model_flag
|
72 |
+
|
73 |
+
if asr_model_choice == "openai/whisper-large-v2":
|
74 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
75 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
76 |
+
model.config.forced_decoder_ids = None
|
77 |
+
model_flag = "openai/whisper-large-v2"
|
78 |
+
elif asr_model_choice == "openai/whisper-tiny.en":
|
79 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
80 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
81 |
+
model_flag = "openai/whisper-tiny.en"
|
82 |
+
|
83 |
+
print("Model Loaded: ",model_flag)
|
84 |
+
|
85 |
with gr.Blocks() as app:
|
86 |
|
87 |
+
gr.Markdown("# VoiceBot")
|
88 |
+
gr.Markdown("Welcome to VoiceBot π, here is how it works")
|
89 |
+
gr.Markdown("This Bot can only be interacted with through your voice. Press record and say something, after stopping the recoding your audio will be processed directly. You have the option to choose between different models. The model you choose influences the Bot's perfomance to understand what you have said. A better perfomance also comes with longer waiting time. π")
|
90 |
+
gr.Markdown("Have fun playing arround π")
|
91 |
+
gr.Markdown("If you have any wishes for models or and idea, feel free to let me know π")
|
92 |
+
|
93 |
chatbot = gr.Chatbot(
|
94 |
value=[{
|
95 |
'role':'System',
|
|
|
107 |
scale=8
|
108 |
)
|
109 |
|
110 |
+
with gr.Accordion(label="Settings", open=False):
|
111 |
+
|
112 |
+
asr_model_choice = gr.Radio(
|
113 |
+
label="Select ASR Model",
|
114 |
+
choices=["openai/whisper-large-v2","openai/whisper-tiny.en"],
|
115 |
+
value="openai/whisper-tiny.en"
|
116 |
+
)
|
117 |
+
asr_model_choice.change(load_model, asr_model_choice)
|
118 |
|
119 |
# Event listener for when the audio recording stops
|
120 |
+
audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot, asr_model_choice], outputs=chatbot)
|
121 |
|
122 |
app.launch()
|
requirements.txt
CHANGED
@@ -33,7 +33,9 @@ markdown-it-py==3.0.0
|
|
33 |
MarkupSafe==2.1.5
|
34 |
matplotlib==3.9.2
|
35 |
mdurl==0.1.2
|
|
|
36 |
msgpack==1.0.8
|
|
|
37 |
numba==0.60.0
|
38 |
numpy==2.0.1
|
39 |
orjson==3.10.7
|
@@ -66,9 +68,11 @@ sniffio==1.3.1
|
|
66 |
soundfile==0.12.1
|
67 |
soxr==0.4.0
|
68 |
starlette==0.38.2
|
|
|
69 |
threadpoolctl==3.5.0
|
70 |
tokenizers==0.19.1
|
71 |
tomlkit==0.12.0
|
|
|
72 |
tqdm==4.66.5
|
73 |
transformers==4.44.0
|
74 |
typer==0.12.3
|
|
|
33 |
MarkupSafe==2.1.5
|
34 |
matplotlib==3.9.2
|
35 |
mdurl==0.1.2
|
36 |
+
mpmath==1.3.0
|
37 |
msgpack==1.0.8
|
38 |
+
networkx==3.3
|
39 |
numba==0.60.0
|
40 |
numpy==2.0.1
|
41 |
orjson==3.10.7
|
|
|
68 |
soundfile==0.12.1
|
69 |
soxr==0.4.0
|
70 |
starlette==0.38.2
|
71 |
+
sympy==1.13.2
|
72 |
threadpoolctl==3.5.0
|
73 |
tokenizers==0.19.1
|
74 |
tomlkit==0.12.0
|
75 |
+
torch==2.4.0
|
76 |
tqdm==4.66.5
|
77 |
transformers==4.44.0
|
78 |
typer==0.12.3
|