j-tobias commited on
Commit
e9daf29
β€’
1 Parent(s): e1e27eb

added model

Browse files
Files changed (2) hide show
  1. app.py +54 -14
  2. requirements.txt +4 -0
app.py CHANGED
@@ -23,14 +23,14 @@ client = InferenceClient(
23
  token=hf_token)
24
 
25
 
26
-
27
-
28
- processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
29
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
30
- model.config.forced_decoder_ids = None
31
-
32
- def chat(audio, chat:list):
33
- transcription = transcribe(audio)
34
  chat.append({'role':'user','content':transcription})
35
  response = client.chat_completion(
36
  messages=chat,
@@ -40,7 +40,7 @@ def chat(audio, chat:list):
40
  chat.append({'role':'assistant','content':response})
41
  return chat
42
 
43
- def transcribe(audio):
44
  sr, audio = audio
45
  audio = audio.astype(np.float32)
46
  if len(audio.shape) > 2 and audio.shape[1] > 1:
@@ -52,8 +52,44 @@ def transcribe(audio):
52
  transcription = processor.tokenizer.normalize(transcription[0])
53
  return transcription
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  with gr.Blocks() as app:
56
 
 
 
 
 
 
 
57
  chatbot = gr.Chatbot(
58
  value=[{
59
  'role':'System',
@@ -71,12 +107,16 @@ with gr.Blocks() as app:
71
  scale=8
72
  )
73
 
74
- # mode_option = gr.Radio(
75
- # choices=["online", "local"],
76
- # scale=1
77
- # )
 
 
 
 
78
 
79
  # Event listener for when the audio recording stops
80
- audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot], outputs=chatbot)
81
 
82
  app.launch()
 
23
  token=hf_token)
24
 
25
 
26
+ def chat(audio, chat:list, asr_model:str):
27
+ if asr_model == "openai/whisper-large-v2":
28
+ transcription = transcribe_whisper_large_v2(audio)
29
+ elif asr_model == "openai/whisper-tiny.en":
30
+ transcription = transcribe_whisper_tiny_en(audio)
31
+ else:
32
+ raise ValueError(f"No Model found with the given choice: {asr_model}")
33
+
34
  chat.append({'role':'user','content':transcription})
35
  response = client.chat_completion(
36
  messages=chat,
 
40
  chat.append({'role':'assistant','content':response})
41
  return chat
42
 
43
+ def transcribe_whisper_large_v2(audio):
44
  sr, audio = audio
45
  audio = audio.astype(np.float32)
46
  if len(audio.shape) > 2 and audio.shape[1] > 1:
 
52
  transcription = processor.tokenizer.normalize(transcription[0])
53
  return transcription
54
 
55
+ def transcribe_whisper_tiny_en(audio):
56
+ sr, audio = audio
57
+ audio = audio.astype(np.float32)
58
+ if len(audio.shape) > 2 and audio.shape[1] > 1:
59
+ audio = np.mean(audio, axis=1)
60
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
61
+ input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
62
+ predicted_ids = model.generate(input_features)
63
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
64
+ transcription = processor.tokenizer.normalize(transcription[0])
65
+ return transcription
66
+
67
+
68
+ def load_model(asr_model_choice:str):
69
+ global processor
70
+ global model
71
+ global model_flag
72
+
73
+ if asr_model_choice == "openai/whisper-large-v2":
74
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
75
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
76
+ model.config.forced_decoder_ids = None
77
+ model_flag = "openai/whisper-large-v2"
78
+ elif asr_model_choice == "openai/whisper-tiny.en":
79
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
80
+ processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
81
+ model_flag = "openai/whisper-tiny.en"
82
+
83
+ print("Model Loaded: ",model_flag)
84
+
85
  with gr.Blocks() as app:
86
 
87
+ gr.Markdown("# VoiceBot")
88
+ gr.Markdown("Welcome to VoiceBot πŸ‘‹, here is how it works")
89
+ gr.Markdown("This Bot can only be interacted with through your voice. Press record and say something, after stopping the recoding your audio will be processed directly. You have the option to choose between different models. The model you choose influences the Bot's perfomance to understand what you have said. A better perfomance also comes with longer waiting time. πŸ˜•")
90
+ gr.Markdown("Have fun playing arround πŸŽ‰")
91
+ gr.Markdown("If you have any wishes for models or and idea, feel free to let me know πŸ™Œ")
92
+
93
  chatbot = gr.Chatbot(
94
  value=[{
95
  'role':'System',
 
107
  scale=8
108
  )
109
 
110
+ with gr.Accordion(label="Settings", open=False):
111
+
112
+ asr_model_choice = gr.Radio(
113
+ label="Select ASR Model",
114
+ choices=["openai/whisper-large-v2","openai/whisper-tiny.en"],
115
+ value="openai/whisper-tiny.en"
116
+ )
117
+ asr_model_choice.change(load_model, asr_model_choice)
118
 
119
  # Event listener for when the audio recording stops
120
+ audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot, asr_model_choice], outputs=chatbot)
121
 
122
  app.launch()
requirements.txt CHANGED
@@ -33,7 +33,9 @@ markdown-it-py==3.0.0
33
  MarkupSafe==2.1.5
34
  matplotlib==3.9.2
35
  mdurl==0.1.2
 
36
  msgpack==1.0.8
 
37
  numba==0.60.0
38
  numpy==2.0.1
39
  orjson==3.10.7
@@ -66,9 +68,11 @@ sniffio==1.3.1
66
  soundfile==0.12.1
67
  soxr==0.4.0
68
  starlette==0.38.2
 
69
  threadpoolctl==3.5.0
70
  tokenizers==0.19.1
71
  tomlkit==0.12.0
 
72
  tqdm==4.66.5
73
  transformers==4.44.0
74
  typer==0.12.3
 
33
  MarkupSafe==2.1.5
34
  matplotlib==3.9.2
35
  mdurl==0.1.2
36
+ mpmath==1.3.0
37
  msgpack==1.0.8
38
+ networkx==3.3
39
  numba==0.60.0
40
  numpy==2.0.1
41
  orjson==3.10.7
 
68
  soundfile==0.12.1
69
  soxr==0.4.0
70
  starlette==0.38.2
71
+ sympy==1.13.2
72
  threadpoolctl==3.5.0
73
  tokenizers==0.19.1
74
  tomlkit==0.12.0
75
+ torch==2.4.0
76
  tqdm==4.66.5
77
  transformers==4.44.0
78
  typer==0.12.3