IliaLarchenko commited on
Commit
21514b1
·
1 Parent(s): 405ab44

An ugly solution for the transcription delay problem

Browse files
Files changed (3) hide show
  1. api/audio.py +27 -4
  2. ui/coding.py +38 -6
  3. ui/instructions.py +1 -1
api/audio.py CHANGED
@@ -92,27 +92,50 @@ class STTManager:
92
  return np.array([], dtype=np.int16), audio_buffer
93
 
94
  def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
 
 
 
 
 
 
 
 
95
  if len(audio) < 500:
96
  return text
97
  else:
98
  transcript = self.transcribe_numpy_array(audio, context=text)
99
  return text + " " + transcript
100
 
101
- def transcribe_and_add_to_chat(self, audio: np.ndarray, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
102
  """
103
- Transcribe audio and add the transcription to the chat history.
104
 
105
- :param audio: Numpy array containing audio data.
106
  :param chat: List of chat messages.
107
  :return: Updated chat history.
108
  """
 
 
 
109
  if len(chat) == 0 or chat[-1][0] is None:
110
  chat.append(["", None])
111
 
112
- chat[-1][0] = self.transcribe_audio(audio, chat[-1][0])
113
 
114
  return chat
115
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
117
  """
118
  Convert speech to text from a full audio segment.
 
92
  return np.array([], dtype=np.int16), audio_buffer
93
 
94
  def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
95
+ """
96
+ Convert speech to text from a full audio segment.
97
+
98
+ :param audio: Numpy array containing audio data.
99
+ :param text: Text message to add.
100
+ :return: Transcribed text.
101
+ """
102
+
103
  if len(audio) < 500:
104
  return text
105
  else:
106
  transcript = self.transcribe_numpy_array(audio, context=text)
107
  return text + " " + transcript
108
 
109
+ def add_to_chat(self, text: str, chat: List[List[Optional[str]]], editable_chat: bool = True) -> List[List[Optional[str]]]:
110
  """
111
+ Add a text message to the chat history.
112
 
113
+ :param text: Text message to add.
114
  :param chat: List of chat messages.
115
  :return: Updated chat history.
116
  """
117
+ if not editable_chat or len(text) == 0:
118
+ return chat
119
+
120
  if len(chat) == 0 or chat[-1][0] is None:
121
  chat.append(["", None])
122
 
123
+ chat[-1][0] = text
124
 
125
  return chat
126
 
127
+ def transcribe_and_add_to_chat(self, audio: np.ndarray, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
128
+ """
129
+ Transcribe audio and add the transcription to the chat history.
130
+
131
+ :param audio: Numpy array containing audio data.
132
+ :param chat: List of chat messages.
133
+ :return: Updated chat history.
134
+ """
135
+ text = self.transcribe_audio(audio)
136
+ chat = self.add_to_chat(text, chat)
137
+ return chat
138
+
139
  def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
140
  """
141
  Convert speech to text from a full audio segment.
ui/coding.py CHANGED
@@ -251,21 +251,53 @@ def get_problem_solving_ui(llm: LLMManager, tts: TTSManager, stt: STTManager, de
251
  fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
252
  )
253
 
254
- # TODO: add a counter for audio chunks to use for better delay handling
255
- audio_counter = 0
 
 
 
256
  audio_input.stream(
257
  stt.process_audio_chunk,
258
  inputs=[audio_input, audio_buffer],
259
  outputs=[audio_buffer, audio_to_transcribe],
260
  show_progress="hidden",
261
- ).success(fn=stt.transcribe_and_add_to_chat, inputs=[audio_to_transcribe, chat], outputs=[chat], show_progress="hidden")
 
 
 
 
 
 
262
 
263
- # TODO: find a way to remove a delay
264
- audio_input.stop_recording(fn=lambda: time.sleep(2)).success(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  fn=send_request_partial,
266
  inputs=[code, previous_code, chat_history, chat],
267
  outputs=[chat_history, chat, previous_code, audio_output],
268
- ).success(fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer])
 
 
 
 
269
 
270
  interview_type_select.change(
271
  fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
 
251
  fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
252
  )
253
 
254
+ is_recording = gr.State(False)
255
+ audio_input.start_recording(fn=lambda: True, outputs=[is_recording])
256
+
257
+ hidden_text = gr.State("")
258
+ is_transcribing = gr.State(False)
259
  audio_input.stream(
260
  stt.process_audio_chunk,
261
  inputs=[audio_input, audio_buffer],
262
  outputs=[audio_buffer, audio_to_transcribe],
263
  show_progress="hidden",
264
+ ).success(fn=lambda: True, outputs=[is_transcribing]).success(
265
+ fn=stt.transcribe_audio, inputs=[audio_to_transcribe, hidden_text], outputs=[hidden_text], show_progress="full"
266
+ ).success(
267
+ fn=stt.add_to_chat, inputs=[hidden_text, chat, is_recording], outputs=[chat], show_progress="full"
268
+ ).success(
269
+ fn=lambda: False, outputs=[is_transcribing]
270
+ )
271
 
272
+ # Ugly but works, need to clean up the code and find a better way to handle the logic
273
+ # Main problem - we need to wait until the last chunk of audio is transcribed before sending the request
274
+ # The same time I don't want to have a fixed delay by default
275
+ # I didn't find a native way of gradio to handle this, so I used a workaround
276
+ # There should be a better way to handle this, but I didn't find it yet
277
+ # The solution below keeps waiting 0.5 second up to 8 times until the audio is transcribed
278
+ audio_input.stop_recording(fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]).success(
279
+ fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
280
+ ).success(fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]).success(
281
+ fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
282
+ ).success(
283
+ fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
284
+ ).success(
285
+ fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
286
+ ).success(
287
+ fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
288
+ ).success(
289
+ fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
290
+ ).success(
291
+ fn=lambda: False, outputs=[is_recording]
292
+ ).success(
293
  fn=send_request_partial,
294
  inputs=[code, previous_code, chat_history, chat],
295
  outputs=[chat_history, chat, previous_code, audio_output],
296
+ ).success(
297
+ fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
298
+ ).success(
299
+ fn=lambda: "", outputs=[hidden_text]
300
+ )
301
 
302
  interview_type_select.change(
303
  fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
ui/instructions.py CHANGED
@@ -33,7 +33,7 @@ This section is where the interaction happens:
33
  - **Code/Solution Area**: On the left side, you will find a space to write your solution. For codding problem you can use any language, although syntax highlighting is only available for Python and SQL.
34
  - **Communication Area**: On the right, this area includes:
35
  - **Chat History**: Displays the entire dialogue history, showing messages from both you and the AI interviewer. Your recognized speech will be shown here before being sent to the AI.
36
- - **Audio Record Button**: Use this button to record your responses. Press to start recording, speak your thoughts, and press stop to send your audio. Your message will be sent to the chat, along with a snapshot of your code or any notes from solution text area."
37
 
38
  Engage with the AI as you would with a real interviewer. Provide concise responses and frequent updates rather than long monologues. Your interactions, including any commentary on your code, will be recorded and the AI's responses will be read aloud and displayed in the chat. Follow the AI's instructions and respond to any follow-up questions as they arise.
39
 
 
33
  - **Code/Solution Area**: On the left side, you will find a space to write your solution. For codding problem you can use any language, although syntax highlighting is only available for Python and SQL.
34
  - **Communication Area**: On the right, this area includes:
35
  - **Chat History**: Displays the entire dialogue history, showing messages from both you and the AI interviewer. Your recognized speech will be shown here before being sent to the AI.
36
+ - **Audio Record Button**: Use this button to record your responses. Press to start recording, speak your thoughts, and press stop to send your audio. Wait until everything you said is transcribed and then click stop. Your message will be sent to the chat, along with a snapshot of your code or any notes from solution text area."
37
 
38
  Engage with the AI as you would with a real interviewer. Provide concise responses and frequent updates rather than long monologues. Your interactions, including any commentary on your code, will be recorded and the AI's responses will be read aloud and displayed in the chat. Follow the AI's instructions and respond to any follow-up questions as they arise.
39