Spaces:
Sleeping
Sleeping
Commit
·
21514b1
1
Parent(s):
405ab44
An ugly solution for the transcription delay problem
Browse files- api/audio.py +27 -4
- ui/coding.py +38 -6
- ui/instructions.py +1 -1
api/audio.py
CHANGED
@@ -92,27 +92,50 @@ class STTManager:
|
|
92 |
return np.array([], dtype=np.int16), audio_buffer
|
93 |
|
94 |
def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
if len(audio) < 500:
|
96 |
return text
|
97 |
else:
|
98 |
transcript = self.transcribe_numpy_array(audio, context=text)
|
99 |
return text + " " + transcript
|
100 |
|
101 |
-
def
|
102 |
"""
|
103 |
-
|
104 |
|
105 |
-
:param
|
106 |
:param chat: List of chat messages.
|
107 |
:return: Updated chat history.
|
108 |
"""
|
|
|
|
|
|
|
109 |
if len(chat) == 0 or chat[-1][0] is None:
|
110 |
chat.append(["", None])
|
111 |
|
112 |
-
chat[-1][0] =
|
113 |
|
114 |
return chat
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
|
117 |
"""
|
118 |
Convert speech to text from a full audio segment.
|
|
|
92 |
return np.array([], dtype=np.int16), audio_buffer
|
93 |
|
94 |
def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
|
95 |
+
"""
|
96 |
+
Convert speech to text from a full audio segment.
|
97 |
+
|
98 |
+
:param audio: Numpy array containing audio data.
|
99 |
+
:param text: Text message to add.
|
100 |
+
:return: Transcribed text.
|
101 |
+
"""
|
102 |
+
|
103 |
if len(audio) < 500:
|
104 |
return text
|
105 |
else:
|
106 |
transcript = self.transcribe_numpy_array(audio, context=text)
|
107 |
return text + " " + transcript
|
108 |
|
109 |
+
def add_to_chat(self, text: str, chat: List[List[Optional[str]]], editable_chat: bool = True) -> List[List[Optional[str]]]:
|
110 |
"""
|
111 |
+
Add a text message to the chat history.
|
112 |
|
113 |
+
:param text: Text message to add.
|
114 |
:param chat: List of chat messages.
|
115 |
:return: Updated chat history.
|
116 |
"""
|
117 |
+
if not editable_chat or len(text) == 0:
|
118 |
+
return chat
|
119 |
+
|
120 |
if len(chat) == 0 or chat[-1][0] is None:
|
121 |
chat.append(["", None])
|
122 |
|
123 |
+
chat[-1][0] = text
|
124 |
|
125 |
return chat
|
126 |
|
127 |
+
def transcribe_and_add_to_chat(self, audio: np.ndarray, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
|
128 |
+
"""
|
129 |
+
Transcribe audio and add the transcription to the chat history.
|
130 |
+
|
131 |
+
:param audio: Numpy array containing audio data.
|
132 |
+
:param chat: List of chat messages.
|
133 |
+
:return: Updated chat history.
|
134 |
+
"""
|
135 |
+
text = self.transcribe_audio(audio)
|
136 |
+
chat = self.add_to_chat(text, chat)
|
137 |
+
return chat
|
138 |
+
|
139 |
def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
|
140 |
"""
|
141 |
Convert speech to text from a full audio segment.
|
ui/coding.py
CHANGED
@@ -251,21 +251,53 @@ def get_problem_solving_ui(llm: LLMManager, tts: TTSManager, stt: STTManager, de
|
|
251 |
fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
|
252 |
)
|
253 |
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
256 |
audio_input.stream(
|
257 |
stt.process_audio_chunk,
|
258 |
inputs=[audio_input, audio_buffer],
|
259 |
outputs=[audio_buffer, audio_to_transcribe],
|
260 |
show_progress="hidden",
|
261 |
-
).success(fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
-
#
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
fn=send_request_partial,
|
266 |
inputs=[code, previous_code, chat_history, chat],
|
267 |
outputs=[chat_history, chat, previous_code, audio_output],
|
268 |
-
).success(
|
|
|
|
|
|
|
|
|
269 |
|
270 |
interview_type_select.change(
|
271 |
fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
|
|
|
251 |
fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
|
252 |
)
|
253 |
|
254 |
+
is_recording = gr.State(False)
|
255 |
+
audio_input.start_recording(fn=lambda: True, outputs=[is_recording])
|
256 |
+
|
257 |
+
hidden_text = gr.State("")
|
258 |
+
is_transcribing = gr.State(False)
|
259 |
audio_input.stream(
|
260 |
stt.process_audio_chunk,
|
261 |
inputs=[audio_input, audio_buffer],
|
262 |
outputs=[audio_buffer, audio_to_transcribe],
|
263 |
show_progress="hidden",
|
264 |
+
).success(fn=lambda: True, outputs=[is_transcribing]).success(
|
265 |
+
fn=stt.transcribe_audio, inputs=[audio_to_transcribe, hidden_text], outputs=[hidden_text], show_progress="full"
|
266 |
+
).success(
|
267 |
+
fn=stt.add_to_chat, inputs=[hidden_text, chat, is_recording], outputs=[chat], show_progress="full"
|
268 |
+
).success(
|
269 |
+
fn=lambda: False, outputs=[is_transcribing]
|
270 |
+
)
|
271 |
|
272 |
+
# Ugly but works, need to clean up the code and find a better way to handle the logic
|
273 |
+
# Main problem - we need to wait until the last chunk of audio is transcribed before sending the request
|
274 |
+
# The same time I don't want to have a fixed delay by default
|
275 |
+
# I didn't find a native way of gradio to handle this, so I used a workaround
|
276 |
+
# There should be a better way to handle this, but I didn't find it yet
|
277 |
+
# The solution below keeps waiting 0.5 second up to 8 times until the audio is transcribed
|
278 |
+
audio_input.stop_recording(fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]).success(
|
279 |
+
fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
|
280 |
+
).success(fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]).success(
|
281 |
+
fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
|
282 |
+
).success(
|
283 |
+
fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
|
284 |
+
).success(
|
285 |
+
fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
|
286 |
+
).success(
|
287 |
+
fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
|
288 |
+
).success(
|
289 |
+
fn=lambda x: time.sleep(int(x) / 2), inputs=[is_transcribing]
|
290 |
+
).success(
|
291 |
+
fn=lambda: False, outputs=[is_recording]
|
292 |
+
).success(
|
293 |
fn=send_request_partial,
|
294 |
inputs=[code, previous_code, chat_history, chat],
|
295 |
outputs=[chat_history, chat, previous_code, audio_output],
|
296 |
+
).success(
|
297 |
+
fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
|
298 |
+
).success(
|
299 |
+
fn=lambda: "", outputs=[hidden_text]
|
300 |
+
)
|
301 |
|
302 |
interview_type_select.change(
|
303 |
fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
|
ui/instructions.py
CHANGED
@@ -33,7 +33,7 @@ This section is where the interaction happens:
|
|
33 |
- **Code/Solution Area**: On the left side, you will find a space to write your solution. For codding problem you can use any language, although syntax highlighting is only available for Python and SQL.
|
34 |
- **Communication Area**: On the right, this area includes:
|
35 |
- **Chat History**: Displays the entire dialogue history, showing messages from both you and the AI interviewer. Your recognized speech will be shown here before being sent to the AI.
|
36 |
-
- **Audio Record Button**: Use this button to record your responses. Press to start recording, speak your thoughts, and press stop to send your audio. Your message will be sent to the chat, along with a snapshot of your code or any notes from solution text area."
|
37 |
|
38 |
Engage with the AI as you would with a real interviewer. Provide concise responses and frequent updates rather than long monologues. Your interactions, including any commentary on your code, will be recorded and the AI's responses will be read aloud and displayed in the chat. Follow the AI's instructions and respond to any follow-up questions as they arise.
|
39 |
|
|
|
33 |
- **Code/Solution Area**: On the left side, you will find a space to write your solution. For codding problem you can use any language, although syntax highlighting is only available for Python and SQL.
|
34 |
- **Communication Area**: On the right, this area includes:
|
35 |
- **Chat History**: Displays the entire dialogue history, showing messages from both you and the AI interviewer. Your recognized speech will be shown here before being sent to the AI.
|
36 |
+
- **Audio Record Button**: Use this button to record your responses. Press to start recording, speak your thoughts, and press stop to send your audio. Wait until everything you said is transcribed and then click stop. Your message will be sent to the chat, along with a snapshot of your code or any notes from solution text area."
|
37 |
|
38 |
Engage with the AI as you would with a real interviewer. Provide concise responses and frequent updates rather than long monologues. Your interactions, including any commentary on your code, will be recorded and the AI's responses will be read aloud and displayed in the chat. Follow the AI's instructions and respond to any follow-up questions as they arise.
|
39 |
|