Spaces:
Sleeping
Sleeping
aldan.creo
commited on
Commit
•
94f0c4f
1
Parent(s):
a85fcba
Partial implementation of audio windows
Browse files
app.py
CHANGED
@@ -13,6 +13,9 @@ transcriber = pipeline(
|
|
13 |
)
|
14 |
|
15 |
|
|
|
|
|
|
|
16 |
def transcribe_live(state, words_list, new_chunk):
|
17 |
try:
|
18 |
words_to_check_for = [word.strip().lower() for word in words_list.split(",")]
|
@@ -21,15 +24,10 @@ def transcribe_live(state, words_list, new_chunk):
|
|
21 |
words_to_check_for = []
|
22 |
|
23 |
stream = state.get("stream", None)
|
24 |
-
previous_transcription = state.get("full_transcription", "")
|
25 |
-
previous_counts_of_words = state.get(
|
26 |
-
"counts_of_words", {word: 0 for word in words_to_check_for}
|
27 |
-
)
|
28 |
|
29 |
if new_chunk is None:
|
30 |
gr.Info("You can start transcribing by clicking on the Record button")
|
31 |
-
|
32 |
-
return state, previous_counts_of_words, previous_transcription
|
33 |
|
34 |
sr, y = new_chunk
|
35 |
|
@@ -45,15 +43,79 @@ def transcribe_live(state, words_list, new_chunk):
|
|
45 |
else:
|
46 |
stream = y
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
try:
|
49 |
-
new_transcription = transcriber(
|
50 |
-
|
|
|
51 |
except Exception as e:
|
52 |
gr.Error(f"Transcription failed. Error: {e}")
|
53 |
print(f"Transcription failed. Error: {e}")
|
54 |
-
return state,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
full_transcription_text_lower = full_transcription_text.lower()
|
59 |
|
@@ -85,7 +147,7 @@ def transcribe_live(state, words_list, new_chunk):
|
|
85 |
|
86 |
new_state = {
|
87 |
"stream": stream,
|
88 |
-
"
|
89 |
"counts_of_words": new_counts_of_words,
|
90 |
"highlighted_transcription": new_highlighted_transcription,
|
91 |
}
|
@@ -94,6 +156,7 @@ def transcribe_live(state, words_list, new_chunk):
|
|
94 |
new_state,
|
95 |
new_counts_of_words,
|
96 |
full_transcription_text,
|
|
|
97 |
new_highlighted_transcription,
|
98 |
)
|
99 |
|
@@ -119,12 +182,16 @@ with gr.Blocks() as demo:
|
|
119 |
"""
|
120 |
)
|
121 |
|
122 |
-
filler_words = gr.Textbox(
|
|
|
|
|
|
|
|
|
123 |
recording = gr.Audio(streaming=True, label="Recording")
|
124 |
|
125 |
word_counts = gr.JSON(label="Filler words count", value={})
|
126 |
-
# word_counts = gr.BarPlot(label="Filler words count", value={})
|
127 |
transcription = gr.Textbox(label="Transcription", value="", visible=False)
|
|
|
128 |
|
129 |
highlighted_transcription = gr.HighlightedText(
|
130 |
label="Transcription",
|
@@ -138,8 +205,8 @@ with gr.Blocks() as demo:
|
|
138 |
recording.stream(
|
139 |
transcribe_live,
|
140 |
inputs=[state, filler_words, recording],
|
141 |
-
outputs=[state, word_counts, transcription, highlighted_transcription],
|
142 |
-
stream_every=
|
143 |
time_limit=-1,
|
144 |
)
|
145 |
|
|
|
13 |
)
|
14 |
|
15 |
|
16 |
+
MAX_AUDIO_DURATION = 5
|
17 |
+
|
18 |
+
|
19 |
def transcribe_live(state, words_list, new_chunk):
|
20 |
try:
|
21 |
words_to_check_for = [word.strip().lower() for word in words_list.split(",")]
|
|
|
24 |
words_to_check_for = []
|
25 |
|
26 |
stream = state.get("stream", None)
|
|
|
|
|
|
|
|
|
27 |
|
28 |
if new_chunk is None:
|
29 |
gr.Info("You can start transcribing by clicking on the Record button")
|
30 |
+
return state, {}, ""
|
|
|
31 |
|
32 |
sr, y = new_chunk
|
33 |
|
|
|
43 |
else:
|
44 |
stream = y
|
45 |
|
46 |
+
duration_of_the_stream = len(stream) / sr
|
47 |
+
print(f"Duration of the stream: {duration_of_the_stream}")
|
48 |
+
|
49 |
+
# Only consider the last 30 seconds of the stream
|
50 |
+
if duration_of_the_stream > MAX_AUDIO_DURATION:
|
51 |
+
potentially_shorted_stream = stream[-sr * MAX_AUDIO_DURATION :]
|
52 |
+
else:
|
53 |
+
potentially_shorted_stream = stream
|
54 |
+
|
55 |
+
start_of_the_stream = duration_of_the_stream - (
|
56 |
+
len(potentially_shorted_stream) / sr
|
57 |
+
)
|
58 |
+
|
59 |
try:
|
60 |
+
new_transcription = transcriber(
|
61 |
+
{"sampling_rate": sr, "raw": potentially_shorted_stream}
|
62 |
+
)
|
63 |
except Exception as e:
|
64 |
gr.Error(f"Transcription failed. Error: {e}")
|
65 |
print(f"Transcription failed. Error: {e}")
|
66 |
+
return state, {}, ""
|
67 |
+
|
68 |
+
# We get something like: 'chunks': [{'timestamp': (0.0, 10.0), 'text': " I'm going to go."}]}
|
69 |
+
new_chunks = new_transcription["chunks"]
|
70 |
+
|
71 |
+
# Sum the start time of the new transcription to every chunk so that we get the real time
|
72 |
+
new_chunks_remapped = [
|
73 |
+
{
|
74 |
+
"timestamp": (
|
75 |
+
chunk["timestamp"][0] + start_of_the_stream,
|
76 |
+
chunk["timestamp"][1] + start_of_the_stream,
|
77 |
+
),
|
78 |
+
"text": chunk["text"],
|
79 |
+
}
|
80 |
+
for chunk in new_chunks
|
81 |
+
]
|
82 |
|
83 |
+
print(new_chunks_remapped)
|
84 |
+
|
85 |
+
# Remove the first 25% and the last 25% of the chunks, as they are usually not accurate (cut off)
|
86 |
+
# Don't remove the first 25% if the stream is less than 20 seconds
|
87 |
+
if duration_of_the_stream < MAX_AUDIO_DURATION:
|
88 |
+
start_time_cutoff = start_of_the_stream
|
89 |
+
else:
|
90 |
+
start_time_cutoff = start_of_the_stream + 0.0 * MAX_AUDIO_DURATION
|
91 |
+
end_time_cutoff = start_of_the_stream + 1.0 * MAX_AUDIO_DURATION
|
92 |
+
|
93 |
+
print(f"Start time cutoff: {start_time_cutoff}")
|
94 |
+
print(f"End time cutoff: {end_time_cutoff}")
|
95 |
+
print(f"Start of the stream: {start_of_the_stream}")
|
96 |
+
print(f"Before filtering: {new_chunks_remapped}")
|
97 |
+
|
98 |
+
new_chunks_remapped = [
|
99 |
+
chunk
|
100 |
+
for chunk in new_chunks_remapped
|
101 |
+
if chunk["timestamp"][0] >= start_time_cutoff
|
102 |
+
and chunk["timestamp"][1] <= end_time_cutoff
|
103 |
+
]
|
104 |
+
|
105 |
+
print(f"After filtering: {new_chunks_remapped}")
|
106 |
+
|
107 |
+
# Merge the new transcription with the previous transcription.
|
108 |
+
# Take the texts from the previous transcription up to the time when the new transcription starts
|
109 |
+
|
110 |
+
previous_chunks = state.get("transcription_chunks", [])
|
111 |
+
|
112 |
+
merged_chunks = [
|
113 |
+
chunk for chunk in previous_chunks if chunk["timestamp"][1] < start_time_cutoff
|
114 |
+
] + new_chunks_remapped
|
115 |
+
|
116 |
+
full_transcription_text = reduce(
|
117 |
+
lambda x, y: x + " " + y["text"], merged_chunks, ""
|
118 |
+
)
|
119 |
|
120 |
full_transcription_text_lower = full_transcription_text.lower()
|
121 |
|
|
|
147 |
|
148 |
new_state = {
|
149 |
"stream": stream,
|
150 |
+
"transcription_chunks": merged_chunks,
|
151 |
"counts_of_words": new_counts_of_words,
|
152 |
"highlighted_transcription": new_highlighted_transcription,
|
153 |
}
|
|
|
156 |
new_state,
|
157 |
new_counts_of_words,
|
158 |
full_transcription_text,
|
159 |
+
merged_chunks,
|
160 |
new_highlighted_transcription,
|
161 |
)
|
162 |
|
|
|
182 |
"""
|
183 |
)
|
184 |
|
185 |
+
filler_words = gr.Textbox(
|
186 |
+
label="List of filer words",
|
187 |
+
value="like, so, you know",
|
188 |
+
info="Enter a comma-separated list of words to check for",
|
189 |
+
)
|
190 |
recording = gr.Audio(streaming=True, label="Recording")
|
191 |
|
192 |
word_counts = gr.JSON(label="Filler words count", value={})
|
|
|
193 |
transcription = gr.Textbox(label="Transcription", value="", visible=False)
|
194 |
+
chunks = gr.JSON(label="Chunks", value=[], visible=False)
|
195 |
|
196 |
highlighted_transcription = gr.HighlightedText(
|
197 |
label="Transcription",
|
|
|
205 |
recording.stream(
|
206 |
transcribe_live,
|
207 |
inputs=[state, filler_words, recording],
|
208 |
+
outputs=[state, word_counts, transcription, chunks, highlighted_transcription],
|
209 |
+
stream_every=MAX_AUDIO_DURATION,
|
210 |
time_limit=-1,
|
211 |
)
|
212 |
|