aldan.creo commited on
Commit
94f0c4f
1 Parent(s): a85fcba

Partial implementation of audio windows

Browse files
Files changed (1) hide show
  1. app.py +82 -15
app.py CHANGED
@@ -13,6 +13,9 @@ transcriber = pipeline(
13
  )
14
 
15
 
 
 
 
16
  def transcribe_live(state, words_list, new_chunk):
17
  try:
18
  words_to_check_for = [word.strip().lower() for word in words_list.split(",")]
@@ -21,15 +24,10 @@ def transcribe_live(state, words_list, new_chunk):
21
  words_to_check_for = []
22
 
23
  stream = state.get("stream", None)
24
- previous_transcription = state.get("full_transcription", "")
25
- previous_counts_of_words = state.get(
26
- "counts_of_words", {word: 0 for word in words_to_check_for}
27
- )
28
 
29
  if new_chunk is None:
30
  gr.Info("You can start transcribing by clicking on the Record button")
31
- print("new chunk is None")
32
- return state, previous_counts_of_words, previous_transcription
33
 
34
  sr, y = new_chunk
35
 
@@ -45,15 +43,79 @@ def transcribe_live(state, words_list, new_chunk):
45
  else:
46
  stream = y
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
- new_transcription = transcriber({"sampling_rate": sr, "raw": stream})
50
- print(f"new transcription: {new_transcription}")
 
51
  except Exception as e:
52
  gr.Error(f"Transcription failed. Error: {e}")
53
  print(f"Transcription failed. Error: {e}")
54
- return state, previous_counts_of_words, previous_transcription
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- full_transcription_text = new_transcription["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  full_transcription_text_lower = full_transcription_text.lower()
59
 
@@ -85,7 +147,7 @@ def transcribe_live(state, words_list, new_chunk):
85
 
86
  new_state = {
87
  "stream": stream,
88
- "full_transcription": full_transcription_text,
89
  "counts_of_words": new_counts_of_words,
90
  "highlighted_transcription": new_highlighted_transcription,
91
  }
@@ -94,6 +156,7 @@ def transcribe_live(state, words_list, new_chunk):
94
  new_state,
95
  new_counts_of_words,
96
  full_transcription_text,
 
97
  new_highlighted_transcription,
98
  )
99
 
@@ -119,12 +182,16 @@ with gr.Blocks() as demo:
119
  """
120
  )
121
 
122
- filler_words = gr.Textbox(label="List of filer words", value="like, so, you know", info="Enter a comma-separated list of words to check for")
 
 
 
 
123
  recording = gr.Audio(streaming=True, label="Recording")
124
 
125
  word_counts = gr.JSON(label="Filler words count", value={})
126
- # word_counts = gr.BarPlot(label="Filler words count", value={})
127
  transcription = gr.Textbox(label="Transcription", value="", visible=False)
 
128
 
129
  highlighted_transcription = gr.HighlightedText(
130
  label="Transcription",
@@ -138,8 +205,8 @@ with gr.Blocks() as demo:
138
  recording.stream(
139
  transcribe_live,
140
  inputs=[state, filler_words, recording],
141
- outputs=[state, word_counts, transcription, highlighted_transcription],
142
- stream_every=5,
143
  time_limit=-1,
144
  )
145
 
 
13
  )
14
 
15
 
16
+ MAX_AUDIO_DURATION = 5
17
+
18
+
19
  def transcribe_live(state, words_list, new_chunk):
20
  try:
21
  words_to_check_for = [word.strip().lower() for word in words_list.split(",")]
 
24
  words_to_check_for = []
25
 
26
  stream = state.get("stream", None)
 
 
 
 
27
 
28
  if new_chunk is None:
29
  gr.Info("You can start transcribing by clicking on the Record button")
30
+ return state, {}, ""
 
31
 
32
  sr, y = new_chunk
33
 
 
43
  else:
44
  stream = y
45
 
46
+ duration_of_the_stream = len(stream) / sr
47
+ print(f"Duration of the stream: {duration_of_the_stream}")
48
+
49
+ # Only consider the last 30 seconds of the stream
50
+ if duration_of_the_stream > MAX_AUDIO_DURATION:
51
+ potentially_shorted_stream = stream[-sr * MAX_AUDIO_DURATION :]
52
+ else:
53
+ potentially_shorted_stream = stream
54
+
55
+ start_of_the_stream = duration_of_the_stream - (
56
+ len(potentially_shorted_stream) / sr
57
+ )
58
+
59
  try:
60
+ new_transcription = transcriber(
61
+ {"sampling_rate": sr, "raw": potentially_shorted_stream}
62
+ )
63
  except Exception as e:
64
  gr.Error(f"Transcription failed. Error: {e}")
65
  print(f"Transcription failed. Error: {e}")
66
+ return state, {}, ""
67
+
68
+ # We get something like: 'chunks': [{'timestamp': (0.0, 10.0), 'text': " I'm going to go."}]}
69
+ new_chunks = new_transcription["chunks"]
70
+
71
+ # Sum the start time of the new transcription to every chunk so that we get the real time
72
+ new_chunks_remapped = [
73
+ {
74
+ "timestamp": (
75
+ chunk["timestamp"][0] + start_of_the_stream,
76
+ chunk["timestamp"][1] + start_of_the_stream,
77
+ ),
78
+ "text": chunk["text"],
79
+ }
80
+ for chunk in new_chunks
81
+ ]
82
 
83
+ print(new_chunks_remapped)
84
+
85
+ # Remove the first 25% and the last 25% of the chunks, as they are usually not accurate (cut off)
86
+ # Don't remove the first 25% if the stream is less than 20 seconds
87
+ if duration_of_the_stream < MAX_AUDIO_DURATION:
88
+ start_time_cutoff = start_of_the_stream
89
+ else:
90
+ start_time_cutoff = start_of_the_stream + 0.0 * MAX_AUDIO_DURATION
91
+ end_time_cutoff = start_of_the_stream + 1.0 * MAX_AUDIO_DURATION
92
+
93
+ print(f"Start time cutoff: {start_time_cutoff}")
94
+ print(f"End time cutoff: {end_time_cutoff}")
95
+ print(f"Start of the stream: {start_of_the_stream}")
96
+ print(f"Before filtering: {new_chunks_remapped}")
97
+
98
+ new_chunks_remapped = [
99
+ chunk
100
+ for chunk in new_chunks_remapped
101
+ if chunk["timestamp"][0] >= start_time_cutoff
102
+ and chunk["timestamp"][1] <= end_time_cutoff
103
+ ]
104
+
105
+ print(f"After filtering: {new_chunks_remapped}")
106
+
107
+ # Merge the new transcription with the previous transcription.
108
+ # Take the texts from the previous transcription up to the time when the new transcription starts
109
+
110
+ previous_chunks = state.get("transcription_chunks", [])
111
+
112
+ merged_chunks = [
113
+ chunk for chunk in previous_chunks if chunk["timestamp"][1] < start_time_cutoff
114
+ ] + new_chunks_remapped
115
+
116
+ full_transcription_text = reduce(
117
+ lambda x, y: x + " " + y["text"], merged_chunks, ""
118
+ )
119
 
120
  full_transcription_text_lower = full_transcription_text.lower()
121
 
 
147
 
148
  new_state = {
149
  "stream": stream,
150
+ "transcription_chunks": merged_chunks,
151
  "counts_of_words": new_counts_of_words,
152
  "highlighted_transcription": new_highlighted_transcription,
153
  }
 
156
  new_state,
157
  new_counts_of_words,
158
  full_transcription_text,
159
+ merged_chunks,
160
  new_highlighted_transcription,
161
  )
162
 
 
182
  """
183
  )
184
 
185
+ filler_words = gr.Textbox(
186
+ label="List of filer words",
187
+ value="like, so, you know",
188
+ info="Enter a comma-separated list of words to check for",
189
+ )
190
  recording = gr.Audio(streaming=True, label="Recording")
191
 
192
  word_counts = gr.JSON(label="Filler words count", value={})
 
193
  transcription = gr.Textbox(label="Transcription", value="", visible=False)
194
+ chunks = gr.JSON(label="Chunks", value=[], visible=False)
195
 
196
  highlighted_transcription = gr.HighlightedText(
197
  label="Transcription",
 
205
  recording.stream(
206
  transcribe_live,
207
  inputs=[state, filler_words, recording],
208
+ outputs=[state, word_counts, transcription, chunks, highlighted_transcription],
209
+ stream_every=MAX_AUDIO_DURATION,
210
  time_limit=-1,
211
  )
212