Spaces:

acmc
/

grammASRian

Sleeping

App Files Files Community

aldan.creo commited on 24 days ago

Commit

94f0c4f

•

1 Parent(s): a85fcba

Partial implementation of audio windows

Browse files

Files changed (1) hide show

app.py +82 -15

app.py CHANGED Viewed

@@ -13,6 +13,9 @@ transcriber = pipeline(
 )
 def transcribe_live(state, words_list, new_chunk):
     try:
         words_to_check_for = [word.strip().lower() for word in words_list.split(",")]
@@ -21,15 +24,10 @@ def transcribe_live(state, words_list, new_chunk):
         words_to_check_for = []
     stream = state.get("stream", None)
-    previous_transcription = state.get("full_transcription", "")
-    previous_counts_of_words = state.get(
-        "counts_of_words", {word: 0 for word in words_to_check_for}
-    )
     if new_chunk is None:
         gr.Info("You can start transcribing by clicking on the Record button")
-        print("new chunk is None")
-        return state, previous_counts_of_words, previous_transcription
     sr, y = new_chunk
@@ -45,15 +43,79 @@ def transcribe_live(state, words_list, new_chunk):
     else:
         stream = y
     try:
-        new_transcription = transcriber({"sampling_rate": sr, "raw": stream})
-        print(f"new transcription: {new_transcription}")
     except Exception as e:
         gr.Error(f"Transcription failed. Error: {e}")
         print(f"Transcription failed. Error: {e}")
-        return state, previous_counts_of_words, previous_transcription
-    full_transcription_text = new_transcription["text"]
     full_transcription_text_lower = full_transcription_text.lower()
@@ -85,7 +147,7 @@ def transcribe_live(state, words_list, new_chunk):
     new_state = {
         "stream": stream,
-        "full_transcription": full_transcription_text,
         "counts_of_words": new_counts_of_words,
         "highlighted_transcription": new_highlighted_transcription,
     }
@@ -94,6 +156,7 @@ def transcribe_live(state, words_list, new_chunk):
         new_state,
         new_counts_of_words,
         full_transcription_text,
         new_highlighted_transcription,
     )
@@ -119,12 +182,16 @@ with gr.Blocks() as demo:
         """
     )
-    filler_words = gr.Textbox(label="List of filer words", value="like, so, you know", info="Enter a comma-separated list of words to check for")
     recording = gr.Audio(streaming=True, label="Recording")
     word_counts = gr.JSON(label="Filler words count", value={})
-    # word_counts = gr.BarPlot(label="Filler words count", value={})
     transcription = gr.Textbox(label="Transcription", value="", visible=False)
     highlighted_transcription = gr.HighlightedText(
         label="Transcription",
@@ -138,8 +205,8 @@ with gr.Blocks() as demo:
     recording.stream(
         transcribe_live,
         inputs=[state, filler_words, recording],
-        outputs=[state, word_counts, transcription, highlighted_transcription],
-        stream_every=5,
         time_limit=-1,
     )

 )
+MAX_AUDIO_DURATION = 5
 def transcribe_live(state, words_list, new_chunk):
     try:
         words_to_check_for = [word.strip().lower() for word in words_list.split(",")]
         words_to_check_for = []
     stream = state.get("stream", None)
     if new_chunk is None:
         gr.Info("You can start transcribing by clicking on the Record button")
+        return state, {}, ""
     sr, y = new_chunk
     else:
         stream = y
+    duration_of_the_stream = len(stream) / sr
+    print(f"Duration of the stream: {duration_of_the_stream}")
+    # Only consider the last 30 seconds of the stream
+    if duration_of_the_stream > MAX_AUDIO_DURATION:
+        potentially_shorted_stream = stream[-sr * MAX_AUDIO_DURATION :]
+    else:
+        potentially_shorted_stream = stream
+    start_of_the_stream = duration_of_the_stream - (
+        len(potentially_shorted_stream) / sr
+    )
     try:
+        new_transcription = transcriber(
+            {"sampling_rate": sr, "raw": potentially_shorted_stream}
+        )
     except Exception as e:
         gr.Error(f"Transcription failed. Error: {e}")
         print(f"Transcription failed. Error: {e}")
+        return state, {}, ""
+    # We get something like: 'chunks': [{'timestamp': (0.0, 10.0), 'text': " I'm going to go."}]}
+    new_chunks = new_transcription["chunks"]
+    # Sum the start time of the new transcription to every chunk so that we get the real time
+    new_chunks_remapped = [
+        {
+            "timestamp": (
+                chunk["timestamp"][0] + start_of_the_stream,
+                chunk["timestamp"][1] + start_of_the_stream,
+            ),
+            "text": chunk["text"],
+        }
+        for chunk in new_chunks
+    ]
+    print(new_chunks_remapped)
+    # Remove the first 25% and the last 25% of the chunks, as they are usually not accurate (cut off)
+    # Don't remove the first 25% if the stream is less than 20 seconds
+    if duration_of_the_stream < MAX_AUDIO_DURATION:
+        start_time_cutoff = start_of_the_stream
+    else:
+        start_time_cutoff = start_of_the_stream + 0.0 * MAX_AUDIO_DURATION
+    end_time_cutoff = start_of_the_stream + 1.0 * MAX_AUDIO_DURATION
+    print(f"Start time cutoff: {start_time_cutoff}")
+    print(f"End time cutoff: {end_time_cutoff}")
+    print(f"Start of the stream: {start_of_the_stream}")
+    print(f"Before filtering: {new_chunks_remapped}")
+    new_chunks_remapped = [
+        chunk
+        for chunk in new_chunks_remapped
+        if chunk["timestamp"][0] >= start_time_cutoff
+        and chunk["timestamp"][1] <= end_time_cutoff
+    ]
+    print(f"After filtering: {new_chunks_remapped}")
+    # Merge the new transcription with the previous transcription.
+    # Take the texts from the previous transcription up to the time when the new transcription starts
+    previous_chunks = state.get("transcription_chunks", [])
+    merged_chunks = [
+        chunk for chunk in previous_chunks if chunk["timestamp"][1] < start_time_cutoff
+    ] + new_chunks_remapped
+    full_transcription_text = reduce(
+        lambda x, y: x + " " + y["text"], merged_chunks, ""
+    )
     full_transcription_text_lower = full_transcription_text.lower()
     new_state = {
         "stream": stream,
+        "transcription_chunks": merged_chunks,
         "counts_of_words": new_counts_of_words,
         "highlighted_transcription": new_highlighted_transcription,
     }
         new_state,
         new_counts_of_words,
         full_transcription_text,
+        merged_chunks,
         new_highlighted_transcription,
     )
         """
     )
+    filler_words = gr.Textbox(
+        label="List of filer words",
+        value="like, so, you know",
+        info="Enter a comma-separated list of words to check for",
+    )
     recording = gr.Audio(streaming=True, label="Recording")
     word_counts = gr.JSON(label="Filler words count", value={})
     transcription = gr.Textbox(label="Transcription", value="", visible=False)
+    chunks = gr.JSON(label="Chunks", value=[], visible=False)
     highlighted_transcription = gr.HighlightedText(
         label="Transcription",
     recording.stream(
         transcribe_live,
         inputs=[state, filler_words, recording],
+        outputs=[state, word_counts, transcription, chunks, highlighted_transcription],
+        stream_every=MAX_AUDIO_DURATION,
         time_limit=-1,
     )