whisper-large-v3-1-1

Runtime error

App Files Files Community

danielwm994 commited on Oct 16, 2024

Commit

be67183

verified ·

1 Parent(s): c7af682

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -17

app.py CHANGED Viewed

@@ -25,6 +25,8 @@ pipe = pipeline(
 @spaces.GPU
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
@@ -33,27 +35,35 @@ def transcribe(inputs, task):
     text = result["text"]
     timestamps = result["chunks"]
-    # Initialize an empty list to store processed text with more natural breaks
-    processed_text = []
     timestamp_str = ""
     for chunk in timestamps:
-        # For each chunk, ensure text ends at a period, question mark, or exclamation mark
         chunk_text = chunk["text"]
-        # Adjust chunk_text to end with a more natural boundary if needed (e.g., sentence end)
-        if not chunk_text.endswith(('.', '!', '?')):
-            # You could modify this part to adjust as needed, for example, by adding a period
-            chunk_text += "..."
-        # Add the text with timestamps
-        processed_text.append(chunk_text)
-        timestamp_str += f"[{chunk['timestamp']}] {chunk_text}\n"
-    # Join all the processed text into a single string with logical sentence boundaries
-    full_text = " ".join(processed_text)
-    return full_text, timestamp_str

 @spaces.GPU
+import re
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     text = result["text"]
     timestamps = result["chunks"]
+    # Список для хранения текстов с правильными разделителями
+    final_text = []
     timestamp_str = ""
+    current_chunk = []
+    current_timestamp = None
     for chunk in timestamps:
+        # Текст текущего чанка
         chunk_text = chunk["text"]
+        chunk_timestamp = chunk["timestamp"]
+        # Проверим, не заканчивается ли текст на точке, восклицательном или вопросительном знаке
+        if re.search(r'[.!?]$', chunk_text):
+            current_chunk.append(chunk_text)
+            final_text.append(" ".join(current_chunk))
+            timestamp_str += f"[{chunk_timestamp}] " + " ".join(current_chunk) + "\n"
+            current_chunk = []
+        else:
+            # Если текст не завершен, собираем его в текущий чанке
+            current_chunk.append(chunk_text)
+    # Если есть незавершенные чанки (например, последний кусок текста не заканчивается на пунктуацию)
+    if current_chunk:
+        final_text.append(" ".join(current_chunk))
+        timestamp_str += f"[{chunk_timestamp}] " + " ".join(current_chunk) + "\n"
+    return " ".join(final_text), timestamp_str