danielwm994 commited on
Commit
be67183
·
verified ·
1 Parent(s): c7af682

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -17
app.py CHANGED
@@ -25,6 +25,8 @@ pipe = pipeline(
25
 
26
 
27
  @spaces.GPU
 
 
28
  def transcribe(inputs, task):
29
  if inputs is None:
30
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
@@ -33,27 +35,35 @@ def transcribe(inputs, task):
33
  text = result["text"]
34
  timestamps = result["chunks"]
35
 
36
- # Initialize an empty list to store processed text with more natural breaks
37
- processed_text = []
38
  timestamp_str = ""
39
 
 
 
 
40
  for chunk in timestamps:
41
- # For each chunk, ensure text ends at a period, question mark, or exclamation mark
42
  chunk_text = chunk["text"]
43
-
44
- # Adjust chunk_text to end with a more natural boundary if needed (e.g., sentence end)
45
- if not chunk_text.endswith(('.', '!', '?')):
46
- # You could modify this part to adjust as needed, for example, by adding a period
47
- chunk_text += "..."
48
-
49
- # Add the text with timestamps
50
- processed_text.append(chunk_text)
51
- timestamp_str += f"[{chunk['timestamp']}] {chunk_text}\n"
52
-
53
- # Join all the processed text into a single string with logical sentence boundaries
54
- full_text = " ".join(processed_text)
55
-
56
- return full_text, timestamp_str
 
 
 
 
 
57
 
58
 
59
 
 
25
 
26
 
27
  @spaces.GPU
28
+ import re
29
+
30
  def transcribe(inputs, task):
31
  if inputs is None:
32
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
 
35
  text = result["text"]
36
  timestamps = result["chunks"]
37
 
38
+ # Список для хранения текстов с правильными разделителями
39
+ final_text = []
40
  timestamp_str = ""
41
 
42
+ current_chunk = []
43
+ current_timestamp = None
44
+
45
  for chunk in timestamps:
46
+ # Текст текущего чанка
47
  chunk_text = chunk["text"]
48
+ chunk_timestamp = chunk["timestamp"]
49
+
50
+ # Проверим, не заканчивается ли текст на точке, восклицательном или вопросительном знаке
51
+ if re.search(r'[.!?]$', chunk_text):
52
+ current_chunk.append(chunk_text)
53
+ final_text.append(" ".join(current_chunk))
54
+ timestamp_str += f"[{chunk_timestamp}] " + " ".join(current_chunk) + "\n"
55
+ current_chunk = []
56
+ else:
57
+ # Если текст не завершен, собираем его в текущий чанке
58
+ current_chunk.append(chunk_text)
59
+
60
+ # Если есть незавершенные чанки (например, последний кусок текста не заканчивается на пунктуацию)
61
+ if current_chunk:
62
+ final_text.append(" ".join(current_chunk))
63
+ timestamp_str += f"[{chunk_timestamp}] " + " ".join(current_chunk) + "\n"
64
+
65
+ return " ".join(final_text), timestamp_str
66
+
67
 
68
 
69