Spaces:

jerrypan7
/

sspeech_translation

Sleeping

App Files Files Community

jerrypan7 commited on Oct 7, 2024

Commit

542e87b

verified ·

1 Parent(s): 000d398

Update app.py

Browse files

def split_text_with_punctuation(text) maximum word more 50

Files changed (1) hide show

app.py +29 -3

app.py CHANGED Viewed

@@ -98,7 +98,7 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
 punctuation_marks = r'([\.!?！？。])'
 def split_text_with_punctuation(text):
     # Split the text using the punctuation marks, keeping the punctuation marks
     split_text = re.split(punctuation_marks, text)
@@ -111,8 +111,34 @@ def split_text_with_punctuation(text):
         combined_segments.append(split_text[-1])
     return combined_segments
 def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
     print(input_text)
     one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"

 punctuation_marks = r'([\.!?！？。])'
+"""
 def split_text_with_punctuation(text):
     # Split the text using the punctuation marks, keeping the punctuation marks
     split_text = re.split(punctuation_marks, text)
         combined_segments.append(split_text[-1])
     return combined_segments
+"""
+def split_text_with_punctuation(text):
+    # Split the text using the punctuation marks, keeping the punctuation marks
+    split_text = re.split(punctuation_marks, text)
+    # Combine each punctuation mark with the preceding segment
+    combined_segments = []
+    # Loop through the split text in steps of 2
+    for i in range(0, len(split_text) - 1, 2):
+        combined_segments.append(split_text[i] + split_text[i + 1])
+    # Handle any remaining text that doesn't have a punctuation following it
+    if len(split_text) % 2 != 0 and split_text[-1]:
+        combined_segments.append(split_text[-1])
+    # Split any segment that exceeds 50 words
+    final_segments = []
+    for segment in combined_segments:
+        words = segment.split()  # Split each segment into words
+        if len(words) > 50:
+            # Split the segment into chunks of no more than 50 words
+            for j in range(0, len(words), 50):
+                final_segments.append(' '.join(words[j:j+50]))
+        else:
+            final_segments.append(segment)
+    return [segment for segment in final_segments if segment]  # Filter out empty strings
 def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
     print(input_text)
     one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"