Spaces:
Sleeping
Sleeping
Update app.py
Browse filesdef split_text_with_punctuation(text) maximum word more 50
app.py
CHANGED
@@ -98,7 +98,7 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
|
|
98 |
|
99 |
|
100 |
punctuation_marks = r'([\.!?!?。])'
|
101 |
-
|
102 |
def split_text_with_punctuation(text):
|
103 |
# Split the text using the punctuation marks, keeping the punctuation marks
|
104 |
split_text = re.split(punctuation_marks, text)
|
@@ -111,8 +111,34 @@ def split_text_with_punctuation(text):
|
|
111 |
combined_segments.append(split_text[-1])
|
112 |
|
113 |
return combined_segments
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
|
117 |
print(input_text)
|
118 |
one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
|
|
|
98 |
|
99 |
|
100 |
punctuation_marks = r'([\.!?!?。])'
|
101 |
+
"""
|
102 |
def split_text_with_punctuation(text):
|
103 |
# Split the text using the punctuation marks, keeping the punctuation marks
|
104 |
split_text = re.split(punctuation_marks, text)
|
|
|
111 |
combined_segments.append(split_text[-1])
|
112 |
|
113 |
return combined_segments
|
114 |
+
"""
|
115 |
+
def split_text_with_punctuation(text):
|
116 |
+
# Split the text using the punctuation marks, keeping the punctuation marks
|
117 |
+
split_text = re.split(punctuation_marks, text)
|
118 |
+
# Combine each punctuation mark with the preceding segment
|
119 |
+
combined_segments = []
|
120 |
+
|
121 |
+
# Loop through the split text in steps of 2
|
122 |
+
for i in range(0, len(split_text) - 1, 2):
|
123 |
+
combined_segments.append(split_text[i] + split_text[i + 1])
|
124 |
+
|
125 |
+
# Handle any remaining text that doesn't have a punctuation following it
|
126 |
+
if len(split_text) % 2 != 0 and split_text[-1]:
|
127 |
+
combined_segments.append(split_text[-1])
|
128 |
+
|
129 |
+
# Split any segment that exceeds 50 words
|
130 |
+
final_segments = []
|
131 |
+
for segment in combined_segments:
|
132 |
+
words = segment.split() # Split each segment into words
|
133 |
+
if len(words) > 50:
|
134 |
+
# Split the segment into chunks of no more than 50 words
|
135 |
+
for j in range(0, len(words), 50):
|
136 |
+
final_segments.append(' '.join(words[j:j+50]))
|
137 |
+
else:
|
138 |
+
final_segments.append(segment)
|
139 |
+
|
140 |
+
return [segment for segment in final_segments if segment] # Filter out empty strings
|
141 |
+
|
142 |
def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
|
143 |
print(input_text)
|
144 |
one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
|