jerrypan7 commited on
Commit
542e87b
·
verified ·
1 Parent(s): 000d398

Update app.py

Browse files

def split_text_with_punctuation(text) maximum word more 50

Files changed (1) hide show
  1. app.py +29 -3
app.py CHANGED
@@ -98,7 +98,7 @@ def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -
98
 
99
 
100
  punctuation_marks = r'([\.!?!?。])'
101
-
102
  def split_text_with_punctuation(text):
103
  # Split the text using the punctuation marks, keeping the punctuation marks
104
  split_text = re.split(punctuation_marks, text)
@@ -111,8 +111,34 @@ def split_text_with_punctuation(text):
111
  combined_segments.append(split_text[-1])
112
 
113
  return combined_segments
114
-
115
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
117
  print(input_text)
118
  one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
 
98
 
99
 
100
  punctuation_marks = r'([\.!?!?。])'
101
+ """
102
  def split_text_with_punctuation(text):
103
  # Split the text using the punctuation marks, keeping the punctuation marks
104
  split_text = re.split(punctuation_marks, text)
 
111
  combined_segments.append(split_text[-1])
112
 
113
  return combined_segments
114
+ """
115
+ def split_text_with_punctuation(text):
116
+ # Split the text using the punctuation marks, keeping the punctuation marks
117
+ split_text = re.split(punctuation_marks, text)
118
+ # Combine each punctuation mark with the preceding segment
119
+ combined_segments = []
120
+
121
+ # Loop through the split text in steps of 2
122
+ for i in range(0, len(split_text) - 1, 2):
123
+ combined_segments.append(split_text[i] + split_text[i + 1])
124
+
125
+ # Handle any remaining text that doesn't have a punctuation following it
126
+ if len(split_text) % 2 != 0 and split_text[-1]:
127
+ combined_segments.append(split_text[-1])
128
+
129
+ # Split any segment that exceeds 50 words
130
+ final_segments = []
131
+ for segment in combined_segments:
132
+ words = segment.split() # Split each segment into words
133
+ if len(words) > 50:
134
+ # Split the segment into chunks of no more than 50 words
135
+ for j in range(0, len(words), 50):
136
+ final_segments.append(' '.join(words[j:j+50]))
137
+ else:
138
+ final_segments.append(segment)
139
+
140
+ return [segment for segment in final_segments if segment] # Filter out empty strings
141
+
142
  def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64):
143
  print(input_text)
144
  one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"