hivecorp commited on
Commit
4902504
·
verified ·
1 Parent(s): 9e1c7c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -80
app.py CHANGED
@@ -44,110 +44,142 @@ class TextProcessor:
44
  def __init__(self, words_per_line: int, lines_per_segment: int):
45
  self.words_per_line = words_per_line
46
  self.lines_per_segment = lines_per_segment
47
- self.min_segment_words = 3 # Minimum words per segment
48
- self.break_patterns = {
49
- 'strong': r'[.!?]+',
50
- 'medium': r'[,;:]',
51
- 'weak': r'[\s]+'
 
 
 
 
 
 
 
52
  }
53
 
54
- def find_best_break_point(self, words: List[str], max_words: int) -> int:
55
- """Find the best point to break the text based on punctuation and length"""
56
- if len(words) <= max_words:
57
- return len(words)
58
-
59
- # Look for strong breaks first
60
- for i in range(max_words, max(max_words - 5, self.min_segment_words), -1):
61
- if i < len(words) and any(words[i-1].endswith(p) for p in '.!?'):
62
- return i
63
 
64
- # Look for medium breaks
65
- for i in range(max_words, max(max_words - 5, self.min_segment_words), -1):
66
- if i < len(words) and any(words[i-1].endswith(p) for p in ',;:'):
67
- return i
68
 
69
- # Look for natural phrase breaks (prepositions, conjunctions, etc.)
70
- break_words = {'the', 'a', 'an', 'and', 'but', 'or', 'in', 'on', 'at', 'to', 'for'}
71
- for i in range(max_words, max(max_words - 5, self.min_segment_words), -1):
72
- if i < len(words) and words[i].lower() in break_words:
73
- return i
74
 
75
- # Default to max_words if no better break found
76
- return max_words
77
 
78
- def optimize_segment_distribution(self, lines: List[str]) -> List[List[str]]:
79
- """Optimize the distribution of lines into segments"""
80
- if not lines:
81
- return []
82
-
83
- segments = []
84
- current_segment = []
85
 
86
- for line in lines:
87
- words_in_line = len(line.split())
88
 
89
- if (len(current_segment) == self.lines_per_segment - 1 and
90
- words_in_line <= self.min_segment_words):
91
- # If last line would be too short, adjust previous segmentation
92
- if segments:
93
- # Borrow from previous segment if possible
94
- prev_segment = segments[-1]
95
- if len(prev_segment) > 1:
96
- current_segment.insert(0, prev_segment.pop())
97
- if not prev_segment: # Remove empty segments
98
- segments.pop()
99
 
100
- current_segment.append(line)
 
 
 
101
 
102
- if len(current_segment) >= self.lines_per_segment:
103
- segments.append(current_segment)
104
- current_segment = []
105
-
106
- # Handle remaining lines
107
- if current_segment:
108
- if len(current_segment) == 1 and segments:
109
- # Merge single line with previous segment if it's short
110
- words_in_last = len(current_segment[0].split())
111
- if words_in_last <= self.min_segment_words:
112
- segments[-1].extend(current_segment)
113
- else:
114
- segments.append(current_segment)
115
- else:
116
- segments.append(current_segment)
117
 
118
- return segments
119
 
120
  def split_into_segments(self, text: str) -> List[Segment]:
121
- # Clean and normalize text
122
  text = re.sub(r'\s+', ' ', text.strip())
123
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
 
124
 
 
 
 
 
125
  words = text.split()
126
- lines = []
127
- current_words = []
128
 
129
  i = 0
130
  while i < len(words):
131
- # Find optimal break point for current line
132
- break_point = self.find_best_break_point(
133
- words[i:],
134
- self.words_per_line
135
- )
 
 
 
 
 
 
 
 
 
136
 
137
- line_words = words[i:i + break_point]
138
- lines.append(' '.join(line_words))
139
- i += break_point
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- # Optimize segments
142
- optimized_segments = self.optimize_segment_distribution(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- # Create final segments
145
- segments = []
146
- for idx, segment_lines in enumerate(optimized_segments, 1):
147
- segment_text = '\n'.join(segment_lines)
148
- segments.append(Segment(id=idx, text=segment_text))
149
 
150
- return segments
151
 
152
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
153
  """Process a single segment and calculate its timing"""
 
44
  def __init__(self, words_per_line: int, lines_per_segment: int):
45
  self.words_per_line = words_per_line
46
  self.lines_per_segment = lines_per_segment
47
+ self.min_segment_words = 3
48
+ self.max_segment_words = words_per_line * lines_per_segment * 1.5 # Allow 50% more for natural breaks
49
+ self.punctuation_weights = {
50
+ '.': 1.0, # Strong break
51
+ '!': 1.0,
52
+ '?': 1.0,
53
+ ';': 0.8, # Medium-strong break
54
+ ':': 0.7,
55
+ ',': 0.5, # Medium break
56
+ '-': 0.3, # Weak break
57
+ '(': 0.2,
58
+ ')': 0.2
59
  }
60
 
61
+ def analyze_sentence_complexity(self, text: str) -> float:
62
+ """Analyze sentence complexity to determine optimal segment length"""
63
+ words = text.split()
64
+ complexity = 1.0
 
 
 
 
 
65
 
66
+ # Adjust for sentence length
67
+ if len(words) > self.words_per_line * 2:
68
+ complexity *= 1.2
 
69
 
70
+ # Adjust for punctuation density
71
+ punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
72
+ complexity *= (1 + (punct_count / len(words)) * 0.5)
 
 
73
 
74
+ return complexity
 
75
 
76
+ def find_natural_breaks(self, text: str) -> List[Tuple[int, float]]:
77
+ """Find natural break points with their weights"""
78
+ breaks = []
79
+ words = text.split()
 
 
 
80
 
81
+ for i, word in enumerate(words):
82
+ weight = 0
83
 
84
+ # Check for punctuation
85
+ for punct, punct_weight in self.punctuation_weights.items():
86
+ if word.endswith(punct):
87
+ weight = max(weight, punct_weight)
 
 
 
 
 
 
88
 
89
+ # Check for natural phrase boundaries
90
+ phrase_starters = {'however', 'therefore', 'moreover', 'furthermore', 'meanwhile', 'although', 'because'}
91
+ if i < len(words) - 1 and words[i+1].lower() in phrase_starters:
92
+ weight = max(weight, 0.6)
93
 
94
+ # Check for conjunctions at natural points
95
+ if i > self.min_segment_words:
96
+ conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
97
+ if word.lower() in conjunctions:
98
+ weight = max(weight, 0.4)
99
+
100
+ if weight > 0:
101
+ breaks.append((i, weight))
 
 
 
 
 
 
 
102
 
103
+ return breaks
104
 
105
  def split_into_segments(self, text: str) -> List[Segment]:
106
+ # Normalize text and add proper spacing around punctuation
107
  text = re.sub(r'\s+', ' ', text.strip())
108
  text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
109
+ text = re.sub(r'\s+([.!?,;:])', r'\1', text)
110
 
111
+ # First, split into major segments by strong punctuation
112
+ segments = []
113
+ current_segment = []
114
+ current_text = ""
115
  words = text.split()
 
 
116
 
117
  i = 0
118
  while i < len(words):
119
+ complexity = self.analyze_sentence_complexity(' '.join(words[i:i + self.words_per_line * 2]))
120
+ breaks = self.find_natural_breaks(' '.join(words[i:i + int(self.max_segment_words * complexity)]))
121
+
122
+ # Find best break point
123
+ best_break = None
124
+ best_weight = 0
125
+
126
+ for break_idx, weight in breaks:
127
+ actual_idx = i + break_idx
128
+ if (actual_idx - i >= self.min_segment_words and
129
+ actual_idx - i <= self.max_segment_words):
130
+ if weight > best_weight:
131
+ best_break = break_idx
132
+ best_weight = weight
133
 
134
+ if best_break is None:
135
+ # If no good break found, use maximum length
136
+ best_break = min(self.words_per_line * self.lines_per_segment, len(words) - i)
137
+
138
+ # Create segment
139
+ segment_words = words[i:i + best_break + 1]
140
+ segment_text = ' '.join(segment_words)
141
+
142
+ # Split segment into lines
143
+ lines = self.split_into_lines(segment_text)
144
+ final_segment_text = '\n'.join(lines)
145
+
146
+ segments.append(Segment(
147
+ id=len(segments) + 1,
148
+ text=final_segment_text
149
+ ))
150
+
151
+ i += best_break + 1
152
+
153
+ return segments
154
+
155
+ def split_into_lines(self, text: str) -> List[str]:
156
+ """Split segment text into natural lines"""
157
+ words = text.split()
158
+ lines = []
159
+ current_line = []
160
+ word_count = 0
161
 
162
+ for word in words:
163
+ current_line.append(word)
164
+ word_count += 1
165
+
166
+ # Check for natural line breaks
167
+ is_break = (
168
+ word_count >= self.words_per_line or
169
+ any(word.endswith(p) for p in '.!?') or
170
+ (word_count >= self.words_per_line * 0.7 and
171
+ any(word.endswith(p) for p in ',;:'))
172
+ )
173
+
174
+ if is_break:
175
+ lines.append(' '.join(current_line))
176
+ current_line = []
177
+ word_count = 0
178
 
179
+ if current_line:
180
+ lines.append(' '.join(current_line))
 
 
 
181
 
182
+ return lines
183
 
184
  async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
185
  """Process a single segment and calculate its timing"""