insta-maker-2

Sleeping

App Files Files Community

hivecorp commited on Mar 19

Commit

4902504

verified ·

1 Parent(s): 9e1c7c4

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -80

app.py CHANGED Viewed

@@ -44,110 +44,142 @@ class TextProcessor:
     def __init__(self, words_per_line: int, lines_per_segment: int):
         self.words_per_line = words_per_line
         self.lines_per_segment = lines_per_segment
-        self.min_segment_words = 3  # Minimum words per segment
-        self.break_patterns = {
-            'strong': r'[.!?]+',
-            'medium': r'[,;:]',
-            'weak': r'[\s]+'
         }
-    def find_best_break_point(self, words: List[str], max_words: int) -> int:
-        """Find the best point to break the text based on punctuation and length"""
-        if len(words) <= max_words:
-            return len(words)
-        # Look for strong breaks first
-        for i in range(max_words, max(max_words - 5, self.min_segment_words), -1):
-            if i < len(words) and any(words[i-1].endswith(p) for p in '.!?'):
-                return i
-        # Look for medium breaks
-        for i in range(max_words, max(max_words - 5, self.min_segment_words), -1):
-            if i < len(words) and any(words[i-1].endswith(p) for p in ',;:'):
-                return i
-        # Look for natural phrase breaks (prepositions, conjunctions, etc.)
-        break_words = {'the', 'a', 'an', 'and', 'but', 'or', 'in', 'on', 'at', 'to', 'for'}
-        for i in range(max_words, max(max_words - 5, self.min_segment_words), -1):
-            if i < len(words) and words[i].lower() in break_words:
-                return i
-        # Default to max_words if no better break found
-        return max_words
-    def optimize_segment_distribution(self, lines: List[str]) -> List[List[str]]:
-        """Optimize the distribution of lines into segments"""
-        if not lines:
-            return []
-        segments = []
-        current_segment = []
-        for line in lines:
-            words_in_line = len(line.split())
-            if (len(current_segment) == self.lines_per_segment - 1 and
-                words_in_line <= self.min_segment_words):
-                # If last line would be too short, adjust previous segmentation
-                if segments:
-                    # Borrow from previous segment if possible
-                    prev_segment = segments[-1]
-                    if len(prev_segment) > 1:
-                        current_segment.insert(0, prev_segment.pop())
-                    if not prev_segment:  # Remove empty segments
-                        segments.pop()
-            current_segment.append(line)
-            if len(current_segment) >= self.lines_per_segment:
-                segments.append(current_segment)
-                current_segment = []
-        # Handle remaining lines
-        if current_segment:
-            if len(current_segment) == 1 and segments:
-                # Merge single line with previous segment if it's short
-                words_in_last = len(current_segment[0].split())
-                if words_in_last <= self.min_segment_words:
-                    segments[-1].extend(current_segment)
-                else:
-                    segments.append(current_segment)
-            else:
-                segments.append(current_segment)
-        return segments
     def split_into_segments(self, text: str) -> List[Segment]:
-        # Clean and normalize text
         text = re.sub(r'\s+', ' ', text.strip())
         text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
         words = text.split()
-        lines = []
-        current_words = []
         i = 0
         while i < len(words):
-            # Find optimal break point for current line
-            break_point = self.find_best_break_point(
-                words[i:],
-                self.words_per_line
-            )
-            line_words = words[i:i + break_point]
-            lines.append(' '.join(line_words))
-            i += break_point
-        # Optimize segments
-        optimized_segments = self.optimize_segment_distribution(lines)
-        # Create final segments
-        segments = []
-        for idx, segment_lines in enumerate(optimized_segments, 1):
-            segment_text = '\n'.join(segment_lines)
-            segments.append(Segment(id=idx, text=segment_text))
-        return segments
 async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
     """Process a single segment and calculate its timing"""

     def __init__(self, words_per_line: int, lines_per_segment: int):
         self.words_per_line = words_per_line
         self.lines_per_segment = lines_per_segment
+        self.min_segment_words = 3
+        self.max_segment_words = words_per_line * lines_per_segment * 1.5  # Allow 50% more for natural breaks
+        self.punctuation_weights = {
+            '.': 1.0,  # Strong break
+            '!': 1.0,
+            '?': 1.0,
+            ';': 0.8,  # Medium-strong break
+            ':': 0.7,
+            ',': 0.5,  # Medium break
+            '-': 0.3,  # Weak break
+            '(': 0.2,
+            ')': 0.2
         }
+    def analyze_sentence_complexity(self, text: str) -> float:
+        """Analyze sentence complexity to determine optimal segment length"""
+        words = text.split()
+        complexity = 1.0
+        # Adjust for sentence length
+        if len(words) > self.words_per_line * 2:
+            complexity *= 1.2
+        # Adjust for punctuation density
+        punct_count = sum(text.count(p) for p in self.punctuation_weights.keys())
+        complexity *= (1 + (punct_count / len(words)) * 0.5)
+        return complexity
+    def find_natural_breaks(self, text: str) -> List[Tuple[int, float]]:
+        """Find natural break points with their weights"""
+        breaks = []
+        words = text.split()
+        for i, word in enumerate(words):
+            weight = 0
+            # Check for punctuation
+            for punct, punct_weight in self.punctuation_weights.items():
+                if word.endswith(punct):
+                    weight = max(weight, punct_weight)
+            # Check for natural phrase boundaries
+            phrase_starters = {'however', 'therefore', 'moreover', 'furthermore', 'meanwhile', 'although', 'because'}
+            if i < len(words) - 1 and words[i+1].lower() in phrase_starters:
+                weight = max(weight, 0.6)
+            # Check for conjunctions at natural points
+            if i > self.min_segment_words:
+                conjunctions = {'and', 'but', 'or', 'nor', 'for', 'yet', 'so'}
+                if word.lower() in conjunctions:
+                    weight = max(weight, 0.4)
+            if weight > 0:
+                breaks.append((i, weight))
+        return breaks
     def split_into_segments(self, text: str) -> List[Segment]:
+        # Normalize text and add proper spacing around punctuation
         text = re.sub(r'\s+', ' ', text.strip())
         text = re.sub(r'([.!?,;:])\s*', r'\1 ', text)
+        text = re.sub(r'\s+([.!?,;:])', r'\1', text)
+        # First, split into major segments by strong punctuation
+        segments = []
+        current_segment = []
+        current_text = ""
         words = text.split()
         i = 0
         while i < len(words):
+            complexity = self.analyze_sentence_complexity(' '.join(words[i:i + self.words_per_line * 2]))
+            breaks = self.find_natural_breaks(' '.join(words[i:i + int(self.max_segment_words * complexity)]))
+            # Find best break point
+            best_break = None
+            best_weight = 0
+            for break_idx, weight in breaks:
+                actual_idx = i + break_idx
+                if (actual_idx - i >= self.min_segment_words and
+                    actual_idx - i <= self.max_segment_words):
+                    if weight > best_weight:
+                        best_break = break_idx
+                        best_weight = weight
+            if best_break is None:
+                # If no good break found, use maximum length
+                best_break = min(self.words_per_line * self.lines_per_segment, len(words) - i)
+            # Create segment
+            segment_words = words[i:i + best_break + 1]
+            segment_text = ' '.join(segment_words)
+            # Split segment into lines
+            lines = self.split_into_lines(segment_text)
+            final_segment_text = '\n'.join(lines)
+            segments.append(Segment(
+                id=len(segments) + 1,
+                text=final_segment_text
+            ))
+            i += best_break + 1
+        return segments
+    def split_into_lines(self, text: str) -> List[str]:
+        """Split segment text into natural lines"""
+        words = text.split()
+        lines = []
+        current_line = []
+        word_count = 0
+        for word in words:
+            current_line.append(word)
+            word_count += 1
+            # Check for natural line breaks
+            is_break = (
+                word_count >= self.words_per_line or
+                any(word.endswith(p) for p in '.!?') or
+                (word_count >= self.words_per_line * 0.7 and
+                 any(word.endswith(p) for p in ',;:'))
+            )
+            if is_break:
+                lines.append(' '.join(current_line))
+                current_line = []
+                word_count = 0
+        if current_line:
+            lines.append(' '.join(current_line))
+        return lines
 async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
     """Process a single segment and calculate its timing"""