awacke1 commited on
Commit
bc2a176
β€’
1 Parent(s): cd34468

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -12
app.py CHANGED
@@ -91,8 +91,28 @@ FILE_EMOJIS = {
91
  "mp3": "🎡",
92
  }
93
 
94
- # 5. High-Information Content Extraction
95
- def get_high_info_terms(text: str) -> list:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  stop_words = set([
97
  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
98
  'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
@@ -100,7 +120,9 @@ def get_high_info_terms(text: str) -> list:
100
  'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
101
  'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
102
  'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
103
- 'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there'
 
 
104
  ])
105
 
106
  key_phrases = [
@@ -113,23 +135,48 @@ def get_high_info_terms(text: str) -> list:
113
  'research paper', 'scientific study', 'empirical analysis'
114
  ]
115
 
116
- preserved_phrases = []
 
117
  lower_text = text.lower()
 
 
 
 
 
 
 
 
118
  for phrase in key_phrases:
119
- if phrase in lower_text:
120
  preserved_phrases.append(phrase)
121
  text = text.replace(phrase, '')
122
 
123
- words = re.findall(r'\b\w+(?:-\w+)*\b', text)
 
 
 
 
 
 
 
 
 
 
 
 
124
  high_info_words = [
125
- word.lower() for word in words
126
  if len(word) > 3
127
  and word.lower() not in stop_words
128
  and not word.isdigit()
129
  and any(c.isalpha() for c in word)
130
  ]
131
 
132
- all_terms = preserved_phrases + high_info_words
 
 
 
 
133
  seen = set()
134
  unique_terms = []
135
  for term in all_terms:
@@ -137,22 +184,50 @@ def get_high_info_terms(text: str) -> list:
137
  seen.add(term)
138
  unique_terms.append(term)
139
 
140
- max_terms = 5
141
  return unique_terms[:max_terms]
142
 
143
- # 6. Filename Generation
144
  def generate_filename(content, file_type="md"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  prefix = datetime.now().strftime("%y%m_%H%M") + "_"
146
- info_terms = get_high_info_terms(content)
 
 
 
 
147
  name_text = '_'.join(term.replace(' ', '-') for term in info_terms) if info_terms else 'file'
148
 
149
- max_length = 100
 
150
  if len(name_text) > max_length:
151
  name_text = name_text[:max_length]
152
 
153
  filename = f"{prefix}{name_text}.{file_type}"
154
  return filename
155
 
 
 
156
  # 7. Audio Processing
157
  def clean_for_speech(text: str) -> str:
158
  text = text.replace("\n", " ")
 
91
  "mp3": "🎡",
92
  }
93
 
94
+ def get_high_info_terms(text: str, prioritize_start=True) -> list:
95
+ """🧠 #1 - The Neural Network for Filenames (but way simpler and probably underpaid)
96
+
97
+ Scans text like a caffeinated librarian on a mission, hunting for words that actually
98
+ mean something. Filters out boring words like 'the' and 'and' (sorry old friends),
99
+ while preserving the good stuff like 'quantum' and 'neural' (party time! πŸŽ‰).
100
+
101
+ Think of it as a bouncer for your filenames - if a word isn't cool enough,
102
+ it's not getting in. But key phrases? VIP access, baby! 🎭
103
+
104
+ Args:
105
+ text (str): The text to strip mine for linguistic gold
106
+ prioritize_start (bool): If True, treats the start like the cool kids' table
107
+ (default behavior because we're not monsters)
108
+
109
+ Returns:
110
+ list: The VIP list of words that made the cut. Maximum of 8 terms if we're
111
+ prioritizing the start (because YOLO), 5 otherwise (because sanity).
112
+
113
+ Warning: May occasionally let through a word that sounds smart but is actually
114
+ just showing off. We're working on its ego. 🎭
115
+ """
116
  stop_words = set([
117
  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
118
  'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
 
120
  'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
121
  'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
122
  'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
123
+ 'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there', 'please', 'tell',
124
+ 'explain', 'show', 'give', 'write', 'provide', 'need', 'want', 'would', 'could',
125
+ 'lets', 'let', 'try', 'use', 'make', 'help'
126
  ])
127
 
128
  key_phrases = [
 
135
  'research paper', 'scientific study', 'empirical analysis'
136
  ]
137
 
138
+ # First check for key phrases at the start of the text
139
+ start_phrases = []
140
  lower_text = text.lower()
141
+ text_start = lower_text[:100] # Look at first 100 chars for starting phrases
142
+ for phrase in key_phrases:
143
+ if text_start.startswith(phrase) or text_start.find(f" {phrase}") >= 0:
144
+ start_phrases.append(phrase)
145
+ text = text.replace(phrase, '')
146
+
147
+ # Then check for key phrases in the rest of the text
148
+ preserved_phrases = []
149
  for phrase in key_phrases:
150
+ if phrase in lower_text and phrase not in start_phrases:
151
  preserved_phrases.append(phrase)
152
  text = text.replace(phrase, '')
153
 
154
+ # Get the first ~50 words to analyze the start more carefully
155
+ start_words = text.split()[:50]
156
+ words_with_pos = []
157
+ for pos, word in enumerate(start_words):
158
+ word = re.sub(r'[^\w\s-]', '', word.lower())
159
+ if (len(word) > 3 and
160
+ word not in stop_words and
161
+ not word.isdigit() and
162
+ any(c.isalpha() for c in word)):
163
+ words_with_pos.append((pos, word))
164
+
165
+ # Get remaining high-info words from the rest of the text
166
+ remaining_words = re.findall(r'\b\w+(?:-\w+)*\b', text[100:])
167
  high_info_words = [
168
+ word.lower() for word in remaining_words
169
  if len(word) > 3
170
  and word.lower() not in stop_words
171
  and not word.isdigit()
172
  and any(c.isalpha() for c in word)
173
  ]
174
 
175
+ # Combine terms prioritizing start content
176
+ start_terms = [word for _, word in sorted(words_with_pos, key=lambda x: x[0])][:3]
177
+ all_terms = (start_phrases + start_terms + preserved_phrases + high_info_words)
178
+
179
+ # Remove duplicates while preserving order
180
  seen = set()
181
  unique_terms = []
182
  for term in all_terms:
 
184
  seen.add(term)
185
  unique_terms.append(term)
186
 
187
+ max_terms = 8 if prioritize_start else 5
188
  return unique_terms[:max_terms]
189
 
 
190
  def generate_filename(content, file_type="md"):
191
+ """🎯 #2 - The File Naming Sommelier (pairs well with frustrated developers)
192
+
193
+ Takes your content and turns it into a filename that's actually readable by humans!
194
+ A revolutionary concept, we know. Combines timestamps with meaningful words,
195
+ because '20231218_quantum_research' beats 'asdfg123.md' any day of the week.
196
+
197
+ Think of it as your personal file naming barista - takes your raw content beans
198
+ and turns them into a smooth, well-crafted filename. No foam art though, sorry! β˜•
199
+
200
+ Args:
201
+ content (str): Your beautiful text that needs a home(name)
202
+ file_type (str): The file extension (defaults to "md" because we're markdown
203
+ hipsters at heart)
204
+
205
+ Returns:
206
+ str: A filename that won't make you question your life choices when you see it
207
+ in 6 months. Limited to 120 chars because we're not writing a novel here.
208
+
209
+ Pro Tip: If your filename ends up being just 'file.md', either your content was
210
+ empty or we've failed spectacularly. Please file a bug report or just
211
+ laugh it off. πŸŽͺ
212
+ """
213
  prefix = datetime.now().strftime("%y%m_%H%M") + "_"
214
+
215
+ # Get high-info terms with start prioritization
216
+ info_terms = get_high_info_terms(content, prioritize_start=True)
217
+
218
+ # Create filename with terms
219
  name_text = '_'.join(term.replace(' ', '-') for term in info_terms) if info_terms else 'file'
220
 
221
+ # Ensure reasonable length
222
+ max_length = 120 # Increased to allow more meaningful content
223
  if len(name_text) > max_length:
224
  name_text = name_text[:max_length]
225
 
226
  filename = f"{prefix}{name_text}.{file_type}"
227
  return filename
228
 
229
+
230
+
231
  # 7. Audio Processing
232
  def clean_for_speech(text: str) -> str:
233
  text = text.replace("\n", " ")