Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -91,8 +91,28 @@ FILE_EMOJIS = {
|
|
91 |
"mp3": "π΅",
|
92 |
}
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
stop_words = set([
|
97 |
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
98 |
'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
|
@@ -100,7 +120,9 @@ def get_high_info_terms(text: str) -> list:
|
|
100 |
'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
|
101 |
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
|
102 |
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
|
103 |
-
'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there'
|
|
|
|
|
104 |
])
|
105 |
|
106 |
key_phrases = [
|
@@ -113,23 +135,48 @@ def get_high_info_terms(text: str) -> list:
|
|
113 |
'research paper', 'scientific study', 'empirical analysis'
|
114 |
]
|
115 |
|
116 |
-
|
|
|
117 |
lower_text = text.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
for phrase in key_phrases:
|
119 |
-
if phrase in lower_text:
|
120 |
preserved_phrases.append(phrase)
|
121 |
text = text.replace(phrase, '')
|
122 |
|
123 |
-
words
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
high_info_words = [
|
125 |
-
word.lower() for word in
|
126 |
if len(word) > 3
|
127 |
and word.lower() not in stop_words
|
128 |
and not word.isdigit()
|
129 |
and any(c.isalpha() for c in word)
|
130 |
]
|
131 |
|
132 |
-
|
|
|
|
|
|
|
|
|
133 |
seen = set()
|
134 |
unique_terms = []
|
135 |
for term in all_terms:
|
@@ -137,22 +184,50 @@ def get_high_info_terms(text: str) -> list:
|
|
137 |
seen.add(term)
|
138 |
unique_terms.append(term)
|
139 |
|
140 |
-
max_terms = 5
|
141 |
return unique_terms[:max_terms]
|
142 |
|
143 |
-
# 6. Filename Generation
|
144 |
def generate_filename(content, file_type="md"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
prefix = datetime.now().strftime("%y%m_%H%M") + "_"
|
146 |
-
|
|
|
|
|
|
|
|
|
147 |
name_text = '_'.join(term.replace(' ', '-') for term in info_terms) if info_terms else 'file'
|
148 |
|
149 |
-
|
|
|
150 |
if len(name_text) > max_length:
|
151 |
name_text = name_text[:max_length]
|
152 |
|
153 |
filename = f"{prefix}{name_text}.{file_type}"
|
154 |
return filename
|
155 |
|
|
|
|
|
156 |
# 7. Audio Processing
|
157 |
def clean_for_speech(text: str) -> str:
|
158 |
text = text.replace("\n", " ")
|
|
|
91 |
"mp3": "π΅",
|
92 |
}
|
93 |
|
94 |
+
def get_high_info_terms(text: str, prioritize_start=True) -> list:
|
95 |
+
"""π§ #1 - The Neural Network for Filenames (but way simpler and probably underpaid)
|
96 |
+
|
97 |
+
Scans text like a caffeinated librarian on a mission, hunting for words that actually
|
98 |
+
mean something. Filters out boring words like 'the' and 'and' (sorry old friends),
|
99 |
+
while preserving the good stuff like 'quantum' and 'neural' (party time! π).
|
100 |
+
|
101 |
+
Think of it as a bouncer for your filenames - if a word isn't cool enough,
|
102 |
+
it's not getting in. But key phrases? VIP access, baby! π
|
103 |
+
|
104 |
+
Args:
|
105 |
+
text (str): The text to strip mine for linguistic gold
|
106 |
+
prioritize_start (bool): If True, treats the start like the cool kids' table
|
107 |
+
(default behavior because we're not monsters)
|
108 |
+
|
109 |
+
Returns:
|
110 |
+
list: The VIP list of words that made the cut. Maximum of 8 terms if we're
|
111 |
+
prioritizing the start (because YOLO), 5 otherwise (because sanity).
|
112 |
+
|
113 |
+
Warning: May occasionally let through a word that sounds smart but is actually
|
114 |
+
just showing off. We're working on its ego. π
|
115 |
+
"""
|
116 |
stop_words = set([
|
117 |
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
118 |
'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
|
|
|
120 |
'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
|
121 |
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
|
122 |
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
|
123 |
+
'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there', 'please', 'tell',
|
124 |
+
'explain', 'show', 'give', 'write', 'provide', 'need', 'want', 'would', 'could',
|
125 |
+
'lets', 'let', 'try', 'use', 'make', 'help'
|
126 |
])
|
127 |
|
128 |
key_phrases = [
|
|
|
135 |
'research paper', 'scientific study', 'empirical analysis'
|
136 |
]
|
137 |
|
138 |
+
# First check for key phrases at the start of the text
|
139 |
+
start_phrases = []
|
140 |
lower_text = text.lower()
|
141 |
+
text_start = lower_text[:100] # Look at first 100 chars for starting phrases
|
142 |
+
for phrase in key_phrases:
|
143 |
+
if text_start.startswith(phrase) or text_start.find(f" {phrase}") >= 0:
|
144 |
+
start_phrases.append(phrase)
|
145 |
+
text = text.replace(phrase, '')
|
146 |
+
|
147 |
+
# Then check for key phrases in the rest of the text
|
148 |
+
preserved_phrases = []
|
149 |
for phrase in key_phrases:
|
150 |
+
if phrase in lower_text and phrase not in start_phrases:
|
151 |
preserved_phrases.append(phrase)
|
152 |
text = text.replace(phrase, '')
|
153 |
|
154 |
+
# Get the first ~50 words to analyze the start more carefully
|
155 |
+
start_words = text.split()[:50]
|
156 |
+
words_with_pos = []
|
157 |
+
for pos, word in enumerate(start_words):
|
158 |
+
word = re.sub(r'[^\w\s-]', '', word.lower())
|
159 |
+
if (len(word) > 3 and
|
160 |
+
word not in stop_words and
|
161 |
+
not word.isdigit() and
|
162 |
+
any(c.isalpha() for c in word)):
|
163 |
+
words_with_pos.append((pos, word))
|
164 |
+
|
165 |
+
# Get remaining high-info words from the rest of the text
|
166 |
+
remaining_words = re.findall(r'\b\w+(?:-\w+)*\b', text[100:])
|
167 |
high_info_words = [
|
168 |
+
word.lower() for word in remaining_words
|
169 |
if len(word) > 3
|
170 |
and word.lower() not in stop_words
|
171 |
and not word.isdigit()
|
172 |
and any(c.isalpha() for c in word)
|
173 |
]
|
174 |
|
175 |
+
# Combine terms prioritizing start content
|
176 |
+
start_terms = [word for _, word in sorted(words_with_pos, key=lambda x: x[0])][:3]
|
177 |
+
all_terms = (start_phrases + start_terms + preserved_phrases + high_info_words)
|
178 |
+
|
179 |
+
# Remove duplicates while preserving order
|
180 |
seen = set()
|
181 |
unique_terms = []
|
182 |
for term in all_terms:
|
|
|
184 |
seen.add(term)
|
185 |
unique_terms.append(term)
|
186 |
|
187 |
+
max_terms = 8 if prioritize_start else 5
|
188 |
return unique_terms[:max_terms]
|
189 |
|
|
|
190 |
def generate_filename(content, file_type="md"):
|
191 |
+
"""π― #2 - The File Naming Sommelier (pairs well with frustrated developers)
|
192 |
+
|
193 |
+
Takes your content and turns it into a filename that's actually readable by humans!
|
194 |
+
A revolutionary concept, we know. Combines timestamps with meaningful words,
|
195 |
+
because '20231218_quantum_research' beats 'asdfg123.md' any day of the week.
|
196 |
+
|
197 |
+
Think of it as your personal file naming barista - takes your raw content beans
|
198 |
+
and turns them into a smooth, well-crafted filename. No foam art though, sorry! β
|
199 |
+
|
200 |
+
Args:
|
201 |
+
content (str): Your beautiful text that needs a home(name)
|
202 |
+
file_type (str): The file extension (defaults to "md" because we're markdown
|
203 |
+
hipsters at heart)
|
204 |
+
|
205 |
+
Returns:
|
206 |
+
str: A filename that won't make you question your life choices when you see it
|
207 |
+
in 6 months. Limited to 120 chars because we're not writing a novel here.
|
208 |
+
|
209 |
+
Pro Tip: If your filename ends up being just 'file.md', either your content was
|
210 |
+
empty or we've failed spectacularly. Please file a bug report or just
|
211 |
+
laugh it off. πͺ
|
212 |
+
"""
|
213 |
prefix = datetime.now().strftime("%y%m_%H%M") + "_"
|
214 |
+
|
215 |
+
# Get high-info terms with start prioritization
|
216 |
+
info_terms = get_high_info_terms(content, prioritize_start=True)
|
217 |
+
|
218 |
+
# Create filename with terms
|
219 |
name_text = '_'.join(term.replace(' ', '-') for term in info_terms) if info_terms else 'file'
|
220 |
|
221 |
+
# Ensure reasonable length
|
222 |
+
max_length = 120 # Increased to allow more meaningful content
|
223 |
if len(name_text) > max_length:
|
224 |
name_text = name_text[:max_length]
|
225 |
|
226 |
filename = f"{prefix}{name_text}.{file_type}"
|
227 |
return filename
|
228 |
|
229 |
+
|
230 |
+
|
231 |
# 7. Audio Processing
|
232 |
def clean_for_speech(text: str) -> str:
|
233 |
text = text.replace("\n", " ")
|