Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -29,7 +29,8 @@ from functions import (
|
|
29 |
preprocess_text,
|
30 |
generate_variants,
|
31 |
contains_excluded_keywords,
|
32 |
-
extract_terms
|
|
|
33 |
)
|
34 |
|
35 |
|
@@ -355,9 +356,6 @@ if sentiment_btn:
|
|
355 |
# plt.axis("off")
|
356 |
# plt.show()
|
357 |
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
if keyword_extraction_btn:
|
362 |
df = st.session_state.get("df")
|
363 |
user_query = st.session_state.get("user_query")
|
@@ -372,40 +370,52 @@ if keyword_extraction_btn:
|
|
372 |
target_col = "Detail_Keyword"
|
373 |
details_list = df['Detail'].tolist()
|
374 |
|
375 |
-
#
|
376 |
with ThreadPoolExecutor() as executor:
|
377 |
extracted_results = list(
|
378 |
-
executor.map(
|
|
|
|
|
|
|
379 |
)
|
380 |
-
# No need to assign back to the DataFrame if you don't want to use it further.
|
381 |
-
# However, for filtering purposes we use the list 'extracted_results'.
|
382 |
|
383 |
-
# Generate excluded keywords from the user query
|
384 |
excluded_keywords = generate_variants(user_query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
-
#
|
387 |
-
filtered_results = [
|
388 |
-
result for result in extracted_results
|
389 |
-
if result is not None and not contains_excluded_keywords(result, excluded_keywords=excluded_keywords)
|
390 |
-
]
|
391 |
-
|
392 |
-
# Parallelize extraction of terms from each extracted result
|
393 |
with ThreadPoolExecutor() as executor:
|
394 |
-
terms_list = list(
|
|
|
|
|
395 |
|
396 |
-
# Flatten the list of lists into a single list of terms
|
397 |
terms = [term for sublist in terms_list for term in sublist]
|
398 |
|
399 |
-
# Count the frequency of each term
|
400 |
freq = Counter(terms)
|
401 |
|
402 |
with st.spinner("Drawing Keywords Diagram..."):
|
403 |
-
# Generate a WordCloud
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
|
|
|
|
|
29 |
preprocess_text,
|
30 |
generate_variants,
|
31 |
contains_excluded_keywords,
|
32 |
+
extract_terms,
|
33 |
+
remove_excluded_from_list
|
34 |
)
|
35 |
|
36 |
|
|
|
356 |
# plt.axis("off")
|
357 |
# plt.show()
|
358 |
|
|
|
|
|
|
|
359 |
if keyword_extraction_btn:
|
360 |
df = st.session_state.get("df")
|
361 |
user_query = st.session_state.get("user_query")
|
|
|
370 |
target_col = "Detail_Keyword"
|
371 |
details_list = df['Detail'].tolist()
|
372 |
|
373 |
+
# 1. Run keyword extraction in parallel for each detail
|
374 |
with ThreadPoolExecutor() as executor:
|
375 |
extracted_results = list(
|
376 |
+
executor.map(
|
377 |
+
lambda detail: keyword_extractor(preprocess_text(detail)) if detail else None,
|
378 |
+
details_list
|
379 |
+
)
|
380 |
)
|
|
|
|
|
381 |
|
382 |
+
# 2. Generate excluded keywords from the user query
|
383 |
excluded_keywords = generate_variants(user_query)
|
384 |
+
|
385 |
+
# 3. Partially remove excluded keywords from each extracted result
|
386 |
+
cleaned_results = []
|
387 |
+
for result in extracted_results:
|
388 |
+
if result is not None:
|
389 |
+
# If the result is a NumPy array, convert it to a Python list
|
390 |
+
if isinstance(result, np.ndarray):
|
391 |
+
result = result.tolist()
|
392 |
+
# Remove only the matching items, not the entire list
|
393 |
+
cleaned = remove_excluded_from_list(result, excluded_keywords)
|
394 |
+
cleaned_results.append(cleaned)
|
395 |
+
else:
|
396 |
+
cleaned_results.append(None)
|
397 |
|
398 |
+
# 4. Extract terms from each cleaned result in parallel
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
with ThreadPoolExecutor() as executor:
|
400 |
+
terms_list = list(
|
401 |
+
executor.map(lambda res: extract_terms(res), cleaned_results)
|
402 |
+
)
|
403 |
|
404 |
+
# 5. Flatten the list of lists into a single list of terms
|
405 |
terms = [term for sublist in terms_list for term in sublist]
|
406 |
|
407 |
+
# 6. Count the frequency of each term
|
408 |
freq = Counter(terms)
|
409 |
|
410 |
with st.spinner("Drawing Keywords Diagram..."):
|
411 |
+
# 7. Generate and display a WordCloud if there are any terms
|
412 |
+
if freq:
|
413 |
+
wc = WordCloud(width=800, height=400, background_color="white")
|
414 |
+
wc.generate_from_frequencies(freq)
|
415 |
+
|
416 |
+
plt.figure(figsize=(10, 5))
|
417 |
+
plt.imshow(wc, interpolation="bilinear")
|
418 |
+
plt.axis("off")
|
419 |
+
plt.show()
|
420 |
+
else:
|
421 |
+
st.write("No keywords to display in the WordCloud.")
|