kusa04 commited on
Commit
749f31d
·
verified ·
1 Parent(s): 0c136fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -28
app.py CHANGED
@@ -29,7 +29,8 @@ from functions import (
29
  preprocess_text,
30
  generate_variants,
31
  contains_excluded_keywords,
32
- extract_terms
 
33
  )
34
 
35
 
@@ -355,9 +356,6 @@ if sentiment_btn:
355
  # plt.axis("off")
356
  # plt.show()
357
 
358
-
359
-
360
-
361
  if keyword_extraction_btn:
362
  df = st.session_state.get("df")
363
  user_query = st.session_state.get("user_query")
@@ -372,40 +370,52 @@ if keyword_extraction_btn:
372
  target_col = "Detail_Keyword"
373
  details_list = df['Detail'].tolist()
374
 
375
- # Use ThreadPoolExecutor to process keyword extraction in parallel for each detail
376
  with ThreadPoolExecutor() as executor:
377
  extracted_results = list(
378
- executor.map(lambda detail: keyword_extractor(preprocess_text(detail)) if detail else None, details_list)
 
 
 
379
  )
380
- # No need to assign back to the DataFrame if you don't want to use it further.
381
- # However, for filtering purposes we use the list 'extracted_results'.
382
 
383
- # Generate excluded keywords from the user query
384
  excluded_keywords = generate_variants(user_query)
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
- # Filter out items that contain any excluded keywords
387
- filtered_results = [
388
- result for result in extracted_results
389
- if result is not None and not contains_excluded_keywords(result, excluded_keywords=excluded_keywords)
390
- ]
391
-
392
- # Parallelize extraction of terms from each extracted result
393
  with ThreadPoolExecutor() as executor:
394
- terms_list = list(executor.map(lambda res: extract_terms(res), filtered_results))
 
 
395
 
396
- # Flatten the list of lists into a single list of terms
397
  terms = [term for sublist in terms_list for term in sublist]
398
 
399
- # Count the frequency of each term
400
  freq = Counter(terms)
401
 
402
  with st.spinner("Drawing Keywords Diagram..."):
403
- # Generate a WordCloud from the frequency dictionary
404
- wc = WordCloud(width=800, height=400, background_color="white")
405
- wc.generate_from_frequencies(freq)
406
-
407
- # Display the WordCloud using Matplotlib
408
- plt.figure(figsize=(10, 5))
409
- plt.imshow(wc, interpolation="bilinear")
410
- plt.axis("off")
411
- plt.show()
 
 
 
29
  preprocess_text,
30
  generate_variants,
31
  contains_excluded_keywords,
32
+ extract_terms,
33
+ remove_excluded_from_list
34
  )
35
 
36
 
 
356
  # plt.axis("off")
357
  # plt.show()
358
 
 
 
 
359
  if keyword_extraction_btn:
360
  df = st.session_state.get("df")
361
  user_query = st.session_state.get("user_query")
 
370
  target_col = "Detail_Keyword"
371
  details_list = df['Detail'].tolist()
372
 
373
+ # 1. Run keyword extraction in parallel for each detail
374
  with ThreadPoolExecutor() as executor:
375
  extracted_results = list(
376
+ executor.map(
377
+ lambda detail: keyword_extractor(preprocess_text(detail)) if detail else None,
378
+ details_list
379
+ )
380
  )
 
 
381
 
382
+ # 2. Generate excluded keywords from the user query
383
  excluded_keywords = generate_variants(user_query)
384
+
385
+ # 3. Partially remove excluded keywords from each extracted result
386
+ cleaned_results = []
387
+ for result in extracted_results:
388
+ if result is not None:
389
+ # If the result is a NumPy array, convert it to a Python list
390
+ if isinstance(result, np.ndarray):
391
+ result = result.tolist()
392
+ # Remove only the matching items, not the entire list
393
+ cleaned = remove_excluded_from_list(result, excluded_keywords)
394
+ cleaned_results.append(cleaned)
395
+ else:
396
+ cleaned_results.append(None)
397
 
398
+ # 4. Extract terms from each cleaned result in parallel
 
 
 
 
 
 
399
  with ThreadPoolExecutor() as executor:
400
+ terms_list = list(
401
+ executor.map(lambda res: extract_terms(res), cleaned_results)
402
+ )
403
 
404
+ # 5. Flatten the list of lists into a single list of terms
405
  terms = [term for sublist in terms_list for term in sublist]
406
 
407
+ # 6. Count the frequency of each term
408
  freq = Counter(terms)
409
 
410
  with st.spinner("Drawing Keywords Diagram..."):
411
+ # 7. Generate and display a WordCloud if there are any terms
412
+ if freq:
413
+ wc = WordCloud(width=800, height=400, background_color="white")
414
+ wc.generate_from_frequencies(freq)
415
+
416
+ plt.figure(figsize=(10, 5))
417
+ plt.imshow(wc, interpolation="bilinear")
418
+ plt.axis("off")
419
+ plt.show()
420
+ else:
421
+ st.write("No keywords to display in the WordCloud.")