Spaces:

kusa04
/

g13_DL_project

Sleeping

App Files Files Community

kusa04 commited on Mar 24

Commit

749f31d

verified ·

1 Parent(s): 0c136fb

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -28

app.py CHANGED Viewed

@@ -29,7 +29,8 @@ from functions import (
                     preprocess_text,
                     generate_variants,
                     contains_excluded_keywords,
-                    extract_terms
                 )
@@ -355,9 +356,6 @@ if sentiment_btn:
 #             plt.axis("off")
 #             plt.show()
 if keyword_extraction_btn:
     df = st.session_state.get("df")
     user_query = st.session_state.get("user_query")
@@ -372,40 +370,52 @@ if keyword_extraction_btn:
             target_col = "Detail_Keyword"
             details_list = df['Detail'].tolist()
-            # Use ThreadPoolExecutor to process keyword extraction in parallel for each detail
             with ThreadPoolExecutor() as executor:
                 extracted_results = list(
-                    executor.map(lambda detail: keyword_extractor(preprocess_text(detail)) if detail else None, details_list)
                 )
-            # No need to assign back to the DataFrame if you don't want to use it further.
-            # However, for filtering purposes we use the list 'extracted_results'.
-            # Generate excluded keywords from the user query
             excluded_keywords = generate_variants(user_query)
-            # Filter out items that contain any excluded keywords
-            filtered_results = [
-                result for result in extracted_results
-                if result is not None and not contains_excluded_keywords(result, excluded_keywords=excluded_keywords)
-            ]
-            # Parallelize extraction of terms from each extracted result
             with ThreadPoolExecutor() as executor:
-                terms_list = list(executor.map(lambda res: extract_terms(res), filtered_results))
-            # Flatten the list of lists into a single list of terms
             terms = [term for sublist in terms_list for term in sublist]
-            # Count the frequency of each term
             freq = Counter(terms)
         with st.spinner("Drawing Keywords Diagram..."):
-            # Generate a WordCloud from the frequency dictionary
-            wc = WordCloud(width=800, height=400, background_color="white")
-            wc.generate_from_frequencies(freq)
-            # Display the WordCloud using Matplotlib
-            plt.figure(figsize=(10, 5))
-            plt.imshow(wc, interpolation="bilinear")
-            plt.axis("off")
-            plt.show()

                     preprocess_text,
                     generate_variants,
                     contains_excluded_keywords,
+                    extract_terms,
+                    remove_excluded_from_list
                 )
 #             plt.axis("off")
 #             plt.show()
 if keyword_extraction_btn:
     df = st.session_state.get("df")
     user_query = st.session_state.get("user_query")
             target_col = "Detail_Keyword"
             details_list = df['Detail'].tolist()
+            # 1. Run keyword extraction in parallel for each detail
             with ThreadPoolExecutor() as executor:
                 extracted_results = list(
+                    executor.map(
+                        lambda detail: keyword_extractor(preprocess_text(detail)) if detail else None,
+                        details_list
+                    )
                 )
+            # 2. Generate excluded keywords from the user query
             excluded_keywords = generate_variants(user_query)
+            # 3. Partially remove excluded keywords from each extracted result
+            cleaned_results = []
+            for result in extracted_results:
+                if result is not None:
+                    # If the result is a NumPy array, convert it to a Python list
+                    if isinstance(result, np.ndarray):
+                        result = result.tolist()
+                    # Remove only the matching items, not the entire list
+                    cleaned = remove_excluded_from_list(result, excluded_keywords)
+                    cleaned_results.append(cleaned)
+                else:
+                    cleaned_results.append(None)
+            # 4. Extract terms from each cleaned result in parallel
             with ThreadPoolExecutor() as executor:
+                terms_list = list(
+                    executor.map(lambda res: extract_terms(res), cleaned_results)
+                )
+            # 5. Flatten the list of lists into a single list of terms
             terms = [term for sublist in terms_list for term in sublist]
+            # 6. Count the frequency of each term
             freq = Counter(terms)
         with st.spinner("Drawing Keywords Diagram..."):
+            # 7. Generate and display a WordCloud if there are any terms
+            if freq:
+                wc = WordCloud(width=800, height=400, background_color="white")
+                wc.generate_from_frequencies(freq)
+                plt.figure(figsize=(10, 5))
+                plt.imshow(wc, interpolation="bilinear")
+                plt.axis("off")
+                plt.show()
+            else:
+                st.write("No keywords to display in the WordCloud.")