Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sean-Case commited on Feb 16, 2024

Commit

d80c8f5

1 Parent(s): e1c1f68

Minor cleaning, csv formatting changes

Browse files

Files changed (2) hide show

funcs/clean_funcs.py +2 -16
funcs/topic_core_funcs.py +16 -11

funcs/clean_funcs.py CHANGED Viewed

@@ -8,32 +8,18 @@ custom_words = []
 my_stop_words = custom_words
 # #### Some of my cleaning functions
-email_start_pattern_regex = r'.*(?i)importance:|.*(?i)subject:'
-email_end_pattern_regex = r'(?i)kind regards.*|(?i)many thanks.*|(?i)sincerely.*'
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
 nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
-warning_pattern_regex = r'(?i)caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
-egress_pattern_regex = r'(?i)has been securely delivered by egress switch and was securely decoded on'
-nbsp_pattern_regex = r'&nbsp;'
 multiple_spaces_regex = r'\s{2,}'
-# Pre-compiling the regular expressions for efficiency (not actually used)
-# email_start_pattern = re.compile(email_start_pattern_regex)
-# email_end_pattern = re.compile(email_end_pattern_regex)
-# html_pattern = re.compile(html_pattern_regex)
-# email_pattern = re.compile(email_end_pattern_regex)
-# num_pattern = re.compile(num_pattern_regex)
-# nums_two_more_regex_pattern = re.compile(nums_two_more_regex)
-# postcode_pattern = re.compile(postcode_pattern_regex)
-# warning_pattern = re.compile(warning_pattern_regex)
-# nbsp_pattern = re.compile(nbsp_pattern_regex)
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
     texts = pl.Series(texts).str.strip_chars()
     text = texts.str.replace_all(html_pattern_regex, ' ')
     text = text.str.replace_all(email_pattern_regex, ' ')
     text = text.str.replace_all(nums_two_more_regex, ' ')
     text = text.str.replace_all(postcode_pattern_regex, ' ')

 my_stop_words = custom_words
 # #### Some of my cleaning functions
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
+html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
 nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
     texts = pl.Series(texts).str.strip_chars()
     text = texts.str.replace_all(html_pattern_regex, ' ')
+    text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
     text = text.str.replace_all(email_pattern_regex, ' ')
     text = text.str.replace_all(nums_two_more_regex, ' ')
     text = text.str.replace_all(postcode_pattern_regex, ' ')

funcs/topic_core_funcs.py CHANGED Viewed

@@ -494,19 +494,24 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
         hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
-        # Print topic tree
-        tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
-        tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
-        with open(tree_name, "w") as file:
-            # Write the string to the file
-            file.write(tree)
-        output_list.append(tree_name)
         # Save new hierarchical topic model to file
-        hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_distz_' + today_rev + '.csv'
-        hierarchical_topics.to_csv(hierarchical_topics_name)
         output_list.append(hierarchical_topics_name)
@@ -516,12 +521,12 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
         # Write hierarchical topics levels to df
         hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
-        hierarchy_df.to_csv(hierarchy_df_name)
         output_list.append(hierarchy_df_name)
         # Write hierarchical topics names to df
         hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
-        hierarchy_topic_names.to_csv(hierarchy_topic_names_name)
         output_list.append(hierarchy_topic_names_name)
         #except:

         hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
+        # Print topic tree - may get encoding errors, so doing try except
+        try:
+            tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
+            tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
+            with open(tree_name, "w") as file:
+                # Write the string to the file
+                file.write(tree)
+            output_list.append(tree_name)
+        except Exception as error:
+            print("An exception occurred when making topic tree document, skipped:", error)
         # Save new hierarchical topic model to file
+        hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_dist_' + today_rev + '.csv'
+        hierarchical_topics.to_csv(hierarchical_topics_name, index = None)
         output_list.append(hierarchical_topics_name)
         # Write hierarchical topics levels to df
         hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
+        hierarchy_df.to_csv(hierarchy_df_name, index = None)
         output_list.append(hierarchy_df_name)
         # Write hierarchical topics names to df
         hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
+        hierarchy_topic_names.to_csv(hierarchy_topic_names_name, index = None)
         output_list.append(hierarchy_topic_names_name)
         #except: