Sean-Case commited on
Commit
d80c8f5
·
1 Parent(s): e1c1f68

Minor cleaning, csv formatting changes

Browse files
Files changed (2) hide show
  1. funcs/clean_funcs.py +2 -16
  2. funcs/topic_core_funcs.py +16 -11
funcs/clean_funcs.py CHANGED
@@ -8,32 +8,18 @@ custom_words = []
8
  my_stop_words = custom_words
9
 
10
  # #### Some of my cleaning functions
11
- email_start_pattern_regex = r'.*(?i)importance:|.*(?i)subject:'
12
- email_end_pattern_regex = r'(?i)kind regards.*|(?i)many thanks.*|(?i)sincerely.*'
13
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 
14
  email_pattern_regex = r'\S*@\S*\s?'
15
  num_pattern_regex = r'[0-9]+'
16
  nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
17
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
18
- warning_pattern_regex = r'(?i)caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
19
- egress_pattern_regex = r'(?i)has been securely delivered by egress switch and was securely decoded on'
20
- nbsp_pattern_regex = r'&nbsp;'
21
  multiple_spaces_regex = r'\s{2,}'
22
 
23
- # Pre-compiling the regular expressions for efficiency (not actually used)
24
- # email_start_pattern = re.compile(email_start_pattern_regex)
25
- # email_end_pattern = re.compile(email_end_pattern_regex)
26
- # html_pattern = re.compile(html_pattern_regex)
27
- # email_pattern = re.compile(email_end_pattern_regex)
28
- # num_pattern = re.compile(num_pattern_regex)
29
- # nums_two_more_regex_pattern = re.compile(nums_two_more_regex)
30
- # postcode_pattern = re.compile(postcode_pattern_regex)
31
- # warning_pattern = re.compile(warning_pattern_regex)
32
- # nbsp_pattern = re.compile(nbsp_pattern_regex)
33
-
34
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
35
  texts = pl.Series(texts).str.strip_chars()
36
  text = texts.str.replace_all(html_pattern_regex, ' ')
 
37
  text = text.str.replace_all(email_pattern_regex, ' ')
38
  text = text.str.replace_all(nums_two_more_regex, ' ')
39
  text = text.str.replace_all(postcode_pattern_regex, ' ')
 
8
  my_stop_words = custom_words
9
 
10
  # #### Some of my cleaning functions
 
 
11
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
12
+ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
13
  email_pattern_regex = r'\S*@\S*\s?'
14
  num_pattern_regex = r'[0-9]+'
15
  nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
16
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 
 
 
17
  multiple_spaces_regex = r'\s{2,}'
18
 
 
 
 
 
 
 
 
 
 
 
 
19
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
20
  texts = pl.Series(texts).str.strip_chars()
21
  text = texts.str.replace_all(html_pattern_regex, ' ')
22
+ text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
23
  text = text.str.replace_all(email_pattern_regex, ' ')
24
  text = text.str.replace_all(nums_two_more_regex, ' ')
25
  text = text.str.replace_all(postcode_pattern_regex, ' ')
funcs/topic_core_funcs.py CHANGED
@@ -494,19 +494,24 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
494
 
495
  hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
496
 
497
- # Print topic tree
498
- tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
499
- tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
 
 
 
 
 
500
 
501
- with open(tree_name, "w") as file:
502
- # Write the string to the file
503
- file.write(tree)
 
504
 
505
- output_list.append(tree_name)
506
 
507
  # Save new hierarchical topic model to file
508
- hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_distz_' + today_rev + '.csv'
509
- hierarchical_topics.to_csv(hierarchical_topics_name)
510
  output_list.append(hierarchical_topics_name)
511
 
512
 
@@ -516,12 +521,12 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
516
 
517
  # Write hierarchical topics levels to df
518
  hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
519
- hierarchy_df.to_csv(hierarchy_df_name)
520
  output_list.append(hierarchy_df_name)
521
 
522
  # Write hierarchical topics names to df
523
  hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
524
- hierarchy_topic_names.to_csv(hierarchy_topic_names_name)
525
  output_list.append(hierarchy_topic_names_name)
526
 
527
  #except:
 
494
 
495
  hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
496
 
497
+ # Print topic tree - may get encoding errors, so doing try except
498
+ try:
499
+ tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
500
+ tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
501
+
502
+ with open(tree_name, "w") as file:
503
+ # Write the string to the file
504
+ file.write(tree)
505
 
506
+ output_list.append(tree_name)
507
+
508
+ except Exception as error:
509
+ print("An exception occurred when making topic tree document, skipped:", error)
510
 
 
511
 
512
  # Save new hierarchical topic model to file
513
+ hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_dist_' + today_rev + '.csv'
514
+ hierarchical_topics.to_csv(hierarchical_topics_name, index = None)
515
  output_list.append(hierarchical_topics_name)
516
 
517
 
 
521
 
522
  # Write hierarchical topics levels to df
523
  hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
524
+ hierarchy_df.to_csv(hierarchy_df_name, index = None)
525
  output_list.append(hierarchy_df_name)
526
 
527
  # Write hierarchical topics names to df
528
  hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
529
+ hierarchy_topic_names.to_csv(hierarchy_topic_names_name, index = None)
530
  output_list.append(hierarchy_topic_names_name)
531
 
532
  #except: