Spaces:
Running
Running
Sean-Case
commited on
Commit
·
d80c8f5
1
Parent(s):
e1c1f68
Minor cleaning, csv formatting changes
Browse files- funcs/clean_funcs.py +2 -16
- funcs/topic_core_funcs.py +16 -11
funcs/clean_funcs.py
CHANGED
@@ -8,32 +8,18 @@ custom_words = []
|
|
8 |
my_stop_words = custom_words
|
9 |
|
10 |
# #### Some of my cleaning functions
|
11 |
-
email_start_pattern_regex = r'.*(?i)importance:|.*(?i)subject:'
|
12 |
-
email_end_pattern_regex = r'(?i)kind regards.*|(?i)many thanks.*|(?i)sincerely.*'
|
13 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
|
|
14 |
email_pattern_regex = r'\S*@\S*\s?'
|
15 |
num_pattern_regex = r'[0-9]+'
|
16 |
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
17 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
18 |
-
warning_pattern_regex = r'(?i)caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
|
19 |
-
egress_pattern_regex = r'(?i)has been securely delivered by egress switch and was securely decoded on'
|
20 |
-
nbsp_pattern_regex = r' '
|
21 |
multiple_spaces_regex = r'\s{2,}'
|
22 |
|
23 |
-
# Pre-compiling the regular expressions for efficiency (not actually used)
|
24 |
-
# email_start_pattern = re.compile(email_start_pattern_regex)
|
25 |
-
# email_end_pattern = re.compile(email_end_pattern_regex)
|
26 |
-
# html_pattern = re.compile(html_pattern_regex)
|
27 |
-
# email_pattern = re.compile(email_end_pattern_regex)
|
28 |
-
# num_pattern = re.compile(num_pattern_regex)
|
29 |
-
# nums_two_more_regex_pattern = re.compile(nums_two_more_regex)
|
30 |
-
# postcode_pattern = re.compile(postcode_pattern_regex)
|
31 |
-
# warning_pattern = re.compile(warning_pattern_regex)
|
32 |
-
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
33 |
-
|
34 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
35 |
texts = pl.Series(texts).str.strip_chars()
|
36 |
text = texts.str.replace_all(html_pattern_regex, ' ')
|
|
|
37 |
text = text.str.replace_all(email_pattern_regex, ' ')
|
38 |
text = text.str.replace_all(nums_two_more_regex, ' ')
|
39 |
text = text.str.replace_all(postcode_pattern_regex, ' ')
|
|
|
8 |
my_stop_words = custom_words
|
9 |
|
10 |
# #### Some of my cleaning functions
|
|
|
|
|
11 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
12 |
+
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
13 |
email_pattern_regex = r'\S*@\S*\s?'
|
14 |
num_pattern_regex = r'[0-9]+'
|
15 |
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
16 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
|
|
|
|
|
|
17 |
multiple_spaces_regex = r'\s{2,}'
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
20 |
texts = pl.Series(texts).str.strip_chars()
|
21 |
text = texts.str.replace_all(html_pattern_regex, ' ')
|
22 |
+
text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
|
23 |
text = text.str.replace_all(email_pattern_regex, ' ')
|
24 |
text = text.str.replace_all(nums_two_more_regex, ' ')
|
25 |
text = text.str.replace_all(postcode_pattern_regex, ' ')
|
funcs/topic_core_funcs.py
CHANGED
@@ -494,19 +494,24 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
|
|
494 |
|
495 |
hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
|
496 |
|
497 |
-
# Print topic tree
|
498 |
-
|
499 |
-
|
|
|
|
|
|
|
|
|
|
|
500 |
|
501 |
-
|
502 |
-
|
503 |
-
|
|
|
504 |
|
505 |
-
output_list.append(tree_name)
|
506 |
|
507 |
# Save new hierarchical topic model to file
|
508 |
-
hierarchical_topics_name = data_file_name_no_ext + '_' + '
|
509 |
-
hierarchical_topics.to_csv(hierarchical_topics_name)
|
510 |
output_list.append(hierarchical_topics_name)
|
511 |
|
512 |
|
@@ -516,12 +521,12 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
|
|
516 |
|
517 |
# Write hierarchical topics levels to df
|
518 |
hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
|
519 |
-
hierarchy_df.to_csv(hierarchy_df_name)
|
520 |
output_list.append(hierarchy_df_name)
|
521 |
|
522 |
# Write hierarchical topics names to df
|
523 |
hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
|
524 |
-
hierarchy_topic_names.to_csv(hierarchy_topic_names_name)
|
525 |
output_list.append(hierarchy_topic_names_name)
|
526 |
|
527 |
#except:
|
|
|
494 |
|
495 |
hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
|
496 |
|
497 |
+
# Print topic tree - may get encoding errors, so doing try except
|
498 |
+
try:
|
499 |
+
tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
|
500 |
+
tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
|
501 |
+
|
502 |
+
with open(tree_name, "w") as file:
|
503 |
+
# Write the string to the file
|
504 |
+
file.write(tree)
|
505 |
|
506 |
+
output_list.append(tree_name)
|
507 |
+
|
508 |
+
except Exception as error:
|
509 |
+
print("An exception occurred when making topic tree document, skipped:", error)
|
510 |
|
|
|
511 |
|
512 |
# Save new hierarchical topic model to file
|
513 |
+
hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_dist_' + today_rev + '.csv'
|
514 |
+
hierarchical_topics.to_csv(hierarchical_topics_name, index = None)
|
515 |
output_list.append(hierarchical_topics_name)
|
516 |
|
517 |
|
|
|
521 |
|
522 |
# Write hierarchical topics levels to df
|
523 |
hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
|
524 |
+
hierarchy_df.to_csv(hierarchy_df_name, index = None)
|
525 |
output_list.append(hierarchy_df_name)
|
526 |
|
527 |
# Write hierarchical topics names to df
|
528 |
hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
|
529 |
+
hierarchy_topic_names.to_csv(hierarchy_topic_names_name, index = None)
|
530 |
output_list.append(hierarchy_topic_names_name)
|
531 |
|
532 |
#except:
|