File size: 5,711 Bytes
a9c2120
 
 
 
ceb8617
a9c2120
 
 
99d6fba
a9c2120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceb8617
99d6fba
ceb8617
 
 
 
 
 
 
 
 
a9c2120
ceb8617
 
 
 
 
 
 
 
a9c2120
99d6fba
a9c2120
99d6fba
 
 
a9c2120
99d6fba
 
 
 
a9c2120
99d6fba
 
 
a9c2120
99d6fba
 
 
 
 
 
a9c2120
99d6fba
 
 
 
 
a9c2120
99d6fba
a9c2120
 
99d6fba
 
 
 
 
 
a9c2120
99d6fba
 
 
 
 
ceb8617
a9c2120
ceb8617
a9c2120
ceb8617
 
 
 
 
 
 
 
a9c2120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99d6fba
a9c2120
 
 
 
 
99d6fba
a9c2120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# ## Some functions to clean text

import re
import string
import polars as pl

# Add calendar months onto stop words
import calendar
#from tqdm import tqdm
import gradio as gr

# Adding custom words to the stopwords
custom_words = []
my_stop_words = custom_words


cal_month = (list(calendar.month_name))
cal_month = [x.lower() for x in cal_month]

# Remove blanks
cal_month = [i for i in cal_month if i]
#print(cal_month)
custom_words.extend(cal_month)


# #### Some of my cleaning functions
email_start_pattern_regex = r'.*importance:|.*subject:'
email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
email_pattern_regex = r'\S*@\S*\s?'
num_pattern_regex = r'[0-9]+'
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
nbsp_pattern_regex = r'&nbsp;'

# Pre-compiling the regular expressions for efficiency
email_start_pattern = re.compile(email_start_pattern_regex)
email_end_pattern = re.compile(email_end_pattern_regex)
html_pattern = re.compile(html_pattern_regex)
email_pattern = re.compile(email_end_pattern_regex)
num_pattern = re.compile(num_pattern_regex)
postcode_pattern = re.compile(postcode_pattern_regex)
warning_pattern = re.compile(warning_pattern_regex)
nbsp_pattern = re.compile(nbsp_pattern_regex)

# def stem_sentence(sentence):

#     words = sentence.split()
#     stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words]
#     return stemmed_words

# def stem_sentences(sentences, progress=gr.Progress()):
#         """Stem each sentence in a list of sentences."""
#         stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
#         return stemmed_sentences

# def get_lemma_text(text):
#     # Tokenize the input string into words
#     tokens = word_tokenize(text)
    
#     lemmas = []
#     for word in tokens:
#         if len(word) > 3:
#             lemma = wn.morphy(word)
#         else:
#             lemma = None
        
#         if lemma is None:
#             lemmas.append(word)
#         else:
#             lemmas.append(lemma)
#     return lemmas

# def get_lemma_tokens(tokens):
    # Tokenize the input string into words
    
    # lemmas = []
    # for word in tokens:
    #     if len(word) > 3:
    #         lemma = wn.morphy(word)
    #     else:
    #         lemma = None
        
    #     if lemma is None:
    #         lemmas.append(word)
    #     else:
    #         lemmas.append(lemma)
    # return lemmas

def initial_clean(texts , progress=gr.Progress()):
    texts = pl.Series(texts)#[]

    text = texts.str.replace_all(email_start_pattern_regex, '')
    text = text.str.replace_all(email_end_pattern_regex, '')
    text = text.str.replace_all(html_pattern_regex, '')
    text = text.str.replace_all(email_pattern_regex, '')

    text = text.to_list()
    
    return text

def remove_hyphens(text_text):
    return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)


def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens

def convert_to_lowercase(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

def remove_short_tokens(tokens):
    return [token for token in tokens if len(token) > 3]


def remove_dups_text(data_samples_ready, data_samples_clean, data_samples):
   # Identify duplicates in the data: https://stackoverflow.com/questions/44191465/efficiently-identify-duplicates-in-large-list-500-000
    # Only identifies the second duplicate

    seen = set()
    dups = []

    for i, doi in enumerate(data_samples_ready):
        if doi not in seen:
            seen.add(doi)
        else:
            dups.append(i) 
    #data_samples_ready[dupes[0:]]
    
    # To see a specific duplicated value you know the position of
    #matching = [s for s in data_samples_ready if data_samples_ready[83] in s]
    #matching
    
    # Remove duplicates only (keep first instance)
    #data_samples_ready = list( dict.fromkeys(data_samples_ready) ) # This way would keep one version of the duplicates
    
    ### Remove all duplicates including original instance
    
    # Identify ALL duplicates including initial values
    # https://stackoverflow.com/questions/11236006/identify-duplicate-values-in-a-list-in-python

    from collections import defaultdict
    D = defaultdict(list)
    for i,item in enumerate(data_samples_ready):
        D[item].append(i)
    D = {k:v for k,v in D.items() if len(v)>1}
    
    # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
    L = list(D.values())
    flat_list_dups = [item for sublist in L for item in sublist]

    # https://stackoverflow.com/questions/11303225/how-to-remove-multiple-indexes-from-a-list-at-the-same-time
    for index in sorted(flat_list_dups, reverse=True):
        del data_samples_ready[index]
        del data_samples_clean[index]
        del data_samples[index]
    
    # Remove blanks
    data_samples_ready = [i for i in data_samples_ready if i]
    data_samples_clean = [i for i in data_samples_clean if i]
    data_samples = [i for i in data_samples if i]
    
    return data_samples_ready, data_samples_clean, flat_list_dups, data_samples