Sean-Case commited on
Commit
a9c2120
·
1 Parent(s): 589448e

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.csv
2
+ *.pyc
3
+ *.cpython-311.pyc
4
+ *.cpython-310.pyc
5
+ *.bat
6
+ *.json
7
+ *.xlsx
8
+ *.parquet
9
+ *.json
10
+ *.bat
11
+ *.pkl
12
+ *.spec
13
+ *.ipynb
14
+ build/*
15
+ dist/*
16
+ __pycache__/*
README.md CHANGED
@@ -1,2 +1,13 @@
1
- # data_text_search
2
- Search for key terms in tabular text data
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Data text search
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.50.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Adaptation of fast_bm25 (https://github.com/Inspirateur/Fast-BM25) to search over your data.
README_additions.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Adaptation of fast_bm25 (https://github.com/Inspirateur/Fast-BM25) to your data.
data_text_search.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from search_funcs.fast_bm25 import BM25
2
+ from search_funcs.clean_funcs import initial_clean, get_lemma_tokens#, stem_sentence
3
+ from nltk import word_tokenize
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import os
8
+
9
+ def prepare_input_data(in_file, text_column, clean="No", progress=gr.Progress()):
10
+
11
+ filename = in_file.name
12
+ # Import data
13
+
14
+ df = read_file(filename)
15
+
16
+ #df = pd.read_parquet(file_in.name)
17
+ df_list = list(df[text_column].astype(str))
18
+ #df_list = df
19
+
20
+ if clean == "Yes":
21
+ df_list_clean = initial_clean(df_list)
22
+
23
+ # Save to file if you have cleaned the data
24
+ out_file_name = save_prepared_data(in_file, df_list_clean, df, text_column)
25
+
26
+ #corpus = [word_tokenize(doc.lower()) for doc in df_list_clean]
27
+ corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list_clean, desc = "Tokenising text", unit = "rows")]
28
+
29
+
30
+
31
+ else:
32
+ #corpus = [word_tokenize(doc.lower()) for doc in df_list]
33
+ corpus = [word_tokenize(doc.lower()) for doc in progress.tqdm(df_list, desc = "Tokenising text", unit = "rows")]
34
+ out_file_name = None
35
+
36
+
37
+
38
+ print("Finished data clean")
39
+
40
+ if len(df_list) >= 20:
41
+ message = "Data loaded"
42
+ else:
43
+ message = "Data loaded. Warning: dataset may be too short to get consistent search results."
44
+
45
+ return corpus, message, df, out_file_name
46
+
47
+ def get_file_path_end(file_path):
48
+ # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
49
+ basename = os.path.basename(file_path)
50
+
51
+ # Then, split the basename and its extension and return only the basename without the extension
52
+ filename_without_extension, _ = os.path.splitext(basename)
53
+
54
+ print(filename_without_extension)
55
+
56
+ return filename_without_extension
57
+
58
+ def save_prepared_data(in_file, prepared_text_list, in_df, in_column):
59
+
60
+ # Check if the list and the dataframe have the same length
61
+ if len(prepared_text_list) != len(in_df):
62
+ raise ValueError("The length of 'prepared_text_list' and 'in_df' must match.")
63
+
64
+ file_end = ".parquet"
65
+
66
+ file_name = get_file_path_end(in_file.name) + "_cleaned" + file_end
67
+
68
+ prepared_text_df = pd.DataFrame(data={in_column + "_cleaned":prepared_text_list})
69
+
70
+ # Drop original column from input file to reduce file size
71
+ in_df = in_df.drop(in_column, axis = 1)
72
+
73
+ prepared_df = pd.concat([in_df, prepared_text_df], axis = 1)
74
+
75
+ if file_end == ".csv":
76
+ prepared_df.to_csv(file_name)
77
+ elif file_end == ".parquet":
78
+ prepared_df.to_parquet(file_name)
79
+ else: file_name = None
80
+
81
+
82
+ return file_name
83
+
84
+ def prepare_bm25(corpus, k1=1.5, b = 0.75, alpha=-5):
85
+ #bm25.save("saved_df_bm25")
86
+ #bm25 = BM25.load(re.sub(r'\.pkl$', '', file_in.name))
87
+
88
+ print("Preparing BM25 corpus")
89
+
90
+ global bm25
91
+ bm25 = BM25(corpus, k1=k1, b=b, alpha=alpha)
92
+
93
+ message = "Search parameters loaded."
94
+
95
+ print(message)
96
+
97
+ return message
98
+
99
+ def convert_query_to_tokens(free_text_query, clean="No"):
100
+ '''
101
+ Split open text query into tokens and then lemmatise to get the core of the word
102
+ '''
103
+
104
+ if clean=="Yes":
105
+ split_query = word_tokenize(free_text_query.lower())
106
+ out_query = get_lemma_tokens(split_query)
107
+ #out_query = stem_sentence(free_text_query)
108
+ else:
109
+ split_query = word_tokenize(free_text_query.lower())
110
+ out_query = split_query
111
+
112
+ return out_query
113
+
114
+ def bm25_search(free_text_query, in_no_search_results, original_data, text_column, clean = "No", in_join_file = None, in_join_column = "", search_df_join_column = ""):
115
+
116
+ # Prepare query
117
+ if (clean == "Yes") | (text_column.endswith("_cleaned")):
118
+ token_query = convert_query_to_tokens(free_text_query, clean="Yes")
119
+ else:
120
+ token_query = convert_query_to_tokens(free_text_query, clean="No")
121
+
122
+ print(token_query)
123
+
124
+ # Perform search
125
+ print("Searching")
126
+
127
+ results_index, results_text, results_scores = bm25.extract_documents_and_scores(token_query, bm25.corpus, n=in_no_search_results) #bm25.corpus #original_data[text_column]
128
+ if not results_index:
129
+ return "No search results found", None, token_query
130
+
131
+ print("Search complete")
132
+
133
+ # Prepare results and export
134
+ joined_texts = [' '.join(inner_list) for inner_list in results_text]
135
+ results_df = pd.DataFrame(data={"index": results_index,
136
+ "search_text": joined_texts,
137
+ "search_score_abs": results_scores})
138
+ results_df['search_score_abs'] = abs(round(results_df['search_score_abs'], 2))
139
+ results_df_out = results_df[['index', 'search_text', 'search_score_abs']].merge(original_data,left_on="index", right_index=True, how="left")#.drop("index", axis=1)
140
+
141
+ # Join on additional files
142
+ if in_join_file:
143
+ join_filename = in_join_file.name
144
+
145
+ # Import data
146
+ join_df = read_file(join_filename)
147
+ join_df[in_join_column] = join_df[in_join_column].astype(str).str.replace("\.0$","", regex=True)
148
+ results_df_out[search_df_join_column] = results_df_out[search_df_join_column].astype(str).str.replace("\.0$","", regex=True)
149
+
150
+ results_df_out = results_df_out.merge(join_df,left_on=search_df_join_column, right_on=in_join_column, how="left").drop(in_join_column, axis=1)
151
+
152
+
153
+ # Reorder results by score
154
+ results_df_out = results_df_out.sort_values('search_score_abs', ascending=False)
155
+
156
+
157
+
158
+ # Out file
159
+ results_df_name = "search_result.csv"
160
+ results_df_out.to_csv(results_df_name, index= None)
161
+ results_first_text = results_df_out[text_column].iloc[0]
162
+
163
+ print("Returning results")
164
+
165
+ return results_first_text, results_df_name, token_query
166
+
167
+ def detect_file_type(filename):
168
+ """Detect the file type based on its extension."""
169
+ if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
170
+ return 'csv'
171
+ elif filename.endswith('.xlsx'):
172
+ return 'xlsx'
173
+ elif filename.endswith('.parquet'):
174
+ return 'parquet'
175
+ else:
176
+ raise ValueError("Unsupported file type.")
177
+
178
+ def read_file(filename):
179
+ """Read the file based on its detected type."""
180
+ file_type = detect_file_type(filename)
181
+
182
+ if file_type == 'csv':
183
+ return pd.read_csv(filename, low_memory=False).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
184
+ elif file_type == 'xlsx':
185
+ return pd.read_excel(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
186
+ elif file_type == 'parquet':
187
+ return pd.read_parquet(filename).reset_index().drop(["index", "Unnamed: 0"], axis=1, errors="ignore")
188
+
189
+ def put_columns_in_df(in_file, in_column):
190
+ '''
191
+ When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
192
+ '''
193
+
194
+ new_choices = []
195
+ concat_choices = []
196
+
197
+
198
+ df = read_file(in_file.name)
199
+ new_choices = list(df.columns)
200
+
201
+ print(new_choices)
202
+
203
+ concat_choices.extend(new_choices)
204
+
205
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(value="No", choices = ["Yes", "No"]),\
206
+ gr.Dropdown(choices=concat_choices)
207
+
208
+ def put_columns_in_join_df(in_file, in_column):
209
+ '''
210
+ When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
211
+ '''
212
+
213
+ print("in_column")
214
+
215
+ new_choices = []
216
+ concat_choices = []
217
+
218
+
219
+ df = read_file(in_file.name)
220
+ new_choices = list(df.columns)
221
+
222
+ print(new_choices)
223
+
224
+ concat_choices.extend(new_choices)
225
+
226
+ return gr.Dropdown(choices=concat_choices)
227
+
228
+ def dummy_function(gradio_component):
229
+ """
230
+ A dummy function that exists just so that dropdown updates work correctly.
231
+ """
232
+ return None
233
+
234
+ def display_info(info_component):
235
+ gr.Info(info_component)
236
+ # %%
237
+ # ## Gradio app - BM25 search
238
+ block = gr.Blocks(theme = gr.themes.Base())
239
+
240
+ with block:
241
+
242
+ corpus_state = gr.State()
243
+ data_state = gr.State(pd.DataFrame())
244
+
245
+ in_k1_info = gr.State("""k1: Constant used for influencing the term frequency saturation. After saturation is reached, additional
246
+ presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
247
+ that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
248
+ the type of documents or queries. Information taken from https://github.com/Inspirateur/Fast-BM25""")
249
+ in_b_info = gr.State("""b: Constant used for influencing the effects of different document lengths relative to average document length.
250
+ When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
251
+ [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
252
+ depends on factors such as the type of documents or queries. Information taken from https://github.com/Inspirateur/Fast-BM25""")
253
+ in_alpha_info = gr.State("""alpha: IDF cutoff, terms with a lower idf score than alpha will be dropped. A higher alpha will lower the accuracy of BM25 but increase performance. Information taken from https://github.com/Inspirateur/Fast-BM25""")
254
+ in_no_search_info = gr.State("""Search results number: Maximum number of search results that will be returned. Bear in mind that if the alpha value is greater than the minimum, common words will be removed from the dataset, and so the number of search results returned may be lower than this value.""")
255
+ in_clean_info = gr.State("""Clean text: Clean the input text and search query. The function will try to remove email components and tags, and then will 'stem' the words. I.e. it will remove the endings of words (e.g. smashed becomes smash) so that the search engine is looking for the common 'core' of words between the query and dataset.""")
256
+
257
+ gr.Markdown(
258
+ """
259
+ # Fast text search
260
+ Enter a text query below to search through a text data column and find relevant entries. Your data should contain at least 20 entries for the search to return results.
261
+ """)
262
+
263
+ with gr.Tab(label="Search your data"):
264
+ with gr.Accordion(label = "Load in data", open=True):
265
+ in_corpus = gr.File(label="Upload your search data here")
266
+ with gr.Row():
267
+ in_column = gr.Dropdown(label="Enter the name of the text column in the data file to search")
268
+
269
+ load_data_button = gr.Button(value="Load data")
270
+
271
+
272
+ with gr.Row():
273
+ load_finished_message = gr.Textbox(label="Load progress", scale = 2)
274
+
275
+
276
+ with gr.Accordion(label = "Search data", open=True):
277
+ with gr.Row():
278
+ in_query = gr.Textbox(label="Enter your search term")
279
+ mod_query = gr.Textbox(label="Cleaned search term (the terms that are passed to the search engine)")
280
+
281
+ search_button = gr.Button(value="Search text")
282
+
283
+ with gr.Row():
284
+ output_single_text = gr.Textbox(label="Top result")
285
+ output_file = gr.File(label="File output")
286
+
287
+
288
+ with gr.Tab(label="Advanced options"):
289
+ with gr.Accordion(label="Data load / save options", open = False):
290
+ #with gr.Row():
291
+ in_clean_data = gr.Dropdown(label = "Clean text during load (remove tags, stem words). This will take some time!", value="No", choices=["Yes", "No"])
292
+ #save_clean_data_button = gr.Button(value = "Save loaded data to file", scale = 1)
293
+ with gr.Accordion(label="Search options", open = False):
294
+ with gr.Row():
295
+ in_k1 = gr.Slider(label = "k1 value", value = 1.5, minimum = 0.1, maximum = 5, step = 0.1, scale = 3)
296
+ in_k1_button = gr.Button(value = "k1 value info", scale = 1)
297
+ with gr.Row():
298
+ in_b = gr.Slider(label = "b value", value = 0.75, minimum = 0.1, maximum = 5, step = 0.05, scale = 3)
299
+ in_b_button = gr.Button(value = "b value info", scale = 1)
300
+ with gr.Row():
301
+ in_alpha = gr.Slider(label = "alpha value / IDF cutoff", value = -5, minimum = -5, maximum = 10, step = 1, scale = 3)
302
+ in_alpha_button = gr.Button(value = "alpha value info", scale = 1)
303
+ with gr.Row():
304
+ in_no_search_results = gr.Slider(label="Maximum number of search results to return", value = 100000, minimum=10, maximum=100000, step=10, scale = 3)
305
+ in_no_search_results_button = gr.Button(value = "Search results number info", scale = 1)
306
+ with gr.Row():
307
+ in_search_param_button = gr.Button(value="Load search parameters (Need to click this if you changed anything above)")
308
+ with gr.Accordion(label = "Join on additional dataframes to results", open = False):
309
+ in_join_file = gr.File(label="Upload your data to join here")
310
+ in_join_column = gr.Dropdown(label="Column to join in new data frame")
311
+ search_df_join_column = gr.Dropdown(label="Column to join in search data frame")
312
+
313
+ in_search_param_button.click(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message])
314
+
315
+ # ---
316
+ in_k1_button.click(display_info, inputs=in_k1_info)
317
+ in_b_button.click(display_info, inputs=in_b_info)
318
+ in_alpha_button.click(display_info, inputs=in_alpha_info)
319
+ in_no_search_results_button.click(display_info, inputs=in_no_search_info)
320
+
321
+
322
+ in_corpus.upload(put_columns_in_df, inputs=[in_corpus, in_column], outputs=[in_column, in_clean_data, search_df_join_column])
323
+ in_join_file.upload(put_columns_in_join_df, inputs=[in_join_file, in_join_column], outputs=[in_join_column])
324
+
325
+ # Load in the data
326
+ load_data_button.click(fn=prepare_input_data, inputs=[in_corpus, in_column, in_clean_data], outputs=[corpus_state, load_finished_message, data_state, output_file]).\
327
+ then(fn=prepare_bm25, inputs=[corpus_state, in_k1, in_b, in_alpha], outputs=[load_finished_message]).\
328
+ then(fn=put_columns_in_df, inputs=[in_corpus, in_column], outputs=[in_column, in_clean_data, search_df_join_column])
329
+
330
+ #save_clean_data_button.click(fn=save_prepared_data, inputs=[in_corpus, corpus_state, data_state, in_column], outputs=[output_file])
331
+
332
+
333
+ # Search functions on click or enter
334
+ search_button.click(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_column, in_clean_data, in_join_file, in_join_column, search_df_join_column],
335
+ outputs=[output_single_text, output_file, mod_query], api_name="search")
336
+
337
+ in_query.submit(fn=bm25_search, inputs=[in_query, in_no_search_results, data_state, in_column, in_clean_data, in_join_file, in_join_column, search_df_join_column],
338
+ outputs=[output_single_text, output_file, mod_query])
339
+
340
+ # Dummy functions just to get dropdowns to work correctly with Gradio 3.50
341
+ in_column.change(dummy_function, in_column, None)
342
+ search_df_join_column.change(dummy_function, search_df_join_column, None)
343
+ in_join_column.change(dummy_function, in_join_column, None)
344
+
345
+ block.queue().launch(debug=True)
346
+
347
+
hook-gradio.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from PyInstaller.utils.hooks import collect_data_files
2
+
3
+ hiddenimports = [
4
+ 'gradio',
5
+ # Add any other submodules that PyInstaller doesn't detect
6
+ ]
7
+
8
+ # Use collect_data_files to find data files. Replace 'gradio' with the correct package name if it's different.
9
+ datas = collect_data_files('gradio')
how_to_create_exe_dist.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
2
+
3
+ 2. Activate the environment 'conda activate new_env'
4
+
5
+ 3. cd to this folder. Install packages from requirements.txt using 'pip install -r requirements.txt'
6
+
7
+ 4. In file explorer, navigate to the miniconda/envs/new_env/Lib/site-packages/gradio-client/ folder
8
+
9
+ 5. Copy types.json from the gradio_client folder to the folder containing the data_text_search.py file
10
+
11
+ 6. pip install pyinstaller
12
+
13
+ 7. In command line, cd to this folder. Then run the following 'python -m PyInstaller --additional-hooks-dir=. --hidden-import pyarrow.vendored.version --add-data="types.json;gradio_client" --clean --onefile --clean --name DataSearchApp data_text_search.py'
14
+
15
+ 8. A 'dist' folder will be created with the executable inside along with all dependencies('dist\data_text_search').
16
+
17
+ 9. In file explorer, navigate to the miniconda/envs/new_env/Lib/site-packages/gradio/ folder. Copy the entire folder. Paste this into the new distributable subfolder 'dist\data_text_search\_internal'
18
+
19
+ 10. In 'dist\data_text_search' try double clicking on the .exe file. After a short delay, the command prompt should inform you about the ip address of the app that is now running. Copy the ip address, but do not close this window.
20
+
21
+ 11. In an Internet browser, navigate to the indicated IP address. The app should now be running in your browser window.
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ nltk
3
+ pyarrow
4
+ openpyxl
5
+ gradio==3.50.0
search_funcs/__init__.py ADDED
File without changes
search_funcs/clean_funcs.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ## Some functions to clean text
2
+
3
+ # ### Some other suggested cleaning approaches
4
+ #
5
+ # #### From here: https://shravan-kuchkula.github.io/topic-modeling/#interactive-plot-showing-results-of-k-means-clustering-lda-topic-modeling-and-sentiment-analysis
6
+ #
7
+ # - remove_hyphens
8
+ # - tokenize_text
9
+ # - remove_special_characters
10
+ # - convert to lower case
11
+ # - remove stopwords
12
+ # - lemmatize the token
13
+ # - remove short tokens
14
+ # - keep only words in wordnet
15
+ # - I ADDED ON - creating custom stopwords list
16
+
17
+ # +
18
+ # Create a custom stop words list
19
+ import nltk
20
+ import re
21
+ import string
22
+ from nltk.stem import WordNetLemmatizer
23
+ from nltk.stem import PorterStemmer
24
+ from nltk.corpus import wordnet as wn
25
+ from nltk import word_tokenize
26
+
27
+ # Add calendar months onto stop words
28
+ import calendar
29
+ from tqdm import tqdm
30
+ import gradio as gr
31
+
32
+ stemmer = PorterStemmer()
33
+
34
+
35
+ nltk.download('stopwords')
36
+ nltk.download('wordnet')
37
+
38
+ #nltk.download('words')
39
+ #nltk.download('names')
40
+
41
+ #nltk.corpus.words.words('en')
42
+
43
+ #from sklearn.feature_extraction import text
44
+ # Adding common names to stopwords
45
+
46
+ all_names = [x.lower() for x in list(nltk.corpus.names.words())]
47
+
48
+ # Adding custom words to the stopwords
49
+ custom_words = []
50
+ my_stop_words = custom_words
51
+
52
+
53
+ cal_month = (list(calendar.month_name))
54
+ cal_month = [x.lower() for x in cal_month]
55
+
56
+ # Remove blanks
57
+ cal_month = [i for i in cal_month if i]
58
+ #print(cal_month)
59
+ custom_words.extend(cal_month)
60
+
61
+ #my_stop_words = frozenset(text.ENGLISH_STOP_WORDS.union(custom_words).union(all_names))
62
+ #custom_stopwords = my_stop_words
63
+ # -
64
+
65
+ # #### Some of my cleaning functions
66
+ '''
67
+ # +
68
+ # Remove all html elements from the text. Inspired by this: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
69
+
70
+ def remove_email_start(text):
71
+ cleanr = re.compile('.*importance:|.*subject:')
72
+ cleantext = re.sub(cleanr, '', text)
73
+ return cleantext
74
+
75
+ def remove_email_end(text):
76
+ cleanr = re.compile('kind regards.*|many thanks.*|sincerely.*')
77
+ cleantext = re.sub(cleanr, '', text)
78
+ return cleantext
79
+
80
+ def cleanhtml(text):
81
+ cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0')
82
+ cleantext = re.sub(cleanr, '', text)
83
+ return cleantext
84
+
85
+ ## The above doesn't work when there is no > at the end of the string to match the initial <. Trying this: <[^>]+> but needs work: https://stackoverflow.com/questions/2013124/regex-matching-up-to-the-first-occurrence-of-a-character
86
+
87
+ # Remove all email addresses and numbers from the text
88
+
89
+ def cleanemail(text):
90
+ cleanr = re.compile('\S*@\S*\s?|\xa0')
91
+ cleantext = re.sub(cleanr, '', text)
92
+ return cleantext
93
+
94
+ def cleannum(text):
95
+ cleanr = re.compile(r'[0-9]+')
96
+ cleantext = re.sub(cleanr, '', text)
97
+ return cleantext
98
+
99
+ def cleanpostcode(text):
100
+ cleanr = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)')
101
+ cleantext = re.sub(cleanr, '', text)
102
+ return cleantext
103
+
104
+ def cleanwarning(text):
105
+ cleanr = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.')
106
+ cleantext = re.sub(cleanr, '', text)
107
+ return cleantext
108
+
109
+
110
+ # -
111
+
112
+ def initial_clean(texts):
113
+ clean_texts = []
114
+ for text in texts:
115
+ text = remove_email_start(text)
116
+ text = remove_email_end(text)
117
+ text = cleanpostcode(text)
118
+ text = remove_hyphens(text)
119
+ text = cleanhtml(text)
120
+ text = cleanemail(text)
121
+ #text = cleannum(text)
122
+ clean_texts.append(text)
123
+ return clean_texts
124
+ '''
125
+ # Pre-compiling the regular expressions for efficiency
126
+ email_start_pattern = re.compile('.*importance:|.*subject:')
127
+ email_end_pattern = re.compile('kind regards.*|many thanks.*|sincerely.*')
128
+ html_pattern = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0')
129
+ email_pattern = re.compile('\S*@\S*\s?')
130
+ num_pattern = re.compile(r'[0-9]+')
131
+ postcode_pattern = re.compile(r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)')
132
+ warning_pattern = re.compile('caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.')
133
+ nbsp_pattern = re.compile(r'&nbsp;')
134
+
135
+ def stem_sentence(sentence):
136
+
137
+ words = sentence.split()
138
+ stemmed_words = [stemmer.stem(word).lower().rstrip("'") for word in words]
139
+ return stemmed_words
140
+
141
+ def stem_sentences(sentences, progress=gr.Progress()):
142
+ """Stem each sentence in a list of sentences."""
143
+ stemmed_sentences = [stem_sentence(sentence) for sentence in progress.tqdm(sentences)]
144
+ return stemmed_sentences
145
+
146
+
147
+
148
+ def get_lemma_text(text):
149
+ # Tokenize the input string into words
150
+ tokens = word_tokenize(text)
151
+
152
+ lemmas = []
153
+ for word in tokens:
154
+ if len(word) > 3:
155
+ lemma = wn.morphy(word)
156
+ else:
157
+ lemma = None
158
+
159
+ if lemma is None:
160
+ lemmas.append(word)
161
+ else:
162
+ lemmas.append(lemma)
163
+ return lemmas
164
+
165
+ def get_lemma_tokens(tokens):
166
+ # Tokenize the input string into words
167
+
168
+ lemmas = []
169
+ for word in tokens:
170
+ if len(word) > 3:
171
+ lemma = wn.morphy(word)
172
+ else:
173
+ lemma = None
174
+
175
+ if lemma is None:
176
+ lemmas.append(word)
177
+ else:
178
+ lemmas.append(lemma)
179
+ return lemmas
180
+
181
+ def initial_clean(texts , progress=gr.Progress()):
182
+ clean_texts = []
183
+
184
+ i = 1
185
+ #progress(0, desc="Cleaning texts")
186
+ for text in progress.tqdm(texts, desc = "Cleaning data", unit = "rows"):
187
+ #print("Cleaning row: ", i)
188
+ text = re.sub(email_start_pattern, '', text)
189
+ text = re.sub(email_end_pattern, '', text)
190
+ text = re.sub(postcode_pattern, '', text)
191
+ text = remove_hyphens(text)
192
+ text = re.sub(html_pattern, '', text)
193
+ text = re.sub(email_pattern, '', text)
194
+ text = re.sub(nbsp_pattern, '', text)
195
+ #text = re.sub(warning_pattern, '', text)
196
+ #text = stem_sentence(text)
197
+ text = get_lemma_text(text)
198
+ text = ' '.join(text)
199
+ # Uncomment the next line if you want to remove numbers as well
200
+ # text = re.sub(num_pattern, '', text)
201
+ clean_texts.append(text)
202
+
203
+ i += 1
204
+ return clean_texts
205
+
206
+ # Sample execution
207
+ #sample_texts = [
208
+ # "Hello, this is a test email. kind regards, John",
209
+ # "<div>Email content here</div> many thanks, Jane",
210
+ # "caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.",
211
212
+ # "Address: 1234 Elm St, AB12 3CD"
213
+ #]
214
+
215
+ #initial_clean(sample_texts)
216
+
217
+
218
+ # +
219
+
220
+ all_names = [x.lower() for x in list(nltk.corpus.names.words())]
221
+
222
+ def remove_hyphens(text_text):
223
+ return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
224
+
225
+ # tokenize text
226
+ def tokenize_text(text_text):
227
+ TOKEN_PATTERN = r'\s+'
228
+ regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True)
229
+ word_tokens = regex_wt.tokenize(text_text)
230
+ return word_tokens
231
+
232
+ def remove_characters_after_tokenization(tokens):
233
+ pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
234
+ filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
235
+ return filtered_tokens
236
+
237
+ def convert_to_lowercase(tokens):
238
+ return [token.lower() for token in tokens if token.isalpha()]
239
+
240
+ def remove_stopwords(tokens, custom_stopwords):
241
+ stopword_list = nltk.corpus.stopwords.words('english')
242
+ stopword_list += my_stop_words
243
+ filtered_tokens = [token for token in tokens if token not in stopword_list]
244
+ return filtered_tokens
245
+
246
+ def remove_names(tokens):
247
+ stopword_list = list(nltk.corpus.names.words())
248
+ stopword_list = [x.lower() for x in stopword_list]
249
+ filtered_tokens = [token for token in tokens if token not in stopword_list]
250
+ return filtered_tokens
251
+
252
+
253
+
254
+ def remove_short_tokens(tokens):
255
+ return [token for token in tokens if len(token) > 3]
256
+
257
+ def keep_only_words_in_wordnet(tokens):
258
+ return [token for token in tokens if wn.synsets(token)]
259
+
260
+ def apply_lemmatize(tokens, wnl=WordNetLemmatizer()):
261
+
262
+ def lem_word(word):
263
+
264
+ if len(word) > 3: out_word = wnl.lemmatize(word)
265
+ else: out_word = word
266
+
267
+ return out_word
268
+
269
+ return [lem_word(token) for token in tokens]
270
+
271
+
272
+ # +
273
+ ### Do the cleaning
274
+
275
+ def cleanTexttexts(texts):
276
+ clean_texts = []
277
+ for text in texts:
278
+ #text = remove_email_start(text)
279
+ #text = remove_email_end(text)
280
+ text = remove_hyphens(text)
281
+ text = cleanhtml(text)
282
+ text = cleanemail(text)
283
+ text = cleanpostcode(text)
284
+ text = cleannum(text)
285
+ #text = cleanwarning(text)
286
+ text_i = tokenize_text(text)
287
+ text_i = remove_characters_after_tokenization(text_i)
288
+ #text_i = remove_names(text_i)
289
+ text_i = convert_to_lowercase(text_i)
290
+ #text_i = remove_stopwords(text_i, my_stop_words)
291
+ text_i = get_lemma(text_i)
292
+ #text_i = remove_short_tokens(text_i)
293
+ text_i = keep_only_words_in_wordnet(text_i)
294
+
295
+ text_i = apply_lemmatize(text_i)
296
+ clean_texts.append(text_i)
297
+ return clean_texts
298
+
299
+
300
+ # -
301
+
302
+ def remove_dups_text(data_samples_ready, data_samples_clean, data_samples):
303
+ # Identify duplicates in the data: https://stackoverflow.com/questions/44191465/efficiently-identify-duplicates-in-large-list-500-000
304
+ # Only identifies the second duplicate
305
+
306
+ seen = set()
307
+ dupes = []
308
+
309
+ for i, doi in enumerate(data_samples_ready):
310
+ if doi not in seen:
311
+ seen.add(doi)
312
+ else:
313
+ dupes.append(i)
314
+ #data_samples_ready[dupes[0:]]
315
+
316
+ # To see a specific duplicated value you know the position of
317
+ #matching = [s for s in data_samples_ready if data_samples_ready[83] in s]
318
+ #matching
319
+
320
+ # Remove duplicates only (keep first instance)
321
+ #data_samples_ready = list( dict.fromkeys(data_samples_ready) ) # This way would keep one version of the duplicates
322
+
323
+ ### Remove all duplicates including original instance
324
+
325
+ # Identify ALL duplicates including initial values
326
+ # https://stackoverflow.com/questions/11236006/identify-duplicate-values-in-a-list-in-python
327
+
328
+ from collections import defaultdict
329
+ D = defaultdict(list)
330
+ for i,item in enumerate(data_samples_ready):
331
+ D[item].append(i)
332
+ D = {k:v for k,v in D.items() if len(v)>1}
333
+
334
+ # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
335
+ L = list(D.values())
336
+ flat_list_dups = [item for sublist in L for item in sublist]
337
+
338
+ # https://stackoverflow.com/questions/11303225/how-to-remove-multiple-indexes-from-a-list-at-the-same-time
339
+ for index in sorted(flat_list_dups, reverse=True):
340
+ del data_samples_ready[index]
341
+ del data_samples_clean[index]
342
+ del data_samples[index]
343
+
344
+ # Remove blanks
345
+ data_samples_ready = [i for i in data_samples_ready if i]
346
+ data_samples_clean = [i for i in data_samples_clean if i]
347
+ data_samples = [i for i in data_samples if i]
348
+
349
+ return data_samples_ready, data_samples_clean, flat_list_dups, data_samples
350
+
search_funcs/fast_bm25.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import heapq
3
+ import math
4
+ import pickle
5
+ import sys
6
+ from numpy import inf
7
+ import gradio as gr
8
+
9
+ PARAM_K1 = 1.5
10
+ PARAM_B = 0.75
11
+ IDF_CUTOFF = -inf
12
+
13
+ # Built off https://github.com/Inspirateur/Fast-BM25
14
+
15
+ class BM25:
16
+ """Fast Implementation of Best Matching 25 ranking function.
17
+
18
+ Attributes
19
+ ----------
20
+ t2d : <token: <doc, freq>>
21
+ Dictionary with terms frequencies for each document in `corpus`.
22
+ idf: <token, idf score>
23
+ Pre computed IDF score for every term.
24
+ doc_len : list of int
25
+ List of document lengths.
26
+ avgdl : float
27
+ Average length of document in `corpus`.
28
+ """
29
+ def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, alpha=IDF_CUTOFF):
30
+ """
31
+ Parameters
32
+ ----------
33
+ corpus : list of list of str
34
+ Given corpus.
35
+ k1 : float
36
+ Constant used for influencing the term frequency saturation. After saturation is reached, additional
37
+ presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
38
+ that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
39
+ the type of documents or queries.
40
+ b : float
41
+ Constant used for influencing the effects of different document lengths relative to average document length.
42
+ When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
43
+ [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
44
+ depends on factors such as the type of documents or queries.
45
+ alpha: float
46
+ IDF cutoff, terms with a lower idf score than alpha will be dropped. A higher alpha will lower the accuracy
47
+ of BM25 but increase performance
48
+ """
49
+ self.k1 = k1
50
+ self.b = b
51
+ self.alpha = alpha
52
+ self.corpus = corpus
53
+
54
+ self.avgdl = 0
55
+ self.t2d = {}
56
+ self.idf = {}
57
+ self.doc_len = []
58
+ if corpus:
59
+ self._initialize(corpus)
60
+
61
+ @property
62
+ def corpus_size(self):
63
+ return len(self.doc_len)
64
+
65
+ def _initialize(self, corpus, progress=gr.Progress()):
66
+ """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
67
+ i = 0
68
+ for document in progress.tqdm(corpus, desc = "Preparing search index", unit = "rows"):
69
+ self.doc_len.append(len(document))
70
+
71
+ for word in document:
72
+ if word not in self.t2d:
73
+ self.t2d[word] = {}
74
+ if i not in self.t2d[word]:
75
+ self.t2d[word][i] = 0
76
+ self.t2d[word][i] += 1
77
+ i += 1
78
+
79
+ self.avgdl = sum(self.doc_len)/len(self.doc_len)
80
+ to_delete = []
81
+ for word, docs in self.t2d.items():
82
+ idf = math.log(self.corpus_size - len(docs) + 0.5) - math.log(len(docs) + 0.5)
83
+ # only store the idf score if it's above the threshold
84
+ if idf > self.alpha:
85
+ self.idf[word] = idf
86
+ else:
87
+ to_delete.append(word)
88
+ print(f"Dropping {len(to_delete)} terms")
89
+ for word in to_delete:
90
+ del self.t2d[word]
91
+
92
+ if len(self.idf) == 0:
93
+ print("Alpha value too high - all words removed from dataset.")
94
+ self.average_idf = 0
95
+
96
+ else:
97
+ self.average_idf = sum(self.idf.values())/len(self.idf)
98
+
99
+ if self.average_idf < 0:
100
+ print(
101
+ f'Average inverse document frequency is less than zero. Your corpus of {self.corpus_size} documents'
102
+ ' is either too small or it does not originate from natural text. BM25 may produce'
103
+ ' unintuitive results.',
104
+ file=sys.stderr
105
+ )
106
+
107
+ def get_top_n(self, query, documents, n=5):
108
+ """
109
+ Retrieve the top n documents for the query.
110
+
111
+ Parameters
112
+ ----------
113
+ query: list of str
114
+ The tokenized query
115
+ documents: list
116
+ The documents to return from
117
+ n: int
118
+ The number of documents to return
119
+
120
+ Returns
121
+ -------
122
+ list
123
+ The top n documents
124
+ """
125
+ assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
126
+ scores = collections.defaultdict(float)
127
+ for token in query:
128
+ if token in self.t2d:
129
+ for index, freq in self.t2d[token].items():
130
+ denom_cst = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl)
131
+ scores[index] += self.idf[token]*freq*(self.k1 + 1)/(freq + denom_cst)
132
+
133
+ return [documents[i] for i in heapq.nlargest(n, scores.keys(), key=scores.__getitem__)]
134
+
135
+
136
+ def get_top_n_with_score(self, query, documents, n=5):
137
+ """
138
+ Retrieve the top n documents for the query along with their scores.
139
+
140
+ Parameters
141
+ ----------
142
+ query: list of str
143
+ The tokenized query
144
+ documents: list
145
+ The documents to return from
146
+ n: int
147
+ The number of documents to return
148
+
149
+ Returns
150
+ -------
151
+ list
152
+ The top n documents along with their scores and row indices in the format (index, document, score)
153
+ """
154
+ assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
155
+ scores = collections.defaultdict(float)
156
+ for token in query:
157
+ if token in self.t2d:
158
+ for index, freq in self.t2d[token].items():
159
+ denom_cst = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl)
160
+ scores[index] += self.idf[token] * freq * (self.k1 + 1) / (freq + denom_cst)
161
+
162
+ top_n_indices = heapq.nlargest(n, scores.keys(), key=scores.__getitem__)
163
+ return [(i, documents[i], scores[i]) for i in top_n_indices]
164
+
165
+ def extract_documents_and_scores(self, query, documents, n=5):
166
+ """
167
+ Extract top n documents and their scores into separate lists.
168
+
169
+ Parameters
170
+ ----------
171
+ query: list of str
172
+ The tokenized query
173
+ documents: list
174
+ The documents to return from
175
+ n: int
176
+ The number of documents to return
177
+
178
+ Returns
179
+ -------
180
+ tuple: (list, list)
181
+ The first list contains the top n documents and the second list contains their scores.
182
+ """
183
+ results = self.get_top_n_with_score(query, documents, n)
184
+ try:
185
+ indices, docs, scores = zip(*results)
186
+ except:
187
+ print("No search results returned")
188
+ return [], [], []
189
+ return list(indices), docs, list(scores)
190
+
191
+ def save(self, filename):
192
+ with open(f"{filename}.pkl", "wb") as fsave:
193
+ pickle.dump(self, fsave, protocol=pickle.HIGHEST_PROTOCOL)
194
+
195
+ @staticmethod
196
+ def load(filename):
197
+ with open(f"{filename}.pkl", "rb") as fsave:
198
+ return pickle.load(fsave)
search_funcs/ingest_text.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import pandas as pd
3
+ import csv
4
+
5
+ # %%
6
+ # Define your file paths
7
+ file_dir = "../"
8
+ extracted_file_path = file_dir + "2022_08_case_notes.txt"
9
+ parquet_file_path = file_dir + "2022_08_case_notes.parquet"
10
+
11
+ # %%
12
+ # Read the TXT file using the csv module and convert to DataFrame
13
+ csv.field_size_limit(1000000) # set to a higher value
14
+
15
+ data_list = []
16
+ with open(extracted_file_path, mode='r', encoding='iso-8859-1') as file:
17
+ csv_reader = csv.reader(file, delimiter=',') # Change the delimiter if needed
18
+ for row in csv_reader:
19
+ data_list.append(row)
20
+
21
+ # Filter rows that have the same number of columns as the header
22
+ header = data_list[0]
23
+ filtered_data = [row for row in data_list if len(row) == len(header)]
24
+
25
+ # Convert list of rows to DataFrame
26
+ casenotes = pd.DataFrame(filtered_data[1:], columns=header) # Assuming first row is header
27
+
28
+ print(casenotes.head()) # Display the first few rows of the DataFrame
29
+
30
+ # %%
31
+ casenotes.to_parquet(parquet_file_path)
32
+
33
+