Sonnyjim commited on
Commit
5d87c3c
1 Parent(s): ffe5eb2

Split off LLM representation, visualisation, and reduce outliers from main function. Added hierarchical visualisation and logs

Browse files
.gitignore CHANGED
@@ -8,6 +8,7 @@
8
  *.safetensors
9
  *.json
10
  *.html
 
11
  .ipynb_checkpoints/*
12
  old_code/*
13
  model/*
 
8
  *.safetensors
9
  *.json
10
  *.html
11
+ *.log
12
  .ipynb_checkpoints/*
13
  old_code/*
14
  model/*
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  from datetime import datetime
3
  import pandas as pd
@@ -48,39 +49,88 @@ from funcs.helper_functions import dummy_function, put_columns_in_df, read_file,
48
  #from funcs.representation_model import representation_model
49
  from funcs.embeddings import make_or_load_embeddings
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  # Load embeddings
53
- #embedding_model_name = "BAAI/bge-small-en-v1.5"
54
- #embedding_model = SentenceTransformer(embedding_model_name)
55
 
56
  # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
57
  # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
58
  embeddings_name = "jinaai/jina-embeddings-v2-small-en"
59
- local_embeddings_location = "model/jina/"
60
  revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
61
 
62
- if low_resource_mode == "No":
63
- try:
64
- embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
65
- except:
66
- embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
67
 
68
- tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
 
69
 
70
- embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
 
 
 
71
 
72
- elif low_resource_mode == "Yes":
73
- embedding_model_pipe = make_pipeline(
74
- TfidfVectorizer(),
75
- TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
76
- )
77
 
78
- # Model used for representing topics
79
- hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
80
- hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
 
82
 
83
- def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out, progress=gr.Progress()):
 
 
 
 
 
 
 
84
 
85
  progress(0, desc= "Loading data")
86
 
@@ -92,7 +142,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
92
  all_tic = time.perf_counter()
93
 
94
  output_list = []
95
- file_list = [string.name for string in in_file]
96
 
97
  data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
98
  data_file_name = data_file_names[0]
@@ -106,39 +156,39 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
106
  in_label_list_first = in_colnames_list_first
107
 
108
  # Make sure format of input series is good
109
- in_files[in_colnames_list_first] = in_files[in_colnames_list_first].fillna('').astype(str)
110
- in_files[in_label_list_first] = in_files[in_label_list_first].fillna('').astype(str)
 
111
 
112
  if anonymise_drop == "Yes":
113
  progress(0.1, desc= "Anonymising data")
114
  anon_tic = time.perf_counter()
115
- time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
116
- in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
117
- in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
118
- anonymise_data_name = "anonymised_data.csv"
119
- in_files.to_csv(anonymise_data_name)
120
  output_list.append(anonymise_data_name)
121
 
122
  anon_toc = time.perf_counter()
123
  time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
124
 
125
- docs = list(in_files[in_colnames_list_first].str.lower())
126
- label_list = list(in_files[in_label_list_first])
127
 
128
- # Check if embeddings are being loaded in
129
- ## Load in pre-embedded file if exists
130
- file_list = [string.name for string in in_file]
131
 
132
  print("Low resource mode: ", low_resource_mode)
133
 
134
  if low_resource_mode == "No":
135
  print("Using high resource Jina transformer model")
136
  try:
137
- embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
138
  except:
139
- embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
140
 
141
- tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
142
 
143
  embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
144
 
@@ -156,30 +206,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
156
 
157
  umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
158
 
159
- progress(0.2, desc= "Loading/creating embeddings")
160
 
161
- embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
162
 
163
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
164
-
165
- from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
166
- from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
167
-
168
-
169
  progress(0.3, desc= "Embeddings loaded. Creating BERTopic model")
170
 
171
  if not candidate_topics:
172
 
173
- # Generate representation model here if topics won't be changed later
174
- # if reduce_outliers == "No":
175
- # topic_model = BERTopic( embedding_model=embedding_model_pipe,
176
- # vectorizer_model=vectoriser_model,
177
- # umap_model=umap_model,
178
- # min_topic_size = min_docs_slider,
179
- # nr_topics = max_topics_slider,
180
- # representation_model=representation_model,
181
- # verbose = True)
182
-
183
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
184
  vectorizer_model=vectoriser_model,
185
  umap_model=umap_model,
@@ -196,146 +232,214 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
196
  error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
197
  print(error_message)
198
 
199
- return error_message, output_list, None
200
 
201
  zero_shot_topics = read_file(candidate_topics.name)
202
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
203
 
204
- # Generate representation model here if topics won't be changed later
205
- # if reduce_outliers == "No":
206
- # topic_model = BERTopic( embedding_model=embedding_model_pipe,
207
- # vectorizer_model=vectoriser_model,
208
- # umap_model=umap_model,
209
- # min_topic_size = min_docs_slider,
210
- # nr_topics = max_topics_slider,
211
- # zeroshot_topic_list = zero_shot_topics_lower,
212
- # zeroshot_min_similarity = 0.5,#0.7,
213
- # representation_model=representation_model,
214
- # verbose = True)
215
- # else:
216
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
217
  vectorizer_model=vectoriser_model,
218
  umap_model=umap_model,
219
  min_topic_size = min_docs_slider,
220
  nr_topics = max_topics_slider,
221
  zeroshot_topic_list = zero_shot_topics_lower,
222
- zeroshot_min_similarity = 0.5,#0.7,
223
  verbose = True)
224
 
225
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
226
 
227
  if not topics_text:
228
- return "No topics found.", data_file_name, None
229
 
230
  else:
231
  print("Topic model created.")
232
 
233
- progress(0.5, desc= "Loading in representation model")
234
- print("Create LLM topic labels:", create_llm_topic_labels)
235
- representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  # Reduce outliers if required, then update representation
239
- if reduce_outliers == "Yes":
240
- progress(0.6, desc= "Reducing outliers then creating topic representations")
241
- print("Reducing outliers.")
242
- # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
243
- topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
244
- # Then, update the topics to the ones that considered the new data
245
- print("Finished reducing outliers.")
246
-
247
- progress(0.6, desc= "Creating topic representations")
 
 
248
  topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
249
 
250
  topic_dets = topic_model.get_topic_info()
251
 
252
- if topic_dets.shape[0] == 1:
253
- topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
254
- topic_dets.to_csv(topic_det_output_name)
255
- output_list.append(topic_det_output_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- return "No topics found, original file returned", output_list, None, embeddings_out
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  # Replace original labels with LLM labels
260
  if "Phi" in topic_model.get_topic_info().columns:
261
  llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
262
  topic_model.set_topic_labels(llm_labels)
 
 
 
 
 
263
  else:
264
  topic_model.set_topic_labels(list(topic_dets["Name"]))
265
 
266
- # Outputs
267
- progress(0.8, desc= "Saving output")
268
 
269
- topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
270
- topic_dets.to_csv(topic_det_output_name)
271
- output_list.append(topic_det_output_name)
272
 
273
- doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
274
- doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
275
- doc_dets.to_csv(doc_det_output_name)
276
- output_list.append(doc_det_output_name)
277
 
278
- topics_text_out_str = str(topic_dets["Name"])
279
- output_text = "Topics: " + topics_text_out_str
280
-
281
- # Save topic model to file
282
- if save_topic_model == "Yes":
283
- topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
284
- topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
285
 
286
- # Clear folder before replacing files
287
- delete_files_in_folder(topic_model_save_name_folder)
288
 
289
- topic_model.save(topic_model_save_name_folder, serialization='pytorch', save_embedding_model=True, save_ctfidf=False)
 
 
290
 
291
- # Zip file example
292
-
293
- zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
294
- output_list.append(topic_model_save_name_zip)
295
 
296
- # If you want to save your embedding files
297
- if return_intermediate_files == "Yes":
298
- print("Saving embeddings to file")
299
- if low_resource_mode == "Yes":
300
- embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
301
- else:
302
- if embeddings_super_compress == "No":
303
- embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
304
- else:
305
- embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embedding_compress.npz'
306
 
307
- np.savez_compressed(embeddings_file_name, embeddings_out)
 
 
 
 
 
308
 
309
- output_list.append(embeddings_file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
- if visualise_topics == "Yes":
312
- from funcs.bertopic_vis_documents import visualize_documents_custom
313
- progress(0.9, desc= "Creating visualisation (this can take a while)")
314
- # Visualise the topics:
315
- vis_tic = time.perf_counter()
316
- print("Creating visualisation")
317
- topics_vis = visualize_documents_custom(topic_model, docs, hover_labels = label_list, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
318
  topics_vis_name = data_file_name_no_ext + '_' + 'visualisation_' + today_rev + '.html'
319
  topics_vis.write_html(topics_vis_name)
320
  output_list.append(topics_vis_name)
321
 
322
- all_toc = time.perf_counter()
323
- time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
324
- print(time_out)
 
325
 
326
-
327
- time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
328
- print(time_out)
329
 
330
- return output_text, output_list, topics_vis, embeddings_out
 
 
331
 
332
  all_toc = time.perf_counter()
333
- time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
334
  print(time_out)
335
 
336
- return output_text, output_list, None, embeddings_out
337
 
338
- # ## Gradio app - extract topics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
  block = gr.Blocks(theme = gr.themes.Base())
341
 
@@ -343,6 +447,10 @@ with block:
343
 
344
  data_state = gr.State(pd.DataFrame())
345
  embeddings_state = gr.State(np.array([]))
 
 
 
 
346
 
347
  gr.Markdown(
348
  """
@@ -370,30 +478,54 @@ with block:
370
  topics_btn = gr.Button("Extract topics")
371
 
372
  with gr.Row():
373
- output_single_text = gr.Textbox(label="Output example (first example in dataset)")
374
  output_file = gr.File(label="Output file")
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
377
 
378
  with gr.Tab("Options"):
379
  with gr.Accordion("Data load and processing options", open = True):
380
  with gr.Row():
381
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
382
- return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
383
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
 
384
  with gr.Row():
385
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
386
- create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
387
- reduce_outliers = gr.Dropdown(label = "Reduce outliers by selecting closest topic.", value="No", choices=["Yes", "No"])
388
- with gr.Row():
389
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
390
- visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
391
 
392
  # Update column names dropdown when file uploaded
393
- in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state])
394
  in_colnames.change(dummy_function, in_colnames, None)
395
 
396
- topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state], api_name="topics")
 
 
 
 
 
 
 
 
397
 
398
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
399
 
 
 
 
 
 
 
1
+ import os
2
  import gradio as gr
3
  from datetime import datetime
4
  import pandas as pd
 
49
  #from funcs.representation_model import representation_model
50
  from funcs.embeddings import make_or_load_embeddings
51
 
52
+ # Log terminal output: https://github.com/gradio-app/gradio/issues/2362
53
+
54
+ import sys
55
+
56
+ class Logger:
57
+ def __init__(self, filename):
58
+ self.terminal = sys.stdout
59
+ self.log = open(filename, "w")
60
+
61
+ def write(self, message):
62
+ self.terminal.write(message)
63
+ self.log.write(message)
64
+
65
+ def flush(self):
66
+ self.terminal.flush()
67
+ self.log.flush()
68
+
69
+ def isatty(self):
70
+ return False
71
+
72
+ sys.stdout = Logger("output.log")
73
+
74
+ def read_logs():
75
+ sys.stdout.flush()
76
+ with open("output.log", "r") as f:
77
+ return f.read()
78
 
79
  # Load embeddings
 
 
80
 
81
  # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
82
  # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
83
  embeddings_name = "jinaai/jina-embeddings-v2-small-en"
84
+ # local_embeddings_location = "model/jina/"
85
  revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
86
 
87
+ # Model used for representing topics
88
+ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
89
+ hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
 
 
90
 
91
+ def save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, progress=gr.Progress()):
92
+ topic_dets = topic_model.get_topic_info()
93
 
94
+ if topic_dets.shape[0] == 1:
95
+ topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
96
+ topic_dets.to_csv(topic_det_output_name)
97
+ output_list.append(topic_det_output_name)
98
 
99
+ return output_list, "No topics found, original file returned"
 
 
 
 
100
 
101
+
102
+ progress(0.8, desc= "Saving output")
103
+
104
+ topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
105
+ topic_dets.to_csv(topic_det_output_name)
106
+ output_list.append(topic_det_output_name)
107
+
108
+ doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
109
+ doc_dets = topic_model.get_document_info(docs)[["Document", "Topic", "Name", "Representative_document"]] # "Probability",
110
+ doc_dets.to_csv(doc_det_output_name)
111
+ output_list.append(doc_det_output_name)
112
+
113
+ topics_text_out_str = str(topic_dets["Name"])
114
+ output_text = "Topics: " + topics_text_out_str
115
+
116
+ # Save topic model to file
117
+ if save_topic_model == "Yes":
118
+ topic_model_save_name_pkl = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev + ".pkl"# + ".safetensors"
119
+ topic_model_save_name_zip = topic_model_save_name_pkl + ".zip"
120
+
121
+ # Clear folder before replacing files
122
+ delete_files_in_folder(topic_model_save_name_pkl)
123
 
124
+ topic_model.save(topic_model_save_name_pkl, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
125
 
126
+ # Zip file example
127
+
128
+ #zip_folder(topic_model_save_name_pkl, topic_model_save_name_zip)
129
+ output_list.append(topic_model_save_name_pkl)
130
+
131
+ return output_list, output_text
132
+
133
+ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, progress=gr.Progress()):
134
 
135
  progress(0, desc= "Loading data")
136
 
 
142
  all_tic = time.perf_counter()
143
 
144
  output_list = []
145
+ file_list = [string.name for string in in_files]
146
 
147
  data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
148
  data_file_name = data_file_names[0]
 
156
  in_label_list_first = in_colnames_list_first
157
 
158
  # Make sure format of input series is good
159
+ data[in_colnames_list_first] = data[in_colnames_list_first].fillna('').astype(str)
160
+ data[in_label_list_first] = data[in_label_list_first].fillna('').astype(str)
161
+ label_list = list(data[in_label_list_first])
162
 
163
  if anonymise_drop == "Yes":
164
  progress(0.1, desc= "Anonymising data")
165
  anon_tic = time.perf_counter()
166
+
167
+ data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="replace")
168
+ data[in_colnames_list_first] = data_anon_col[in_colnames_list_first]
169
+ anonymise_data_name = data_file_name_no_ext + "_anonymised_" + today_rev + ".csv"
170
+ data.to_csv(anonymise_data_name)
171
  output_list.append(anonymise_data_name)
172
 
173
  anon_toc = time.perf_counter()
174
  time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
175
 
176
+ docs = list(data[in_colnames_list_first].str.lower())
177
+
178
 
179
+ # Check if embeddings are being loaded in
180
+ progress(0.2, desc= "Loading/creating embeddings")
 
181
 
182
  print("Low resource mode: ", low_resource_mode)
183
 
184
  if low_resource_mode == "No":
185
  print("Using high resource Jina transformer model")
186
  try:
187
+ embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True,device_map="auto")
188
  except:
189
+ embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto", use_auth_token=os.environ["HF_TOKEN"])
190
 
191
+ tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
192
 
193
  embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
194
 
 
206
 
207
  umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
208
 
209
+
210
 
211
+ embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, low_resource_mode)
212
 
213
  vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
214
+
 
 
 
 
215
  progress(0.3, desc= "Embeddings loaded. Creating BERTopic model")
216
 
217
  if not candidate_topics:
218
 
 
 
 
 
 
 
 
 
 
 
219
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
220
  vectorizer_model=vectoriser_model,
221
  umap_model=umap_model,
 
232
  error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
233
  print(error_message)
234
 
235
+ return error_message, output_list, None, embeddings_out, data_file_name_no_ext, None, docs, label_list
236
 
237
  zero_shot_topics = read_file(candidate_topics.name)
238
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
239
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  topic_model = BERTopic( embedding_model=embedding_model_pipe,
241
  vectorizer_model=vectoriser_model,
242
  umap_model=umap_model,
243
  min_topic_size = min_docs_slider,
244
  nr_topics = max_topics_slider,
245
  zeroshot_topic_list = zero_shot_topics_lower,
246
+ zeroshot_min_similarity = 0.6, # 0.7
247
  verbose = True)
248
 
249
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
250
 
251
  if not topics_text:
252
+ return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
253
 
254
  else:
255
  print("Topic model created.")
256
 
257
+ # Outputs
258
+ output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
259
+
260
+ # If you want to save your embedding files
261
+ if return_intermediate_files == "Yes":
262
+ print("Saving embeddings to file")
263
+ if low_resource_mode == "Yes":
264
+ embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
265
+ else:
266
+ if embeddings_super_compress == "No":
267
+ embeddings_file_name = data_file_name_no_ext + '_' + 'jina_embeddings.npz'
268
+ else:
269
+ embeddings_file_name = data_file_name_no_ext + '_' + 'jina_embeddings_compress.npz'
270
+
271
+ np.savez_compressed(embeddings_file_name, embeddings_out)
272
+
273
+ output_list.append(embeddings_file_name)
274
+
275
+ all_toc = time.perf_counter()
276
+ time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
277
+ print(time_out)
278
+
279
+ return output_text, output_list, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
280
 
281
+ def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, low_resource_mode, create_llm_topic_labels, save_topic_model, progress=gr.Progress()):
282
+ #from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
283
+ from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
284
+
285
+ output_list = []
286
+
287
+ all_tic = time.perf_counter()
288
+
289
+ vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
290
+
291
+ topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
292
+
293
+ #progress(0.2, desc= "Loading in representation model")
294
+ #print("Create LLM topic labels:", create_llm_topic_labels)
295
+ #representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
296
 
297
  # Reduce outliers if required, then update representation
298
+ progress(0.2, desc= "Reducing outliers")
299
+ print("Reducing outliers.")
300
+ # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
301
+ topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
302
+ # Then, update the topics to the ones that considered the new data
303
+
304
+ print("Finished reducing outliers.")
305
+
306
+ progress(0.5, desc= "Creating topic representations")
307
+ print("Create LLM topic labels:", "No")
308
+ representation_model = create_representation_model("No", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
309
  topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
310
 
311
  topic_dets = topic_model.get_topic_info()
312
 
313
+ # Replace original labels with LLM labels
314
+ if "Phi" in topic_model.get_topic_info().columns:
315
+ llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
316
+ topic_model.set_topic_labels(llm_labels)
317
+ else:
318
+ topic_model.set_topic_labels(list(topic_dets["Name"]))
319
+
320
+ # Outputs
321
+ output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
322
+
323
+ all_toc = time.perf_counter()
324
+ time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
325
+ print(time_out)
326
+
327
+ return output_text, output_list, embeddings_out
328
+
329
+ def represent_topics(topic_model, docs, embeddings_out, data_file_name_no_ext, low_resource_mode, save_topic_model, progress=gr.Progress()):
330
+ #from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
331
+ from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
332
+
333
+ output_list = []
334
 
335
+ all_tic = time.perf_counter()
336
+
337
+ vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
338
+
339
+ topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
340
+
341
+ topic_dets = topic_model.get_topic_info()
342
+
343
+ progress(0.2, desc= "Creating topic representations")
344
+ print("Create LLM topic labels:", "Yes")
345
+ representation_model = create_representation_model("Yes", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
346
+
347
+ topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
348
 
349
  # Replace original labels with LLM labels
350
  if "Phi" in topic_model.get_topic_info().columns:
351
  llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
352
  topic_model.set_topic_labels(llm_labels)
353
+
354
+ with open('llm_topic_list.txt', 'w') as file:
355
+ for item in llm_labels:
356
+ file.write(f"{item}\n")
357
+ output_list.append('llm_topic_list.txt')
358
  else:
359
  topic_model.set_topic_labels(list(topic_dets["Name"]))
360
 
 
 
361
 
 
 
 
362
 
363
+ # Outputs
364
+ output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
 
 
365
 
366
+ all_toc = time.perf_counter()
367
+ time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
368
+ print(time_out)
 
 
 
 
369
 
370
+ return output_text, output_list, embeddings_out
 
371
 
372
+ def visualise_topics(topic_model, docs, data_file_name_no_ext, low_resource_mode, embeddings_out, label_list, sample_prop, visualisation_type_radio, progress=gr.Progress()):
373
+ output_list = []
374
+ vis_tic = time.perf_counter()
375
 
376
+ from funcs.bertopic_vis_documents import visualize_documents_custom
 
 
 
377
 
378
+ topic_dets = topic_model.get_topic_info()
 
 
 
 
 
 
 
 
 
379
 
380
+ # Replace original labels with LLM labels
381
+ if "Phi" in topic_model.get_topic_info().columns:
382
+ llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
383
+ topic_model.set_topic_labels(llm_labels)
384
+ else:
385
+ topic_model.set_topic_labels(list(topic_dets["Name"]))
386
 
387
+ # Pre-reduce embeddings for visualisation purposes
388
+ if low_resource_mode == "No":
389
+ reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
390
+ else:
391
+ reduced_embeddings = TruncatedSVD(2, random_state=random_seed).fit_transform(embeddings_out)
392
+
393
+ progress(0.5, desc= "Creating visualisation (this can take a while)")
394
+ # Visualise the topics:
395
+
396
+ print("Creating visualisation")
397
+
398
+ # "Topic document graph", "Hierarchical view"
399
+
400
+ if visualisation_type_radio == "Topic document graph":
401
+ topics_vis = visualize_documents_custom(topic_model, docs, hover_labels = label_list, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True, sample = sample_prop)
402
 
 
 
 
 
 
 
 
403
  topics_vis_name = data_file_name_no_ext + '_' + 'visualisation_' + today_rev + '.html'
404
  topics_vis.write_html(topics_vis_name)
405
  output_list.append(topics_vis_name)
406
 
407
+ elif visualisation_type_radio == "Hierarchical view":
408
+ hierarchical_topics = topic_model.hierarchical_topics(docs)
409
+ topics_vis = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings, sample = sample_prop)
410
+ topics_vis_2 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
411
 
412
+ topics_vis_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topic_doc_' + today_rev + '.html'
413
+ topics_vis.write_html(topics_vis_name)
414
+ output_list.append(topics_vis_name)
415
 
416
+ topics_vis_2_name = data_file_name_no_ext + '_' + 'vis_hierarchy_' + today_rev + '.html'
417
+ topics_vis_2.write_html(topics_vis_2_name)
418
+ output_list.append(topics_vis_2_name)
419
 
420
  all_toc = time.perf_counter()
421
+ time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
422
  print(time_out)
423
 
424
+ return time_out, output_list, topics_vis, embeddings_out
425
 
426
+ def save_as_pytorch_model(topic_model, docs, data_file_name_no_ext , progress=gr.Progress()):
427
+ output_list = []
428
+
429
+ topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
430
+ topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
431
+
432
+ # Clear folder before replacing files
433
+ delete_files_in_folder(topic_model_save_name_folder)
434
+
435
+ topic_model.save(topic_model_save_name_folder, serialization='pytorch', save_embedding_model=True, save_ctfidf=False)
436
+
437
+ # Zip file example
438
+
439
+ zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
440
+ output_list.append(topic_model_save_name_zip)
441
+
442
+ # Gradio app
443
 
444
  block = gr.Blocks(theme = gr.themes.Base())
445
 
 
447
 
448
  data_state = gr.State(pd.DataFrame())
449
  embeddings_state = gr.State(np.array([]))
450
+ topic_model_state = gr.State()
451
+ docs_state = gr.State()
452
+ data_file_name_no_ext_state = gr.State()
453
+ label_list_state = gr.State()
454
 
455
  gr.Markdown(
456
  """
 
478
  topics_btn = gr.Button("Extract topics")
479
 
480
  with gr.Row():
481
+ output_single_text = gr.Textbox(label="Output topics")
482
  output_file = gr.File(label="Output file")
483
 
484
+ with gr.Accordion("Post processing options.", open = True):
485
+ with gr.Row():
486
+ reduce_outliers_btn = gr.Button("Reduce outliers")
487
+ represent_llm_btn = gr.Button("Generate topic labels with LLMs")
488
+
489
+ logs = gr.Textbox(label="Processing logs.")
490
+
491
+
492
+
493
+ with gr.Tab("Visualise"):
494
+ plot_btn = gr.Button("Visualise topic model")
495
+ sample_slide = gr.Slider(minimum = 0.01, maximum = 1, value = 0.1, step = 0.01, label = "Proportion of data points to show on output visualisation.")
496
+ visualisation_type_radio = gr.Radio(choices=["Topic document graph", "Hierarchical view"])
497
+ out_plot_file = gr.File(label="Output plots to file", file_count="multiple")
498
  plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
499
 
500
  with gr.Tab("Options"):
501
  with gr.Accordion("Data load and processing options", open = True):
502
  with gr.Row():
503
  anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
 
504
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
505
+ #create_llm_topic_labels = gr.Dropdown(label = "Create topic labels based on LLMs.", value="No", choices=["Yes", "No"])
506
  with gr.Row():
507
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
508
+ return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
 
 
509
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
 
510
 
511
  # Update column names dropdown when file uploaded
512
+ in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state, output_single_text, topic_model_state])
513
  in_colnames.change(dummy_function, in_colnames, None)
514
 
515
+ topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, label_list_state], api_name="topics")
516
+
517
+ reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="reduce_outliers")
518
+
519
+ represent_llm_btn.click(fn=represent_topics, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="represent_llm")
520
+
521
+ plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, docs_state, data_file_name_no_ext_state, low_resource_mode_opt, embeddings_state, label_list_state, sample_slide, visualisation_type_radio], outputs=[output_single_text, out_plot_file, plot], api_name="plot")
522
+
523
+ block.load(read_logs, None, logs, every=5)
524
 
525
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
526
 
527
+
528
+
529
+
530
+
531
+
funcs/bertopic_vis_documents.py CHANGED
@@ -94,8 +94,8 @@ def visualize_documents_custom(topic_model,
94
 
95
  # Add <br> tags to hover labels to get them to appear on multiple lines
96
  def wrap_by_word(s, n):
97
- '''returns a string where \\n is inserted between every n words'''
98
- a = s.split()
99
  ret = ''
100
  for i in range(0, len(a), n):
101
  ret += ' '.join(a[i:i+n]) + '<br>'
 
94
 
95
  # Add <br> tags to hover labels to get them to appear on multiple lines
96
  def wrap_by_word(s, n):
97
+ '''returns a string up to 300 words where \\n is inserted between every n words'''
98
+ a = s.split()[:300]
99
  ret = ''
100
  for i in range(0, len(a), n):
101
  ret += ' '.join(a[i:i+n]) + '<br>'
funcs/embeddings.py CHANGED
@@ -13,7 +13,7 @@ if cuda.is_available():
13
  else:
14
  torch_device = "cpu"
15
 
16
- def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
17
 
18
  # If no embeddings found, make or load in
19
  if embeddings_out.size == 0:
@@ -65,16 +65,9 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_o
65
  embeddings_out = np.round(embeddings_out, 3)
66
  embeddings_out *= 100
67
 
 
 
68
  else:
69
  print("Found pre-loaded embeddings.")
70
 
71
- # Pre-reduce embeddings for visualisation purposes
72
- if reduce_embeddings == "Yes":
73
- if low_resource_mode_opt == "No":
74
- reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
75
- return embeddings_out, reduced_embeddings
76
- else:
77
- reduced_embeddings = TruncatedSVD(2, random_state=random_seed).fit_transform(embeddings_out)
78
- return embeddings_out, reduced_embeddings
79
-
80
- return embeddings_out, None
 
13
  else:
14
  torch_device = "cpu"
15
 
16
+ def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, low_resource_mode_opt):
17
 
18
  # If no embeddings found, make or load in
19
  if embeddings_out.size == 0:
 
65
  embeddings_out = np.round(embeddings_out, 3)
66
  embeddings_out *= 100
67
 
68
+ return embeddings_out, None
69
+
70
  else:
71
  print("Found pre-loaded embeddings.")
72
 
73
+ return embeddings_out, None
 
 
 
 
 
 
 
 
 
funcs/helper_functions.py CHANGED
@@ -18,6 +18,8 @@ def detect_file_type(filename):
18
  return 'parquet'
19
  elif filename.endswith('.pkl.gz'):
20
  return 'pkl.gz'
 
 
21
  else:
22
  raise ValueError("Unsupported file type.")
23
 
@@ -37,6 +39,8 @@ def read_file(filename):
37
  with gzip.open(filename, 'rb') as file:
38
  file = pickle.load(file)
39
  #file = pd.read_pickle(filename)
 
 
40
 
41
  print("File load complete")
42
 
@@ -44,28 +48,37 @@ def read_file(filename):
44
 
45
  def put_columns_in_df(in_file, in_bm25_column):
46
  '''
47
- When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
48
  '''
49
-
50
- file_list = [string.name for string in in_file]
51
-
52
- data_file_names = [string.lower() for string in file_list if "npz" not in string.lower()]
53
- data_file_name = data_file_names[0]
54
-
55
-
56
  new_choices = []
57
  concat_choices = []
58
-
59
-
60
- df = read_file(data_file_name)
61
 
62
- new_choices = list(df.columns)
 
 
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- concat_choices.extend(new_choices)
66
 
67
  #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
68
- return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([])
69
 
70
  def get_file_path_end(file_path):
71
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
 
18
  return 'parquet'
19
  elif filename.endswith('.pkl.gz'):
20
  return 'pkl.gz'
21
+ elif filename.endswith('.pkl'):
22
+ return 'pkl'
23
  else:
24
  raise ValueError("Unsupported file type.")
25
 
 
39
  with gzip.open(filename, 'rb') as file:
40
  file = pickle.load(file)
41
  #file = pd.read_pickle(filename)
42
+ elif file_type == 'pkl':
43
+ file = pickle.load(file)
44
 
45
  print("File load complete")
46
 
 
48
 
49
  def put_columns_in_df(in_file, in_bm25_column):
50
  '''
51
+ When file is loaded, update the column dropdown choices and write to relevant data states.
52
  '''
 
 
 
 
 
 
 
53
  new_choices = []
54
  concat_choices = []
 
 
 
55
 
56
+ file_list = [string.name for string in in_file]
57
+
58
+ data_file_names = [string.lower() for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower()]
59
+ if data_file_names:
60
+ data_file_name = data_file_names[0]
61
+ df = read_file(data_file_name)
62
 
63
+ new_choices = list(df.columns)
64
+ concat_choices.extend(new_choices)
65
+ output_text = "Data file loaded."
66
+ else:
67
+ error = "No data file provided."
68
+ print(error)
69
+ output_text = error
70
+
71
+ model_file_names = [string.lower() for string in file_list if "pkl" in string.lower()]
72
+ if model_file_names:
73
+ model_file_name = model_file_names[0]
74
+ topic_model = read_file(model_file_name)
75
+ output_text = "Bertopic model loaded in"
76
+
77
 
78
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([]), output_text, topic_model
79
 
80
  #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
81
+ return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([]), output_text, None
82
 
83
  def get_file_path_end(file_path):
84
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")