Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sonnyjim commited on Jan 26, 2024

Commit

5d87c3c

1 Parent(s): ffe5eb2

Split off LLM representation, visualisation, and reduce outliers from main function. Added hierarchical visualisation and logs

Browse files

Files changed (5) hide show

.gitignore +1 -0
app.py +276 -144
funcs/bertopic_vis_documents.py +2 -2
funcs/embeddings.py +4 -11
funcs/helper_functions.py +27 -14

.gitignore CHANGED Viewed

@@ -8,6 +8,7 @@
 *.safetensors
 *.json
 *.html
 .ipynb_checkpoints/*
 old_code/*
 model/*

 *.safetensors
 *.json
 *.html
+*.log
 .ipynb_checkpoints/*
 old_code/*
 model/*

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 from datetime import datetime
 import pandas as pd
@@ -48,39 +49,88 @@ from funcs.helper_functions import dummy_function, put_columns_in_df, read_file,
 #from funcs.representation_model import representation_model
 from funcs.embeddings import make_or_load_embeddings
 # Load embeddings
-#embedding_model_name = "BAAI/bge-small-en-v1.5"
-#embedding_model = SentenceTransformer(embedding_model_name)
 # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
 # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
 embeddings_name = "jinaai/jina-embeddings-v2-small-en"
-local_embeddings_location = "model/jina/"
 revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
-if low_resource_mode == "No":
-    try:
-        embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
-    except:
-        embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
-    tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
-    embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
-elif low_resource_mode == "Yes":
-    embedding_model_pipe = make_pipeline(
-                TfidfVectorizer(),
-                TruncatedSVD(2) # 100 # set to 2 to be compatible with zero shot topics - can't be higher than number of topics
-                )
-# Model used for representing topics
-hf_model_name =  'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
-hf_model_file =   'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
-def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_out, progress=gr.Progress()):
     progress(0, desc= "Loading data")
@@ -92,7 +142,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
     all_tic = time.perf_counter()
     output_list = []
-    file_list = [string.name for string in in_file]
     data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
     data_file_name = data_file_names[0]
@@ -106,39 +156,39 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         in_label_list_first = in_colnames_list_first
     # Make sure format of input series is good
-    in_files[in_colnames_list_first] = in_files[in_colnames_list_first].fillna('').astype(str)
-    in_files[in_label_list_first] = in_files[in_label_list_first].fillna('').astype(str)
     if anonymise_drop == "Yes":
         progress(0.1, desc= "Anonymising data")
         anon_tic = time.perf_counter()
-        time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
-        in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
-        in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
-        anonymise_data_name = "anonymised_data.csv"
-        in_files.to_csv(anonymise_data_name)
         output_list.append(anonymise_data_name)
         anon_toc = time.perf_counter()
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
-    docs = list(in_files[in_colnames_list_first].str.lower())
-    label_list = list(in_files[in_label_list_first])
-    # Check if embeddings are being loaded in
-    ## Load in pre-embedded file if exists
-    file_list = [string.name for string in in_file]
     print("Low resource mode: ", low_resource_mode)
     if low_resource_mode == "No":
         print("Using high resource Jina transformer model")
         try:
-            embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
         except:
-            embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-small-en")
         embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
@@ -156,30 +206,16 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
-    progress(0.2, desc= "Loading/creating embeddings")
-    embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
-    from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
-    from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
     progress(0.3, desc= "Embeddings loaded. Creating BERTopic model")
     if not candidate_topics:
-        # Generate representation model here if topics won't be changed later
-        # if reduce_outliers == "No":
-        #     topic_model = BERTopic( embedding_model=embedding_model_pipe,
-        #                             vectorizer_model=vectoriser_model,
-        #                             umap_model=umap_model,
-        #                             min_topic_size = min_docs_slider,
-        #                             nr_topics = max_topics_slider,
-        #                             representation_model=representation_model,
-        #                             verbose = True)
         topic_model = BERTopic( embedding_model=embedding_model_pipe,
                                 vectorizer_model=vectoriser_model,
                                 umap_model=umap_model,
@@ -196,146 +232,214 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
             error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
             print(error_message)
-            return error_message, output_list, None
         zero_shot_topics = read_file(candidate_topics.name)
         zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
-        # Generate representation model here if topics won't be changed later
-        # if reduce_outliers == "No":
-        #     topic_model = BERTopic( embedding_model=embedding_model_pipe,
-        #                             vectorizer_model=vectoriser_model,
-        #                             umap_model=umap_model,
-        #                             min_topic_size = min_docs_slider,
-        #                             nr_topics = max_topics_slider,
-        #                             zeroshot_topic_list = zero_shot_topics_lower,
-        #                             zeroshot_min_similarity = 0.5,#0.7,
-        #                             representation_model=representation_model,
-        #                             verbose = True)
-        # else:
         topic_model = BERTopic( embedding_model=embedding_model_pipe,
                                 vectorizer_model=vectoriser_model,
                                 umap_model=umap_model,
                                 min_topic_size = min_docs_slider,
                                 nr_topics = max_topics_slider,
                                 zeroshot_topic_list = zero_shot_topics_lower,
-                                zeroshot_min_similarity = 0.5,#0.7,
                                 verbose = True)
         topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
     if not topics_text:
-        return "No topics found.", data_file_name, None
     else:
         print("Topic model created.")
-    progress(0.5, desc= "Loading in representation model")
-    print("Create LLM topic labels:", create_llm_topic_labels)
-    representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
     # Reduce outliers if required, then update representation
-    if reduce_outliers == "Yes":
-        progress(0.6, desc= "Reducing outliers then creating topic representations")
-        print("Reducing outliers.")
-        # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
-        topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
-        # Then, update the topics to the ones that considered the new data
-        print("Finished reducing outliers.")
-    progress(0.6, desc= "Creating topic representations")
     topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
     topic_dets = topic_model.get_topic_info()
-    if topic_dets.shape[0] == 1:
-        topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
-        topic_dets.to_csv(topic_det_output_name)
-        output_list.append(topic_det_output_name)
-        return "No topics found, original file returned", output_list, None, embeddings_out
     # Replace original labels with LLM labels
     if "Phi" in topic_model.get_topic_info().columns:
         llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
         topic_model.set_topic_labels(llm_labels)
     else:
         topic_model.set_topic_labels(list(topic_dets["Name"]))
-    # Outputs
-    progress(0.8, desc= "Saving output")
-    topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
-    topic_dets.to_csv(topic_det_output_name)
-    output_list.append(topic_det_output_name)
-    doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
-    doc_dets = topic_model.get_document_info(docs)[["Document",	"Topic", "Name", "Representative_document"]] # "Probability",
-    doc_dets.to_csv(doc_det_output_name)
-    output_list.append(doc_det_output_name)
-    topics_text_out_str = str(topic_dets["Name"])
-    output_text = "Topics: " + topics_text_out_str
-    # Save topic model to file
-    if save_topic_model == "Yes":
-        topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
-        topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
-        # Clear folder before replacing files
-        delete_files_in_folder(topic_model_save_name_folder)
-        topic_model.save(topic_model_save_name_folder, serialization='pytorch', save_embedding_model=True, save_ctfidf=False)
-        # Zip file example
-        zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
-        output_list.append(topic_model_save_name_zip)
-     # If you want to save your embedding files
-    if return_intermediate_files == "Yes":
-        print("Saving embeddings to file")
-        if low_resource_mode == "Yes":
-            embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
-        else:
-            if embeddings_super_compress == "No":
-                embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
-            else:
-                embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embedding_compress.npz'
-        np.savez_compressed(embeddings_file_name, embeddings_out)
-        output_list.append(embeddings_file_name)
-    if visualise_topics == "Yes":
-        from funcs.bertopic_vis_documents import visualize_documents_custom
-        progress(0.9, desc= "Creating visualisation (this can take a while)")
-        # Visualise the topics:
-        vis_tic = time.perf_counter()
-        print("Creating visualisation")
-        topics_vis = visualize_documents_custom(topic_model, docs, hover_labels = label_list, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
         topics_vis_name = data_file_name_no_ext + '_' + 'visualisation_' + today_rev + '.html'
         topics_vis.write_html(topics_vis_name)
         output_list.append(topics_vis_name)
-        all_toc = time.perf_counter()
-        time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
-        print(time_out)
-        time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
-        print(time_out)
-        return output_text, output_list, topics_vis, embeddings_out
     all_toc = time.perf_counter()
-    time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
     print(time_out)
-    return output_text, output_list, None, embeddings_out
-# ## Gradio app - extract topics
 block = gr.Blocks(theme = gr.themes.Base())
@@ -343,6 +447,10 @@ with block:
     data_state = gr.State(pd.DataFrame())
     embeddings_state = gr.State(np.array([]))
     gr.Markdown(
     """
@@ -370,30 +478,54 @@ with block:
             topics_btn = gr.Button("Extract topics")
         with gr.Row():
-            output_single_text = gr.Textbox(label="Output example (first example in dataset)")
             output_file = gr.File(label="Output file")
         plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
     with gr.Tab("Options"):
         with gr.Accordion("Data load and processing options", open = True):
             with gr.Row():
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
-                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
             with gr.Row():
                 low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
-                create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
-                reduce_outliers = gr.Dropdown(label = "Reduce outliers by selecting closest topic.", value="No", choices=["Yes", "No"])
-            with gr.Row():
                 save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
-                visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
     # Update column names dropdown when file uploaded
-    in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state])
     in_colnames.change(dummy_function, in_colnames, None)
-    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state], api_name="topics")
 block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)

+import os
 import gradio as gr
 from datetime import datetime
 import pandas as pd
 #from funcs.representation_model import representation_model
 from funcs.embeddings import make_or_load_embeddings
+# Log terminal output: https://github.com/gradio-app/gradio/issues/2362
+import sys
+class Logger:
+    def __init__(self, filename):
+        self.terminal = sys.stdout
+        self.log = open(filename, "w")
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+    def flush(self):
+        self.terminal.flush()
+        self.log.flush()
+    def isatty(self):
+        return False
+sys.stdout = Logger("output.log")
+def read_logs():
+    sys.stdout.flush()
+    with open("output.log", "r") as f:
+        return f.read()
 # Load embeddings
 # Pinning a Jina revision for security purposes: https://www.baseten.co/blog/pinning-ml-model-revisions-for-compatibility-and-security/
 # Save Jina model locally as described here: https://huggingface.co/jinaai/jina-embeddings-v2-base-en/discussions/29
 embeddings_name = "jinaai/jina-embeddings-v2-small-en"
+# local_embeddings_location = "model/jina/"
 revision_choice = "b811f03af3d4d7ea72a7c25c802b21fc675a5d99"
+# Model used for representing topics
+hf_model_name =  'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF' # 'second-state/stablelm-2-zephyr-1.6b-GGUF'
+hf_model_file =   'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
+def save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, progress=gr.Progress()):
+        topic_dets = topic_model.get_topic_info()
+        if topic_dets.shape[0] == 1:
+            topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+            topic_dets.to_csv(topic_det_output_name)
+            output_list.append(topic_det_output_name)
+            return output_list, "No topics found, original file returned"
+        progress(0.8, desc= "Saving output")
+        topic_det_output_name = "topic_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+        topic_dets.to_csv(topic_det_output_name)
+        output_list.append(topic_det_output_name)
+        doc_det_output_name = "doc_details_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+        doc_dets = topic_model.get_document_info(docs)[["Document",	"Topic", "Name", "Representative_document"]] # "Probability",
+        doc_dets.to_csv(doc_det_output_name)
+        output_list.append(doc_det_output_name)
+        topics_text_out_str = str(topic_dets["Name"])
+        output_text = "Topics: " + topics_text_out_str
+        # Save topic model to file
+        if save_topic_model == "Yes":
+            topic_model_save_name_pkl = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev + ".pkl"# + ".safetensors"
+            topic_model_save_name_zip = topic_model_save_name_pkl + ".zip"
+            # Clear folder before replacing files
+            delete_files_in_folder(topic_model_save_name_pkl)
+            topic_model.save(topic_model_save_name_pkl, serialization='pickle', save_embedding_model=False, save_ctfidf=False)
+            # Zip file example
+            #zip_folder(topic_model_save_name_pkl, topic_model_save_name_zip)
+            output_list.append(topic_model_save_name_pkl)
+        return output_list, output_text
+def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, progress=gr.Progress()):
     progress(0, desc= "Loading data")
     all_tic = time.perf_counter()
     output_list = []
+    file_list = [string.name for string in in_files]
     data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower() and "gz" not in string.lower()]
     data_file_name = data_file_names[0]
         in_label_list_first = in_colnames_list_first
     # Make sure format of input series is good
+    data[in_colnames_list_first] = data[in_colnames_list_first].fillna('').astype(str)
+    data[in_label_list_first] = data[in_label_list_first].fillna('').astype(str)
+    label_list = list(data[in_label_list_first])
     if anonymise_drop == "Yes":
         progress(0.1, desc= "Anonymising data")
         anon_tic = time.perf_counter()
+        data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="replace")
+        data[in_colnames_list_first] = data_anon_col[in_colnames_list_first]
+        anonymise_data_name = data_file_name_no_ext + "_anonymised_" + today_rev +  ".csv"
+        data.to_csv(anonymise_data_name)
         output_list.append(anonymise_data_name)
         anon_toc = time.perf_counter()
         time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
+    docs = list(data[in_colnames_list_first].str.lower())
+    # Check if embeddings are being loaded in
+    progress(0.2, desc= "Loading/creating embeddings")
     print("Low resource mode: ", low_resource_mode)
     if low_resource_mode == "No":
         print("Using high resource Jina transformer model")
         try:
+            embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True,device_map="auto")
         except:
+            embedding_model = AutoModel.from_pretrained(embeddings_name, revision = revision_choice, trust_remote_code=True, device_map="auto", use_auth_token=os.environ["HF_TOKEN"])
+        tokenizer = AutoTokenizer.from_pretrained(embeddings_name)
         embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
         umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
+    embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, low_resource_mode)
     vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
     progress(0.3, desc= "Embeddings loaded. Creating BERTopic model")
     if not candidate_topics:
         topic_model = BERTopic( embedding_model=embedding_model_pipe,
                                 vectorizer_model=vectoriser_model,
                                 umap_model=umap_model,
             error_message = "Zero shot topic modelling currently not compatible with low-resource embeddings. Please change this option to 'No' on the options tab and retry."
             print(error_message)
+            return error_message, output_list, None, embeddings_out, data_file_name_no_ext, None, docs, label_list
         zero_shot_topics = read_file(candidate_topics.name)
         zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
         topic_model = BERTopic( embedding_model=embedding_model_pipe,
                                 vectorizer_model=vectoriser_model,
                                 umap_model=umap_model,
                                 min_topic_size = min_docs_slider,
                                 nr_topics = max_topics_slider,
                                 zeroshot_topic_list = zero_shot_topics_lower,
+                                zeroshot_min_similarity = 0.6, # 0.7
                                 verbose = True)
         topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
     if not topics_text:
+        return "No topics found.", data_file_name, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
     else:
         print("Topic model created.")
+    # Outputs
+    output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
+     # If you want to save your embedding files
+    if return_intermediate_files == "Yes":
+        print("Saving embeddings to file")
+        if low_resource_mode == "Yes":
+            embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
+        else:
+            if embeddings_super_compress == "No":
+                embeddings_file_name = data_file_name_no_ext + '_' + 'jina_embeddings.npz'
+            else:
+                embeddings_file_name = data_file_name_no_ext + '_' + 'jina_embeddings_compress.npz'
+        np.savez_compressed(embeddings_file_name, embeddings_out)
+        output_list.append(embeddings_file_name)
+    all_toc = time.perf_counter()
+    time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
+    print(time_out)
+    return output_text, output_list, None, embeddings_out, data_file_name_no_ext, topic_model, docs, label_list
+def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, low_resource_mode, create_llm_topic_labels, save_topic_model, progress=gr.Progress()):
+    #from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
+    from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
+    output_list = []
+    all_tic = time.perf_counter()
+    vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
+    topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
+    #progress(0.2, desc= "Loading in representation model")
+    #print("Create LLM topic labels:", create_llm_topic_labels)
+    #representation_model = create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
     # Reduce outliers if required, then update representation
+    progress(0.2, desc= "Reducing outliers")
+    print("Reducing outliers.")
+    # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
+    topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
+    # Then, update the topics to the ones that considered the new data
+    print("Finished reducing outliers.")
+    progress(0.5, desc= "Creating topic representations")
+    print("Create LLM topic labels:", "No")
+    representation_model = create_representation_model("No", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
     topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
     topic_dets = topic_model.get_topic_info()
+    # Replace original labels with LLM labels
+    if "Phi" in topic_model.get_topic_info().columns:
+        llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
+        topic_model.set_topic_labels(llm_labels)
+    else:
+        topic_model.set_topic_labels(list(topic_dets["Name"]))
+    # Outputs
+    output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
+    all_toc = time.perf_counter()
+    time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
+    print(time_out)
+    return output_text, output_list, embeddings_out
+def represent_topics(topic_model, docs, embeddings_out, data_file_name_no_ext, low_resource_mode, save_topic_model, progress=gr.Progress()):
+    #from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
+    from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
+    output_list = []
+    all_tic = time.perf_counter()
+    vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
+    topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
+    topic_dets = topic_model.get_topic_info()
+    progress(0.2, desc= "Creating topic representations")
+    print("Create LLM topic labels:", "Yes")
+    representation_model = create_representation_model("Yes", llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode)
+    topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
     # Replace original labels with LLM labels
     if "Phi" in topic_model.get_topic_info().columns:
         llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
         topic_model.set_topic_labels(llm_labels)
+        with open('llm_topic_list.txt', 'w') as file:
+            for item in llm_labels:
+                file.write(f"{item}\n")
+        output_list.append('llm_topic_list.txt')
     else:
         topic_model.set_topic_labels(list(topic_dets["Name"]))
+    # Outputs
+    output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model)
+    all_toc = time.perf_counter()
+    time_out = f"All processes took {all_toc - all_tic:0.1f} seconds"
+    print(time_out)
+    return output_text, output_list, embeddings_out
+def visualise_topics(topic_model, docs, data_file_name_no_ext, low_resource_mode,  embeddings_out, label_list, sample_prop, visualisation_type_radio, progress=gr.Progress()):
+    output_list = []
+    vis_tic = time.perf_counter()
+    from funcs.bertopic_vis_documents import visualize_documents_custom
+    topic_dets = topic_model.get_topic_info()
+    # Replace original labels with LLM labels
+    if "Phi" in topic_model.get_topic_info().columns:
+        llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Phi"].values()]
+        topic_model.set_topic_labels(llm_labels)
+    else:
+        topic_model.set_topic_labels(list(topic_dets["Name"]))
+    # Pre-reduce embeddings for visualisation purposes
+    if low_resource_mode == "No":
+        reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
+    else:
+        reduced_embeddings = TruncatedSVD(2, random_state=random_seed).fit_transform(embeddings_out)
+    progress(0.5, desc= "Creating visualisation (this can take a while)")
+    # Visualise the topics:
+    print("Creating visualisation")
+    # "Topic document graph", "Hierarchical view"
+    if visualisation_type_radio == "Topic document graph":
+        topics_vis = visualize_documents_custom(topic_model, docs, hover_labels = label_list, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True, sample = sample_prop)
         topics_vis_name = data_file_name_no_ext + '_' + 'visualisation_' + today_rev + '.html'
         topics_vis.write_html(topics_vis_name)
         output_list.append(topics_vis_name)
+    elif visualisation_type_radio == "Hierarchical view":
+        hierarchical_topics = topic_model.hierarchical_topics(docs)
+        topics_vis = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings, sample = sample_prop)
+        topics_vis_2 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
+        topics_vis_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topic_doc_' + today_rev + '.html'
+        topics_vis.write_html(topics_vis_name)
+        output_list.append(topics_vis_name)
+        topics_vis_2_name = data_file_name_no_ext + '_' + 'vis_hierarchy_' + today_rev + '.html'
+        topics_vis_2.write_html(topics_vis_2_name)
+        output_list.append(topics_vis_2_name)
     all_toc = time.perf_counter()
+    time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
     print(time_out)
+    return time_out, output_list, topics_vis, embeddings_out
+def save_as_pytorch_model(topic_model, docs, data_file_name_no_ext , progress=gr.Progress()):
+    output_list = []
+    topic_model_save_name_folder = "output_model/" + data_file_name_no_ext + "_topics_" + today_rev# + ".safetensors"
+    topic_model_save_name_zip = topic_model_save_name_folder + ".zip"
+    # Clear folder before replacing files
+    delete_files_in_folder(topic_model_save_name_folder)
+    topic_model.save(topic_model_save_name_folder, serialization='pytorch', save_embedding_model=True, save_ctfidf=False)
+    # Zip file example
+    zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
+    output_list.append(topic_model_save_name_zip)
+# Gradio app
 block = gr.Blocks(theme = gr.themes.Base())
     data_state = gr.State(pd.DataFrame())
     embeddings_state = gr.State(np.array([]))
+    topic_model_state = gr.State()
+    docs_state = gr.State()
+    data_file_name_no_ext_state = gr.State()
+    label_list_state = gr.State()
     gr.Markdown(
     """
             topics_btn = gr.Button("Extract topics")
         with gr.Row():
+            output_single_text = gr.Textbox(label="Output topics")
             output_file = gr.File(label="Output file")
+        with gr.Accordion("Post processing options.", open = True):
+            with gr.Row():
+                reduce_outliers_btn = gr.Button("Reduce outliers")
+                represent_llm_btn = gr.Button("Generate topic labels with LLMs")
+        logs = gr.Textbox(label="Processing logs.")
+    with gr.Tab("Visualise"):
+        plot_btn = gr.Button("Visualise topic model")
+        sample_slide = gr.Slider(minimum = 0.01, maximum = 1, value = 0.1, step = 0.01, label = "Proportion of data points to show on output visualisation.")
+        visualisation_type_radio = gr.Radio(choices=["Topic document graph", "Hierarchical view"])
+        out_plot_file = gr.File(label="Output plots to file", file_count="multiple")
         plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
     with gr.Tab("Options"):
         with gr.Accordion("Data load and processing options", open = True):
             with gr.Row():
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
+                #create_llm_topic_labels = gr.Dropdown(label = "Create topic labels based on LLMs.", value="No", choices=["Yes", "No"])
             with gr.Row():
                 low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
+                return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
                 save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
     # Update column names dropdown when file uploaded
+    in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state, output_single_text, topic_model_state])
     in_colnames.change(dummy_function, in_colnames, None)
+    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, label_list_state], api_name="topics")
+    reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="reduce_outliers")
+    represent_llm_btn.click(fn=represent_topics, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="represent_llm")
+    plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, docs_state, data_file_name_no_ext_state, low_resource_mode_opt, embeddings_state, label_list_state, sample_slide, visualisation_type_radio], outputs=[output_single_text, out_plot_file, plot], api_name="plot")
+    block.load(read_logs, None, logs, every=5)
 block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)

funcs/bertopic_vis_documents.py CHANGED Viewed

@@ -94,8 +94,8 @@ def visualize_documents_custom(topic_model,
     # Add <br> tags to hover labels to get them to appear on multiple lines
     def wrap_by_word(s, n):
-        '''returns a string where \\n is inserted between every n words'''
-        a = s.split()
         ret = ''
         for i in range(0, len(a), n):
             ret += ' '.join(a[i:i+n]) + '<br>'

     # Add <br> tags to hover labels to get them to appear on multiple lines
     def wrap_by_word(s, n):
+        '''returns a string up to 300 words where \\n is inserted between every n words'''
+        a = s.split()[:300]
         ret = ''
         for i in range(0, len(a), n):
             ret += ' '.join(a[i:i+n]) + '<br>'

funcs/embeddings.py CHANGED Viewed

@@ -13,7 +13,7 @@ if cuda.is_available():
 else:
     torch_device =  "cpu"
-def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_out, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode_opt, reduce_embeddings="Yes"):
     # If no embeddings found, make or load in
     if embeddings_out.size == 0:
@@ -65,16 +65,9 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embeddings_o
                 embeddings_out = np.round(embeddings_out, 3)
                 embeddings_out *= 100
     else:
         print("Found pre-loaded embeddings.")
-    # Pre-reduce embeddings for visualisation purposes
-    if reduce_embeddings == "Yes":
-        if low_resource_mode_opt == "No":
-            reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit_transform(embeddings_out)
-            return embeddings_out, reduced_embeddings
-        else:
-            reduced_embeddings = TruncatedSVD(2, random_state=random_seed).fit_transform(embeddings_out)
-            return embeddings_out, reduced_embeddings
-    return embeddings_out, None

 else:
     torch_device =  "cpu"
+def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, low_resource_mode_opt):
     # If no embeddings found, make or load in
     if embeddings_out.size == 0:
                 embeddings_out = np.round(embeddings_out, 3)
                 embeddings_out *= 100
+        return embeddings_out, None
     else:
         print("Found pre-loaded embeddings.")
+        return embeddings_out, None

funcs/helper_functions.py CHANGED Viewed

@@ -18,6 +18,8 @@ def detect_file_type(filename):
         return 'parquet'
     elif filename.endswith('.pkl.gz'):
         return 'pkl.gz'
     else:
         raise ValueError("Unsupported file type.")
@@ -37,6 +39,8 @@ def read_file(filename):
         with gzip.open(filename, 'rb') as file:
             file = pickle.load(file)
             #file = pd.read_pickle(filename)
     print("File load complete")
@@ -44,28 +48,37 @@ def read_file(filename):
 def put_columns_in_df(in_file, in_bm25_column):
     '''
-    When file is loaded, update the column dropdown choices and change 'clean data' dropdown option to 'no'.
     '''
-    file_list = [string.name for string in in_file]
-    data_file_names = [string.lower() for string in file_list if "npz" not in string.lower()]
-    data_file_name = data_file_names[0]
     new_choices = []
     concat_choices = []
-    df = read_file(data_file_name)
-    new_choices = list(df.columns)
-    concat_choices.extend(new_choices)
     #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
-    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([])
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")

         return 'parquet'
     elif filename.endswith('.pkl.gz'):
         return 'pkl.gz'
+    elif filename.endswith('.pkl'):
+        return 'pkl'
     else:
         raise ValueError("Unsupported file type.")
         with gzip.open(filename, 'rb') as file:
             file = pickle.load(file)
             #file = pd.read_pickle(filename)
+    elif file_type == 'pkl':
+        file = pickle.load(file)
     print("File load complete")
 def put_columns_in_df(in_file, in_bm25_column):
     '''
+    When file is loaded, update the column dropdown choices and write to relevant data states.
     '''
     new_choices = []
     concat_choices = []
+    file_list = [string.name for string in in_file]
+    data_file_names = [string.lower() for string in file_list if "npz" not in string.lower() and "pkl" not in string.lower()]
+    if data_file_names:
+        data_file_name = data_file_names[0]
+        df = read_file(data_file_name)
+        new_choices = list(df.columns)
+        concat_choices.extend(new_choices)
+        output_text = "Data file loaded."
+    else:
+        error = "No data file provided."
+        print(error)
+        output_text = error
+    model_file_names = [string.lower() for string in file_list if "pkl" in string.lower()]
+    if model_file_names:
+        model_file_name = model_file_names[0]
+        topic_model = read_file(model_file_name)
+        output_text = "Bertopic model loaded in"
+        return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([]), output_text, topic_model
     #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
+    return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, np.array([]), output_text, None
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")