import pandas as pd import numpy as np import spacy import gradio as gr import umap from sklearn.cluster import OPTICS from transformers import BertTokenizer, TFBertModel import plotly.io as pio # configuration params pio.templates.default = "plotly_dark" # setting up the text in the page TITLE = "

BERTopic - For topics detection on text

" DESCRIPTION = r"""
Apply BERTopic to a given dataset end extract the most relevant topics.
""" EXAMPLES = [ ["data/ecomm500.csv"], ] ARTICLE = r"""
Done by dr. Gabriel Lopez
This program follows the BERTopic philosophy, but actually has its own implementation.
For more please visit: My Page
For info about the BERTopic model can be found here
""" def load_data(fileobj): """Load dataset (keep only 500 rows for efficiency)""" data = pd.read_csv(fileobj, on_bad_lines='skip', nrows=500) assert "text" in data.columns, "The data must have a column named 'text'" return data[['text']] def run_nlp_processing(data): """As reference for standard NLP processing""" import os # NLP processing docs = [] nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) for doc in nlp.pipe(data["text"].values, n_process=os.cpu_count() - 1): lemmas = [] for token in doc: if token.is_punct or token.is_stop: continue lemmas.append(token.lemma_.lower()) docs.append(" ".join(lemmas)) # Make new column data = data.assign(text=docs) return data def run_bert_tokenization(data): """Show the action of the WordPiece alogorithm""" # load BERT model (for embeddings) checkpoint = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(checkpoint) model = TFBertModel.from_pretrained(checkpoint) # Run BERT tokenizing + encoding descr_processed_tokenized = tokenizer( list(data["text"]), return_tensors="tf", truncation=True, padding=True, max_length=128, ) data = data.assign(text_tokenized=descr_processed_tokenized) return data def run_bertopic(data): """ " End-to-end BERTopic model""" # load BERT model (for embeddings) checkpoint = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(checkpoint) model = TFBertModel.from_pretrained(checkpoint) # Run BERT tokenizing + encoding descr_processed_tokenized = tokenizer( list(data["text"]), return_tensors="tf", truncation=True, padding=True, max_length=128, ) output_bert = model(descr_processed_tokenized) # Get sentence embeddings from BERTs word embeddings mean_vect = [] for vect in output_bert.last_hidden_state: mean_vect.append(np.mean(vect, axis=0)) data = data.assign(descr_vect=mean_vect) # Use UMAP to lower the dimensionality of the embedding to 3D - [stack makes array(array()) --> array2d] descr_vect_3d = umap.UMAP(n_components=3).fit_transform( np.stack(data["descr_vect"].values) ) data["descr_vect_2d"] = list(descr_vect_3d) # Use BERT's + UMAP vector embeddings for clustering using OPTICS clustering = OPTICS(min_samples=50).fit(np.stack(data["descr_vect_2d"].values)) data["cluster_label"] = clustering.labels_ # Plot the 3D embedding fig_bertopic = plot_bertopic(descr_vect_3d, data) # Extract topic wordclouds return fig_bertopic def plot_bertopic(descr_vect_3d, data): """ " Show the topic clusters over an 3d embedding space""" import plotly.express as px fig = px.scatter_3d( x=descr_vect_3d[:, 0], y=descr_vect_3d[:, 1], z=descr_vect_3d[:, 2], color=data["cluster_label"], ) return fig # gradio interface blocks = gr.Blocks() with blocks: # physical elements session_state = gr.State([]) gr.Markdown(TITLE) gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(): gr.Markdown( "## Load the data (must be a csv file with a column named 'text')" ) in_file = gr.File() gr.Markdown("## Inspect the data") in_data = gr.Dataframe() submit_button = gr.Button("Run BERTopic!") gr.Examples(inputs=in_file, examples=EXAMPLES) with gr.Column(): gr.Markdown("## BERTopic Flow") gr.Markdown( "Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic" ) gr.Markdown("## Processed Text") out_dataset = gr.Dataframe() gr.Markdown("## Embedding + Projection + Clustering") embedding_plot = gr.Plot(label="BERTopic projections") gr.Markdown("## Extracted Topics") topics_text = gr.Textbox(label="Topics", lines=50) gr.Markdown(ARTICLE) # event listeners in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data) # submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization) # out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic) blocks.launch()