import pandas as pd
import numpy as np
import spacy
import gradio as gr
import umap
from sklearn.cluster import OPTICS
from transformers import BertTokenizer, TFBertModel
import plotly.io as pio
# configuration params
pio.templates.default = "plotly_dark"
# setting up the text in the page
TITLE = "
BERTopic - For topics detection on text
"
DESCRIPTION = r"""Apply BERTopic to a given dataset end extract the most relevant topics.
"""
EXAMPLES = [
["data/ecomm500.csv"],
]
ARTICLE = r"""
Done by dr. Gabriel Lopez
This program follows the BERTopic philosophy, but actually has its own implementation.
For more please visit: My Page
For info about the BERTopic model can be found here
"""
def load_data(fileobj):
"""Load dataset (keep only 500 rows for efficiency)"""
data = pd.read_csv(fileobj, on_bad_lines='skip', nrows=500)
assert "text" in data.columns, "The data must have a column named 'text'"
return data[['text']]
def run_nlp_processing(data):
"""As reference for standard NLP processing"""
import os
# NLP processing
docs = []
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
for doc in nlp.pipe(data["text"].values, n_process=os.cpu_count() - 1):
lemmas = []
for token in doc:
if token.is_punct or token.is_stop:
continue
lemmas.append(token.lemma_.lower())
docs.append(" ".join(lemmas))
# Make new column
data = data.assign(text=docs)
return data
def run_bert_tokenization(data):
"""Show the action of the WordPiece alogorithm"""
# load BERT model (for embeddings)
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = TFBertModel.from_pretrained(checkpoint)
# Run BERT tokenizing + encoding
descr_processed_tokenized = tokenizer(
list(data["text"]),
return_tensors="tf",
truncation=True,
padding=True,
max_length=128,
)
data = data.assign(text_tokenized=descr_processed_tokenized)
return data
def run_bertopic(data):
""" " End-to-end BERTopic model"""
# load BERT model (for embeddings)
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = TFBertModel.from_pretrained(checkpoint)
# Run BERT tokenizing + encoding
descr_processed_tokenized = tokenizer(
list(data["text"]),
return_tensors="tf",
truncation=True,
padding=True,
max_length=128,
)
output_bert = model(descr_processed_tokenized)
# Get sentence embeddings from BERTs word embeddings
mean_vect = []
for vect in output_bert.last_hidden_state:
mean_vect.append(np.mean(vect, axis=0))
data = data.assign(descr_vect=mean_vect)
# Use UMAP to lower the dimensionality of the embedding to 3D - [stack makes array(array()) --> array2d]
descr_vect_3d = umap.UMAP(n_components=3).fit_transform(
np.stack(data["descr_vect"].values)
)
data["descr_vect_2d"] = list(descr_vect_3d)
# Use BERT's + UMAP vector embeddings for clustering using OPTICS
clustering = OPTICS(min_samples=50).fit(np.stack(data["descr_vect_2d"].values))
data["cluster_label"] = clustering.labels_
# Plot the 3D embedding
fig_bertopic = plot_bertopic(descr_vect_3d, data)
# Extract topic wordclouds
return fig_bertopic
def plot_bertopic(descr_vect_3d, data):
""" " Show the topic clusters over an 3d embedding space"""
import plotly.express as px
fig = px.scatter_3d(
x=descr_vect_3d[:, 0],
y=descr_vect_3d[:, 1],
z=descr_vect_3d[:, 2],
color=data["cluster_label"],
)
return fig
# gradio interface
blocks = gr.Blocks()
with blocks:
# physical elements
session_state = gr.State([])
gr.Markdown(TITLE)
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column():
gr.Markdown(
"## Load the data (must be a csv file with a column named 'text')"
)
in_file = gr.File()
gr.Markdown("## Inspect the data")
in_data = gr.Dataframe()
submit_button = gr.Button("Run BERTopic!")
gr.Examples(inputs=in_file, examples=EXAMPLES)
with gr.Column():
gr.Markdown("## BERTopic Flow")
gr.Markdown(
"Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
)
gr.Markdown("## Processed Text")
out_dataset = gr.Dataframe()
gr.Markdown("## Embedding + Projection + Clustering")
embedding_plot = gr.Plot(label="BERTopic projections")
gr.Markdown("## Extracted Topics")
topics_text = gr.Textbox(label="Topics", lines=50)
gr.Markdown(ARTICLE)
# event listeners
in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
# submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
# out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
blocks.launch()