import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import string
import re #regex library
import umap
import hdbscan
import plotly.graph_objects as go
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
# import word_tokenize from NLTK
from transformers import AutoTokenizer
from script.plotting import visualize_barchart
def load_stopwords():
stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
stopwords = stopwords[0].tolist()
more_stopword = ["ga","iya","dg",'dengan', 'ia','bahwa','oleh',"sy","kl","gak","ah","apa","kok","mau","yg","pak","bapak","ibu","krn","nya","ya"]
stopwords = stopwords + more_stopword + list(string.punctuation)
return stopwords
def tokenisasi(df):
stopwords = load_stopwords()
tokenizer = AutoTokenizer.from_pretrained('indobert')
tokens = df.content.apply(lambda x: tokenizer.tokenize(x))
tokens = tokens.apply(lambda x: [x for x in x if (not x.startswith('##') and x not in stopwords and len(x) > 4)])
return tokens
def get_wordcloud(df,kelas_sentiment):
cmap_dict = {'positif': 'Greens', 'negatif': 'OrRd', 'netral': 'GnBu'}
tokens = tokenisasi(df[df.sentiment == kelas_sentiment])
tokens = tokens.apply(lambda x: ' '.join(x))
text = ' '.join(tokens)
wordcloud = WordCloud(width = 800, height = 800,
background_color ='black',
min_font_size = 10,
colormap = cmap_dict[kelas_sentiment]).generate(text)
return wordcloud
def plot_text(df,kelas,embedding_model):
df = df[df.sentiment == kelas]
data = embedding_model.encode(df.values.tolist())
umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42)
umap_data = umap_model.fit_transform(data)
clusterer = hdbscan.HDBSCAN(min_cluster_size=round((df.shape[0])**(0.5)-1),min_samples=3)
labels = ['cluster ' + str(i) for i in clusterer.labels_]
# replace cluster -1 with outlier
labels = ["outlier" if i == "cluster -1" else i for i in labels ]
text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_)
# remove legend
fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text)
#set text color
fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8)
# set background color
# set margin
fig.update_layout(margin=dict(l=40, r=5, t=45, b=40))
# set axis color to grey
fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)')
fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)')
# set font sans-serif
# remove legend
# set legend title to cluster
return df["content"],data,fig
def topic_modelling(df,embed_df):
data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
stopwords = load_stopwords()
# remove empty data
topic_model = BERTopic(
# cluster model
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True),
topics, probs = topic_model.fit_transform(data,embed_df)
topic_labels = topic_model.generate_topic_labels(
topic_prefix = False,
separator = ", ",
fig = visualize_barchart(topic_model)
# set title to Kata Kunci tiap Topic
# fig.update_layout(title_text="Topic yang sering muncul")
return fig,topic_model