Spaces:

dafqi
/

indo_twitter_sentiment_app

Sleeping

File size: 3,890 Bytes

cbf9eb3

import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import string 
import re #regex library
#umap 
import umap
import hdbscan
import plotly.graph_objects as go
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# import word_tokenize from NLTK
from transformers import AutoTokenizer
from script.plotting import visualize_barchart

def load_stopwords():
    stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
    stopwords = stopwords[0].tolist()
    more_stopword = ["ga","iya","dg",'dengan', 'ia','bahwa','oleh',"sy","kl","gak","ah","apa","kok","mau","yg","pak","bapak","ibu","krn","nya","ya"]
    stopwords = stopwords + more_stopword + list(string.punctuation)
    return stopwords

def tokenisasi(df):
    stopwords = load_stopwords()
    tokenizer = AutoTokenizer.from_pretrained('indobert')
    tokens = df.content.apply(lambda x: tokenizer.tokenize(x))
    tokens = tokens.apply(lambda x: [x for x in x if (not x.startswith('##') and x not in stopwords and len(x) > 4)])
    return tokens

def get_wordcloud(df,kelas_sentiment):
    cmap_dict = {'positif': 'Greens', 'negatif': 'OrRd', 'netral': 'GnBu'}
    tokens = tokenisasi(df[df.sentiment == kelas_sentiment])
    tokens = tokens.apply(lambda x: ' '.join(x))
    text = ' '.join(tokens)
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='black',
                    min_font_size = 10,
                    colormap = cmap_dict[kelas_sentiment]).generate(text)
    return wordcloud

def plot_text(df,kelas,embedding_model):
    df = df[df.sentiment == kelas]
    data = embedding_model.encode(df.values.tolist())
    umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42) 
    umap_data = umap_model.fit_transform(data)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=round((df.shape[0])**(0.5)-1),min_samples=3)
    clusterer.fit(umap_data)

    labels = ['cluster ' + str(i) for i in clusterer.labels_]
    # replace cluster -1 with outlier 
    labels = ["outlier" if i == "cluster -1" else i for i in labels ]
    text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))
    
    fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_)
    # remove legend
    fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text)
    #set text color 
    fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8)
    # set background color
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
    # set margin 
    fig.update_layout(margin=dict(l=40, r=5, t=45, b=40))
    # set axis color to grey
    fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)')
    fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)')
    # set font sans-serif
    fig.update_layout(font_family="sans-serif")
    # remove legend
    fig.update_layout(showlegend=False)

    # set legend title to cluster
    return df["content"],data,fig

def topic_modelling(df,embed_df):
    data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    stopwords = load_stopwords()
    # remove empty data 
    topic_model = BERTopic(
        calculate_probabilities=True,
        # cluster model 
        hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5,prediction_data=True),
        vectorizer_model=CountVectorizer(stop_words=stopwords),
        language="indonesian",
    )
    topics, probs = topic_model.fit_transform(data,embed_df)
    topic_labels = topic_model.generate_topic_labels(
        topic_prefix = False,
        separator = ", ",
    )
    topic_model.set_topic_labels(topic_labels)
    fig = visualize_barchart(topic_model)
    # set title to Kata Kunci tiap Topic 
    # fig.update_layout(title_text="Topic yang sering muncul")
    return fig,topic_model