|
import streamlit as st |
|
import torch |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
import requests |
|
import pandas as pd |
|
import altair as alt |
|
from collections import OrderedDict |
|
from nltk.tokenize import sent_tokenize |
|
import trafilatura |
|
|
|
|
|
import nltk |
|
nltk.download('punkt') |
|
|
|
|
|
model_name = 'dejanseo/sentiment' |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
sentiment_labels = { |
|
0: "very positive", |
|
1: "positive", |
|
2: "somewhat positive", |
|
3: "neutral", |
|
4: "somewhat negative", |
|
5: "negative", |
|
6: "very negative" |
|
} |
|
|
|
|
|
background_colors = { |
|
"very positive": "rgba(0, 255, 0, 0.5)", |
|
"positive": "rgba(0, 255, 0, 0.3)", |
|
"somewhat positive": "rgba(0, 255, 0, 0.1)", |
|
"neutral": "rgba(128, 128, 128, 0.1)", |
|
"somewhat negative": "rgba(255, 0, 0, 0.1)", |
|
"negative": "rgba(255, 0, 0, 0.3)", |
|
"very negative": "rgba(255, 0, 0, 0.5)" |
|
} |
|
|
|
|
|
def get_text_from_url(url): |
|
downloaded = trafilatura.fetch_url(url) |
|
if downloaded: |
|
return trafilatura.extract(downloaded) |
|
return "" |
|
|
|
|
|
def classify_text(text, max_length): |
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist() |
|
return scores |
|
|
|
|
|
def classify_long_text(text): |
|
max_length = tokenizer.model_max_length |
|
|
|
chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)] |
|
aggregate_scores = [0] * len(sentiment_labels) |
|
chunk_scores_list = [] |
|
for chunk in chunks: |
|
chunk_scores = classify_text(chunk, max_length) |
|
chunk_scores_list.append(chunk_scores) |
|
aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)] |
|
|
|
aggregate_scores = [x / len(chunks) for x in aggregate_scores] |
|
return aggregate_scores, chunk_scores_list, chunks |
|
|
|
|
|
def classify_sentences(text): |
|
sentences = sent_tokenize(text) |
|
sentence_scores = [] |
|
for sentence in sentences: |
|
scores = classify_text(sentence, tokenizer.model_max_length) |
|
sentiment_idx = scores.index(max(scores)) |
|
sentiment = sentiment_labels[sentiment_idx] |
|
sentence_scores.append((sentence, sentiment)) |
|
return sentence_scores |
|
|
|
|
|
st.title("Sentiment Classification Model by DEJAN") |
|
|
|
url = st.text_input("Enter URL:") |
|
|
|
if url: |
|
text = get_text_from_url(url) |
|
if text: |
|
scores, chunk_scores_list, chunks = classify_long_text(text) |
|
scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))} |
|
|
|
|
|
sentiment_order = [ |
|
"very positive", "positive", "somewhat positive", |
|
"neutral", |
|
"somewhat negative", "negative", "very negative" |
|
] |
|
ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order) |
|
|
|
|
|
df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order) |
|
|
|
|
|
chart = alt.Chart(df.reset_index()).mark_bar().encode( |
|
x=alt.X('index', sort=sentiment_order, title='Sentiment'), |
|
y='Likelihood' |
|
).properties( |
|
width=600, |
|
height=400 |
|
) |
|
|
|
st.altair_chart(chart, use_container_width=True) |
|
|
|
|
|
for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)): |
|
chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))} |
|
ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order) |
|
df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order) |
|
|
|
chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode( |
|
x=alt.X('index', sort=sentiment_order, title='Sentiment'), |
|
y='Likelihood' |
|
).properties( |
|
width=600, |
|
height=400 |
|
) |
|
|
|
st.write(f"Chunk {i + 1}:") |
|
st.write(chunk) |
|
st.altair_chart(chunk_chart, use_container_width=True) |
|
|
|
|
|
st.write("Extracted Text with Sentiment Highlights:") |
|
sentence_scores = classify_sentences(text) |
|
for sentence, sentiment in sentence_scores: |
|
bg_color = background_colors[sentiment] |
|
st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True) |
|
|
|
else: |
|
st.write("Could not extract text from the provided URL.") |
|
|
|
|
|
st.markdown(""" |
|
Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/). |
|
|
|
The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client. |
|
|
|
### Engage Our Team |
|
Interested in using this in an automated pipeline for bulk sentiment processing? |
|
|
|
Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs. |
|
""") |
|
|