Spaces:

dejanseo
/

sentiment

Running

App Files Files Community

sentiment / app.py

dejanseo

Update app.py

80e20d5 verified 7 months ago

raw

history blame

6.07 kB

	import streamlit as st
	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	import requests
	import pandas as pd
	import altair as alt
	from collections import OrderedDict
	from nltk.tokenize import sent_tokenize
	import trafilatura

	# Load the punkt tokenizer from nltk
	import nltk
	nltk.download('punkt')

	# Load model and tokenizer
	model_name = 'dejanseo/sentiment'
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Sentiment labels as textual descriptions
	sentiment_labels = {
	0: "very positive",
	1: "positive",
	2: "somewhat positive",
	3: "neutral",
	4: "somewhat negative",
	5: "negative",
	6: "very negative"
	}

	# Background colors for sentiments
	background_colors = {
	"very positive": "rgba(0, 255, 0, 0.5)",
	"positive": "rgba(0, 255, 0, 0.3)",
	"somewhat positive": "rgba(0, 255, 0, 0.1)",
	"neutral": "rgba(128, 128, 128, 0.1)",
	"somewhat negative": "rgba(255, 0, 0, 0.1)",
	"negative": "rgba(255, 0, 0, 0.3)",
	"very negative": "rgba(255, 0, 0, 0.5)"
	}

	# Function to get text content from a URL
	def get_text_from_url(url):
	downloaded = trafilatura.fetch_url(url)
	if downloaded:
	return trafilatura.extract(downloaded)
	return ""

	# Function to classify text
	def classify_text(text, max_length):
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
	with torch.no_grad():
	outputs = model(**inputs)
	scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
	return scores

	# Function to handle long texts
	def classify_long_text(text):
	max_length = tokenizer.model_max_length
	# Split the text into chunks
	chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
	aggregate_scores = [0] * len(sentiment_labels)
	chunk_scores_list = []
	for chunk in chunks:
	chunk_scores = classify_text(chunk, max_length)
	chunk_scores_list.append(chunk_scores)
	aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
	# Average the scores
	aggregate_scores = [x / len(chunks) for x in aggregate_scores]
	return aggregate_scores, chunk_scores_list, chunks

	# Function to classify each sentence in the text
	def classify_sentences(text):
	sentences = sent_tokenize(text)
	sentence_scores = []
	for sentence in sentences:
	scores = classify_text(sentence, tokenizer.model_max_length)
	sentiment_idx = scores.index(max(scores))
	sentiment = sentiment_labels[sentiment_idx]
	sentence_scores.append((sentence, sentiment))
	return sentence_scores

	# Streamlit UI
	st.title("Sentiment Classification Model by DEJAN")

	url = st.text_input("Enter URL:")

	if url:
	text = get_text_from_url(url)
	if text:
	scores, chunk_scores_list, chunks = classify_long_text(text)
	scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}

	# Ensure the exact order of labels in the graph
	sentiment_order = [
	"very positive", "positive", "somewhat positive",
	"neutral",
	"somewhat negative", "negative", "very negative"
	]
	ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)

	# Prepare the DataFrame and reindex
	df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)

	# Use Altair to plot the bar chart
	chart = alt.Chart(df.reset_index()).mark_bar().encode(
	x=alt.X('index', sort=sentiment_order, title='Sentiment'),
	y='Likelihood'
	).properties(
	width=600,
	height=400
	)

	st.altair_chart(chart, use_container_width=True)

	# Display each chunk and its own chart
	for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
	chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
	ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
	df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)

	chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
	x=alt.X('index', sort=sentiment_order, title='Sentiment'),
	y='Likelihood'
	).properties(
	width=600,
	height=400
	)

	st.write(f"Chunk {i + 1}:")
	st.write(chunk)
	st.altair_chart(chunk_chart, use_container_width=True)

	# Sentence-level classification with background colors
	st.write("Extracted Text with Sentiment Highlights:")
	sentence_scores = classify_sentences(text)
	for sentence, sentiment in sentence_scores:
	bg_color = background_colors[sentiment]
	st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)

	else:
	st.write("Could not extract text from the provided URL.")

	# Additional information at the end
	st.markdown("""
	Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/).

	The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client.

	### Engage Our Team
	Interested in using this in an automated pipeline for bulk sentiment processing?

	Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs.
	""")