import streamlit as st import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer import requests from bs4 import BeautifulSoup import pandas as pd import altair as alt from collections import OrderedDict from nltk.tokenize import sent_tokenize # Load the punkt tokenizer from nltk import nltk nltk.download('punkt') # Load model and tokenizer model_name = 'dejanseo/sentiment' model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Sentiment labels as textual descriptions sentiment_labels = { 0: "very positive", 1: "positive", 2: "somewhat positive", 3: "neutral", 4: "somewhat negative", 5: "negative", 6: "very negative" } # Background colors for sentiments background_colors = { "very positive": "rgba(0, 255, 0, 0.5)", "positive": "rgba(0, 255, 0, 0.3)", "somewhat positive": "rgba(0, 255, 0, 0.1)", "neutral": "rgba(128, 128, 128, 0.1)", "somewhat negative": "rgba(255, 0, 0, 0.1)", "negative": "rgba(255, 0, 0, 0.3)", "very negative": "rgba(255, 0, 0, 0.5)" } # Function to get text content from a URL def get_text_from_url(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') paragraphs = soup.find_all('p') return ' '.join(p.get_text() for p in paragraphs) return "" # Function to classify text def classify_text(text, max_length): inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length) with torch.no_grad(): outputs = model(**inputs) scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist() return scores # Function to handle long texts def classify_long_text(text): max_length = tokenizer.model_max_length # Split the text into chunks chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)] aggregate_scores = [0] * len(sentiment_labels) chunk_scores_list = [] for chunk in chunks: chunk_scores = classify_text(chunk, max_length) chunk_scores_list.append(chunk_scores) aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)] # Average the scores aggregate_scores = [x / len(chunks) for x in aggregate_scores] return aggregate_scores, chunk_scores_list, chunks # Function to classify each sentence in the text def classify_sentences(text): sentences = sent_tokenize(text) sentence_scores = [] for sentence in sentences: scores = classify_text(sentence, tokenizer.model_max_length) sentiment_idx = scores.index(max(scores)) sentiment = sentiment_labels[sentiment_idx] sentence_scores.append((sentence, sentiment)) return sentence_scores # Streamlit UI st.title("Sentiment Classification from URL") url = st.text_input("Enter URL:") # Additional information st.markdown(""" Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/). The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client. # Engage Our Team Interested in using this in an automated pipeline for bulk query processing? Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs. """) if url: text = get_text_from_url(url) if text: scores, chunk_scores_list, chunks = classify_long_text(text) scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))} # Ensure the exact order of labels in the graph sentiment_order = [ "very positive", "positive", "somewhat positive", "neutral", "somewhat negative", "negative", "very negative" ] ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order) # Prepare the DataFrame and reindex df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order) # Use Altair to plot the bar chart chart = alt.Chart(df.reset_index()).mark_bar().encode( x=alt.X('index', sort=sentiment_order, title='Sentiment'), y='Likelihood' ).properties( width=600, height=400 ) st.altair_chart(chart, use_container_width=True) # Display each chunk and its own chart for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)): chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))} ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order) df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order) chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode( x=alt.X('index', sort=sentiment_order, title='Sentiment'), y='Likelihood' ).properties( width=600, height=400 ) st.write(f"Chunk {i + 1}:") st.write(chunk) st.altair_chart(chunk_chart, use_container_width=True) # Sentence-level classification with background colors st.write("Extracted Text with Sentiment Highlights:") sentence_scores = classify_sentences(text) for sentence, sentiment in sentence_scores: bg_color = background_colors[sentiment] st.markdown(f'{sentence}', unsafe_allow_html=True) else: st.write("Could not extract text from the provided URL.") # Additional information at the end st.markdown(""" Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/). The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client. ### Engage Our Team Interested in using this in an automated pipeline for bulk query processing? Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs. """)