File size: 5,575 Bytes

de55574

import streamlit as st
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import requests
from bs4 import BeautifulSoup
import pandas as pd
import altair as alt
from collections import OrderedDict
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Load model and tokenizer
model_name = 'C:/projects/sentiment/albert_sentiment_model/checkpoint-3000'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Sentiment labels as textual descriptions
sentiment_labels = {
    0: "very positive",
    1: "positive",
    2: "somewhat positive",
    3: "neutral",
    4: "somewhat negative",
    5: "negative",
    6: "very negative"
}

# Background colors for sentiments
background_colors = {
    "very positive": "rgba(0, 255, 0, 0.5)",
    "positive": "rgba(0, 255, 0, 0.3)",
    "somewhat positive": "rgba(0, 255, 0, 0.1)",
    "neutral": "rgba(128, 128, 128, 0.1)",
    "somewhat negative": "rgba(255, 0, 0, 0.1)",
    "negative": "rgba(255, 0, 0, 0.3)",
    "very negative": "rgba(255, 0, 0, 0.5)"
}

# Function to get text content from a URL
def get_text_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        return ' '.join(p.get_text() for p in paragraphs)
    return ""

# Function to classify text
def classify_text(text, max_length):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
    return scores

# Function to handle long texts
def classify_long_text(text):
    max_length = tokenizer.model_max_length
    # Split the text into chunks
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    aggregate_scores = [0] * len(sentiment_labels)
    chunk_scores_list = []
    for chunk in chunks:
        chunk_scores = classify_text(chunk, max_length)
        chunk_scores_list.append(chunk_scores)
        aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
    # Average the scores
    aggregate_scores = [x / len(chunks) for x in aggregate_scores]
    return aggregate_scores, chunk_scores_list, chunks

# Function to classify each sentence in the text
def classify_sentences(text):
    sentences = sent_tokenize(text)
    sentence_scores = []
    for sentence in sentences:
        scores = classify_text(sentence, tokenizer.model_max_length)
        sentiment_idx = scores.index(max(scores))
        sentiment = sentiment_labels[sentiment_idx]
        sentence_scores.append((sentence, sentiment))
    return sentence_scores

# Streamlit UI
st.title("Sentiment Classification from URL")

url = st.text_input("Enter URL:")
if url:
    text = get_text_from_url(url)
    if text:
        scores, chunk_scores_list, chunks = classify_long_text(text)
        scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
        
        # Ensure the exact order of labels in the graph
        sentiment_order = [
            "very positive", "positive", "somewhat positive",
            "neutral",
            "somewhat negative", "negative", "very negative"
        ]
        ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
        
        # Prepare the DataFrame and reindex
        df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
        
        # Use Altair to plot the bar chart
        chart = alt.Chart(df.reset_index()).mark_bar().encode(
            x=alt.X('index', sort=sentiment_order, title='Sentiment'),
            y='Likelihood'
        ).properties(
            width=600,
            height=400
        )

        st.altair_chart(chart, use_container_width=True)

        # Display each chunk and its own chart
        for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
            chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
            ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
            df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
            
            chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
                x=alt.X('index', sort=sentiment_order, title='Sentiment'),
                y='Likelihood'
            ).properties(
                width=600,
                height=400
            )

            st.write(f"Chunk {i + 1}:")
            st.write(chunk)
            st.altair_chart(chunk_chart, use_container_width=True)

        # Sentence-level classification with background colors
        st.write("Extracted Text with Sentiment Highlights:")
        sentence_scores = classify_sentences(text)
        for sentence, sentiment in sentence_scores:
            bg_color = background_colors[sentiment]
            st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)

    else:
        st.write("Could not extract text from the provided URL.")