dejanseo commited on
Commit
21357a8
1 Parent(s): 44fa285

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import pandas as pd
7
+ import altair as alt
8
+ from collections import OrderedDict
9
+ import nltk
10
+ from nltk.tokenize import sent_tokenize
11
+
12
+ nltk.download('punkt')
13
+
14
+ # Load model and tokenizer
15
+ model_name = 'dejanseo/sentiment'
16
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+
19
+ # Sentiment labels as textual descriptions
20
+ sentiment_labels = {
21
+ 0: "very positive",
22
+ 1: "positive",
23
+ 2: "somewhat positive",
24
+ 3: "neutral",
25
+ 4: "somewhat negative",
26
+ 5: "negative",
27
+ 6: "very negative"
28
+ }
29
+
30
+ # Background colors for sentiments
31
+ background_colors = {
32
+ "very positive": "rgba(0, 255, 0, 0.5)",
33
+ "positive": "rgba(0, 255, 0, 0.3)",
34
+ "somewhat positive": "rgba(0, 255, 0, 0.1)",
35
+ "neutral": "rgba(128, 128, 128, 0.1)",
36
+ "somewhat negative": "rgba(255, 0, 0, 0.1)",
37
+ "negative": "rgba(255, 0, 0, 0.3)",
38
+ "very negative": "rgba(255, 0, 0, 0.5)"
39
+ }
40
+
41
+ # Function to get text content from a URL
42
+ def get_text_from_url(url):
43
+ response = requests.get(url)
44
+ if response.status_code == 200:
45
+ soup = BeautifulSoup(response.content, 'html.parser')
46
+ paragraphs = soup.find_all('p')
47
+ return ' '.join(p.get_text() for p in paragraphs)
48
+ return ""
49
+
50
+ # Function to classify text
51
+ def classify_text(text, max_length):
52
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
53
+ with torch.no_grad():
54
+ outputs = model(**inputs)
55
+ scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
56
+ return scores
57
+
58
+ # Function to handle long texts
59
+ def classify_long_text(text):
60
+ max_length = tokenizer.model_max_length
61
+ # Split the text into chunks
62
+ chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
63
+ aggregate_scores = [0] * len(sentiment_labels)
64
+ chunk_scores_list = []
65
+ for chunk in chunks:
66
+ chunk_scores = classify_text(chunk, max_length)
67
+ chunk_scores_list.append(chunk_scores)
68
+ aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
69
+ # Average the scores
70
+ aggregate_scores = [x / len(chunks) for x in aggregate_scores]
71
+ return aggregate_scores, chunk_scores_list, chunks
72
+
73
+ # Function to classify each sentence in the text
74
+ def classify_sentences(text):
75
+ sentences = sent_tokenize(text)
76
+ sentence_scores = []
77
+ for sentence in sentences:
78
+ scores = classify_text(sentence, tokenizer.model_max_length)
79
+ sentiment_idx = scores.index(max(scores))
80
+ sentiment = sentiment_labels[sentiment_idx]
81
+ sentence_scores.append((sentence, sentiment))
82
+ return sentence_scores
83
+
84
+ # Streamlit UI
85
+ st.title("Sentiment Classification from URL")
86
+
87
+ url = st.text_input("Enter URL:")
88
+
89
+ if url:
90
+ text = get_text_from_url(url)
91
+ if text:
92
+ scores, chunk_scores_list, chunks = classify_long_text(text)
93
+ scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
94
+
95
+ # Ensure the exact order of labels in the graph
96
+ sentiment_order = [
97
+ "very positive", "positive", "somewhat positive",
98
+ "neutral",
99
+ "somewhat negative", "negative", "very negative"
100
+ ]
101
+ ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
102
+
103
+ # Prepare the DataFrame and reindex
104
+ df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
105
+
106
+ # Use Altair to plot the bar chart
107
+ chart = alt.Chart(df.reset_index()).mark_bar().encode(
108
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
109
+ y='Likelihood'
110
+ ).properties(
111
+ width=600,
112
+ height=400
113
+ )
114
+
115
+ st.altair_chart(chart, use_container_width=True)
116
+
117
+ # Display each chunk and its own chart
118
+ for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
119
+ chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
120
+ ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
121
+ df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
122
+
123
+ chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
124
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
125
+ y='Likelihood'
126
+ ).properties(
127
+ width=600,
128
+ height=400
129
+ )
130
+
131
+ st.write(f"Chunk {i + 1}:")
132
+ st.write(chunk)
133
+ st.altair_chart(chunk_chart, use_container_width=True)
134
+
135
+ # Sentence-level classification with background colors
136
+ st.write("Extracted Text with Sentiment Highlights:")
137
+ sentence_scores = classify_sentences(text)
138
+ for sentence, sentiment in sentence_scores:
139
+ bg_color = background_colors[sentiment]
140
+ st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
141
+
142
+ else:
143
+ st.write("Could not extract text from the provided URL.")
144
+
145
+ # Additional information at the end
146
+ st.markdown("""
147
+ Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/).
148
+
149
+ The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client.
150
+
151
+ ### Engage Our Team
152
+ Interested in using this in an automated pipeline for bulk query processing?
153
+
154
+ Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs.
155
+ """)