Spaces:
Running
Running
Upload comparison.py
Browse files- lib/comparison.py +94 -0
lib/comparison.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, BertModel,T5ForConditionalGeneration, T5Tokenizer
|
2 |
+
import torch
|
3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import numpy as np
|
6 |
+
# Load the pre-trained FinBERT model for sentiment analysis
|
7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
8 |
+
finbert_model_name = "yiyanghkust/finbert-tone"
|
9 |
+
finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name)
|
10 |
+
finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name)
|
11 |
+
finbert_model.to(device)
|
12 |
+
|
13 |
+
finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer, device=0 if device.type == "cuda" else -1)
|
14 |
+
|
15 |
+
# Load the pre-trained T5 model for summarization
|
16 |
+
t5_model_name = "t5-small" # You can also use "t5-base" or "t5-large" for better summaries
|
17 |
+
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)
|
18 |
+
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
|
19 |
+
|
20 |
+
def analyze_and_summarize_paragraphs(paragraphs):
|
21 |
+
"""Perform sentiment analysis and summarization on each paragraph."""
|
22 |
+
results = []
|
23 |
+
|
24 |
+
for paragraph in paragraphs:
|
25 |
+
# Perform sentiment analysis using FinBERT
|
26 |
+
sentiment_result = finbert_pipeline(paragraph)
|
27 |
+
|
28 |
+
# Perform summarization using T5
|
29 |
+
t5_input = f"summarize: {paragraph}"
|
30 |
+
input_ids = t5_tokenizer.encode(t5_input, return_tensors="pt").to(device)
|
31 |
+
summary_ids = t5_model.generate(input_ids, max_length=80, num_beams=6, early_stopping=True)
|
32 |
+
summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
33 |
+
|
34 |
+
# Store only positive or negative sentiment results
|
35 |
+
if sentiment_result and sentiment_result[0]['label'] in ['Positive', 'Negative', 'Neutral']:
|
36 |
+
results.append({
|
37 |
+
"paragraph_text": paragraph,
|
38 |
+
"summary": summary,
|
39 |
+
"sentiment": sentiment_result[0]
|
40 |
+
})
|
41 |
+
|
42 |
+
return results
|
43 |
+
|
44 |
+
bert_model_name = "bert-base-uncased"
|
45 |
+
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
|
46 |
+
model = BertModel.from_pretrained(bert_model_name)
|
47 |
+
model.eval() # Set to evaluation mode
|
48 |
+
|
49 |
+
# Word embedding on summary text using BERT
|
50 |
+
def get_bert_embeddings(texts):
|
51 |
+
"""Obtain BERT embeddings for a list of texts."""
|
52 |
+
embeddings = []
|
53 |
+
with torch.no_grad():
|
54 |
+
for text in texts:
|
55 |
+
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
|
56 |
+
outputs = model(**inputs)
|
57 |
+
# Take the mean of token embeddings as the sentence embedding
|
58 |
+
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
59 |
+
embeddings.append(embedding)
|
60 |
+
return np.array(embeddings)
|
61 |
+
|
62 |
+
# Compute similirity matrices over embeddings
|
63 |
+
def compute_similarity(embeddings1, embeddings2):
|
64 |
+
"""Compute pairwise cosine similarity between two sets of embeddings."""
|
65 |
+
return cosine_similarity(embeddings1, embeddings2)
|
66 |
+
|
67 |
+
# For each paragraphs summed up, get the most close summary from other year and compare contents
|
68 |
+
def compare_summaries(results1, results2):
|
69 |
+
"""Compare summaries from two documents and return similarity scores."""
|
70 |
+
# Get embeddings for each set of summaries
|
71 |
+
summaries1 = [result['summary'] for result in results1]
|
72 |
+
summaries2 = [result['summary'] for result in results2]
|
73 |
+
sentiment1 = [result['sentiment'] for result in results1]
|
74 |
+
sentiment2 = [result['sentiment'] for result in results2]
|
75 |
+
embeddings1 = get_bert_embeddings(summaries1)
|
76 |
+
embeddings2 = get_bert_embeddings(summaries2)
|
77 |
+
|
78 |
+
# Compute similarity
|
79 |
+
similarity_matrix = compute_similarity(embeddings1, embeddings2)
|
80 |
+
|
81 |
+
# Analyze matches
|
82 |
+
matches = []
|
83 |
+
for i, row in enumerate(similarity_matrix):
|
84 |
+
most_similar_index = np.argmax(row)
|
85 |
+
similarity_score = row[most_similar_index]
|
86 |
+
matches.append({
|
87 |
+
'summary_doc1': summaries1[i],
|
88 |
+
'summary_doc2': summaries2[most_similar_index],
|
89 |
+
'sentiment_doc1': sentiment1[i],
|
90 |
+
'sentiment_doc2': sentiment2[most_similar_index],
|
91 |
+
'similarity_score': similarity_score
|
92 |
+
})
|
93 |
+
|
94 |
+
return matches
|