Cachoups commited on
Commit
367182a
·
verified ·
1 Parent(s): ec643ee

Upload comparison.py

Browse files
Files changed (1) hide show
  1. lib/comparison.py +94 -0
lib/comparison.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertForSequenceClassification, pipeline, BertModel,T5ForConditionalGeneration, T5Tokenizer
2
+ import torch
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import numpy as np
6
+ # Load the pre-trained FinBERT model for sentiment analysis
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+ finbert_model_name = "yiyanghkust/finbert-tone"
9
+ finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name)
10
+ finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name)
11
+ finbert_model.to(device)
12
+
13
+ finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer, device=0 if device.type == "cuda" else -1)
14
+
15
+ # Load the pre-trained T5 model for summarization
16
+ t5_model_name = "t5-small" # You can also use "t5-base" or "t5-large" for better summaries
17
+ t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)
18
+ t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
19
+
20
+ def analyze_and_summarize_paragraphs(paragraphs):
21
+ """Perform sentiment analysis and summarization on each paragraph."""
22
+ results = []
23
+
24
+ for paragraph in paragraphs:
25
+ # Perform sentiment analysis using FinBERT
26
+ sentiment_result = finbert_pipeline(paragraph)
27
+
28
+ # Perform summarization using T5
29
+ t5_input = f"summarize: {paragraph}"
30
+ input_ids = t5_tokenizer.encode(t5_input, return_tensors="pt").to(device)
31
+ summary_ids = t5_model.generate(input_ids, max_length=80, num_beams=6, early_stopping=True)
32
+ summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
33
+
34
+ # Store only positive or negative sentiment results
35
+ if sentiment_result and sentiment_result[0]['label'] in ['Positive', 'Negative', 'Neutral']:
36
+ results.append({
37
+ "paragraph_text": paragraph,
38
+ "summary": summary,
39
+ "sentiment": sentiment_result[0]
40
+ })
41
+
42
+ return results
43
+
44
+ bert_model_name = "bert-base-uncased"
45
+ tokenizer = BertTokenizer.from_pretrained(bert_model_name)
46
+ model = BertModel.from_pretrained(bert_model_name)
47
+ model.eval() # Set to evaluation mode
48
+
49
+ # Word embedding on summary text using BERT
50
+ def get_bert_embeddings(texts):
51
+ """Obtain BERT embeddings for a list of texts."""
52
+ embeddings = []
53
+ with torch.no_grad():
54
+ for text in texts:
55
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
56
+ outputs = model(**inputs)
57
+ # Take the mean of token embeddings as the sentence embedding
58
+ embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
59
+ embeddings.append(embedding)
60
+ return np.array(embeddings)
61
+
62
+ # Compute similirity matrices over embeddings
63
+ def compute_similarity(embeddings1, embeddings2):
64
+ """Compute pairwise cosine similarity between two sets of embeddings."""
65
+ return cosine_similarity(embeddings1, embeddings2)
66
+
67
+ # For each paragraphs summed up, get the most close summary from other year and compare contents
68
+ def compare_summaries(results1, results2):
69
+ """Compare summaries from two documents and return similarity scores."""
70
+ # Get embeddings for each set of summaries
71
+ summaries1 = [result['summary'] for result in results1]
72
+ summaries2 = [result['summary'] for result in results2]
73
+ sentiment1 = [result['sentiment'] for result in results1]
74
+ sentiment2 = [result['sentiment'] for result in results2]
75
+ embeddings1 = get_bert_embeddings(summaries1)
76
+ embeddings2 = get_bert_embeddings(summaries2)
77
+
78
+ # Compute similarity
79
+ similarity_matrix = compute_similarity(embeddings1, embeddings2)
80
+
81
+ # Analyze matches
82
+ matches = []
83
+ for i, row in enumerate(similarity_matrix):
84
+ most_similar_index = np.argmax(row)
85
+ similarity_score = row[most_similar_index]
86
+ matches.append({
87
+ 'summary_doc1': summaries1[i],
88
+ 'summary_doc2': summaries2[most_similar_index],
89
+ 'sentiment_doc1': sentiment1[i],
90
+ 'sentiment_doc2': sentiment2[most_similar_index],
91
+ 'similarity_score': similarity_score
92
+ })
93
+
94
+ return matches