Cachoups commited on
Commit
1aa667b
·
verified ·
1 Parent(s): 367182a

Update lib/comparison.py

Browse files
Files changed (1) hide show
  1. lib/comparison.py +48 -94
lib/comparison.py CHANGED
@@ -1,94 +1,48 @@
1
- from transformers import BertTokenizer, BertForSequenceClassification, pipeline, BertModel,T5ForConditionalGeneration, T5Tokenizer
2
- import torch
3
- from sklearn.feature_extraction.text import TfidfVectorizer
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- import numpy as np
6
- # Load the pre-trained FinBERT model for sentiment analysis
7
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
- finbert_model_name = "yiyanghkust/finbert-tone"
9
- finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name)
10
- finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name)
11
- finbert_model.to(device)
12
-
13
- finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer, device=0 if device.type == "cuda" else -1)
14
-
15
- # Load the pre-trained T5 model for summarization
16
- t5_model_name = "t5-small" # You can also use "t5-base" or "t5-large" for better summaries
17
- t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)
18
- t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
19
-
20
- def analyze_and_summarize_paragraphs(paragraphs):
21
- """Perform sentiment analysis and summarization on each paragraph."""
22
- results = []
23
-
24
- for paragraph in paragraphs:
25
- # Perform sentiment analysis using FinBERT
26
- sentiment_result = finbert_pipeline(paragraph)
27
-
28
- # Perform summarization using T5
29
- t5_input = f"summarize: {paragraph}"
30
- input_ids = t5_tokenizer.encode(t5_input, return_tensors="pt").to(device)
31
- summary_ids = t5_model.generate(input_ids, max_length=80, num_beams=6, early_stopping=True)
32
- summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
33
-
34
- # Store only positive or negative sentiment results
35
- if sentiment_result and sentiment_result[0]['label'] in ['Positive', 'Negative', 'Neutral']:
36
- results.append({
37
- "paragraph_text": paragraph,
38
- "summary": summary,
39
- "sentiment": sentiment_result[0]
40
- })
41
-
42
- return results
43
-
44
- bert_model_name = "bert-base-uncased"
45
- tokenizer = BertTokenizer.from_pretrained(bert_model_name)
46
- model = BertModel.from_pretrained(bert_model_name)
47
- model.eval() # Set to evaluation mode
48
-
49
- # Word embedding on summary text using BERT
50
- def get_bert_embeddings(texts):
51
- """Obtain BERT embeddings for a list of texts."""
52
- embeddings = []
53
- with torch.no_grad():
54
- for text in texts:
55
- inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
56
- outputs = model(**inputs)
57
- # Take the mean of token embeddings as the sentence embedding
58
- embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
59
- embeddings.append(embedding)
60
- return np.array(embeddings)
61
-
62
- # Compute similirity matrices over embeddings
63
- def compute_similarity(embeddings1, embeddings2):
64
- """Compute pairwise cosine similarity between two sets of embeddings."""
65
- return cosine_similarity(embeddings1, embeddings2)
66
-
67
- # For each paragraphs summed up, get the most close summary from other year and compare contents
68
- def compare_summaries(results1, results2):
69
- """Compare summaries from two documents and return similarity scores."""
70
- # Get embeddings for each set of summaries
71
- summaries1 = [result['summary'] for result in results1]
72
- summaries2 = [result['summary'] for result in results2]
73
- sentiment1 = [result['sentiment'] for result in results1]
74
- sentiment2 = [result['sentiment'] for result in results2]
75
- embeddings1 = get_bert_embeddings(summaries1)
76
- embeddings2 = get_bert_embeddings(summaries2)
77
-
78
- # Compute similarity
79
- similarity_matrix = compute_similarity(embeddings1, embeddings2)
80
-
81
- # Analyze matches
82
- matches = []
83
- for i, row in enumerate(similarity_matrix):
84
- most_similar_index = np.argmax(row)
85
- similarity_score = row[most_similar_index]
86
- matches.append({
87
- 'summary_doc1': summaries1[i],
88
- 'summary_doc2': summaries2[most_similar_index],
89
- 'sentiment_doc1': sentiment1[i],
90
- 'sentiment_doc2': sentiment2[most_similar_index],
91
- 'similarity_score': similarity_score
92
- })
93
-
94
- return matches
 
1
+ from transformers import BertTokenizer, BertModel
2
+ import torch
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+
6
+ # Load BERT tokenizer and model
7
+ bert_model_name = "bert-base-uncased"
8
+ tokenizer = BertTokenizer.from_pretrained(bert_model_name)
9
+ model = BertModel.from_pretrained(bert_model_name)
10
+ model.eval() # Set to evaluation mode
11
+
12
+ # Function to obtain BERT embeddings
13
+ def get_bert_embeddings(texts):
14
+ """Obtain BERT embeddings for a list of texts."""
15
+ embeddings = []
16
+ with torch.no_grad():
17
+ for text in texts:
18
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
19
+ outputs = model(**inputs)
20
+ # Take the mean of token embeddings as the sentence embedding
21
+ embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
22
+ embeddings.append(embedding)
23
+ return np.array(embeddings)
24
+
25
+ # Compute similarity matrices over embeddings
26
+ def compute_similarity(embeddings1, embeddings2):
27
+ """Compute pairwise cosine similarity between two sets of embeddings."""
28
+ return cosine_similarity(embeddings1, embeddings2)
29
+
30
+ # Compare a paragraph with a list of other paragraphs
31
+ def compare_summaries(paragraph, paragraphs):
32
+ """
33
+ Compare a single paragraph with a list of summaries,
34
+ and return the most similar summary along with the similarity score.
35
+ """
36
+ # Get embeddings for the paragraph and the list of summaries
37
+ paragraph_embedding = get_bert_embeddings([paragraph])[0] # Single paragraph embedding
38
+ summaries_embeddings = get_bert_embeddings(paragraphs) # Embeddings for list of paragraphs
39
+
40
+ # Compute similarity between the paragraph and each summary
41
+ similarities = compute_similarity([paragraph_embedding], summaries_embeddings)[0]
42
+
43
+ # Find the most similar summary
44
+ most_similar_index = np.argmax(similarities) # Get index of most similar summary
45
+ most_similar_summary = summaries[most_similar_index] # Corresponding summary
46
+ similarity_score = similarities[most_similar_index] # Similarity score
47
+
48
+ return most_similar_summary, similarity_score