FinanceReport / lib /comparison.py
Cachoups's picture
Upload comparison.py
367182a verified
raw
history blame
4.33 kB
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, BertModel,T5ForConditionalGeneration, T5Tokenizer
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load the pre-trained FinBERT model for sentiment analysis
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
finbert_model_name = "yiyanghkust/finbert-tone"
finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name)
finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name)
finbert_model.to(device)
finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer, device=0 if device.type == "cuda" else -1)
# Load the pre-trained T5 model for summarization
t5_model_name = "t5-small" # You can also use "t5-base" or "t5-large" for better summaries
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
def analyze_and_summarize_paragraphs(paragraphs):
"""Perform sentiment analysis and summarization on each paragraph."""
results = []
for paragraph in paragraphs:
# Perform sentiment analysis using FinBERT
sentiment_result = finbert_pipeline(paragraph)
# Perform summarization using T5
t5_input = f"summarize: {paragraph}"
input_ids = t5_tokenizer.encode(t5_input, return_tensors="pt").to(device)
summary_ids = t5_model.generate(input_ids, max_length=80, num_beams=6, early_stopping=True)
summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Store only positive or negative sentiment results
if sentiment_result and sentiment_result[0]['label'] in ['Positive', 'Negative', 'Neutral']:
results.append({
"paragraph_text": paragraph,
"summary": summary,
"sentiment": sentiment_result[0]
})
return results
bert_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = BertModel.from_pretrained(bert_model_name)
model.eval() # Set to evaluation mode
# Word embedding on summary text using BERT
def get_bert_embeddings(texts):
"""Obtain BERT embeddings for a list of texts."""
embeddings = []
with torch.no_grad():
for text in texts:
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
outputs = model(**inputs)
# Take the mean of token embeddings as the sentence embedding
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
embeddings.append(embedding)
return np.array(embeddings)
# Compute similirity matrices over embeddings
def compute_similarity(embeddings1, embeddings2):
"""Compute pairwise cosine similarity between two sets of embeddings."""
return cosine_similarity(embeddings1, embeddings2)
# For each paragraphs summed up, get the most close summary from other year and compare contents
def compare_summaries(results1, results2):
"""Compare summaries from two documents and return similarity scores."""
# Get embeddings for each set of summaries
summaries1 = [result['summary'] for result in results1]
summaries2 = [result['summary'] for result in results2]
sentiment1 = [result['sentiment'] for result in results1]
sentiment2 = [result['sentiment'] for result in results2]
embeddings1 = get_bert_embeddings(summaries1)
embeddings2 = get_bert_embeddings(summaries2)
# Compute similarity
similarity_matrix = compute_similarity(embeddings1, embeddings2)
# Analyze matches
matches = []
for i, row in enumerate(similarity_matrix):
most_similar_index = np.argmax(row)
similarity_score = row[most_similar_index]
matches.append({
'summary_doc1': summaries1[i],
'summary_doc2': summaries2[most_similar_index],
'sentiment_doc1': sentiment1[i],
'sentiment_doc2': sentiment2[most_similar_index],
'similarity_score': similarity_score
})
return matches