Spaces:
Running
Running
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, BertModel,T5ForConditionalGeneration, T5Tokenizer | |
import torch | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
# Load the pre-trained FinBERT model for sentiment analysis | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
finbert_model_name = "yiyanghkust/finbert-tone" | |
finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name) | |
finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name) | |
finbert_model.to(device) | |
finbert_pipeline = pipeline("sentiment-analysis", model=finbert_model, tokenizer=finbert_tokenizer, device=0 if device.type == "cuda" else -1) | |
# Load the pre-trained T5 model for summarization | |
t5_model_name = "t5-small" # You can also use "t5-base" or "t5-large" for better summaries | |
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device) | |
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name) | |
def analyze_and_summarize_paragraphs(paragraphs): | |
"""Perform sentiment analysis and summarization on each paragraph.""" | |
results = [] | |
for paragraph in paragraphs: | |
# Perform sentiment analysis using FinBERT | |
sentiment_result = finbert_pipeline(paragraph) | |
# Perform summarization using T5 | |
t5_input = f"summarize: {paragraph}" | |
input_ids = t5_tokenizer.encode(t5_input, return_tensors="pt").to(device) | |
summary_ids = t5_model.generate(input_ids, max_length=80, num_beams=6, early_stopping=True) | |
summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
# Store only positive or negative sentiment results | |
if sentiment_result and sentiment_result[0]['label'] in ['Positive', 'Negative', 'Neutral']: | |
results.append({ | |
"paragraph_text": paragraph, | |
"summary": summary, | |
"sentiment": sentiment_result[0] | |
}) | |
return results | |
bert_model_name = "bert-base-uncased" | |
tokenizer = BertTokenizer.from_pretrained(bert_model_name) | |
model = BertModel.from_pretrained(bert_model_name) | |
model.eval() # Set to evaluation mode | |
# Word embedding on summary text using BERT | |
def get_bert_embeddings(texts): | |
"""Obtain BERT embeddings for a list of texts.""" | |
embeddings = [] | |
with torch.no_grad(): | |
for text in texts: | |
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True) | |
outputs = model(**inputs) | |
# Take the mean of token embeddings as the sentence embedding | |
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() | |
embeddings.append(embedding) | |
return np.array(embeddings) | |
# Compute similirity matrices over embeddings | |
def compute_similarity(embeddings1, embeddings2): | |
"""Compute pairwise cosine similarity between two sets of embeddings.""" | |
return cosine_similarity(embeddings1, embeddings2) | |
# For each paragraphs summed up, get the most close summary from other year and compare contents | |
def compare_summaries(results1, results2): | |
"""Compare summaries from two documents and return similarity scores.""" | |
# Get embeddings for each set of summaries | |
summaries1 = [result['summary'] for result in results1] | |
summaries2 = [result['summary'] for result in results2] | |
sentiment1 = [result['sentiment'] for result in results1] | |
sentiment2 = [result['sentiment'] for result in results2] | |
embeddings1 = get_bert_embeddings(summaries1) | |
embeddings2 = get_bert_embeddings(summaries2) | |
# Compute similarity | |
similarity_matrix = compute_similarity(embeddings1, embeddings2) | |
# Analyze matches | |
matches = [] | |
for i, row in enumerate(similarity_matrix): | |
most_similar_index = np.argmax(row) | |
similarity_score = row[most_similar_index] | |
matches.append({ | |
'summary_doc1': summaries1[i], | |
'summary_doc2': summaries2[most_similar_index], | |
'sentiment_doc1': sentiment1[i], | |
'sentiment_doc2': sentiment2[most_similar_index], | |
'similarity_score': similarity_score | |
}) | |
return matches | |