Spaces:
Running
Running
Upload 8 files
Browse files- .env.example +17 -0
- .gitignore +18 -0
- Dockerfile +24 -0
- api.py +72 -0
- app.py +431 -0
- config.py +85 -0
- requirements.txt +21 -0
- utils.py +1402 -0
.env.example
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# API Settings
|
2 |
+
API_HOST=0.0.0.0
|
3 |
+
API_PORT=8005
|
4 |
+
API_BASE_URL=http://0.0.0.0:8005
|
5 |
+
|
6 |
+
# News Scraping Settings
|
7 |
+
ARTICLES_PER_SOURCE=10
|
8 |
+
USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
|
9 |
+
|
10 |
+
# Cache Settings
|
11 |
+
CACHE_DIR=.cache
|
12 |
+
CACHE_EXPIRY=3600
|
13 |
+
CACHE_DURATION=300
|
14 |
+
|
15 |
+
# Audio Settings
|
16 |
+
AUDIO_OUTPUT_DIR=audio_output
|
17 |
+
DEFAULT_LANG=hi
|
.gitignore
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Ignore virtual environment
|
3 |
+
venv/
|
4 |
+
.env
|
5 |
+
audio_output/
|
6 |
+
|
7 |
+
# Ignore compiled Python files
|
8 |
+
__pycache__/
|
9 |
+
*.pyc
|
10 |
+
*.pyo
|
11 |
+
*.pyd
|
12 |
+
sentiment_history/
|
13 |
+
# Ignore macOS system files
|
14 |
+
.DS_Store
|
15 |
+
|
16 |
+
# Ignore log files
|
17 |
+
*.log
|
18 |
+
audio_output
|
Dockerfile
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install system dependencies
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
&& rm -rf /var/lib/apt/lists/*
|
9 |
+
|
10 |
+
# Copy requirements first to leverage Docker cache
|
11 |
+
COPY requirements.txt .
|
12 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
13 |
+
|
14 |
+
# Copy the rest of the application
|
15 |
+
COPY . .
|
16 |
+
|
17 |
+
# Create necessary directories
|
18 |
+
RUN mkdir -p audio_output sentiment_history
|
19 |
+
|
20 |
+
# Expose the port Streamlit will run on
|
21 |
+
EXPOSE 8501
|
22 |
+
|
23 |
+
# Command to run the application
|
24 |
+
CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0"]
|
api.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""FastAPI backend for the News Summarization application."""
|
2 |
+
|
3 |
+
from fastapi import FastAPI, HTTPException
|
4 |
+
from fastapi.staticfiles import StaticFiles
|
5 |
+
from pydantic import BaseModel
|
6 |
+
from typing import List, Dict, Any
|
7 |
+
import uvicorn
|
8 |
+
from utils import NewsExtractor, SentimentAnalyzer, TextToSpeechConverter, ComparativeAnalyzer
|
9 |
+
import os
|
10 |
+
from config import API_PORT, AUDIO_OUTPUT_DIR
|
11 |
+
import time
|
12 |
+
|
13 |
+
app = FastAPI(title="News Summarization API")
|
14 |
+
|
15 |
+
# Mount static directory for audio files
|
16 |
+
os.makedirs(AUDIO_OUTPUT_DIR, exist_ok=True)
|
17 |
+
app.mount("/audio", StaticFiles(directory=AUDIO_OUTPUT_DIR), name="audio")
|
18 |
+
|
19 |
+
# Initialize components
|
20 |
+
news_extractor = NewsExtractor()
|
21 |
+
sentiment_analyzer = SentimentAnalyzer()
|
22 |
+
tts_converter = TextToSpeechConverter()
|
23 |
+
comparative_analyzer = ComparativeAnalyzer()
|
24 |
+
|
25 |
+
class CompanyRequest(BaseModel):
|
26 |
+
name: str
|
27 |
+
|
28 |
+
class AnalysisResponse(BaseModel):
|
29 |
+
company: str
|
30 |
+
articles: List[Dict[str, Any]]
|
31 |
+
comparative_sentiment_score: Dict[str, Any]
|
32 |
+
final_sentiment_analysis: str
|
33 |
+
audio_url: str = None
|
34 |
+
|
35 |
+
@app.post("/api/analyze", response_model=AnalysisResponse)
|
36 |
+
async def analyze_company(request: CompanyRequest):
|
37 |
+
"""Analyze news articles for a given company."""
|
38 |
+
try:
|
39 |
+
# Extract news articles
|
40 |
+
articles = news_extractor.search_news(request.name)
|
41 |
+
if not articles:
|
42 |
+
raise HTTPException(status_code=404, detail="No articles found for the company")
|
43 |
+
|
44 |
+
# Analyze each article
|
45 |
+
analyzed_articles = []
|
46 |
+
for article in articles:
|
47 |
+
analysis = sentiment_analyzer.analyze_article(article)
|
48 |
+
# Add company name to each article
|
49 |
+
analysis['company'] = request.name
|
50 |
+
analyzed_articles.append(analysis)
|
51 |
+
|
52 |
+
# Perform comparative analysis
|
53 |
+
comparison = comparative_analyzer.analyze_coverage(analyzed_articles, company_name=request.name)
|
54 |
+
final_analysis = comparison["final_sentiment"]
|
55 |
+
|
56 |
+
# Generate Hindi audio for final analysis
|
57 |
+
audio_filename = f"{request.name.lower().replace(' ', '_')}_{int(time.time())}"
|
58 |
+
audio_path = tts_converter.generate_audio(final_analysis, audio_filename)
|
59 |
+
audio_url = f"/audio/{os.path.basename(audio_path)}"
|
60 |
+
|
61 |
+
return {
|
62 |
+
"company": request.name,
|
63 |
+
"articles": analyzed_articles,
|
64 |
+
"comparative_sentiment_score": comparison,
|
65 |
+
"final_sentiment_analysis": final_analysis,
|
66 |
+
"audio_url": audio_url
|
67 |
+
}
|
68 |
+
except Exception as e:
|
69 |
+
raise HTTPException(status_code=500, detail=str(e))
|
70 |
+
|
71 |
+
if __name__ == "__main__":
|
72 |
+
uvicorn.run(app, host="0.0.0.0", port=API_PORT)
|
app.py
ADDED
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Streamlit frontend for the News Summarization application."""
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import requests
|
5 |
+
import pandas as pd
|
6 |
+
import json
|
7 |
+
from config import API_BASE_URL
|
8 |
+
import os
|
9 |
+
import plotly.express as px
|
10 |
+
import altair as alt
|
11 |
+
|
12 |
+
st.set_page_config(
|
13 |
+
page_title="News Summarization App",
|
14 |
+
page_icon="📰",
|
15 |
+
layout="wide"
|
16 |
+
)
|
17 |
+
|
18 |
+
def analyze_company(company_name):
|
19 |
+
"""Send analysis request to API."""
|
20 |
+
try:
|
21 |
+
response = requests.post(
|
22 |
+
f"{API_BASE_URL}/api/analyze",
|
23 |
+
json={"name": company_name}
|
24 |
+
)
|
25 |
+
if response.status_code == 200:
|
26 |
+
data = response.json()
|
27 |
+
# Print the response data for debugging
|
28 |
+
print("API Response Data:")
|
29 |
+
print(json.dumps(data, indent=2))
|
30 |
+
|
31 |
+
# Download audio file if available
|
32 |
+
if 'audio_url' in data:
|
33 |
+
audio_response = requests.get(f"{API_BASE_URL}{data['audio_url']}")
|
34 |
+
if audio_response.status_code == 200:
|
35 |
+
data['audio_content'] = audio_response.content
|
36 |
+
return data
|
37 |
+
else:
|
38 |
+
st.error(f"Error from API: {response.text}")
|
39 |
+
return {"articles": [], "comparative_sentiment_score": {}, "final_sentiment_analysis": "", "audio_url": None}
|
40 |
+
except Exception as e:
|
41 |
+
st.error(f"Error analyzing company: {str(e)}")
|
42 |
+
return {"articles": [], "comparative_sentiment_score": {}, "final_sentiment_analysis": "", "audio_url": None}
|
43 |
+
|
44 |
+
def main():
|
45 |
+
st.title("📰 News Summarization and Analysis")
|
46 |
+
|
47 |
+
# Sidebar
|
48 |
+
st.sidebar.header("Settings")
|
49 |
+
|
50 |
+
# Replace dropdown with text input
|
51 |
+
company = st.sidebar.text_input(
|
52 |
+
"Enter Company Name",
|
53 |
+
placeholder="e.g., Tesla, Apple, Microsoft, or any other company",
|
54 |
+
help="Enter the name of any company you want to analyze"
|
55 |
+
)
|
56 |
+
|
57 |
+
if st.sidebar.button("Analyze") and company:
|
58 |
+
if len(company.strip()) < 2:
|
59 |
+
st.sidebar.error("Please enter a valid company name (at least 2 characters)")
|
60 |
+
else:
|
61 |
+
with st.spinner("Analyzing news articles..."):
|
62 |
+
result = analyze_company(company)
|
63 |
+
|
64 |
+
if result and result.get("articles"):
|
65 |
+
# Display Articles
|
66 |
+
st.header("📑 News Articles")
|
67 |
+
for idx, article in enumerate(result["articles"], 1):
|
68 |
+
with st.expander(f"Article {idx}: {article['title']}"):
|
69 |
+
st.write("**Content:**", article.get("content", "No content available"))
|
70 |
+
if "summary" in article:
|
71 |
+
st.write("**Summary:**", article["summary"])
|
72 |
+
st.write("**Source:**", article.get("source", "Unknown"))
|
73 |
+
|
74 |
+
# Enhanced sentiment display
|
75 |
+
if "sentiment" in article:
|
76 |
+
sentiment_col1, sentiment_col2 = st.columns(2)
|
77 |
+
with sentiment_col1:
|
78 |
+
st.write("**Sentiment:**", article["sentiment"])
|
79 |
+
st.write("**Confidence Score:**", f"{article.get('sentiment_score', 0)*100:.1f}%")
|
80 |
+
|
81 |
+
with sentiment_col2:
|
82 |
+
# Display fine-grained sentiment if available
|
83 |
+
if "fine_grained_sentiment" in article and article["fine_grained_sentiment"]:
|
84 |
+
fine_grained = article["fine_grained_sentiment"]
|
85 |
+
if "category" in fine_grained:
|
86 |
+
st.write("**Detailed Sentiment:**", fine_grained["category"])
|
87 |
+
if "confidence" in fine_grained:
|
88 |
+
st.write("**Confidence:**", f"{fine_grained['confidence']*100:.1f}%")
|
89 |
+
|
90 |
+
# Display sentiment indices if available
|
91 |
+
if "sentiment_indices" in article and article["sentiment_indices"]:
|
92 |
+
st.markdown("**Sentiment Indices:**")
|
93 |
+
indices = article["sentiment_indices"]
|
94 |
+
|
95 |
+
# Create columns for displaying indices
|
96 |
+
idx_cols = st.columns(3)
|
97 |
+
|
98 |
+
# Display positivity and negativity in first column
|
99 |
+
with idx_cols[0]:
|
100 |
+
if "positivity_index" in indices:
|
101 |
+
st.markdown(f"**Positivity:** {indices['positivity_index']:.2f}")
|
102 |
+
if "negativity_index" in indices:
|
103 |
+
st.markdown(f"**Negativity:** {indices['negativity_index']:.2f}")
|
104 |
+
|
105 |
+
# Display emotional intensity and controversy in second column
|
106 |
+
with idx_cols[1]:
|
107 |
+
if "emotional_intensity" in indices:
|
108 |
+
st.markdown(f"**Emotional Intensity:** {indices['emotional_intensity']:.2f}")
|
109 |
+
if "controversy_score" in indices:
|
110 |
+
st.markdown(f"**Controversy:** {indices['controversy_score']:.2f}")
|
111 |
+
|
112 |
+
# Display confidence and ESG in third column
|
113 |
+
with idx_cols[2]:
|
114 |
+
if "confidence_score" in indices:
|
115 |
+
st.markdown(f"**Confidence:** {indices['confidence_score']:.2f}")
|
116 |
+
if "esg_relevance" in indices:
|
117 |
+
st.markdown(f"**ESG Relevance:** {indices['esg_relevance']:.2f}")
|
118 |
+
|
119 |
+
# Display entities if available
|
120 |
+
if "entities" in article and article["entities"]:
|
121 |
+
st.markdown("**Named Entities:**")
|
122 |
+
entities = article["entities"]
|
123 |
+
|
124 |
+
# Organizations
|
125 |
+
if "ORG" in entities and entities["ORG"]:
|
126 |
+
st.write("**Organizations:**", ", ".join(entities["ORG"]))
|
127 |
+
|
128 |
+
# People
|
129 |
+
if "PERSON" in entities and entities["PERSON"]:
|
130 |
+
st.write("**People:**", ", ".join(entities["PERSON"]))
|
131 |
+
|
132 |
+
# Locations
|
133 |
+
if "GPE" in entities and entities["GPE"]:
|
134 |
+
st.write("**Locations:**", ", ".join(entities["GPE"]))
|
135 |
+
|
136 |
+
# Money
|
137 |
+
if "MONEY" in entities and entities["MONEY"]:
|
138 |
+
st.write("**Financial Values:**", ", ".join(entities["MONEY"]))
|
139 |
+
|
140 |
+
# Display sentiment targets if available
|
141 |
+
if "sentiment_targets" in article and article["sentiment_targets"]:
|
142 |
+
st.markdown("**Sentiment Targets:**")
|
143 |
+
targets = article["sentiment_targets"]
|
144 |
+
for target in targets:
|
145 |
+
st.markdown(f"**{target['entity']}** ({target['type']}): {target['sentiment']} ({target['confidence']*100:.1f}%)")
|
146 |
+
st.markdown(f"> {target['context']}")
|
147 |
+
st.markdown("---")
|
148 |
+
|
149 |
+
if "url" in article:
|
150 |
+
st.write("**[Read More](%s)**" % article["url"])
|
151 |
+
|
152 |
+
# Display Comparative Analysis
|
153 |
+
st.header("📊 Comparative Analysis")
|
154 |
+
analysis = result.get("comparative_sentiment_score", {})
|
155 |
+
|
156 |
+
# Sentiment Distribution
|
157 |
+
if "sentiment_distribution" in analysis:
|
158 |
+
st.subheader("Sentiment Distribution")
|
159 |
+
|
160 |
+
# Debug: Print sentiment distribution data
|
161 |
+
print("Sentiment Distribution Data:")
|
162 |
+
print(json.dumps(analysis["sentiment_distribution"], indent=2))
|
163 |
+
|
164 |
+
sentiment_dist = analysis["sentiment_distribution"]
|
165 |
+
|
166 |
+
# Create a very simple visualization that will definitely work
|
167 |
+
try:
|
168 |
+
# Extract basic sentiment data
|
169 |
+
if isinstance(sentiment_dist, dict):
|
170 |
+
if "basic" in sentiment_dist and isinstance(sentiment_dist["basic"], dict):
|
171 |
+
basic_dist = sentiment_dist["basic"]
|
172 |
+
elif any(k in sentiment_dist for k in ['positive', 'negative', 'neutral']):
|
173 |
+
basic_dist = {k: v for k, v in sentiment_dist.items()
|
174 |
+
if k in ['positive', 'negative', 'neutral']}
|
175 |
+
else:
|
176 |
+
basic_dist = {'positive': 0, 'negative': 0, 'neutral': 1}
|
177 |
+
else:
|
178 |
+
basic_dist = {'positive': 0, 'negative': 0, 'neutral': 1}
|
179 |
+
|
180 |
+
# Calculate percentages
|
181 |
+
total_articles = sum(basic_dist.values())
|
182 |
+
if total_articles > 0:
|
183 |
+
percentages = {
|
184 |
+
k: (v / total_articles) * 100
|
185 |
+
for k, v in basic_dist.items()
|
186 |
+
}
|
187 |
+
else:
|
188 |
+
percentages = {k: 0 for k in basic_dist}
|
189 |
+
|
190 |
+
# Display as simple text and metrics
|
191 |
+
st.write("**Sentiment Distribution:**")
|
192 |
+
|
193 |
+
col1, col2, col3 = st.columns(3)
|
194 |
+
with col1:
|
195 |
+
st.metric(
|
196 |
+
"Positive",
|
197 |
+
basic_dist.get('positive', 0),
|
198 |
+
f"{percentages.get('positive', 0):.1f}%"
|
199 |
+
)
|
200 |
+
with col2:
|
201 |
+
st.metric(
|
202 |
+
"Negative",
|
203 |
+
basic_dist.get('negative', 0),
|
204 |
+
f"{percentages.get('negative', 0):.1f}%"
|
205 |
+
)
|
206 |
+
with col3:
|
207 |
+
st.metric(
|
208 |
+
"Neutral",
|
209 |
+
basic_dist.get('neutral', 0),
|
210 |
+
f"{percentages.get('neutral', 0):.1f}%"
|
211 |
+
)
|
212 |
+
|
213 |
+
# Create a simple bar chart using Altair
|
214 |
+
|
215 |
+
# Create a simple DataFrame with consistent capitalization and percentages
|
216 |
+
chart_data = pd.DataFrame({
|
217 |
+
'Sentiment': ['Positive', 'Negative', 'Neutral'],
|
218 |
+
'Count': [
|
219 |
+
basic_dist.get('positive', 0), # Map lowercase keys to capitalized display
|
220 |
+
basic_dist.get('negative', 0),
|
221 |
+
basic_dist.get('neutral', 0)
|
222 |
+
],
|
223 |
+
'Percentage': [
|
224 |
+
f"{percentages.get('positive', 0):.1f}%",
|
225 |
+
f"{percentages.get('negative', 0):.1f}%",
|
226 |
+
f"{percentages.get('neutral', 0):.1f}%"
|
227 |
+
]
|
228 |
+
})
|
229 |
+
|
230 |
+
# Add debug output to see what's in the data
|
231 |
+
print("Chart Data for Sentiment Distribution:")
|
232 |
+
print(chart_data)
|
233 |
+
|
234 |
+
# Create a simple bar chart with percentages
|
235 |
+
chart = alt.Chart(chart_data).mark_bar().encode(
|
236 |
+
y='Sentiment', # Changed from x to y for horizontal bars
|
237 |
+
x='Count', # Changed from y to x for horizontal bars
|
238 |
+
color=alt.Color('Sentiment', scale=alt.Scale(
|
239 |
+
domain=['Positive', 'Negative', 'Neutral'],
|
240 |
+
range=['green', 'red', 'gray']
|
241 |
+
)),
|
242 |
+
tooltip=['Sentiment', 'Count', 'Percentage'] # Add tooltip with percentage
|
243 |
+
).properties(
|
244 |
+
width=600,
|
245 |
+
height=300
|
246 |
+
)
|
247 |
+
|
248 |
+
# Add text labels with percentages
|
249 |
+
text = chart.mark_text(
|
250 |
+
align='left',
|
251 |
+
baseline='middle',
|
252 |
+
dx=3 # Nudge text to the right so it doesn't overlap with the bar
|
253 |
+
).encode(
|
254 |
+
text='Percentage'
|
255 |
+
)
|
256 |
+
|
257 |
+
# Combine the chart and text
|
258 |
+
chart_with_text = (chart + text)
|
259 |
+
|
260 |
+
st.altair_chart(chart_with_text, use_container_width=True)
|
261 |
+
|
262 |
+
except Exception as e:
|
263 |
+
st.error(f"Error creating visualization: {str(e)}")
|
264 |
+
st.write("Fallback to simple text display:")
|
265 |
+
if isinstance(sentiment_dist, dict):
|
266 |
+
if "basic" in sentiment_dist:
|
267 |
+
st.write(f"Positive: {sentiment_dist['basic'].get('positive', 0)}")
|
268 |
+
st.write(f"Negative: {sentiment_dist['basic'].get('negative', 0)}")
|
269 |
+
st.write(f"Neutral: {sentiment_dist['basic'].get('neutral', 0)}")
|
270 |
+
else:
|
271 |
+
st.write(f"Positive: {sentiment_dist.get('positive', 0)}")
|
272 |
+
st.write(f"Negative: {sentiment_dist.get('negative', 0)}")
|
273 |
+
st.write(f"Neutral: {sentiment_dist.get('neutral', 0)}")
|
274 |
+
else:
|
275 |
+
st.write("No valid sentiment data available")
|
276 |
+
|
277 |
+
# Display sentiment indices if available
|
278 |
+
if "sentiment_indices" in analysis and analysis["sentiment_indices"]:
|
279 |
+
st.subheader("Sentiment Indices")
|
280 |
+
|
281 |
+
# Debug: Print sentiment indices
|
282 |
+
print("Sentiment Indices:")
|
283 |
+
print(json.dumps(analysis["sentiment_indices"], indent=2))
|
284 |
+
|
285 |
+
# Get the indices data
|
286 |
+
indices = analysis["sentiment_indices"]
|
287 |
+
|
288 |
+
# Create a very simple visualization that will definitely work
|
289 |
+
try:
|
290 |
+
if isinstance(indices, dict):
|
291 |
+
# Display as simple metrics in columns
|
292 |
+
cols = st.columns(3)
|
293 |
+
|
294 |
+
# Define display names and descriptions
|
295 |
+
display_names = {
|
296 |
+
"positivity_index": "Positivity",
|
297 |
+
"negativity_index": "Negativity",
|
298 |
+
"emotional_intensity": "Emotional Intensity",
|
299 |
+
"controversy_score": "Controversy",
|
300 |
+
"confidence_score": "Confidence",
|
301 |
+
"esg_relevance": "ESG Relevance"
|
302 |
+
}
|
303 |
+
|
304 |
+
# Display each index as a metric
|
305 |
+
for i, (key, value) in enumerate(indices.items()):
|
306 |
+
if isinstance(value, (int, float)):
|
307 |
+
with cols[i % 3]:
|
308 |
+
display_name = display_names.get(key, key.replace("_", " ").title())
|
309 |
+
st.metric(display_name, f"{value:.2f}")
|
310 |
+
|
311 |
+
# Create a simple bar chart using Altair
|
312 |
+
|
313 |
+
# Create a simple DataFrame
|
314 |
+
chart_data = pd.DataFrame({
|
315 |
+
'Index': [display_names.get(k, k.replace("_", " ").title()) for k in indices.keys()],
|
316 |
+
'Value': [v if isinstance(v, (int, float)) else 0 for v in indices.values()]
|
317 |
+
})
|
318 |
+
|
319 |
+
# Create a simple bar chart
|
320 |
+
chart = alt.Chart(chart_data).mark_bar().encode(
|
321 |
+
x='Value',
|
322 |
+
y='Index',
|
323 |
+
color=alt.Color('Index')
|
324 |
+
).properties(
|
325 |
+
width=600,
|
326 |
+
height=300
|
327 |
+
)
|
328 |
+
|
329 |
+
st.altair_chart(chart, use_container_width=True)
|
330 |
+
|
331 |
+
# Add descriptions
|
332 |
+
with st.expander("Sentiment Indices Explained"):
|
333 |
+
st.markdown("""
|
334 |
+
- **Positivity**: Measures the positive sentiment in the articles (0-1)
|
335 |
+
- **Negativity**: Measures the negative sentiment in the articles (0-1)
|
336 |
+
- **Emotional Intensity**: Measures the overall emotional content (0-1)
|
337 |
+
- **Controversy**: High when both positive and negative sentiments are strong (0-1)
|
338 |
+
- **Confidence**: Confidence in the sentiment analysis (0-1)
|
339 |
+
- **ESG Relevance**: Relevance to Environmental, Social, and Governance topics (0-1)
|
340 |
+
""")
|
341 |
+
else:
|
342 |
+
st.warning("Sentiment indices data is not in the expected format.")
|
343 |
+
st.write("No valid sentiment indices available")
|
344 |
+
except Exception as e:
|
345 |
+
st.error(f"Error creating indices visualization: {str(e)}")
|
346 |
+
st.write("Fallback to simple text display:")
|
347 |
+
if isinstance(indices, dict):
|
348 |
+
for key, value in indices.items():
|
349 |
+
if isinstance(value, (int, float)):
|
350 |
+
st.write(f"{key.replace('_', ' ').title()}: {value:.2f}")
|
351 |
+
else:
|
352 |
+
st.write("No valid sentiment indices data available")
|
353 |
+
|
354 |
+
# Source Distribution
|
355 |
+
if "source_distribution" in analysis:
|
356 |
+
st.subheader("Source Distribution")
|
357 |
+
source_df = pd.DataFrame.from_dict(
|
358 |
+
analysis["source_distribution"],
|
359 |
+
orient='index',
|
360 |
+
columns=['Count']
|
361 |
+
)
|
362 |
+
st.bar_chart(source_df)
|
363 |
+
|
364 |
+
# Common Topics
|
365 |
+
if "common_topics" in analysis:
|
366 |
+
st.subheader("Common Topics")
|
367 |
+
st.write(", ".join(analysis["common_topics"]) if analysis["common_topics"] else "No common topics found")
|
368 |
+
|
369 |
+
# Coverage Differences
|
370 |
+
if "coverage_differences" in analysis:
|
371 |
+
st.subheader("Coverage Analysis")
|
372 |
+
for diff in analysis["coverage_differences"]:
|
373 |
+
st.write("- " + diff)
|
374 |
+
|
375 |
+
# Display Final Sentiment and Audio
|
376 |
+
st.header("🎯 Final Analysis")
|
377 |
+
if "final_sentiment_analysis" in result:
|
378 |
+
st.write(result["final_sentiment_analysis"])
|
379 |
+
|
380 |
+
# Display sentiment indices in the sidebar if available
|
381 |
+
if "sentiment_indices" in analysis and analysis["sentiment_indices"]:
|
382 |
+
indices = analysis["sentiment_indices"]
|
383 |
+
# Verify we have valid data
|
384 |
+
if indices and any(isinstance(v, (int, float)) for v in indices.values()):
|
385 |
+
st.sidebar.markdown("### Sentiment Indices")
|
386 |
+
for idx_name, idx_value in indices.items():
|
387 |
+
if isinstance(idx_value, (int, float)):
|
388 |
+
formatted_name = " ".join(word.capitalize() for word in idx_name.replace("_", " ").split())
|
389 |
+
st.sidebar.metric(formatted_name, f"{idx_value:.2f}")
|
390 |
+
|
391 |
+
# Display ensemble model information if available
|
392 |
+
if "ensemble_info" in result:
|
393 |
+
with st.expander("Ensemble Model Details"):
|
394 |
+
ensemble = result["ensemble_info"]
|
395 |
+
|
396 |
+
# Model agreement
|
397 |
+
if "agreement" in ensemble:
|
398 |
+
st.metric("Model Agreement", f"{ensemble['agreement']*100:.1f}%")
|
399 |
+
|
400 |
+
# Individual model results
|
401 |
+
if "models" in ensemble:
|
402 |
+
st.subheader("Individual Model Results")
|
403 |
+
models_data = []
|
404 |
+
for model_name, model_info in ensemble["models"].items():
|
405 |
+
models_data.append({
|
406 |
+
"Model": model_name,
|
407 |
+
"Sentiment": model_info.get("sentiment", "N/A"),
|
408 |
+
"Confidence": f"{model_info.get('confidence', 0)*100:.1f}%"
|
409 |
+
})
|
410 |
+
|
411 |
+
if models_data:
|
412 |
+
st.table(pd.DataFrame(models_data))
|
413 |
+
|
414 |
+
# Audio Playback Section
|
415 |
+
st.subheader("🔊 Listen to Analysis (Hindi)")
|
416 |
+
if 'audio_content' in result:
|
417 |
+
st.audio(result['audio_content'], format='audio/mp3')
|
418 |
+
else:
|
419 |
+
st.warning("Hindi audio summary not available")
|
420 |
+
|
421 |
+
# Total Articles
|
422 |
+
if "total_articles" in analysis:
|
423 |
+
st.sidebar.info(f"Found {analysis['total_articles']} articles")
|
424 |
+
|
425 |
+
# Add a disclaimer
|
426 |
+
st.sidebar.markdown("---")
|
427 |
+
st.sidebar.markdown("### About")
|
428 |
+
st.sidebar.write("This app analyzes news articles and provides sentiment analysis for any company.")
|
429 |
+
|
430 |
+
if __name__ == "__main__":
|
431 |
+
main()
|
config.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Configuration settings for the News Summarization application."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
# Load environment variables
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
# API Settings
|
10 |
+
API_HOST = os.getenv("API_HOST", "0.0.0.0")
|
11 |
+
API_PORT = int(os.getenv("API_PORT", "8005"))
|
12 |
+
API_BASE_URL = os.getenv("API_BASE_URL", f"http://{API_HOST}:{API_PORT}")
|
13 |
+
|
14 |
+
# News Scraping Settings
|
15 |
+
ARTICLES_PER_SOURCE = int(os.getenv("ARTICLES_PER_SOURCE", "10"))
|
16 |
+
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
17 |
+
|
18 |
+
# RSS Feed Settings
|
19 |
+
RSS_FEEDS = {
|
20 |
+
"BBC": "http://feeds.bbci.co.uk/news/business/rss.xml",
|
21 |
+
"CNN": "http://rss.cnn.com/rss/money_news_international.rss",
|
22 |
+
"FoxBusiness": "http://feeds.foxnews.com/foxbusiness/latest"
|
23 |
+
}
|
24 |
+
|
25 |
+
# Model Settings
|
26 |
+
SENTIMENT_MODEL = "yiyanghkust/finbert-tone" # More advanced financial sentiment model
|
27 |
+
SENTIMENT_FINE_GRAINED_MODEL = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
|
28 |
+
SUMMARIZATION_MODEL = "t5-base"
|
29 |
+
|
30 |
+
# Additional Fine-Grained Sentiment Models
|
31 |
+
FINE_GRAINED_MODELS = {
|
32 |
+
"financial": "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
|
33 |
+
"emotion": "j-hartmann/emotion-english-distilroberta-base",
|
34 |
+
"aspect": "yangheng/deberta-v3-base-absa-v1.1",
|
35 |
+
"esg": "yiyanghkust/finbert-esg",
|
36 |
+
"news_tone": "ProsusAI/finbert"
|
37 |
+
}
|
38 |
+
|
39 |
+
# Fine-Grained Sentiment Categories
|
40 |
+
SENTIMENT_CATEGORIES = {
|
41 |
+
"financial": ["positive", "negative", "neutral"],
|
42 |
+
"emotion": ["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"],
|
43 |
+
"aspect": ["positive", "negative", "neutral"],
|
44 |
+
"esg": ["environmental", "social", "governance", "neutral"],
|
45 |
+
"news_tone": ["positive", "negative", "neutral"]
|
46 |
+
}
|
47 |
+
|
48 |
+
# Cache Settings
|
49 |
+
CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
|
50 |
+
CACHE_EXPIRY = int(os.getenv("CACHE_EXPIRY", "3600")) # 1 hour
|
51 |
+
CACHE_DURATION = int(os.getenv("CACHE_DURATION", "300")) # 5 minutes in seconds
|
52 |
+
|
53 |
+
# Audio Settings
|
54 |
+
AUDIO_OUTPUT_DIR = os.getenv("AUDIO_OUTPUT_DIR", "audio_output")
|
55 |
+
DEFAULT_LANG = os.getenv("DEFAULT_LANG", "hi") # Hindi
|
56 |
+
|
57 |
+
# News Sources
|
58 |
+
NEWS_SOURCES = {
|
59 |
+
# Major News Aggregators
|
60 |
+
"google": "https://www.google.com/search?q={}&tbm=nws",
|
61 |
+
"bing": "https://www.bing.com/news/search?q={}",
|
62 |
+
"yahoo": "https://news.search.yahoo.com/search?p={}",
|
63 |
+
|
64 |
+
# Financial News
|
65 |
+
"reuters": "https://www.reuters.com/search/news?blob={}",
|
66 |
+
"marketwatch": "https://www.marketwatch.com/search?q={}&ts=0&tab=All%20News",
|
67 |
+
"investing": "https://www.investing.com/search/?q={}&tab=news",
|
68 |
+
|
69 |
+
# Tech News
|
70 |
+
"techcrunch": "https://techcrunch.com/search/{}",
|
71 |
+
"zdnet": "https://www.zdnet.com/search/?q={}",
|
72 |
+
}
|
73 |
+
|
74 |
+
# Article limits
|
75 |
+
MIN_ARTICLES = 20
|
76 |
+
MAX_ARTICLES_PER_SOURCE = 10 # Adjusted for more sources
|
77 |
+
MAX_ARTICLES = 50 # Increased to accommodate more sources
|
78 |
+
|
79 |
+
# Browser Headers
|
80 |
+
HEADERS = {
|
81 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
82 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
83 |
+
"Accept-Language": "en-US,en;q=0.5",
|
84 |
+
"Connection": "keep-alive"
|
85 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.31.1
|
2 |
+
beautifulsoup4==4.12.2
|
3 |
+
requests==2.31.0
|
4 |
+
pandas==2.2.0
|
5 |
+
nltk==3.8.1
|
6 |
+
transformers==4.37.2
|
7 |
+
torch==2.2.0
|
8 |
+
fastapi==0.109.2
|
9 |
+
uvicorn==0.27.1
|
10 |
+
python-multipart==0.0.6
|
11 |
+
gTTS==2.5.0
|
12 |
+
scikit-learn==1.4.0
|
13 |
+
numpy==1.26.3
|
14 |
+
python-dotenv==1.0.1
|
15 |
+
aiofiles==23.2.1
|
16 |
+
googletrans==3.1.0a0
|
17 |
+
lxml==4.9.3
|
18 |
+
spacy==3.7.2
|
19 |
+
plotly==5.18.0
|
20 |
+
textblob==0.17.1
|
21 |
+
vaderSentiment==3.3.2
|
utils.py
ADDED
@@ -0,0 +1,1402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility functions for news extraction, sentiment analysis, and text-to-speech."""
|
2 |
+
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
6 |
+
from gtts import gTTS
|
7 |
+
import os
|
8 |
+
from typing import List, Dict, Any
|
9 |
+
import pandas as pd
|
10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
+
from config import *
|
12 |
+
import re
|
13 |
+
from datetime import datetime, timedelta
|
14 |
+
import time
|
15 |
+
import json
|
16 |
+
from googletrans import Translator
|
17 |
+
import statistics
|
18 |
+
|
19 |
+
class NewsExtractor:
|
20 |
+
def __init__(self):
|
21 |
+
self.headers = HEADERS
|
22 |
+
|
23 |
+
def search_news(self, company_name: str) -> List[Dict[str, str]]:
|
24 |
+
"""Extract news articles about the company ensuring minimum count."""
|
25 |
+
all_articles = []
|
26 |
+
retries = 2 # Number of retries if we don't get enough articles
|
27 |
+
|
28 |
+
while retries > 0 and len(all_articles) < MIN_ARTICLES:
|
29 |
+
for source, url_template in NEWS_SOURCES.items():
|
30 |
+
try:
|
31 |
+
url = url_template.format(company_name.replace(" ", "+"))
|
32 |
+
print(f"\nSearching {source} for news about {company_name}...")
|
33 |
+
|
34 |
+
# Try different page numbers for more articles
|
35 |
+
for page in range(2): # Try first two pages
|
36 |
+
page_url = url
|
37 |
+
if page > 0:
|
38 |
+
if source == "google":
|
39 |
+
page_url += f"&start={page * 10}"
|
40 |
+
elif source == "bing":
|
41 |
+
page_url += f"&first={page * 10 + 1}"
|
42 |
+
elif source == "yahoo":
|
43 |
+
page_url += f"&b={page * 10 + 1}"
|
44 |
+
elif source == "reuters":
|
45 |
+
page_url += f"&page={page + 1}"
|
46 |
+
elif source == "marketwatch":
|
47 |
+
page_url += f"&page={page + 1}"
|
48 |
+
elif source == "investing":
|
49 |
+
page_url += f"&page={page + 1}"
|
50 |
+
elif source == "techcrunch":
|
51 |
+
page_url += f"/page/{page + 1}"
|
52 |
+
elif source == "zdnet":
|
53 |
+
page_url += f"&page={page + 1}"
|
54 |
+
|
55 |
+
response = requests.get(page_url, headers=self.headers, timeout=15)
|
56 |
+
if response.status_code != 200:
|
57 |
+
print(f"Error: {source} page {page+1} returned status code {response.status_code}")
|
58 |
+
continue
|
59 |
+
|
60 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
61 |
+
|
62 |
+
source_articles = []
|
63 |
+
if source == "google":
|
64 |
+
source_articles = self._parse_google_news(soup)
|
65 |
+
elif source == "bing":
|
66 |
+
source_articles = self._parse_bing_news(soup)
|
67 |
+
elif source == "yahoo":
|
68 |
+
source_articles = self._parse_yahoo_news(soup)
|
69 |
+
elif source == "reuters":
|
70 |
+
source_articles = self._parse_reuters_news(soup)
|
71 |
+
elif source == "marketwatch":
|
72 |
+
source_articles = self._parse_marketwatch_news(soup)
|
73 |
+
elif source == "investing":
|
74 |
+
source_articles = self._parse_investing_news(soup)
|
75 |
+
elif source == "techcrunch":
|
76 |
+
source_articles = self._parse_techcrunch_news(soup)
|
77 |
+
elif source == "zdnet":
|
78 |
+
source_articles = self._parse_zdnet_news(soup)
|
79 |
+
|
80 |
+
# Limit articles per source
|
81 |
+
if source_articles:
|
82 |
+
source_articles = source_articles[:MAX_ARTICLES_PER_SOURCE]
|
83 |
+
all_articles.extend(source_articles)
|
84 |
+
print(f"Found {len(source_articles)} articles from {source} page {page+1}")
|
85 |
+
|
86 |
+
# If we have enough articles, break the page loop
|
87 |
+
if len(all_articles) >= MIN_ARTICLES:
|
88 |
+
break
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Error fetching from {source}: {str(e)}")
|
92 |
+
continue
|
93 |
+
|
94 |
+
# If we have enough articles, break the source loop
|
95 |
+
if len(all_articles) >= MIN_ARTICLES:
|
96 |
+
break
|
97 |
+
|
98 |
+
retries -= 1
|
99 |
+
if len(all_articles) < MIN_ARTICLES and retries > 0:
|
100 |
+
print(f"\nFound only {len(all_articles)} articles, retrying...")
|
101 |
+
|
102 |
+
# Remove duplicates
|
103 |
+
unique_articles = self._remove_duplicates(all_articles)
|
104 |
+
print(f"\nFound {len(unique_articles)} unique articles")
|
105 |
+
|
106 |
+
if len(unique_articles) < MIN_ARTICLES:
|
107 |
+
print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
|
108 |
+
|
109 |
+
# Balance articles across sources
|
110 |
+
balanced_articles = self._balance_sources(unique_articles)
|
111 |
+
return balanced_articles[:max(MIN_ARTICLES, MAX_ARTICLES)]
|
112 |
+
|
113 |
+
def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
114 |
+
"""Balance articles across sources while maintaining minimum count."""
|
115 |
+
source_articles = {}
|
116 |
+
|
117 |
+
# Group articles by source
|
118 |
+
for article in articles:
|
119 |
+
source = article['source']
|
120 |
+
if source not in source_articles:
|
121 |
+
source_articles[source] = []
|
122 |
+
source_articles[source].append(article)
|
123 |
+
|
124 |
+
# Calculate target articles per source
|
125 |
+
total_sources = len(source_articles)
|
126 |
+
target_per_source = max(MIN_ARTICLES // total_sources,
|
127 |
+
MAX_ARTICLES_PER_SOURCE)
|
128 |
+
|
129 |
+
# Get articles from each source
|
130 |
+
balanced = []
|
131 |
+
for source, articles_list in source_articles.items():
|
132 |
+
balanced.extend(articles_list[:target_per_source])
|
133 |
+
|
134 |
+
# If we still need more articles to meet minimum, add more from sources
|
135 |
+
# that have additional articles
|
136 |
+
if len(balanced) < MIN_ARTICLES:
|
137 |
+
remaining = []
|
138 |
+
for articles_list in source_articles.values():
|
139 |
+
remaining.extend(articles_list[target_per_source:])
|
140 |
+
|
141 |
+
# Sort remaining by source to maintain balance
|
142 |
+
remaining.sort(key=lambda x: len([a for a in balanced if a['source'] == x['source']]))
|
143 |
+
|
144 |
+
while len(balanced) < MIN_ARTICLES and remaining:
|
145 |
+
balanced.append(remaining.pop(0))
|
146 |
+
|
147 |
+
return balanced
|
148 |
+
|
149 |
+
def _parse_google_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
150 |
+
"""Parse Google News search results."""
|
151 |
+
articles = []
|
152 |
+
for div in soup.find_all(['div', 'article'], class_=['g', 'xuvV6b', 'WlydOe']):
|
153 |
+
try:
|
154 |
+
title_elem = div.find(['h3', 'h4'])
|
155 |
+
snippet_elem = div.find('div', class_=['VwiC3b', 'yy6M1d'])
|
156 |
+
link_elem = div.find('a')
|
157 |
+
source_elem = div.find(['div', 'span'], class_='UPmit')
|
158 |
+
|
159 |
+
if title_elem and snippet_elem and link_elem:
|
160 |
+
source = source_elem.get_text(strip=True) if source_elem else 'Google News'
|
161 |
+
articles.append({
|
162 |
+
'title': title_elem.get_text(strip=True),
|
163 |
+
'content': snippet_elem.get_text(strip=True),
|
164 |
+
'url': link_elem['href'],
|
165 |
+
'source': source
|
166 |
+
})
|
167 |
+
except Exception as e:
|
168 |
+
print(f"Error parsing Google article: {str(e)}")
|
169 |
+
continue
|
170 |
+
return articles
|
171 |
+
|
172 |
+
def _parse_bing_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
173 |
+
"""Parse Bing News search results."""
|
174 |
+
articles = []
|
175 |
+
for article in soup.find_all(['div', 'article'], class_=['news-card', 'newsitem', 'item-info']):
|
176 |
+
try:
|
177 |
+
title_elem = article.find(['a', 'h3'], class_=['title', 'news-card-title'])
|
178 |
+
snippet_elem = article.find(['div', 'p'], class_=['snippet', 'description'])
|
179 |
+
source_elem = article.find(['div', 'span'], class_=['source', 'provider'])
|
180 |
+
|
181 |
+
if title_elem and snippet_elem:
|
182 |
+
source = source_elem.get_text(strip=True) if source_elem else 'Bing News'
|
183 |
+
url = title_elem['href'] if 'href' in title_elem.attrs else ''
|
184 |
+
articles.append({
|
185 |
+
'title': title_elem.get_text(strip=True),
|
186 |
+
'content': snippet_elem.get_text(strip=True),
|
187 |
+
'url': url,
|
188 |
+
'source': source
|
189 |
+
})
|
190 |
+
except Exception as e:
|
191 |
+
print(f"Error parsing Bing article: {str(e)}")
|
192 |
+
return articles
|
193 |
+
|
194 |
+
def _parse_yahoo_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
195 |
+
"""Parse Yahoo News search results."""
|
196 |
+
articles = []
|
197 |
+
for article in soup.find_all('div', class_='NewsArticle'):
|
198 |
+
try:
|
199 |
+
title_elem = article.find(['h4', 'h3', 'a'])
|
200 |
+
snippet_elem = article.find('p')
|
201 |
+
source_elem = article.find(['span', 'div'], class_=['provider', 'source'])
|
202 |
+
|
203 |
+
if title_elem and snippet_elem:
|
204 |
+
source = source_elem.get_text(strip=True) if source_elem else 'Yahoo News'
|
205 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
206 |
+
articles.append({
|
207 |
+
'title': title_elem.get_text(strip=True),
|
208 |
+
'content': snippet_elem.get_text(strip=True),
|
209 |
+
'url': url,
|
210 |
+
'source': source
|
211 |
+
})
|
212 |
+
except Exception as e:
|
213 |
+
print(f"Error parsing Yahoo article: {str(e)}")
|
214 |
+
return articles
|
215 |
+
|
216 |
+
def _parse_reuters_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
217 |
+
"""Parse Reuters search results."""
|
218 |
+
articles = []
|
219 |
+
for article in soup.find_all(['div', 'article'], class_=['search-result-content', 'story']):
|
220 |
+
try:
|
221 |
+
title_elem = article.find(['h3', 'a'], class_='story-title')
|
222 |
+
snippet_elem = article.find(['p', 'div'], class_=['story-description', 'description'])
|
223 |
+
|
224 |
+
if title_elem:
|
225 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
226 |
+
if url and not url.startswith('http'):
|
227 |
+
url = 'https://www.reuters.com' + url
|
228 |
+
|
229 |
+
articles.append({
|
230 |
+
'title': title_elem.get_text(strip=True),
|
231 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
232 |
+
'url': url,
|
233 |
+
'source': 'Reuters'
|
234 |
+
})
|
235 |
+
except Exception as e:
|
236 |
+
print(f"Error parsing Reuters article: {str(e)}")
|
237 |
+
return articles
|
238 |
+
|
239 |
+
def _parse_marketwatch_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
240 |
+
"""Parse MarketWatch search results."""
|
241 |
+
articles = []
|
242 |
+
for article in soup.find_all(['div', 'article'], class_=['element--article', 'article__content']):
|
243 |
+
try:
|
244 |
+
title_elem = article.find(['h3', 'h2'], class_=['article__headline', 'title'])
|
245 |
+
snippet_elem = article.find('p', class_=['article__summary', 'description'])
|
246 |
+
|
247 |
+
if title_elem:
|
248 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
249 |
+
articles.append({
|
250 |
+
'title': title_elem.get_text(strip=True),
|
251 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
252 |
+
'url': url,
|
253 |
+
'source': 'MarketWatch'
|
254 |
+
})
|
255 |
+
except Exception as e:
|
256 |
+
print(f"Error parsing MarketWatch article: {str(e)}")
|
257 |
+
return articles
|
258 |
+
|
259 |
+
def _parse_investing_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
260 |
+
"""Parse Investing.com search results."""
|
261 |
+
articles = []
|
262 |
+
for article in soup.find_all(['div', 'article'], class_=['articleItem', 'news-item']):
|
263 |
+
try:
|
264 |
+
title_elem = article.find(['a', 'h3'], class_=['title', 'articleTitle'])
|
265 |
+
snippet_elem = article.find(['p', 'div'], class_=['description', 'articleContent'])
|
266 |
+
|
267 |
+
if title_elem:
|
268 |
+
url = title_elem['href'] if 'href' in title_elem.attrs else title_elem.find('a')['href']
|
269 |
+
if url and not url.startswith('http'):
|
270 |
+
url = 'https://www.investing.com' + url
|
271 |
+
|
272 |
+
articles.append({
|
273 |
+
'title': title_elem.get_text(strip=True),
|
274 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
275 |
+
'url': url,
|
276 |
+
'source': 'Investing.com'
|
277 |
+
})
|
278 |
+
except Exception as e:
|
279 |
+
print(f"Error parsing Investing.com article: {str(e)}")
|
280 |
+
return articles
|
281 |
+
|
282 |
+
def _parse_techcrunch_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
283 |
+
"""Parse TechCrunch search results."""
|
284 |
+
articles = []
|
285 |
+
for article in soup.find_all(['div', 'article'], class_=['post-block', 'article-block']):
|
286 |
+
try:
|
287 |
+
title_elem = article.find(['h2', 'h3', 'a'], class_=['post-block__title', 'article-title'])
|
288 |
+
snippet_elem = article.find(['div', 'p'], class_=['post-block__content', 'article-content'])
|
289 |
+
|
290 |
+
if title_elem:
|
291 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
292 |
+
articles.append({
|
293 |
+
'title': title_elem.get_text(strip=True),
|
294 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
295 |
+
'url': url,
|
296 |
+
'source': 'TechCrunch'
|
297 |
+
})
|
298 |
+
except Exception as e:
|
299 |
+
print(f"Error parsing TechCrunch article: {str(e)}")
|
300 |
+
return articles
|
301 |
+
|
302 |
+
def _parse_zdnet_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
303 |
+
"""Parse ZDNet search results."""
|
304 |
+
articles = []
|
305 |
+
for article in soup.find_all(['div', 'article'], class_=['item', 'article']):
|
306 |
+
try:
|
307 |
+
title_elem = article.find(['h3', 'a'], class_=['title', 'headline'])
|
308 |
+
snippet_elem = article.find(['p', 'div'], class_=['summary', 'content'])
|
309 |
+
|
310 |
+
if title_elem:
|
311 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
312 |
+
if url and not url.startswith('http'):
|
313 |
+
url = 'https://www.zdnet.com' + url
|
314 |
+
|
315 |
+
articles.append({
|
316 |
+
'title': title_elem.get_text(strip=True),
|
317 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
318 |
+
'url': url,
|
319 |
+
'source': 'ZDNet'
|
320 |
+
})
|
321 |
+
except Exception as e:
|
322 |
+
print(f"Error parsing ZDNet article: {str(e)}")
|
323 |
+
return articles
|
324 |
+
|
325 |
+
def _remove_duplicates(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
326 |
+
"""Remove duplicate articles based on title similarity."""
|
327 |
+
unique_articles = []
|
328 |
+
seen_titles = set()
|
329 |
+
|
330 |
+
for article in articles:
|
331 |
+
title = article['title'].lower()
|
332 |
+
if not any(title in seen_title or seen_title in title for seen_title in seen_titles):
|
333 |
+
unique_articles.append(article)
|
334 |
+
seen_titles.add(title)
|
335 |
+
|
336 |
+
return unique_articles
|
337 |
+
|
338 |
+
class SentimentAnalyzer:
|
339 |
+
def __init__(self):
|
340 |
+
try:
|
341 |
+
# Primary financial sentiment model
|
342 |
+
self.sentiment_pipeline = pipeline("sentiment-analysis",
|
343 |
+
model=SENTIMENT_MODEL)
|
344 |
+
|
345 |
+
# Initialize fine-grained sentiment models
|
346 |
+
self.fine_grained_models = {}
|
347 |
+
try:
|
348 |
+
# Initialize the default fine-grained model for backward compatibility
|
349 |
+
self.fine_grained_sentiment = pipeline("sentiment-analysis",
|
350 |
+
model=SENTIMENT_FINE_GRAINED_MODEL)
|
351 |
+
|
352 |
+
# Initialize additional fine-grained models
|
353 |
+
for model_name, model_path in FINE_GRAINED_MODELS.items():
|
354 |
+
try:
|
355 |
+
print(f"Loading fine-grained model: {model_name}")
|
356 |
+
self.fine_grained_models[model_name] = pipeline("sentiment-analysis",
|
357 |
+
model=model_path)
|
358 |
+
except Exception as e:
|
359 |
+
print(f"Error loading fine-grained model {model_name}: {str(e)}")
|
360 |
+
except Exception as e:
|
361 |
+
print(f"Error initializing fine-grained models: {str(e)}")
|
362 |
+
self.fine_grained_sentiment = None
|
363 |
+
|
364 |
+
# Initialize additional sentiment analyzers if available
|
365 |
+
self.has_textblob = False
|
366 |
+
self.has_vader = False
|
367 |
+
|
368 |
+
try:
|
369 |
+
from textblob import TextBlob
|
370 |
+
self.TextBlob = TextBlob
|
371 |
+
self.has_textblob = True
|
372 |
+
except:
|
373 |
+
print("TextBlob not available. Install with: pip install textblob")
|
374 |
+
|
375 |
+
try:
|
376 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
377 |
+
self.vader = SentimentIntensityAnalyzer()
|
378 |
+
self.has_vader = True
|
379 |
+
except:
|
380 |
+
print("VADER not available. Install with: pip install vaderSentiment")
|
381 |
+
|
382 |
+
self.summarizer = pipeline("summarization",
|
383 |
+
model=SUMMARIZATION_MODEL)
|
384 |
+
self.vectorizer = TfidfVectorizer(stop_words='english',
|
385 |
+
max_features=10)
|
386 |
+
|
387 |
+
# Initialize NER pipeline if spaCy is available
|
388 |
+
try:
|
389 |
+
import spacy
|
390 |
+
self.nlp = spacy.load("en_core_web_sm")
|
391 |
+
self.has_ner = True
|
392 |
+
except:
|
393 |
+
self.has_ner = False
|
394 |
+
print("spaCy not available for NER. Install with: pip install spacy && python -m spacy download en_core_web_sm")
|
395 |
+
|
396 |
+
except Exception as e:
|
397 |
+
print(f"Error initializing sentiment models: {str(e)}")
|
398 |
+
# Fallback to default models if specific models fail
|
399 |
+
self.sentiment_pipeline = pipeline("sentiment-analysis")
|
400 |
+
self.fine_grained_sentiment = None
|
401 |
+
self.fine_grained_models = {}
|
402 |
+
self.summarizer = pipeline("summarization")
|
403 |
+
self.vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
|
404 |
+
self.has_ner = False
|
405 |
+
self.has_textblob = False
|
406 |
+
self.has_vader = False
|
407 |
+
|
408 |
+
def analyze_article(self, article: Dict[str, str]) -> Dict[str, Any]:
|
409 |
+
"""Analyze sentiment and generate summary for an article."""
|
410 |
+
try:
|
411 |
+
# Get the full text by combining title and content
|
412 |
+
full_text = f"{article['title']} {article['content']}"
|
413 |
+
|
414 |
+
# Generate summary
|
415 |
+
summary = self.summarize_text(full_text)
|
416 |
+
|
417 |
+
# Get ensemble sentiment analysis
|
418 |
+
sentiment_analysis = self._get_ensemble_sentiment(full_text)
|
419 |
+
sentiment_label = sentiment_analysis['ensemble_sentiment']
|
420 |
+
sentiment_score = sentiment_analysis['ensemble_score']
|
421 |
+
|
422 |
+
# Add fine-grained sentiment analysis
|
423 |
+
fine_grained_sentiment = self._get_fine_grained_sentiment(full_text)
|
424 |
+
|
425 |
+
# Extract key topics
|
426 |
+
topics = self.extract_topics(full_text)
|
427 |
+
|
428 |
+
# Extract named entities
|
429 |
+
entities = self._extract_entities(full_text)
|
430 |
+
|
431 |
+
# Extract sentiment targets (entities associated with sentiment)
|
432 |
+
sentiment_targets = self._extract_sentiment_targets(full_text, entities)
|
433 |
+
|
434 |
+
# Add analysis to article
|
435 |
+
analyzed_article = article.copy()
|
436 |
+
analyzed_article.update({
|
437 |
+
'summary': summary,
|
438 |
+
'sentiment': sentiment_label,
|
439 |
+
'sentiment_score': sentiment_score,
|
440 |
+
'sentiment_details': sentiment_analysis,
|
441 |
+
'fine_grained_sentiment': fine_grained_sentiment,
|
442 |
+
'topics': topics,
|
443 |
+
'entities': entities,
|
444 |
+
'sentiment_targets': sentiment_targets,
|
445 |
+
'sentiment_indices': fine_grained_sentiment.get('indices', {}),
|
446 |
+
'analysis_timestamp': datetime.now().isoformat()
|
447 |
+
})
|
448 |
+
|
449 |
+
return analyzed_article
|
450 |
+
|
451 |
+
except Exception as e:
|
452 |
+
print(f"Error analyzing article: {str(e)}")
|
453 |
+
# Return original article with default values if analysis fails
|
454 |
+
article.update({
|
455 |
+
'summary': article.get('content', '')[:200] + '...',
|
456 |
+
'sentiment': 'neutral',
|
457 |
+
'sentiment_score': 0.0,
|
458 |
+
'sentiment_details': {},
|
459 |
+
'fine_grained_sentiment': {},
|
460 |
+
'topics': [],
|
461 |
+
'entities': {},
|
462 |
+
'sentiment_targets': [],
|
463 |
+
'sentiment_indices': {
|
464 |
+
'positivity_index': 0.5,
|
465 |
+
'negativity_index': 0.5,
|
466 |
+
'emotional_intensity': 0.0,
|
467 |
+
'controversy_score': 0.0,
|
468 |
+
'confidence_score': 0.0,
|
469 |
+
'esg_relevance': 0.0
|
470 |
+
},
|
471 |
+
'analysis_timestamp': datetime.now().isoformat()
|
472 |
+
})
|
473 |
+
return article
|
474 |
+
|
475 |
+
def _get_ensemble_sentiment(self, text: str) -> Dict[str, Any]:
|
476 |
+
"""Get ensemble sentiment by combining multiple sentiment models."""
|
477 |
+
results = {}
|
478 |
+
|
479 |
+
# Initialize with default values
|
480 |
+
ensemble_result = {
|
481 |
+
'ensemble_sentiment': 'neutral',
|
482 |
+
'ensemble_score': 0.5,
|
483 |
+
'models': {}
|
484 |
+
}
|
485 |
+
|
486 |
+
try:
|
487 |
+
# 1. Primary transformer model (finbert)
|
488 |
+
try:
|
489 |
+
primary_result = self.sentiment_pipeline(text[:512])[0] # Limit text length
|
490 |
+
primary_label = primary_result['label'].lower()
|
491 |
+
primary_score = primary_result['score']
|
492 |
+
|
493 |
+
# Map to standard format
|
494 |
+
if primary_label == 'positive':
|
495 |
+
primary_normalized = primary_score
|
496 |
+
elif primary_label == 'negative':
|
497 |
+
primary_normalized = 1 - primary_score
|
498 |
+
else: # neutral
|
499 |
+
primary_normalized = 0.5
|
500 |
+
|
501 |
+
ensemble_result['models']['transformer'] = {
|
502 |
+
'sentiment': primary_label,
|
503 |
+
'score': round(primary_score, 3),
|
504 |
+
'normalized_score': round(primary_normalized, 3)
|
505 |
+
}
|
506 |
+
except:
|
507 |
+
ensemble_result['models']['transformer'] = {
|
508 |
+
'sentiment': 'error',
|
509 |
+
'score': 0,
|
510 |
+
'normalized_score': 0.5
|
511 |
+
}
|
512 |
+
|
513 |
+
# 2. TextBlob sentiment
|
514 |
+
if self.has_textblob:
|
515 |
+
try:
|
516 |
+
blob = self.TextBlob(text)
|
517 |
+
polarity = blob.sentiment.polarity
|
518 |
+
|
519 |
+
# Convert to standard format
|
520 |
+
if polarity > 0.1:
|
521 |
+
textblob_sentiment = 'positive'
|
522 |
+
textblob_score = polarity
|
523 |
+
elif polarity < -0.1:
|
524 |
+
textblob_sentiment = 'negative'
|
525 |
+
textblob_score = abs(polarity)
|
526 |
+
else:
|
527 |
+
textblob_sentiment = 'neutral'
|
528 |
+
textblob_score = 0.5
|
529 |
+
|
530 |
+
# Normalize to 0-1 scale
|
531 |
+
textblob_normalized = (polarity + 1) / 2
|
532 |
+
|
533 |
+
ensemble_result['models']['textblob'] = {
|
534 |
+
'sentiment': textblob_sentiment,
|
535 |
+
'score': round(textblob_score, 3),
|
536 |
+
'normalized_score': round(textblob_normalized, 3)
|
537 |
+
}
|
538 |
+
except:
|
539 |
+
ensemble_result['models']['textblob'] = {
|
540 |
+
'sentiment': 'error',
|
541 |
+
'score': 0,
|
542 |
+
'normalized_score': 0.5
|
543 |
+
}
|
544 |
+
|
545 |
+
# 3. VADER sentiment
|
546 |
+
if self.has_vader:
|
547 |
+
try:
|
548 |
+
vader_scores = self.vader.polarity_scores(text)
|
549 |
+
compound = vader_scores['compound']
|
550 |
+
|
551 |
+
# Convert to standard format
|
552 |
+
if compound > 0.05:
|
553 |
+
vader_sentiment = 'positive'
|
554 |
+
vader_score = compound
|
555 |
+
elif compound < -0.05:
|
556 |
+
vader_sentiment = 'negative'
|
557 |
+
vader_score = abs(compound)
|
558 |
+
else:
|
559 |
+
vader_sentiment = 'neutral'
|
560 |
+
vader_score = 0.5
|
561 |
+
|
562 |
+
# Normalize to 0-1 scale
|
563 |
+
vader_normalized = (compound + 1) / 2
|
564 |
+
|
565 |
+
ensemble_result['models']['vader'] = {
|
566 |
+
'sentiment': vader_sentiment,
|
567 |
+
'score': round(vader_score, 3),
|
568 |
+
'normalized_score': round(vader_normalized, 3)
|
569 |
+
}
|
570 |
+
except:
|
571 |
+
ensemble_result['models']['vader'] = {
|
572 |
+
'sentiment': 'error',
|
573 |
+
'score': 0,
|
574 |
+
'normalized_score': 0.5
|
575 |
+
}
|
576 |
+
|
577 |
+
# Calculate ensemble result
|
578 |
+
# Get all normalized scores
|
579 |
+
normalized_scores = []
|
580 |
+
for model_name, model_result in ensemble_result['models'].items():
|
581 |
+
if model_result['sentiment'] != 'error':
|
582 |
+
normalized_scores.append(model_result['normalized_score'])
|
583 |
+
|
584 |
+
# Calculate average if we have scores
|
585 |
+
if normalized_scores:
|
586 |
+
avg_score = sum(normalized_scores) / len(normalized_scores)
|
587 |
+
|
588 |
+
# Convert to sentiment label
|
589 |
+
if avg_score > 0.6:
|
590 |
+
ensemble_sentiment = 'positive'
|
591 |
+
elif avg_score < 0.4:
|
592 |
+
ensemble_sentiment = 'negative'
|
593 |
+
else:
|
594 |
+
ensemble_sentiment = 'neutral'
|
595 |
+
|
596 |
+
ensemble_result['ensemble_sentiment'] = ensemble_sentiment
|
597 |
+
ensemble_result['ensemble_score'] = round(avg_score, 3)
|
598 |
+
|
599 |
+
# Add confidence level
|
600 |
+
if len(normalized_scores) > 1:
|
601 |
+
# Calculate standard deviation to measure agreement
|
602 |
+
std_dev = statistics.stdev(normalized_scores) if len(normalized_scores) > 1 else 0
|
603 |
+
agreement = 1 - (std_dev * 2) # Lower std_dev means higher agreement
|
604 |
+
agreement = max(0, min(1, agreement)) # Clamp to 0-1
|
605 |
+
|
606 |
+
ensemble_result['model_agreement'] = round(agreement, 3)
|
607 |
+
|
608 |
+
return ensemble_result
|
609 |
+
|
610 |
+
except Exception as e:
|
611 |
+
print(f"Error in ensemble sentiment analysis: {str(e)}")
|
612 |
+
return {
|
613 |
+
'ensemble_sentiment': 'neutral',
|
614 |
+
'ensemble_score': 0.5,
|
615 |
+
'models': {}
|
616 |
+
}
|
617 |
+
|
618 |
+
def _get_fine_grained_sentiment(self, text: str) -> Dict[str, Any]:
|
619 |
+
"""Get fine-grained sentiment analysis with more detailed categories."""
|
620 |
+
# Initialize result structure
|
621 |
+
result = {
|
622 |
+
"primary": {"category": "unknown", "confidence": 0.0},
|
623 |
+
"models": {}
|
624 |
+
}
|
625 |
+
|
626 |
+
# Check if we have any fine-grained models
|
627 |
+
if not self.fine_grained_sentiment and not self.fine_grained_models:
|
628 |
+
return result
|
629 |
+
|
630 |
+
try:
|
631 |
+
# Split text into manageable chunks if too long
|
632 |
+
chunks = self._split_text(text)
|
633 |
+
|
634 |
+
# Process with default fine-grained model for backward compatibility
|
635 |
+
if self.fine_grained_sentiment:
|
636 |
+
primary_results = []
|
637 |
+
|
638 |
+
for chunk in chunks:
|
639 |
+
if not chunk.strip():
|
640 |
+
continue
|
641 |
+
chunk_result = self.fine_grained_sentiment(chunk)[0]
|
642 |
+
primary_results.append(chunk_result)
|
643 |
+
|
644 |
+
if primary_results:
|
645 |
+
# Aggregate results from all chunks
|
646 |
+
categories = {}
|
647 |
+
for res in primary_results:
|
648 |
+
label = res['label'].lower()
|
649 |
+
score = res['score']
|
650 |
+
if label in categories:
|
651 |
+
categories[label] += score
|
652 |
+
else:
|
653 |
+
categories[label] = score
|
654 |
+
|
655 |
+
# Normalize scores
|
656 |
+
total = sum(categories.values())
|
657 |
+
if total > 0:
|
658 |
+
categories = {k: round(v/total, 3) for k, v in categories.items()}
|
659 |
+
|
660 |
+
# Get dominant category
|
661 |
+
dominant_category = max(categories.items(), key=lambda x: x[1])
|
662 |
+
|
663 |
+
result["primary"] = {
|
664 |
+
"category": dominant_category[0],
|
665 |
+
"confidence": dominant_category[1],
|
666 |
+
"distribution": categories
|
667 |
+
}
|
668 |
+
|
669 |
+
# Process with additional fine-grained models
|
670 |
+
for model_name, model in self.fine_grained_models.items():
|
671 |
+
model_results = []
|
672 |
+
|
673 |
+
for chunk in chunks:
|
674 |
+
if not chunk.strip():
|
675 |
+
continue
|
676 |
+
try:
|
677 |
+
chunk_result = model(chunk)[0]
|
678 |
+
model_results.append(chunk_result)
|
679 |
+
except Exception as e:
|
680 |
+
print(f"Error analyzing chunk with model {model_name}: {str(e)}")
|
681 |
+
|
682 |
+
if model_results:
|
683 |
+
# Aggregate results from all chunks
|
684 |
+
categories = {}
|
685 |
+
for res in model_results:
|
686 |
+
# Ensure the label is lowercase for consistency
|
687 |
+
label = res['label'].lower() if isinstance(res.get('label'), str) else "unknown"
|
688 |
+
score = res['score']
|
689 |
+
if label in categories:
|
690 |
+
categories[label] += score
|
691 |
+
else:
|
692 |
+
categories[label] = score
|
693 |
+
|
694 |
+
# Normalize scores
|
695 |
+
total = sum(categories.values())
|
696 |
+
if total > 0:
|
697 |
+
categories = {k: round(v/total, 3) for k, v in categories.items()}
|
698 |
+
|
699 |
+
# Get dominant category
|
700 |
+
dominant_category = max(categories.items(), key=lambda x: x[1])
|
701 |
+
|
702 |
+
# Store results for this model
|
703 |
+
result["models"][model_name] = {
|
704 |
+
"category": dominant_category[0],
|
705 |
+
"confidence": dominant_category[1],
|
706 |
+
"distribution": categories
|
707 |
+
}
|
708 |
+
|
709 |
+
# Calculate sentiment indices based on the fine-grained results
|
710 |
+
result["indices"] = self._calculate_sentiment_indices(result)
|
711 |
+
|
712 |
+
return result
|
713 |
+
|
714 |
+
except Exception as e:
|
715 |
+
print(f"Error in fine-grained sentiment analysis: {str(e)}")
|
716 |
+
return result
|
717 |
+
|
718 |
+
def _calculate_sentiment_indices(self, fine_grained_results: Dict[str, Any]) -> Dict[str, float]:
|
719 |
+
"""Calculate various sentiment indices based on fine-grained sentiment analysis."""
|
720 |
+
indices = {
|
721 |
+
"positivity_index": 0.5, # Default neutral value
|
722 |
+
"negativity_index": 0.5,
|
723 |
+
"emotional_intensity": 0.0,
|
724 |
+
"controversy_score": 0.0,
|
725 |
+
"confidence_score": 0.0,
|
726 |
+
"esg_relevance": 0.0
|
727 |
+
}
|
728 |
+
|
729 |
+
try:
|
730 |
+
# Extract distributions from all models
|
731 |
+
distributions = {}
|
732 |
+
confidence_scores = {}
|
733 |
+
|
734 |
+
# Add primary model if available
|
735 |
+
if "category" in fine_grained_results.get("primary", {}):
|
736 |
+
if "distribution" in fine_grained_results["primary"]:
|
737 |
+
distributions["primary"] = fine_grained_results["primary"]["distribution"]
|
738 |
+
confidence_scores["primary"] = fine_grained_results["primary"].get("confidence", 0.0)
|
739 |
+
|
740 |
+
# Add other models
|
741 |
+
for model_name, model_result in fine_grained_results.get("models", {}).items():
|
742 |
+
if "distribution" in model_result:
|
743 |
+
distributions[model_name] = model_result["distribution"]
|
744 |
+
confidence_scores[model_name] = model_result.get("confidence", 0.0)
|
745 |
+
|
746 |
+
# Calculate positivity index
|
747 |
+
positive_scores = []
|
748 |
+
for model_name, dist in distributions.items():
|
749 |
+
if model_name == "financial" or model_name == "primary" or model_name == "news_tone" or model_name == "aspect":
|
750 |
+
pos_score = dist.get("positive", 0.0)
|
751 |
+
positive_scores.append(pos_score)
|
752 |
+
elif model_name == "emotion":
|
753 |
+
# For emotion model, consider joy as positive
|
754 |
+
pos_score = dist.get("joy", 0.0) + dist.get("surprise", 0.0) * 0.5
|
755 |
+
positive_scores.append(pos_score)
|
756 |
+
|
757 |
+
if positive_scores:
|
758 |
+
indices["positivity_index"] = round(sum(positive_scores) / len(positive_scores), 3)
|
759 |
+
|
760 |
+
# Calculate negativity index
|
761 |
+
negative_scores = []
|
762 |
+
for model_name, dist in distributions.items():
|
763 |
+
if model_name == "financial" or model_name == "primary" or model_name == "news_tone" or model_name == "aspect":
|
764 |
+
neg_score = dist.get("negative", 0.0)
|
765 |
+
negative_scores.append(neg_score)
|
766 |
+
elif model_name == "emotion":
|
767 |
+
# For emotion model, consider sadness, anger, fear, disgust as negative
|
768 |
+
neg_score = dist.get("sadness", 0.0) + dist.get("anger", 0.0) + \
|
769 |
+
dist.get("fear", 0.0) + dist.get("disgust", 0.0)
|
770 |
+
negative_scores.append(neg_score / 4) # Average of 4 negative emotions
|
771 |
+
|
772 |
+
if negative_scores:
|
773 |
+
indices["negativity_index"] = round(sum(negative_scores) / len(negative_scores), 3)
|
774 |
+
|
775 |
+
# Calculate emotional intensity
|
776 |
+
emotion_dist = distributions.get("emotion", {})
|
777 |
+
if emotion_dist:
|
778 |
+
# Sum all emotional intensities except neutral
|
779 |
+
emotional_sum = sum(v for k, v in emotion_dist.items() if k != "neutral")
|
780 |
+
indices["emotional_intensity"] = round(emotional_sum, 3)
|
781 |
+
|
782 |
+
# Calculate controversy score (high when both positive and negative are high)
|
783 |
+
indices["controversy_score"] = round(indices["positivity_index"] * indices["negativity_index"] * 4, 3)
|
784 |
+
|
785 |
+
# Calculate confidence score (average of all model confidences)
|
786 |
+
if confidence_scores:
|
787 |
+
indices["confidence_score"] = round(sum(confidence_scores.values()) / len(confidence_scores), 3)
|
788 |
+
|
789 |
+
# Calculate ESG relevance if available
|
790 |
+
esg_dist = distributions.get("esg", {})
|
791 |
+
if esg_dist:
|
792 |
+
# Sum of all ESG categories
|
793 |
+
esg_sum = sum(v for k, v in esg_dist.items() if k in ["environmental", "social", "governance"])
|
794 |
+
indices["esg_relevance"] = round(esg_sum, 3)
|
795 |
+
|
796 |
+
return indices
|
797 |
+
|
798 |
+
except Exception as e:
|
799 |
+
print(f"Error calculating sentiment indices: {str(e)}")
|
800 |
+
return indices
|
801 |
+
|
802 |
+
def summarize_text(self, text: str) -> str:
|
803 |
+
"""Generate a concise summary of the text."""
|
804 |
+
try:
|
805 |
+
# Clean and prepare text
|
806 |
+
text = text.replace('\n', ' ').strip()
|
807 |
+
|
808 |
+
# Split text into chunks if it's too long
|
809 |
+
chunks = self._split_text(text)
|
810 |
+
|
811 |
+
summaries = []
|
812 |
+
for chunk in chunks:
|
813 |
+
# Generate summary for each chunk
|
814 |
+
summary = self.summarizer(chunk,
|
815 |
+
max_length=130,
|
816 |
+
min_length=30,
|
817 |
+
do_sample=False)[0]['summary_text']
|
818 |
+
summaries.append(summary)
|
819 |
+
|
820 |
+
# Combine summaries if there were multiple chunks
|
821 |
+
final_summary = ' '.join(summaries)
|
822 |
+
return final_summary
|
823 |
+
|
824 |
+
except Exception as e:
|
825 |
+
print(f"Error generating summary: {str(e)}")
|
826 |
+
return text[:200] + '...' # Return truncated text as fallback
|
827 |
+
|
828 |
+
def extract_topics(self, text: str) -> List[str]:
|
829 |
+
"""Extract key topics from the text using TF-IDF."""
|
830 |
+
try:
|
831 |
+
# Prepare text
|
832 |
+
text = text.lower()
|
833 |
+
|
834 |
+
# Fit and transform the text
|
835 |
+
tfidf_matrix = self.vectorizer.fit_transform([text])
|
836 |
+
|
837 |
+
# Get feature names and scores
|
838 |
+
feature_names = self.vectorizer.get_feature_names_out()
|
839 |
+
scores = tfidf_matrix.toarray()[0]
|
840 |
+
|
841 |
+
# Get top topics
|
842 |
+
top_indices = scores.argsort()[-5:][::-1] # Get top 5 topics
|
843 |
+
topics = [feature_names[i] for i in top_indices]
|
844 |
+
|
845 |
+
return topics
|
846 |
+
|
847 |
+
except Exception as e:
|
848 |
+
print(f"Error extracting topics: {str(e)}")
|
849 |
+
return []
|
850 |
+
|
851 |
+
def _split_text(self, text: str, max_length: int = 1024) -> List[str]:
|
852 |
+
"""Split text into chunks that fit within model's maximum token limit."""
|
853 |
+
words = text.split()
|
854 |
+
chunks = []
|
855 |
+
current_chunk = []
|
856 |
+
current_length = 0
|
857 |
+
|
858 |
+
for word in words:
|
859 |
+
word_length = len(word) + 1 # +1 for space
|
860 |
+
if current_length + word_length > max_length:
|
861 |
+
chunks.append(' '.join(current_chunk))
|
862 |
+
current_chunk = [word]
|
863 |
+
current_length = word_length
|
864 |
+
else:
|
865 |
+
current_chunk.append(word)
|
866 |
+
current_length += word_length
|
867 |
+
|
868 |
+
if current_chunk:
|
869 |
+
chunks.append(' '.join(current_chunk))
|
870 |
+
|
871 |
+
return chunks
|
872 |
+
|
873 |
+
def _extract_entities(self, text: str) -> Dict[str, List[str]]:
|
874 |
+
"""Extract named entities from text."""
|
875 |
+
entities = {
|
876 |
+
'PERSON': [],
|
877 |
+
'ORG': [],
|
878 |
+
'GPE': [], # Countries, cities, states
|
879 |
+
'MONEY': [],
|
880 |
+
'PERCENT': [],
|
881 |
+
'DATE': []
|
882 |
+
}
|
883 |
+
|
884 |
+
if not self.has_ner:
|
885 |
+
return entities
|
886 |
+
|
887 |
+
try:
|
888 |
+
# Process text with spaCy
|
889 |
+
doc = self.nlp(text[:10000]) # Limit text length for performance
|
890 |
+
|
891 |
+
# Extract entities
|
892 |
+
for ent in doc.ents:
|
893 |
+
if ent.label_ in entities:
|
894 |
+
# Clean entity text and deduplicate
|
895 |
+
clean_text = ent.text.strip()
|
896 |
+
if clean_text and clean_text not in entities[ent.label_]:
|
897 |
+
entities[ent.label_].append(clean_text)
|
898 |
+
|
899 |
+
return entities
|
900 |
+
except Exception as e:
|
901 |
+
print(f"Error extracting entities: {str(e)}")
|
902 |
+
return entities
|
903 |
+
|
904 |
+
def _extract_sentiment_targets(self, text: str, entities: Dict[str, List[str]]) -> List[Dict[str, Any]]:
|
905 |
+
"""Extract entities that are targets of sentiment expressions."""
|
906 |
+
if not self.has_ner:
|
907 |
+
return []
|
908 |
+
|
909 |
+
try:
|
910 |
+
# Get all entities as a flat list
|
911 |
+
all_entities = []
|
912 |
+
for entity_type, entity_list in entities.items():
|
913 |
+
for entity in entity_list:
|
914 |
+
all_entities.append({
|
915 |
+
'text': entity,
|
916 |
+
'type': entity_type
|
917 |
+
})
|
918 |
+
|
919 |
+
# Find sentiment targets
|
920 |
+
targets = []
|
921 |
+
|
922 |
+
# Split text into sentences
|
923 |
+
doc = self.nlp(text[:10000]) # Limit text length
|
924 |
+
|
925 |
+
for sentence in doc.sents:
|
926 |
+
# Skip short sentences
|
927 |
+
if len(sentence.text.split()) < 3:
|
928 |
+
continue
|
929 |
+
|
930 |
+
# Check for sentiment in this sentence
|
931 |
+
try:
|
932 |
+
sentiment = self.sentiment_pipeline(sentence.text)[0]
|
933 |
+
# Only process if sentiment is strong
|
934 |
+
if sentiment['score'] > 0.7:
|
935 |
+
# Find entities in this sentence
|
936 |
+
for entity in all_entities:
|
937 |
+
if entity['text'] in sentence.text:
|
938 |
+
targets.append({
|
939 |
+
'entity': entity['text'],
|
940 |
+
'type': entity['type'],
|
941 |
+
'sentiment': sentiment['label'].lower(),
|
942 |
+
'confidence': round(sentiment['score'], 3),
|
943 |
+
'context': sentence.text
|
944 |
+
})
|
945 |
+
except:
|
946 |
+
continue
|
947 |
+
|
948 |
+
# Return unique targets
|
949 |
+
unique_targets = []
|
950 |
+
seen = set()
|
951 |
+
for target in targets:
|
952 |
+
key = f"{target['entity']}_{target['sentiment']}"
|
953 |
+
if key not in seen:
|
954 |
+
seen.add(key)
|
955 |
+
unique_targets.append(target)
|
956 |
+
|
957 |
+
return unique_targets
|
958 |
+
|
959 |
+
except Exception as e:
|
960 |
+
print(f"Error extracting sentiment targets: {str(e)}")
|
961 |
+
return []
|
962 |
+
|
963 |
+
class TextToSpeechConverter:
|
964 |
+
def __init__(self):
|
965 |
+
self.output_dir = AUDIO_OUTPUT_DIR
|
966 |
+
self.translator = Translator()
|
967 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
968 |
+
|
969 |
+
def generate_audio(self, text: str, filename: str) -> str:
|
970 |
+
"""Convert text to Hindi speech and save as audio file."""
|
971 |
+
try:
|
972 |
+
print(f"Translating text to Hindi: {text[:100]}...")
|
973 |
+
|
974 |
+
# First translate the text to Hindi
|
975 |
+
# Use chunking for long text to avoid translation limits
|
976 |
+
chunks = []
|
977 |
+
for i in range(0, len(text), 1000):
|
978 |
+
chunk = text[i:i+1000]
|
979 |
+
try:
|
980 |
+
translated_chunk = self.translator.translate(chunk, dest='hi').text
|
981 |
+
chunks.append(translated_chunk)
|
982 |
+
print(f"Translated chunk {i//1000 + 1}")
|
983 |
+
except Exception as e:
|
984 |
+
print(f"Error translating chunk {i//1000 + 1}: {str(e)}")
|
985 |
+
# If translation fails, use original text
|
986 |
+
chunks.append(chunk)
|
987 |
+
|
988 |
+
hindi_text = ' '.join(chunks)
|
989 |
+
print(f"Translation complete. Hindi text length: {len(hindi_text)}")
|
990 |
+
|
991 |
+
# Generate Hindi speech
|
992 |
+
print("Generating Hindi speech...")
|
993 |
+
tts = gTTS(text=hindi_text, lang='hi', slow=False)
|
994 |
+
output_path = os.path.join(self.output_dir, f"{filename}.mp3")
|
995 |
+
tts.save(output_path)
|
996 |
+
print(f"Audio saved to {output_path}")
|
997 |
+
|
998 |
+
return output_path
|
999 |
+
except Exception as e:
|
1000 |
+
print(f"Error in TTS conversion: {str(e)}")
|
1001 |
+
# Fallback to original text if translation fails
|
1002 |
+
print("Using fallback English TTS")
|
1003 |
+
tts = gTTS(text=text, lang='en')
|
1004 |
+
output_path = os.path.join(self.output_dir, f"{filename}.mp3")
|
1005 |
+
tts.save(output_path)
|
1006 |
+
return output_path
|
1007 |
+
|
1008 |
+
class ComparativeAnalyzer:
|
1009 |
+
def __init__(self):
|
1010 |
+
pass
|
1011 |
+
|
1012 |
+
def analyze_coverage(self, articles: List[Dict[str, Any]], company_name: str = None) -> Dict[str, Any]:
|
1013 |
+
"""Perform comparative analysis across articles."""
|
1014 |
+
if not articles:
|
1015 |
+
return {
|
1016 |
+
"topics": [],
|
1017 |
+
"sentiment_distribution": {},
|
1018 |
+
"coverage_differences": ["No articles found for analysis."],
|
1019 |
+
"final_sentiment": "No articles found for analysis.",
|
1020 |
+
"total_articles": 0,
|
1021 |
+
"sentiment_indices": {}
|
1022 |
+
}
|
1023 |
+
|
1024 |
+
# Debug: Print articles for analysis
|
1025 |
+
print(f"Analyzing {len(articles)} articles for company: {company_name}")
|
1026 |
+
|
1027 |
+
# Add company name to each article if provided
|
1028 |
+
if company_name:
|
1029 |
+
for article in articles:
|
1030 |
+
article['company'] = company_name
|
1031 |
+
|
1032 |
+
# Calculate sentiment distribution
|
1033 |
+
print("Calculating sentiment distribution...")
|
1034 |
+
sentiment_dist = self._get_sentiment_distribution(articles)
|
1035 |
+
print("Sentiment distribution result:")
|
1036 |
+
print(sentiment_dist)
|
1037 |
+
|
1038 |
+
# Analyze common topics
|
1039 |
+
topics = self._analyze_topics(articles)
|
1040 |
+
|
1041 |
+
# Analyze coverage differences
|
1042 |
+
differences = self._analyze_coverage_differences(articles)
|
1043 |
+
|
1044 |
+
# Get final sentiment analysis
|
1045 |
+
final_sentiment = self._get_final_sentiment(sentiment_dist, articles)
|
1046 |
+
|
1047 |
+
result = {
|
1048 |
+
"topics": topics,
|
1049 |
+
"sentiment_distribution": sentiment_dist,
|
1050 |
+
"coverage_differences": differences,
|
1051 |
+
"final_sentiment": final_sentiment,
|
1052 |
+
"total_articles": len(articles),
|
1053 |
+
"sentiment_indices": sentiment_dist.get("sentiment_indices", {})
|
1054 |
+
}
|
1055 |
+
|
1056 |
+
# Debug: Print final result
|
1057 |
+
print("Final comparative analysis result:")
|
1058 |
+
print(result)
|
1059 |
+
|
1060 |
+
return result
|
1061 |
+
|
1062 |
+
def _get_sentiment_distribution(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
|
1063 |
+
"""Calculate distribution of sentiments across articles."""
|
1064 |
+
# Basic sentiment distribution
|
1065 |
+
basic_distribution = {'positive': 0, 'negative': 0, 'neutral': 0}
|
1066 |
+
|
1067 |
+
# Fine-grained sentiment distribution
|
1068 |
+
fine_grained_distribution = {}
|
1069 |
+
|
1070 |
+
# Sentiment scores
|
1071 |
+
sentiment_scores = []
|
1072 |
+
|
1073 |
+
# Sentiment indices aggregation
|
1074 |
+
sentiment_indices = {
|
1075 |
+
"positivity_index": [],
|
1076 |
+
"negativity_index": [],
|
1077 |
+
"emotional_intensity": [],
|
1078 |
+
"controversy_score": [],
|
1079 |
+
"confidence_score": [],
|
1080 |
+
"esg_relevance": []
|
1081 |
+
}
|
1082 |
+
|
1083 |
+
# Debug: Print articles for sentiment distribution
|
1084 |
+
print(f"Processing {len(articles)} articles for sentiment distribution")
|
1085 |
+
|
1086 |
+
# Process each article
|
1087 |
+
for i, article in enumerate(articles):
|
1088 |
+
try:
|
1089 |
+
# Debug: Print article sentiment data
|
1090 |
+
print(f"Article {i+1} sentiment data:")
|
1091 |
+
print(f" Basic sentiment: {article.get('sentiment', 'N/A')}")
|
1092 |
+
print(f" Fine-grained: {article.get('fine_grained_sentiment', {})}")
|
1093 |
+
print(f" Sentiment indices: {article.get('sentiment_indices', {})}")
|
1094 |
+
|
1095 |
+
# Basic sentiment
|
1096 |
+
sentiment = article.get('sentiment', 'neutral')
|
1097 |
+
if isinstance(sentiment, str):
|
1098 |
+
sentiment = sentiment.lower()
|
1099 |
+
# Ensure we have a valid sentiment category
|
1100 |
+
if sentiment not in basic_distribution:
|
1101 |
+
sentiment = 'neutral'
|
1102 |
+
basic_distribution[sentiment] = basic_distribution.get(sentiment, 0) + 1
|
1103 |
+
else:
|
1104 |
+
# Handle non-string sentiment values
|
1105 |
+
basic_distribution['neutral'] = basic_distribution.get('neutral', 0) + 1
|
1106 |
+
|
1107 |
+
# Sentiment score
|
1108 |
+
score = article.get('sentiment_score', 0.0)
|
1109 |
+
if isinstance(score, (int, float)):
|
1110 |
+
sentiment_scores.append(score)
|
1111 |
+
|
1112 |
+
# Fine-grained sentiment
|
1113 |
+
fine_grained = article.get('fine_grained_sentiment', {})
|
1114 |
+
if isinstance(fine_grained, dict) and 'category' in fine_grained:
|
1115 |
+
category = fine_grained['category']
|
1116 |
+
if isinstance(category, str):
|
1117 |
+
category = category.lower()
|
1118 |
+
fine_grained_distribution[category] = fine_grained_distribution.get(category, 0) + 1
|
1119 |
+
|
1120 |
+
# Collect sentiment indices
|
1121 |
+
indices = article.get('sentiment_indices', {})
|
1122 |
+
if isinstance(indices, dict):
|
1123 |
+
for index_name, index_values in sentiment_indices.items():
|
1124 |
+
if index_name in indices and isinstance(indices[index_name], (int, float)):
|
1125 |
+
index_values.append(indices[index_name])
|
1126 |
+
except Exception as e:
|
1127 |
+
print(f"Error processing article {i+1} for sentiment distribution: {str(e)}")
|
1128 |
+
# Continue with next article
|
1129 |
+
continue
|
1130 |
+
|
1131 |
+
# Debug: Print collected data
|
1132 |
+
print("Collected sentiment data:")
|
1133 |
+
print(f" Basic distribution: {basic_distribution}")
|
1134 |
+
print(f" Fine-grained distribution: {fine_grained_distribution}")
|
1135 |
+
print(f" Sentiment scores: {sentiment_scores}")
|
1136 |
+
print(f" Sentiment indices collected: {sentiment_indices}")
|
1137 |
+
|
1138 |
+
# Calculate average sentiment score with fallback
|
1139 |
+
avg_sentiment_score = 0.5 # Default neutral value
|
1140 |
+
if sentiment_scores:
|
1141 |
+
avg_sentiment_score = sum(sentiment_scores) / len(sentiment_scores)
|
1142 |
+
|
1143 |
+
# Calculate sentiment volatility (standard deviation) with fallback
|
1144 |
+
sentiment_volatility = 0
|
1145 |
+
if len(sentiment_scores) > 1:
|
1146 |
+
try:
|
1147 |
+
sentiment_volatility = statistics.stdev(sentiment_scores)
|
1148 |
+
except Exception as e:
|
1149 |
+
print(f"Error calculating sentiment volatility: {str(e)}")
|
1150 |
+
|
1151 |
+
# Calculate average sentiment indices with fallbacks
|
1152 |
+
avg_indices = {}
|
1153 |
+
for index_name, values in sentiment_indices.items():
|
1154 |
+
if values:
|
1155 |
+
avg_indices[index_name] = round(sum(values) / len(values), 3)
|
1156 |
+
else:
|
1157 |
+
# Provide default values for empty indices
|
1158 |
+
if index_name in ["positivity_index", "confidence_score"]:
|
1159 |
+
avg_indices[index_name] = 0.5 # Neutral default
|
1160 |
+
else:
|
1161 |
+
avg_indices[index_name] = 0.0 # Zero default for other indices
|
1162 |
+
|
1163 |
+
# Ensure all expected indices exist
|
1164 |
+
for index_name in ["positivity_index", "negativity_index", "emotional_intensity",
|
1165 |
+
"controversy_score", "confidence_score", "esg_relevance"]:
|
1166 |
+
if index_name not in avg_indices:
|
1167 |
+
avg_indices[index_name] = 0.5 if index_name in ["positivity_index", "confidence_score"] else 0.0
|
1168 |
+
|
1169 |
+
# Ensure we have at least one item in each distribution
|
1170 |
+
if not any(basic_distribution.values()):
|
1171 |
+
basic_distribution['neutral'] = 1
|
1172 |
+
|
1173 |
+
# Ensure fine_grained_distribution has at least one entry if empty
|
1174 |
+
if not fine_grained_distribution:
|
1175 |
+
fine_grained_distribution['neutral'] = 1
|
1176 |
+
|
1177 |
+
result = {
|
1178 |
+
"basic": basic_distribution,
|
1179 |
+
"fine_grained": fine_grained_distribution,
|
1180 |
+
"avg_score": round(avg_sentiment_score, 3),
|
1181 |
+
"volatility": round(sentiment_volatility, 3),
|
1182 |
+
"sentiment_indices": avg_indices
|
1183 |
+
}
|
1184 |
+
|
1185 |
+
# Debug: Print final sentiment distribution result
|
1186 |
+
print("Final sentiment distribution result:")
|
1187 |
+
print(result)
|
1188 |
+
|
1189 |
+
return result
|
1190 |
+
|
1191 |
+
def _analyze_topics(self, articles: List[Dict[str, Any]]) -> List[str]:
|
1192 |
+
"""Analyze common topics across articles using TF-IDF."""
|
1193 |
+
try:
|
1194 |
+
# Combine title and content for better topic extraction
|
1195 |
+
texts = [f"{article.get('title', '')} {article.get('content', '')}" for article in articles]
|
1196 |
+
|
1197 |
+
# Create and fit TF-IDF
|
1198 |
+
vectorizer = TfidfVectorizer(
|
1199 |
+
max_features=10,
|
1200 |
+
stop_words='english',
|
1201 |
+
ngram_range=(1, 2),
|
1202 |
+
token_pattern=r'(?u)\b[A-Za-z][A-Za-z+\'-]*[A-Za-z]+\b' # Improved pattern
|
1203 |
+
)
|
1204 |
+
|
1205 |
+
# Clean and normalize texts
|
1206 |
+
cleaned_texts = []
|
1207 |
+
for text in texts:
|
1208 |
+
# Remove numbers and special characters
|
1209 |
+
cleaned = re.sub(r'\d+', '', text)
|
1210 |
+
cleaned = re.sub(r'[^\w\s]', ' ', cleaned)
|
1211 |
+
cleaned_texts.append(cleaned.lower())
|
1212 |
+
|
1213 |
+
tfidf_matrix = vectorizer.fit_transform(cleaned_texts)
|
1214 |
+
feature_names = vectorizer.get_feature_names_out()
|
1215 |
+
|
1216 |
+
# Get average TF-IDF scores for each term
|
1217 |
+
avg_scores = tfidf_matrix.mean(axis=0).A1
|
1218 |
+
|
1219 |
+
# Sort terms by score and return top meaningful terms
|
1220 |
+
sorted_indices = avg_scores.argsort()[-5:][::-1]
|
1221 |
+
meaningful_topics = []
|
1222 |
+
|
1223 |
+
for idx in sorted_indices:
|
1224 |
+
topic = feature_names[idx]
|
1225 |
+
# Filter out single characters and common words
|
1226 |
+
if len(topic) > 1 and topic not in {'000', 'com', 'said', 'says', 'year', 'new', 'one'}:
|
1227 |
+
meaningful_topics.append(topic)
|
1228 |
+
if len(meaningful_topics) >= 5:
|
1229 |
+
break
|
1230 |
+
|
1231 |
+
return meaningful_topics
|
1232 |
+
|
1233 |
+
except Exception as e:
|
1234 |
+
print(f"Error analyzing topics: {str(e)}")
|
1235 |
+
return []
|
1236 |
+
|
1237 |
+
def _analyze_coverage_differences(self, articles: List[Dict[str, Any]]) -> List[str]:
|
1238 |
+
"""Analyze how coverage differs across articles."""
|
1239 |
+
if not articles:
|
1240 |
+
return ["No articles available for comparison"]
|
1241 |
+
|
1242 |
+
differences = []
|
1243 |
+
|
1244 |
+
# Compare sentiment differences
|
1245 |
+
sentiments = [article.get('sentiment', 'neutral').lower() for article in articles]
|
1246 |
+
unique_sentiments = set(sentiments)
|
1247 |
+
if len(unique_sentiments) > 1:
|
1248 |
+
pos_count = sentiments.count('positive')
|
1249 |
+
neg_count = sentiments.count('negative')
|
1250 |
+
neu_count = sentiments.count('neutral')
|
1251 |
+
|
1252 |
+
if pos_count > 0 and neg_count > 0:
|
1253 |
+
differences.append(f"Coverage sentiment varies significantly: {pos_count} positive, {neg_count} negative, and {neu_count} neutral articles.")
|
1254 |
+
|
1255 |
+
# Compare fine-grained sentiment differences
|
1256 |
+
fine_grained_categories = []
|
1257 |
+
for article in articles:
|
1258 |
+
fine_grained = article.get('fine_grained_sentiment', {})
|
1259 |
+
if isinstance(fine_grained, dict) and 'category' in fine_grained:
|
1260 |
+
category = fine_grained['category']
|
1261 |
+
if isinstance(category, str):
|
1262 |
+
fine_grained_categories.append(category.lower())
|
1263 |
+
|
1264 |
+
unique_categories = set(fine_grained_categories)
|
1265 |
+
if len(unique_categories) > 2: # More than 2 different categories
|
1266 |
+
category_counts = {}
|
1267 |
+
for category in fine_grained_categories:
|
1268 |
+
category_counts[category] = category_counts.get(category, 0) + 1
|
1269 |
+
|
1270 |
+
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:3]
|
1271 |
+
categories_str = ", ".join([f"{cat} ({count})" for cat, count in top_categories])
|
1272 |
+
differences.append(f"Articles show diverse sentiment categories: {categories_str}")
|
1273 |
+
|
1274 |
+
# Compare sentiment indices
|
1275 |
+
indices_differences = []
|
1276 |
+
positivity_values = []
|
1277 |
+
negativity_values = []
|
1278 |
+
controversy_values = []
|
1279 |
+
|
1280 |
+
for article in articles:
|
1281 |
+
indices = article.get('sentiment_indices', {})
|
1282 |
+
if indices:
|
1283 |
+
if 'positivity_index' in indices:
|
1284 |
+
positivity_values.append(indices['positivity_index'])
|
1285 |
+
if 'negativity_index' in indices:
|
1286 |
+
negativity_values.append(indices['negativity_index'])
|
1287 |
+
if 'controversy_score' in indices:
|
1288 |
+
controversy_values.append(indices['controversy_score'])
|
1289 |
+
|
1290 |
+
# Check for high variance in positivity
|
1291 |
+
if positivity_values and len(positivity_values) > 1:
|
1292 |
+
if max(positivity_values) - min(positivity_values) > 0.4:
|
1293 |
+
indices_differences.append("Articles show significant variation in positivity levels")
|
1294 |
+
|
1295 |
+
# Check for high variance in negativity
|
1296 |
+
if negativity_values and len(negativity_values) > 1:
|
1297 |
+
if max(negativity_values) - min(negativity_values) > 0.4:
|
1298 |
+
indices_differences.append("Articles show significant variation in negativity levels")
|
1299 |
+
|
1300 |
+
# Check for high controversy scores
|
1301 |
+
if controversy_values:
|
1302 |
+
high_controversy = [v for v in controversy_values if v > 0.5]
|
1303 |
+
if high_controversy:
|
1304 |
+
indices_differences.append(f"{len(high_controversy)} articles show high controversy scores")
|
1305 |
+
|
1306 |
+
if indices_differences:
|
1307 |
+
differences.append("Sentiment index analysis: " + "; ".join(indices_differences))
|
1308 |
+
|
1309 |
+
# Compare source differences
|
1310 |
+
sources = [article.get('source', '').lower() for article in articles]
|
1311 |
+
source_counts = {}
|
1312 |
+
for source in sources:
|
1313 |
+
if source:
|
1314 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
1315 |
+
|
1316 |
+
if len(source_counts) > 1:
|
1317 |
+
top_sources = sorted(source_counts.items(), key=lambda x: x[1], reverse=True)[:3]
|
1318 |
+
sources_str = ", ".join([f"{source} ({count})" for source, count in top_sources])
|
1319 |
+
differences.append(f"Coverage spans multiple sources: {sources_str}")
|
1320 |
+
|
1321 |
+
# If no significant differences found
|
1322 |
+
if not differences:
|
1323 |
+
differences.append("Coverage is relatively consistent across articles")
|
1324 |
+
|
1325 |
+
return differences
|
1326 |
+
|
1327 |
+
def _get_final_sentiment(self, distribution: Dict[str, Any], articles: List[Dict[str, Any]]) -> str:
|
1328 |
+
"""Generate final sentiment analysis based on distribution and article content."""
|
1329 |
+
try:
|
1330 |
+
# Get basic sentiment counts
|
1331 |
+
basic_dist = distribution.get('basic', {})
|
1332 |
+
positive_count = basic_dist.get('positive', 0)
|
1333 |
+
negative_count = basic_dist.get('negative', 0)
|
1334 |
+
neutral_count = basic_dist.get('neutral', 0)
|
1335 |
+
|
1336 |
+
total_articles = positive_count + negative_count + neutral_count
|
1337 |
+
|
1338 |
+
if total_articles == 0:
|
1339 |
+
return "No sentiment data available"
|
1340 |
+
|
1341 |
+
# Calculate percentages
|
1342 |
+
positive_pct = (positive_count / total_articles) * 100
|
1343 |
+
negative_pct = (negative_count / total_articles) * 100
|
1344 |
+
neutral_pct = (neutral_count / total_articles) * 100
|
1345 |
+
|
1346 |
+
# Get average sentiment score
|
1347 |
+
avg_score = distribution.get('avg_score', 0.5)
|
1348 |
+
|
1349 |
+
# Get volatility
|
1350 |
+
volatility = distribution.get('volatility', 0)
|
1351 |
+
|
1352 |
+
# Get sentiment indices
|
1353 |
+
indices = distribution.get('sentiment_indices', {})
|
1354 |
+
positivity_index = indices.get('positivity_index', 0.5)
|
1355 |
+
negativity_index = indices.get('negativity_index', 0.5)
|
1356 |
+
emotional_intensity = indices.get('emotional_intensity', 0)
|
1357 |
+
controversy_score = indices.get('controversy_score', 0)
|
1358 |
+
esg_relevance = indices.get('esg_relevance', 0)
|
1359 |
+
|
1360 |
+
# Generate analysis text
|
1361 |
+
analysis = []
|
1362 |
+
|
1363 |
+
# Overall sentiment
|
1364 |
+
if positive_pct > 60:
|
1365 |
+
analysis.append(f"Overall sentiment is predominantly positive ({positive_pct:.1f}%).")
|
1366 |
+
elif negative_pct > 60:
|
1367 |
+
analysis.append(f"Overall sentiment is predominantly negative ({negative_pct:.1f}%).")
|
1368 |
+
elif neutral_pct > 60:
|
1369 |
+
analysis.append(f"Overall sentiment is predominantly neutral ({neutral_pct:.1f}%).")
|
1370 |
+
elif positive_pct > negative_pct and positive_pct > neutral_pct:
|
1371 |
+
analysis.append(f"Overall sentiment leans positive ({positive_pct:.1f}%), with some mixed coverage.")
|
1372 |
+
elif negative_pct > positive_pct and negative_pct > neutral_pct:
|
1373 |
+
analysis.append(f"Overall sentiment leans negative ({negative_pct:.1f}%), with some mixed coverage.")
|
1374 |
+
else:
|
1375 |
+
analysis.append(f"Sentiment is mixed across sources (Positive: {positive_pct:.1f}%, Negative: {negative_pct:.1f}%, Neutral: {neutral_pct:.1f}%).")
|
1376 |
+
|
1377 |
+
# Sentiment indices insights
|
1378 |
+
if positivity_index > 0.7:
|
1379 |
+
analysis.append(f"High positivity index ({positivity_index:.2f}) indicates strong positive sentiment.")
|
1380 |
+
elif positivity_index < 0.3 and negativity_index > 0.7:
|
1381 |
+
analysis.append(f"High negativity index ({negativity_index:.2f}) with low positivity suggests strongly negative coverage.")
|
1382 |
+
|
1383 |
+
if emotional_intensity > 0.6:
|
1384 |
+
analysis.append(f"Coverage shows high emotional intensity ({emotional_intensity:.2f}).")
|
1385 |
+
|
1386 |
+
if controversy_score > 0.5:
|
1387 |
+
analysis.append(f"Coverage shows significant controversy ({controversy_score:.2f}), with polarized opinions.")
|
1388 |
+
|
1389 |
+
if esg_relevance > 0.4:
|
1390 |
+
analysis.append(f"Coverage includes significant ESG-related content ({esg_relevance:.2f}).")
|
1391 |
+
|
1392 |
+
# Volatility
|
1393 |
+
if volatility > 0.2:
|
1394 |
+
analysis.append(f"Sentiment varies considerably across articles (volatility: {volatility:.2f}).")
|
1395 |
+
else:
|
1396 |
+
analysis.append(f"Sentiment is relatively consistent across articles (volatility: {volatility:.2f}).")
|
1397 |
+
|
1398 |
+
return " ".join(analysis)
|
1399 |
+
|
1400 |
+
except Exception as e:
|
1401 |
+
print(f"Error generating final sentiment: {str(e)}")
|
1402 |
+
return "Unable to generate final sentiment analysis due to an error."
|