proKBD commited on
Commit
d9cc29f
·
verified ·
1 Parent(s): 457e9a2

Upload 8 files

Browse files
Files changed (8) hide show
  1. .env.example +17 -0
  2. .gitignore +18 -0
  3. Dockerfile +24 -0
  4. api.py +72 -0
  5. app.py +431 -0
  6. config.py +85 -0
  7. requirements.txt +21 -0
  8. utils.py +1402 -0
.env.example ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Settings
2
+ API_HOST=0.0.0.0
3
+ API_PORT=8005
4
+ API_BASE_URL=http://0.0.0.0:8005
5
+
6
+ # News Scraping Settings
7
+ ARTICLES_PER_SOURCE=10
8
+ USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
9
+
10
+ # Cache Settings
11
+ CACHE_DIR=.cache
12
+ CACHE_EXPIRY=3600
13
+ CACHE_DURATION=300
14
+
15
+ # Audio Settings
16
+ AUDIO_OUTPUT_DIR=audio_output
17
+ DEFAULT_LANG=hi
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Ignore virtual environment
3
+ venv/
4
+ .env
5
+ audio_output/
6
+
7
+ # Ignore compiled Python files
8
+ __pycache__/
9
+ *.pyc
10
+ *.pyo
11
+ *.pyd
12
+ sentiment_history/
13
+ # Ignore macOS system files
14
+ .DS_Store
15
+
16
+ # Ignore log files
17
+ *.log
18
+ audio_output
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Copy requirements first to leverage Docker cache
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy the rest of the application
15
+ COPY . .
16
+
17
+ # Create necessary directories
18
+ RUN mkdir -p audio_output sentiment_history
19
+
20
+ # Expose the port Streamlit will run on
21
+ EXPOSE 8501
22
+
23
+ # Command to run the application
24
+ CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0"]
api.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI backend for the News Summarization application."""
2
+
3
+ from fastapi import FastAPI, HTTPException
4
+ from fastapi.staticfiles import StaticFiles
5
+ from pydantic import BaseModel
6
+ from typing import List, Dict, Any
7
+ import uvicorn
8
+ from utils import NewsExtractor, SentimentAnalyzer, TextToSpeechConverter, ComparativeAnalyzer
9
+ import os
10
+ from config import API_PORT, AUDIO_OUTPUT_DIR
11
+ import time
12
+
13
+ app = FastAPI(title="News Summarization API")
14
+
15
+ # Mount static directory for audio files
16
+ os.makedirs(AUDIO_OUTPUT_DIR, exist_ok=True)
17
+ app.mount("/audio", StaticFiles(directory=AUDIO_OUTPUT_DIR), name="audio")
18
+
19
+ # Initialize components
20
+ news_extractor = NewsExtractor()
21
+ sentiment_analyzer = SentimentAnalyzer()
22
+ tts_converter = TextToSpeechConverter()
23
+ comparative_analyzer = ComparativeAnalyzer()
24
+
25
+ class CompanyRequest(BaseModel):
26
+ name: str
27
+
28
+ class AnalysisResponse(BaseModel):
29
+ company: str
30
+ articles: List[Dict[str, Any]]
31
+ comparative_sentiment_score: Dict[str, Any]
32
+ final_sentiment_analysis: str
33
+ audio_url: str = None
34
+
35
+ @app.post("/api/analyze", response_model=AnalysisResponse)
36
+ async def analyze_company(request: CompanyRequest):
37
+ """Analyze news articles for a given company."""
38
+ try:
39
+ # Extract news articles
40
+ articles = news_extractor.search_news(request.name)
41
+ if not articles:
42
+ raise HTTPException(status_code=404, detail="No articles found for the company")
43
+
44
+ # Analyze each article
45
+ analyzed_articles = []
46
+ for article in articles:
47
+ analysis = sentiment_analyzer.analyze_article(article)
48
+ # Add company name to each article
49
+ analysis['company'] = request.name
50
+ analyzed_articles.append(analysis)
51
+
52
+ # Perform comparative analysis
53
+ comparison = comparative_analyzer.analyze_coverage(analyzed_articles, company_name=request.name)
54
+ final_analysis = comparison["final_sentiment"]
55
+
56
+ # Generate Hindi audio for final analysis
57
+ audio_filename = f"{request.name.lower().replace(' ', '_')}_{int(time.time())}"
58
+ audio_path = tts_converter.generate_audio(final_analysis, audio_filename)
59
+ audio_url = f"/audio/{os.path.basename(audio_path)}"
60
+
61
+ return {
62
+ "company": request.name,
63
+ "articles": analyzed_articles,
64
+ "comparative_sentiment_score": comparison,
65
+ "final_sentiment_analysis": final_analysis,
66
+ "audio_url": audio_url
67
+ }
68
+ except Exception as e:
69
+ raise HTTPException(status_code=500, detail=str(e))
70
+
71
+ if __name__ == "__main__":
72
+ uvicorn.run(app, host="0.0.0.0", port=API_PORT)
app.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit frontend for the News Summarization application."""
2
+
3
+ import streamlit as st
4
+ import requests
5
+ import pandas as pd
6
+ import json
7
+ from config import API_BASE_URL
8
+ import os
9
+ import plotly.express as px
10
+ import altair as alt
11
+
12
+ st.set_page_config(
13
+ page_title="News Summarization App",
14
+ page_icon="📰",
15
+ layout="wide"
16
+ )
17
+
18
+ def analyze_company(company_name):
19
+ """Send analysis request to API."""
20
+ try:
21
+ response = requests.post(
22
+ f"{API_BASE_URL}/api/analyze",
23
+ json={"name": company_name}
24
+ )
25
+ if response.status_code == 200:
26
+ data = response.json()
27
+ # Print the response data for debugging
28
+ print("API Response Data:")
29
+ print(json.dumps(data, indent=2))
30
+
31
+ # Download audio file if available
32
+ if 'audio_url' in data:
33
+ audio_response = requests.get(f"{API_BASE_URL}{data['audio_url']}")
34
+ if audio_response.status_code == 200:
35
+ data['audio_content'] = audio_response.content
36
+ return data
37
+ else:
38
+ st.error(f"Error from API: {response.text}")
39
+ return {"articles": [], "comparative_sentiment_score": {}, "final_sentiment_analysis": "", "audio_url": None}
40
+ except Exception as e:
41
+ st.error(f"Error analyzing company: {str(e)}")
42
+ return {"articles": [], "comparative_sentiment_score": {}, "final_sentiment_analysis": "", "audio_url": None}
43
+
44
+ def main():
45
+ st.title("📰 News Summarization and Analysis")
46
+
47
+ # Sidebar
48
+ st.sidebar.header("Settings")
49
+
50
+ # Replace dropdown with text input
51
+ company = st.sidebar.text_input(
52
+ "Enter Company Name",
53
+ placeholder="e.g., Tesla, Apple, Microsoft, or any other company",
54
+ help="Enter the name of any company you want to analyze"
55
+ )
56
+
57
+ if st.sidebar.button("Analyze") and company:
58
+ if len(company.strip()) < 2:
59
+ st.sidebar.error("Please enter a valid company name (at least 2 characters)")
60
+ else:
61
+ with st.spinner("Analyzing news articles..."):
62
+ result = analyze_company(company)
63
+
64
+ if result and result.get("articles"):
65
+ # Display Articles
66
+ st.header("📑 News Articles")
67
+ for idx, article in enumerate(result["articles"], 1):
68
+ with st.expander(f"Article {idx}: {article['title']}"):
69
+ st.write("**Content:**", article.get("content", "No content available"))
70
+ if "summary" in article:
71
+ st.write("**Summary:**", article["summary"])
72
+ st.write("**Source:**", article.get("source", "Unknown"))
73
+
74
+ # Enhanced sentiment display
75
+ if "sentiment" in article:
76
+ sentiment_col1, sentiment_col2 = st.columns(2)
77
+ with sentiment_col1:
78
+ st.write("**Sentiment:**", article["sentiment"])
79
+ st.write("**Confidence Score:**", f"{article.get('sentiment_score', 0)*100:.1f}%")
80
+
81
+ with sentiment_col2:
82
+ # Display fine-grained sentiment if available
83
+ if "fine_grained_sentiment" in article and article["fine_grained_sentiment"]:
84
+ fine_grained = article["fine_grained_sentiment"]
85
+ if "category" in fine_grained:
86
+ st.write("**Detailed Sentiment:**", fine_grained["category"])
87
+ if "confidence" in fine_grained:
88
+ st.write("**Confidence:**", f"{fine_grained['confidence']*100:.1f}%")
89
+
90
+ # Display sentiment indices if available
91
+ if "sentiment_indices" in article and article["sentiment_indices"]:
92
+ st.markdown("**Sentiment Indices:**")
93
+ indices = article["sentiment_indices"]
94
+
95
+ # Create columns for displaying indices
96
+ idx_cols = st.columns(3)
97
+
98
+ # Display positivity and negativity in first column
99
+ with idx_cols[0]:
100
+ if "positivity_index" in indices:
101
+ st.markdown(f"**Positivity:** {indices['positivity_index']:.2f}")
102
+ if "negativity_index" in indices:
103
+ st.markdown(f"**Negativity:** {indices['negativity_index']:.2f}")
104
+
105
+ # Display emotional intensity and controversy in second column
106
+ with idx_cols[1]:
107
+ if "emotional_intensity" in indices:
108
+ st.markdown(f"**Emotional Intensity:** {indices['emotional_intensity']:.2f}")
109
+ if "controversy_score" in indices:
110
+ st.markdown(f"**Controversy:** {indices['controversy_score']:.2f}")
111
+
112
+ # Display confidence and ESG in third column
113
+ with idx_cols[2]:
114
+ if "confidence_score" in indices:
115
+ st.markdown(f"**Confidence:** {indices['confidence_score']:.2f}")
116
+ if "esg_relevance" in indices:
117
+ st.markdown(f"**ESG Relevance:** {indices['esg_relevance']:.2f}")
118
+
119
+ # Display entities if available
120
+ if "entities" in article and article["entities"]:
121
+ st.markdown("**Named Entities:**")
122
+ entities = article["entities"]
123
+
124
+ # Organizations
125
+ if "ORG" in entities and entities["ORG"]:
126
+ st.write("**Organizations:**", ", ".join(entities["ORG"]))
127
+
128
+ # People
129
+ if "PERSON" in entities and entities["PERSON"]:
130
+ st.write("**People:**", ", ".join(entities["PERSON"]))
131
+
132
+ # Locations
133
+ if "GPE" in entities and entities["GPE"]:
134
+ st.write("**Locations:**", ", ".join(entities["GPE"]))
135
+
136
+ # Money
137
+ if "MONEY" in entities and entities["MONEY"]:
138
+ st.write("**Financial Values:**", ", ".join(entities["MONEY"]))
139
+
140
+ # Display sentiment targets if available
141
+ if "sentiment_targets" in article and article["sentiment_targets"]:
142
+ st.markdown("**Sentiment Targets:**")
143
+ targets = article["sentiment_targets"]
144
+ for target in targets:
145
+ st.markdown(f"**{target['entity']}** ({target['type']}): {target['sentiment']} ({target['confidence']*100:.1f}%)")
146
+ st.markdown(f"> {target['context']}")
147
+ st.markdown("---")
148
+
149
+ if "url" in article:
150
+ st.write("**[Read More](%s)**" % article["url"])
151
+
152
+ # Display Comparative Analysis
153
+ st.header("📊 Comparative Analysis")
154
+ analysis = result.get("comparative_sentiment_score", {})
155
+
156
+ # Sentiment Distribution
157
+ if "sentiment_distribution" in analysis:
158
+ st.subheader("Sentiment Distribution")
159
+
160
+ # Debug: Print sentiment distribution data
161
+ print("Sentiment Distribution Data:")
162
+ print(json.dumps(analysis["sentiment_distribution"], indent=2))
163
+
164
+ sentiment_dist = analysis["sentiment_distribution"]
165
+
166
+ # Create a very simple visualization that will definitely work
167
+ try:
168
+ # Extract basic sentiment data
169
+ if isinstance(sentiment_dist, dict):
170
+ if "basic" in sentiment_dist and isinstance(sentiment_dist["basic"], dict):
171
+ basic_dist = sentiment_dist["basic"]
172
+ elif any(k in sentiment_dist for k in ['positive', 'negative', 'neutral']):
173
+ basic_dist = {k: v for k, v in sentiment_dist.items()
174
+ if k in ['positive', 'negative', 'neutral']}
175
+ else:
176
+ basic_dist = {'positive': 0, 'negative': 0, 'neutral': 1}
177
+ else:
178
+ basic_dist = {'positive': 0, 'negative': 0, 'neutral': 1}
179
+
180
+ # Calculate percentages
181
+ total_articles = sum(basic_dist.values())
182
+ if total_articles > 0:
183
+ percentages = {
184
+ k: (v / total_articles) * 100
185
+ for k, v in basic_dist.items()
186
+ }
187
+ else:
188
+ percentages = {k: 0 for k in basic_dist}
189
+
190
+ # Display as simple text and metrics
191
+ st.write("**Sentiment Distribution:**")
192
+
193
+ col1, col2, col3 = st.columns(3)
194
+ with col1:
195
+ st.metric(
196
+ "Positive",
197
+ basic_dist.get('positive', 0),
198
+ f"{percentages.get('positive', 0):.1f}%"
199
+ )
200
+ with col2:
201
+ st.metric(
202
+ "Negative",
203
+ basic_dist.get('negative', 0),
204
+ f"{percentages.get('negative', 0):.1f}%"
205
+ )
206
+ with col3:
207
+ st.metric(
208
+ "Neutral",
209
+ basic_dist.get('neutral', 0),
210
+ f"{percentages.get('neutral', 0):.1f}%"
211
+ )
212
+
213
+ # Create a simple bar chart using Altair
214
+
215
+ # Create a simple DataFrame with consistent capitalization and percentages
216
+ chart_data = pd.DataFrame({
217
+ 'Sentiment': ['Positive', 'Negative', 'Neutral'],
218
+ 'Count': [
219
+ basic_dist.get('positive', 0), # Map lowercase keys to capitalized display
220
+ basic_dist.get('negative', 0),
221
+ basic_dist.get('neutral', 0)
222
+ ],
223
+ 'Percentage': [
224
+ f"{percentages.get('positive', 0):.1f}%",
225
+ f"{percentages.get('negative', 0):.1f}%",
226
+ f"{percentages.get('neutral', 0):.1f}%"
227
+ ]
228
+ })
229
+
230
+ # Add debug output to see what's in the data
231
+ print("Chart Data for Sentiment Distribution:")
232
+ print(chart_data)
233
+
234
+ # Create a simple bar chart with percentages
235
+ chart = alt.Chart(chart_data).mark_bar().encode(
236
+ y='Sentiment', # Changed from x to y for horizontal bars
237
+ x='Count', # Changed from y to x for horizontal bars
238
+ color=alt.Color('Sentiment', scale=alt.Scale(
239
+ domain=['Positive', 'Negative', 'Neutral'],
240
+ range=['green', 'red', 'gray']
241
+ )),
242
+ tooltip=['Sentiment', 'Count', 'Percentage'] # Add tooltip with percentage
243
+ ).properties(
244
+ width=600,
245
+ height=300
246
+ )
247
+
248
+ # Add text labels with percentages
249
+ text = chart.mark_text(
250
+ align='left',
251
+ baseline='middle',
252
+ dx=3 # Nudge text to the right so it doesn't overlap with the bar
253
+ ).encode(
254
+ text='Percentage'
255
+ )
256
+
257
+ # Combine the chart and text
258
+ chart_with_text = (chart + text)
259
+
260
+ st.altair_chart(chart_with_text, use_container_width=True)
261
+
262
+ except Exception as e:
263
+ st.error(f"Error creating visualization: {str(e)}")
264
+ st.write("Fallback to simple text display:")
265
+ if isinstance(sentiment_dist, dict):
266
+ if "basic" in sentiment_dist:
267
+ st.write(f"Positive: {sentiment_dist['basic'].get('positive', 0)}")
268
+ st.write(f"Negative: {sentiment_dist['basic'].get('negative', 0)}")
269
+ st.write(f"Neutral: {sentiment_dist['basic'].get('neutral', 0)}")
270
+ else:
271
+ st.write(f"Positive: {sentiment_dist.get('positive', 0)}")
272
+ st.write(f"Negative: {sentiment_dist.get('negative', 0)}")
273
+ st.write(f"Neutral: {sentiment_dist.get('neutral', 0)}")
274
+ else:
275
+ st.write("No valid sentiment data available")
276
+
277
+ # Display sentiment indices if available
278
+ if "sentiment_indices" in analysis and analysis["sentiment_indices"]:
279
+ st.subheader("Sentiment Indices")
280
+
281
+ # Debug: Print sentiment indices
282
+ print("Sentiment Indices:")
283
+ print(json.dumps(analysis["sentiment_indices"], indent=2))
284
+
285
+ # Get the indices data
286
+ indices = analysis["sentiment_indices"]
287
+
288
+ # Create a very simple visualization that will definitely work
289
+ try:
290
+ if isinstance(indices, dict):
291
+ # Display as simple metrics in columns
292
+ cols = st.columns(3)
293
+
294
+ # Define display names and descriptions
295
+ display_names = {
296
+ "positivity_index": "Positivity",
297
+ "negativity_index": "Negativity",
298
+ "emotional_intensity": "Emotional Intensity",
299
+ "controversy_score": "Controversy",
300
+ "confidence_score": "Confidence",
301
+ "esg_relevance": "ESG Relevance"
302
+ }
303
+
304
+ # Display each index as a metric
305
+ for i, (key, value) in enumerate(indices.items()):
306
+ if isinstance(value, (int, float)):
307
+ with cols[i % 3]:
308
+ display_name = display_names.get(key, key.replace("_", " ").title())
309
+ st.metric(display_name, f"{value:.2f}")
310
+
311
+ # Create a simple bar chart using Altair
312
+
313
+ # Create a simple DataFrame
314
+ chart_data = pd.DataFrame({
315
+ 'Index': [display_names.get(k, k.replace("_", " ").title()) for k in indices.keys()],
316
+ 'Value': [v if isinstance(v, (int, float)) else 0 for v in indices.values()]
317
+ })
318
+
319
+ # Create a simple bar chart
320
+ chart = alt.Chart(chart_data).mark_bar().encode(
321
+ x='Value',
322
+ y='Index',
323
+ color=alt.Color('Index')
324
+ ).properties(
325
+ width=600,
326
+ height=300
327
+ )
328
+
329
+ st.altair_chart(chart, use_container_width=True)
330
+
331
+ # Add descriptions
332
+ with st.expander("Sentiment Indices Explained"):
333
+ st.markdown("""
334
+ - **Positivity**: Measures the positive sentiment in the articles (0-1)
335
+ - **Negativity**: Measures the negative sentiment in the articles (0-1)
336
+ - **Emotional Intensity**: Measures the overall emotional content (0-1)
337
+ - **Controversy**: High when both positive and negative sentiments are strong (0-1)
338
+ - **Confidence**: Confidence in the sentiment analysis (0-1)
339
+ - **ESG Relevance**: Relevance to Environmental, Social, and Governance topics (0-1)
340
+ """)
341
+ else:
342
+ st.warning("Sentiment indices data is not in the expected format.")
343
+ st.write("No valid sentiment indices available")
344
+ except Exception as e:
345
+ st.error(f"Error creating indices visualization: {str(e)}")
346
+ st.write("Fallback to simple text display:")
347
+ if isinstance(indices, dict):
348
+ for key, value in indices.items():
349
+ if isinstance(value, (int, float)):
350
+ st.write(f"{key.replace('_', ' ').title()}: {value:.2f}")
351
+ else:
352
+ st.write("No valid sentiment indices data available")
353
+
354
+ # Source Distribution
355
+ if "source_distribution" in analysis:
356
+ st.subheader("Source Distribution")
357
+ source_df = pd.DataFrame.from_dict(
358
+ analysis["source_distribution"],
359
+ orient='index',
360
+ columns=['Count']
361
+ )
362
+ st.bar_chart(source_df)
363
+
364
+ # Common Topics
365
+ if "common_topics" in analysis:
366
+ st.subheader("Common Topics")
367
+ st.write(", ".join(analysis["common_topics"]) if analysis["common_topics"] else "No common topics found")
368
+
369
+ # Coverage Differences
370
+ if "coverage_differences" in analysis:
371
+ st.subheader("Coverage Analysis")
372
+ for diff in analysis["coverage_differences"]:
373
+ st.write("- " + diff)
374
+
375
+ # Display Final Sentiment and Audio
376
+ st.header("🎯 Final Analysis")
377
+ if "final_sentiment_analysis" in result:
378
+ st.write(result["final_sentiment_analysis"])
379
+
380
+ # Display sentiment indices in the sidebar if available
381
+ if "sentiment_indices" in analysis and analysis["sentiment_indices"]:
382
+ indices = analysis["sentiment_indices"]
383
+ # Verify we have valid data
384
+ if indices and any(isinstance(v, (int, float)) for v in indices.values()):
385
+ st.sidebar.markdown("### Sentiment Indices")
386
+ for idx_name, idx_value in indices.items():
387
+ if isinstance(idx_value, (int, float)):
388
+ formatted_name = " ".join(word.capitalize() for word in idx_name.replace("_", " ").split())
389
+ st.sidebar.metric(formatted_name, f"{idx_value:.2f}")
390
+
391
+ # Display ensemble model information if available
392
+ if "ensemble_info" in result:
393
+ with st.expander("Ensemble Model Details"):
394
+ ensemble = result["ensemble_info"]
395
+
396
+ # Model agreement
397
+ if "agreement" in ensemble:
398
+ st.metric("Model Agreement", f"{ensemble['agreement']*100:.1f}%")
399
+
400
+ # Individual model results
401
+ if "models" in ensemble:
402
+ st.subheader("Individual Model Results")
403
+ models_data = []
404
+ for model_name, model_info in ensemble["models"].items():
405
+ models_data.append({
406
+ "Model": model_name,
407
+ "Sentiment": model_info.get("sentiment", "N/A"),
408
+ "Confidence": f"{model_info.get('confidence', 0)*100:.1f}%"
409
+ })
410
+
411
+ if models_data:
412
+ st.table(pd.DataFrame(models_data))
413
+
414
+ # Audio Playback Section
415
+ st.subheader("🔊 Listen to Analysis (Hindi)")
416
+ if 'audio_content' in result:
417
+ st.audio(result['audio_content'], format='audio/mp3')
418
+ else:
419
+ st.warning("Hindi audio summary not available")
420
+
421
+ # Total Articles
422
+ if "total_articles" in analysis:
423
+ st.sidebar.info(f"Found {analysis['total_articles']} articles")
424
+
425
+ # Add a disclaimer
426
+ st.sidebar.markdown("---")
427
+ st.sidebar.markdown("### About")
428
+ st.sidebar.write("This app analyzes news articles and provides sentiment analysis for any company.")
429
+
430
+ if __name__ == "__main__":
431
+ main()
config.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration settings for the News Summarization application."""
2
+
3
+ import os
4
+ from dotenv import load_dotenv
5
+
6
+ # Load environment variables
7
+ load_dotenv()
8
+
9
+ # API Settings
10
+ API_HOST = os.getenv("API_HOST", "0.0.0.0")
11
+ API_PORT = int(os.getenv("API_PORT", "8005"))
12
+ API_BASE_URL = os.getenv("API_BASE_URL", f"http://{API_HOST}:{API_PORT}")
13
+
14
+ # News Scraping Settings
15
+ ARTICLES_PER_SOURCE = int(os.getenv("ARTICLES_PER_SOURCE", "10"))
16
+ USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
17
+
18
+ # RSS Feed Settings
19
+ RSS_FEEDS = {
20
+ "BBC": "http://feeds.bbci.co.uk/news/business/rss.xml",
21
+ "CNN": "http://rss.cnn.com/rss/money_news_international.rss",
22
+ "FoxBusiness": "http://feeds.foxnews.com/foxbusiness/latest"
23
+ }
24
+
25
+ # Model Settings
26
+ SENTIMENT_MODEL = "yiyanghkust/finbert-tone" # More advanced financial sentiment model
27
+ SENTIMENT_FINE_GRAINED_MODEL = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
28
+ SUMMARIZATION_MODEL = "t5-base"
29
+
30
+ # Additional Fine-Grained Sentiment Models
31
+ FINE_GRAINED_MODELS = {
32
+ "financial": "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
33
+ "emotion": "j-hartmann/emotion-english-distilroberta-base",
34
+ "aspect": "yangheng/deberta-v3-base-absa-v1.1",
35
+ "esg": "yiyanghkust/finbert-esg",
36
+ "news_tone": "ProsusAI/finbert"
37
+ }
38
+
39
+ # Fine-Grained Sentiment Categories
40
+ SENTIMENT_CATEGORIES = {
41
+ "financial": ["positive", "negative", "neutral"],
42
+ "emotion": ["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"],
43
+ "aspect": ["positive", "negative", "neutral"],
44
+ "esg": ["environmental", "social", "governance", "neutral"],
45
+ "news_tone": ["positive", "negative", "neutral"]
46
+ }
47
+
48
+ # Cache Settings
49
+ CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
50
+ CACHE_EXPIRY = int(os.getenv("CACHE_EXPIRY", "3600")) # 1 hour
51
+ CACHE_DURATION = int(os.getenv("CACHE_DURATION", "300")) # 5 minutes in seconds
52
+
53
+ # Audio Settings
54
+ AUDIO_OUTPUT_DIR = os.getenv("AUDIO_OUTPUT_DIR", "audio_output")
55
+ DEFAULT_LANG = os.getenv("DEFAULT_LANG", "hi") # Hindi
56
+
57
+ # News Sources
58
+ NEWS_SOURCES = {
59
+ # Major News Aggregators
60
+ "google": "https://www.google.com/search?q={}&tbm=nws",
61
+ "bing": "https://www.bing.com/news/search?q={}",
62
+ "yahoo": "https://news.search.yahoo.com/search?p={}",
63
+
64
+ # Financial News
65
+ "reuters": "https://www.reuters.com/search/news?blob={}",
66
+ "marketwatch": "https://www.marketwatch.com/search?q={}&ts=0&tab=All%20News",
67
+ "investing": "https://www.investing.com/search/?q={}&tab=news",
68
+
69
+ # Tech News
70
+ "techcrunch": "https://techcrunch.com/search/{}",
71
+ "zdnet": "https://www.zdnet.com/search/?q={}",
72
+ }
73
+
74
+ # Article limits
75
+ MIN_ARTICLES = 20
76
+ MAX_ARTICLES_PER_SOURCE = 10 # Adjusted for more sources
77
+ MAX_ARTICLES = 50 # Increased to accommodate more sources
78
+
79
+ # Browser Headers
80
+ HEADERS = {
81
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
82
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
83
+ "Accept-Language": "en-US,en;q=0.5",
84
+ "Connection": "keep-alive"
85
+ }
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.31.1
2
+ beautifulsoup4==4.12.2
3
+ requests==2.31.0
4
+ pandas==2.2.0
5
+ nltk==3.8.1
6
+ transformers==4.37.2
7
+ torch==2.2.0
8
+ fastapi==0.109.2
9
+ uvicorn==0.27.1
10
+ python-multipart==0.0.6
11
+ gTTS==2.5.0
12
+ scikit-learn==1.4.0
13
+ numpy==1.26.3
14
+ python-dotenv==1.0.1
15
+ aiofiles==23.2.1
16
+ googletrans==3.1.0a0
17
+ lxml==4.9.3
18
+ spacy==3.7.2
19
+ plotly==5.18.0
20
+ textblob==0.17.1
21
+ vaderSentiment==3.3.2
utils.py ADDED
@@ -0,0 +1,1402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility functions for news extraction, sentiment analysis, and text-to-speech."""
2
+
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
6
+ from gtts import gTTS
7
+ import os
8
+ from typing import List, Dict, Any
9
+ import pandas as pd
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from config import *
12
+ import re
13
+ from datetime import datetime, timedelta
14
+ import time
15
+ import json
16
+ from googletrans import Translator
17
+ import statistics
18
+
19
+ class NewsExtractor:
20
+ def __init__(self):
21
+ self.headers = HEADERS
22
+
23
+ def search_news(self, company_name: str) -> List[Dict[str, str]]:
24
+ """Extract news articles about the company ensuring minimum count."""
25
+ all_articles = []
26
+ retries = 2 # Number of retries if we don't get enough articles
27
+
28
+ while retries > 0 and len(all_articles) < MIN_ARTICLES:
29
+ for source, url_template in NEWS_SOURCES.items():
30
+ try:
31
+ url = url_template.format(company_name.replace(" ", "+"))
32
+ print(f"\nSearching {source} for news about {company_name}...")
33
+
34
+ # Try different page numbers for more articles
35
+ for page in range(2): # Try first two pages
36
+ page_url = url
37
+ if page > 0:
38
+ if source == "google":
39
+ page_url += f"&start={page * 10}"
40
+ elif source == "bing":
41
+ page_url += f"&first={page * 10 + 1}"
42
+ elif source == "yahoo":
43
+ page_url += f"&b={page * 10 + 1}"
44
+ elif source == "reuters":
45
+ page_url += f"&page={page + 1}"
46
+ elif source == "marketwatch":
47
+ page_url += f"&page={page + 1}"
48
+ elif source == "investing":
49
+ page_url += f"&page={page + 1}"
50
+ elif source == "techcrunch":
51
+ page_url += f"/page/{page + 1}"
52
+ elif source == "zdnet":
53
+ page_url += f"&page={page + 1}"
54
+
55
+ response = requests.get(page_url, headers=self.headers, timeout=15)
56
+ if response.status_code != 200:
57
+ print(f"Error: {source} page {page+1} returned status code {response.status_code}")
58
+ continue
59
+
60
+ soup = BeautifulSoup(response.content, 'html.parser')
61
+
62
+ source_articles = []
63
+ if source == "google":
64
+ source_articles = self._parse_google_news(soup)
65
+ elif source == "bing":
66
+ source_articles = self._parse_bing_news(soup)
67
+ elif source == "yahoo":
68
+ source_articles = self._parse_yahoo_news(soup)
69
+ elif source == "reuters":
70
+ source_articles = self._parse_reuters_news(soup)
71
+ elif source == "marketwatch":
72
+ source_articles = self._parse_marketwatch_news(soup)
73
+ elif source == "investing":
74
+ source_articles = self._parse_investing_news(soup)
75
+ elif source == "techcrunch":
76
+ source_articles = self._parse_techcrunch_news(soup)
77
+ elif source == "zdnet":
78
+ source_articles = self._parse_zdnet_news(soup)
79
+
80
+ # Limit articles per source
81
+ if source_articles:
82
+ source_articles = source_articles[:MAX_ARTICLES_PER_SOURCE]
83
+ all_articles.extend(source_articles)
84
+ print(f"Found {len(source_articles)} articles from {source} page {page+1}")
85
+
86
+ # If we have enough articles, break the page loop
87
+ if len(all_articles) >= MIN_ARTICLES:
88
+ break
89
+
90
+ except Exception as e:
91
+ print(f"Error fetching from {source}: {str(e)}")
92
+ continue
93
+
94
+ # If we have enough articles, break the source loop
95
+ if len(all_articles) >= MIN_ARTICLES:
96
+ break
97
+
98
+ retries -= 1
99
+ if len(all_articles) < MIN_ARTICLES and retries > 0:
100
+ print(f"\nFound only {len(all_articles)} articles, retrying...")
101
+
102
+ # Remove duplicates
103
+ unique_articles = self._remove_duplicates(all_articles)
104
+ print(f"\nFound {len(unique_articles)} unique articles")
105
+
106
+ if len(unique_articles) < MIN_ARTICLES:
107
+ print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
108
+
109
+ # Balance articles across sources
110
+ balanced_articles = self._balance_sources(unique_articles)
111
+ return balanced_articles[:max(MIN_ARTICLES, MAX_ARTICLES)]
112
+
113
+ def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
114
+ """Balance articles across sources while maintaining minimum count."""
115
+ source_articles = {}
116
+
117
+ # Group articles by source
118
+ for article in articles:
119
+ source = article['source']
120
+ if source not in source_articles:
121
+ source_articles[source] = []
122
+ source_articles[source].append(article)
123
+
124
+ # Calculate target articles per source
125
+ total_sources = len(source_articles)
126
+ target_per_source = max(MIN_ARTICLES // total_sources,
127
+ MAX_ARTICLES_PER_SOURCE)
128
+
129
+ # Get articles from each source
130
+ balanced = []
131
+ for source, articles_list in source_articles.items():
132
+ balanced.extend(articles_list[:target_per_source])
133
+
134
+ # If we still need more articles to meet minimum, add more from sources
135
+ # that have additional articles
136
+ if len(balanced) < MIN_ARTICLES:
137
+ remaining = []
138
+ for articles_list in source_articles.values():
139
+ remaining.extend(articles_list[target_per_source:])
140
+
141
+ # Sort remaining by source to maintain balance
142
+ remaining.sort(key=lambda x: len([a for a in balanced if a['source'] == x['source']]))
143
+
144
+ while len(balanced) < MIN_ARTICLES and remaining:
145
+ balanced.append(remaining.pop(0))
146
+
147
+ return balanced
148
+
149
+ def _parse_google_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
150
+ """Parse Google News search results."""
151
+ articles = []
152
+ for div in soup.find_all(['div', 'article'], class_=['g', 'xuvV6b', 'WlydOe']):
153
+ try:
154
+ title_elem = div.find(['h3', 'h4'])
155
+ snippet_elem = div.find('div', class_=['VwiC3b', 'yy6M1d'])
156
+ link_elem = div.find('a')
157
+ source_elem = div.find(['div', 'span'], class_='UPmit')
158
+
159
+ if title_elem and snippet_elem and link_elem:
160
+ source = source_elem.get_text(strip=True) if source_elem else 'Google News'
161
+ articles.append({
162
+ 'title': title_elem.get_text(strip=True),
163
+ 'content': snippet_elem.get_text(strip=True),
164
+ 'url': link_elem['href'],
165
+ 'source': source
166
+ })
167
+ except Exception as e:
168
+ print(f"Error parsing Google article: {str(e)}")
169
+ continue
170
+ return articles
171
+
172
+ def _parse_bing_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
173
+ """Parse Bing News search results."""
174
+ articles = []
175
+ for article in soup.find_all(['div', 'article'], class_=['news-card', 'newsitem', 'item-info']):
176
+ try:
177
+ title_elem = article.find(['a', 'h3'], class_=['title', 'news-card-title'])
178
+ snippet_elem = article.find(['div', 'p'], class_=['snippet', 'description'])
179
+ source_elem = article.find(['div', 'span'], class_=['source', 'provider'])
180
+
181
+ if title_elem and snippet_elem:
182
+ source = source_elem.get_text(strip=True) if source_elem else 'Bing News'
183
+ url = title_elem['href'] if 'href' in title_elem.attrs else ''
184
+ articles.append({
185
+ 'title': title_elem.get_text(strip=True),
186
+ 'content': snippet_elem.get_text(strip=True),
187
+ 'url': url,
188
+ 'source': source
189
+ })
190
+ except Exception as e:
191
+ print(f"Error parsing Bing article: {str(e)}")
192
+ return articles
193
+
194
+ def _parse_yahoo_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
195
+ """Parse Yahoo News search results."""
196
+ articles = []
197
+ for article in soup.find_all('div', class_='NewsArticle'):
198
+ try:
199
+ title_elem = article.find(['h4', 'h3', 'a'])
200
+ snippet_elem = article.find('p')
201
+ source_elem = article.find(['span', 'div'], class_=['provider', 'source'])
202
+
203
+ if title_elem and snippet_elem:
204
+ source = source_elem.get_text(strip=True) if source_elem else 'Yahoo News'
205
+ url = title_elem.find('a')['href'] if title_elem.find('a') else ''
206
+ articles.append({
207
+ 'title': title_elem.get_text(strip=True),
208
+ 'content': snippet_elem.get_text(strip=True),
209
+ 'url': url,
210
+ 'source': source
211
+ })
212
+ except Exception as e:
213
+ print(f"Error parsing Yahoo article: {str(e)}")
214
+ return articles
215
+
216
+ def _parse_reuters_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
217
+ """Parse Reuters search results."""
218
+ articles = []
219
+ for article in soup.find_all(['div', 'article'], class_=['search-result-content', 'story']):
220
+ try:
221
+ title_elem = article.find(['h3', 'a'], class_='story-title')
222
+ snippet_elem = article.find(['p', 'div'], class_=['story-description', 'description'])
223
+
224
+ if title_elem:
225
+ url = title_elem.find('a')['href'] if title_elem.find('a') else ''
226
+ if url and not url.startswith('http'):
227
+ url = 'https://www.reuters.com' + url
228
+
229
+ articles.append({
230
+ 'title': title_elem.get_text(strip=True),
231
+ 'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
232
+ 'url': url,
233
+ 'source': 'Reuters'
234
+ })
235
+ except Exception as e:
236
+ print(f"Error parsing Reuters article: {str(e)}")
237
+ return articles
238
+
239
+ def _parse_marketwatch_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
240
+ """Parse MarketWatch search results."""
241
+ articles = []
242
+ for article in soup.find_all(['div', 'article'], class_=['element--article', 'article__content']):
243
+ try:
244
+ title_elem = article.find(['h3', 'h2'], class_=['article__headline', 'title'])
245
+ snippet_elem = article.find('p', class_=['article__summary', 'description'])
246
+
247
+ if title_elem:
248
+ url = title_elem.find('a')['href'] if title_elem.find('a') else ''
249
+ articles.append({
250
+ 'title': title_elem.get_text(strip=True),
251
+ 'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
252
+ 'url': url,
253
+ 'source': 'MarketWatch'
254
+ })
255
+ except Exception as e:
256
+ print(f"Error parsing MarketWatch article: {str(e)}")
257
+ return articles
258
+
259
+ def _parse_investing_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
260
+ """Parse Investing.com search results."""
261
+ articles = []
262
+ for article in soup.find_all(['div', 'article'], class_=['articleItem', 'news-item']):
263
+ try:
264
+ title_elem = article.find(['a', 'h3'], class_=['title', 'articleTitle'])
265
+ snippet_elem = article.find(['p', 'div'], class_=['description', 'articleContent'])
266
+
267
+ if title_elem:
268
+ url = title_elem['href'] if 'href' in title_elem.attrs else title_elem.find('a')['href']
269
+ if url and not url.startswith('http'):
270
+ url = 'https://www.investing.com' + url
271
+
272
+ articles.append({
273
+ 'title': title_elem.get_text(strip=True),
274
+ 'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
275
+ 'url': url,
276
+ 'source': 'Investing.com'
277
+ })
278
+ except Exception as e:
279
+ print(f"Error parsing Investing.com article: {str(e)}")
280
+ return articles
281
+
282
+ def _parse_techcrunch_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
283
+ """Parse TechCrunch search results."""
284
+ articles = []
285
+ for article in soup.find_all(['div', 'article'], class_=['post-block', 'article-block']):
286
+ try:
287
+ title_elem = article.find(['h2', 'h3', 'a'], class_=['post-block__title', 'article-title'])
288
+ snippet_elem = article.find(['div', 'p'], class_=['post-block__content', 'article-content'])
289
+
290
+ if title_elem:
291
+ url = title_elem.find('a')['href'] if title_elem.find('a') else ''
292
+ articles.append({
293
+ 'title': title_elem.get_text(strip=True),
294
+ 'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
295
+ 'url': url,
296
+ 'source': 'TechCrunch'
297
+ })
298
+ except Exception as e:
299
+ print(f"Error parsing TechCrunch article: {str(e)}")
300
+ return articles
301
+
302
+ def _parse_zdnet_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
303
+ """Parse ZDNet search results."""
304
+ articles = []
305
+ for article in soup.find_all(['div', 'article'], class_=['item', 'article']):
306
+ try:
307
+ title_elem = article.find(['h3', 'a'], class_=['title', 'headline'])
308
+ snippet_elem = article.find(['p', 'div'], class_=['summary', 'content'])
309
+
310
+ if title_elem:
311
+ url = title_elem.find('a')['href'] if title_elem.find('a') else ''
312
+ if url and not url.startswith('http'):
313
+ url = 'https://www.zdnet.com' + url
314
+
315
+ articles.append({
316
+ 'title': title_elem.get_text(strip=True),
317
+ 'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
318
+ 'url': url,
319
+ 'source': 'ZDNet'
320
+ })
321
+ except Exception as e:
322
+ print(f"Error parsing ZDNet article: {str(e)}")
323
+ return articles
324
+
325
+ def _remove_duplicates(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
326
+ """Remove duplicate articles based on title similarity."""
327
+ unique_articles = []
328
+ seen_titles = set()
329
+
330
+ for article in articles:
331
+ title = article['title'].lower()
332
+ if not any(title in seen_title or seen_title in title for seen_title in seen_titles):
333
+ unique_articles.append(article)
334
+ seen_titles.add(title)
335
+
336
+ return unique_articles
337
+
338
+ class SentimentAnalyzer:
339
+ def __init__(self):
340
+ try:
341
+ # Primary financial sentiment model
342
+ self.sentiment_pipeline = pipeline("sentiment-analysis",
343
+ model=SENTIMENT_MODEL)
344
+
345
+ # Initialize fine-grained sentiment models
346
+ self.fine_grained_models = {}
347
+ try:
348
+ # Initialize the default fine-grained model for backward compatibility
349
+ self.fine_grained_sentiment = pipeline("sentiment-analysis",
350
+ model=SENTIMENT_FINE_GRAINED_MODEL)
351
+
352
+ # Initialize additional fine-grained models
353
+ for model_name, model_path in FINE_GRAINED_MODELS.items():
354
+ try:
355
+ print(f"Loading fine-grained model: {model_name}")
356
+ self.fine_grained_models[model_name] = pipeline("sentiment-analysis",
357
+ model=model_path)
358
+ except Exception as e:
359
+ print(f"Error loading fine-grained model {model_name}: {str(e)}")
360
+ except Exception as e:
361
+ print(f"Error initializing fine-grained models: {str(e)}")
362
+ self.fine_grained_sentiment = None
363
+
364
+ # Initialize additional sentiment analyzers if available
365
+ self.has_textblob = False
366
+ self.has_vader = False
367
+
368
+ try:
369
+ from textblob import TextBlob
370
+ self.TextBlob = TextBlob
371
+ self.has_textblob = True
372
+ except:
373
+ print("TextBlob not available. Install with: pip install textblob")
374
+
375
+ try:
376
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
377
+ self.vader = SentimentIntensityAnalyzer()
378
+ self.has_vader = True
379
+ except:
380
+ print("VADER not available. Install with: pip install vaderSentiment")
381
+
382
+ self.summarizer = pipeline("summarization",
383
+ model=SUMMARIZATION_MODEL)
384
+ self.vectorizer = TfidfVectorizer(stop_words='english',
385
+ max_features=10)
386
+
387
+ # Initialize NER pipeline if spaCy is available
388
+ try:
389
+ import spacy
390
+ self.nlp = spacy.load("en_core_web_sm")
391
+ self.has_ner = True
392
+ except:
393
+ self.has_ner = False
394
+ print("spaCy not available for NER. Install with: pip install spacy && python -m spacy download en_core_web_sm")
395
+
396
+ except Exception as e:
397
+ print(f"Error initializing sentiment models: {str(e)}")
398
+ # Fallback to default models if specific models fail
399
+ self.sentiment_pipeline = pipeline("sentiment-analysis")
400
+ self.fine_grained_sentiment = None
401
+ self.fine_grained_models = {}
402
+ self.summarizer = pipeline("summarization")
403
+ self.vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
404
+ self.has_ner = False
405
+ self.has_textblob = False
406
+ self.has_vader = False
407
+
408
+ def analyze_article(self, article: Dict[str, str]) -> Dict[str, Any]:
409
+ """Analyze sentiment and generate summary for an article."""
410
+ try:
411
+ # Get the full text by combining title and content
412
+ full_text = f"{article['title']} {article['content']}"
413
+
414
+ # Generate summary
415
+ summary = self.summarize_text(full_text)
416
+
417
+ # Get ensemble sentiment analysis
418
+ sentiment_analysis = self._get_ensemble_sentiment(full_text)
419
+ sentiment_label = sentiment_analysis['ensemble_sentiment']
420
+ sentiment_score = sentiment_analysis['ensemble_score']
421
+
422
+ # Add fine-grained sentiment analysis
423
+ fine_grained_sentiment = self._get_fine_grained_sentiment(full_text)
424
+
425
+ # Extract key topics
426
+ topics = self.extract_topics(full_text)
427
+
428
+ # Extract named entities
429
+ entities = self._extract_entities(full_text)
430
+
431
+ # Extract sentiment targets (entities associated with sentiment)
432
+ sentiment_targets = self._extract_sentiment_targets(full_text, entities)
433
+
434
+ # Add analysis to article
435
+ analyzed_article = article.copy()
436
+ analyzed_article.update({
437
+ 'summary': summary,
438
+ 'sentiment': sentiment_label,
439
+ 'sentiment_score': sentiment_score,
440
+ 'sentiment_details': sentiment_analysis,
441
+ 'fine_grained_sentiment': fine_grained_sentiment,
442
+ 'topics': topics,
443
+ 'entities': entities,
444
+ 'sentiment_targets': sentiment_targets,
445
+ 'sentiment_indices': fine_grained_sentiment.get('indices', {}),
446
+ 'analysis_timestamp': datetime.now().isoformat()
447
+ })
448
+
449
+ return analyzed_article
450
+
451
+ except Exception as e:
452
+ print(f"Error analyzing article: {str(e)}")
453
+ # Return original article with default values if analysis fails
454
+ article.update({
455
+ 'summary': article.get('content', '')[:200] + '...',
456
+ 'sentiment': 'neutral',
457
+ 'sentiment_score': 0.0,
458
+ 'sentiment_details': {},
459
+ 'fine_grained_sentiment': {},
460
+ 'topics': [],
461
+ 'entities': {},
462
+ 'sentiment_targets': [],
463
+ 'sentiment_indices': {
464
+ 'positivity_index': 0.5,
465
+ 'negativity_index': 0.5,
466
+ 'emotional_intensity': 0.0,
467
+ 'controversy_score': 0.0,
468
+ 'confidence_score': 0.0,
469
+ 'esg_relevance': 0.0
470
+ },
471
+ 'analysis_timestamp': datetime.now().isoformat()
472
+ })
473
+ return article
474
+
475
+ def _get_ensemble_sentiment(self, text: str) -> Dict[str, Any]:
476
+ """Get ensemble sentiment by combining multiple sentiment models."""
477
+ results = {}
478
+
479
+ # Initialize with default values
480
+ ensemble_result = {
481
+ 'ensemble_sentiment': 'neutral',
482
+ 'ensemble_score': 0.5,
483
+ 'models': {}
484
+ }
485
+
486
+ try:
487
+ # 1. Primary transformer model (finbert)
488
+ try:
489
+ primary_result = self.sentiment_pipeline(text[:512])[0] # Limit text length
490
+ primary_label = primary_result['label'].lower()
491
+ primary_score = primary_result['score']
492
+
493
+ # Map to standard format
494
+ if primary_label == 'positive':
495
+ primary_normalized = primary_score
496
+ elif primary_label == 'negative':
497
+ primary_normalized = 1 - primary_score
498
+ else: # neutral
499
+ primary_normalized = 0.5
500
+
501
+ ensemble_result['models']['transformer'] = {
502
+ 'sentiment': primary_label,
503
+ 'score': round(primary_score, 3),
504
+ 'normalized_score': round(primary_normalized, 3)
505
+ }
506
+ except:
507
+ ensemble_result['models']['transformer'] = {
508
+ 'sentiment': 'error',
509
+ 'score': 0,
510
+ 'normalized_score': 0.5
511
+ }
512
+
513
+ # 2. TextBlob sentiment
514
+ if self.has_textblob:
515
+ try:
516
+ blob = self.TextBlob(text)
517
+ polarity = blob.sentiment.polarity
518
+
519
+ # Convert to standard format
520
+ if polarity > 0.1:
521
+ textblob_sentiment = 'positive'
522
+ textblob_score = polarity
523
+ elif polarity < -0.1:
524
+ textblob_sentiment = 'negative'
525
+ textblob_score = abs(polarity)
526
+ else:
527
+ textblob_sentiment = 'neutral'
528
+ textblob_score = 0.5
529
+
530
+ # Normalize to 0-1 scale
531
+ textblob_normalized = (polarity + 1) / 2
532
+
533
+ ensemble_result['models']['textblob'] = {
534
+ 'sentiment': textblob_sentiment,
535
+ 'score': round(textblob_score, 3),
536
+ 'normalized_score': round(textblob_normalized, 3)
537
+ }
538
+ except:
539
+ ensemble_result['models']['textblob'] = {
540
+ 'sentiment': 'error',
541
+ 'score': 0,
542
+ 'normalized_score': 0.5
543
+ }
544
+
545
+ # 3. VADER sentiment
546
+ if self.has_vader:
547
+ try:
548
+ vader_scores = self.vader.polarity_scores(text)
549
+ compound = vader_scores['compound']
550
+
551
+ # Convert to standard format
552
+ if compound > 0.05:
553
+ vader_sentiment = 'positive'
554
+ vader_score = compound
555
+ elif compound < -0.05:
556
+ vader_sentiment = 'negative'
557
+ vader_score = abs(compound)
558
+ else:
559
+ vader_sentiment = 'neutral'
560
+ vader_score = 0.5
561
+
562
+ # Normalize to 0-1 scale
563
+ vader_normalized = (compound + 1) / 2
564
+
565
+ ensemble_result['models']['vader'] = {
566
+ 'sentiment': vader_sentiment,
567
+ 'score': round(vader_score, 3),
568
+ 'normalized_score': round(vader_normalized, 3)
569
+ }
570
+ except:
571
+ ensemble_result['models']['vader'] = {
572
+ 'sentiment': 'error',
573
+ 'score': 0,
574
+ 'normalized_score': 0.5
575
+ }
576
+
577
+ # Calculate ensemble result
578
+ # Get all normalized scores
579
+ normalized_scores = []
580
+ for model_name, model_result in ensemble_result['models'].items():
581
+ if model_result['sentiment'] != 'error':
582
+ normalized_scores.append(model_result['normalized_score'])
583
+
584
+ # Calculate average if we have scores
585
+ if normalized_scores:
586
+ avg_score = sum(normalized_scores) / len(normalized_scores)
587
+
588
+ # Convert to sentiment label
589
+ if avg_score > 0.6:
590
+ ensemble_sentiment = 'positive'
591
+ elif avg_score < 0.4:
592
+ ensemble_sentiment = 'negative'
593
+ else:
594
+ ensemble_sentiment = 'neutral'
595
+
596
+ ensemble_result['ensemble_sentiment'] = ensemble_sentiment
597
+ ensemble_result['ensemble_score'] = round(avg_score, 3)
598
+
599
+ # Add confidence level
600
+ if len(normalized_scores) > 1:
601
+ # Calculate standard deviation to measure agreement
602
+ std_dev = statistics.stdev(normalized_scores) if len(normalized_scores) > 1 else 0
603
+ agreement = 1 - (std_dev * 2) # Lower std_dev means higher agreement
604
+ agreement = max(0, min(1, agreement)) # Clamp to 0-1
605
+
606
+ ensemble_result['model_agreement'] = round(agreement, 3)
607
+
608
+ return ensemble_result
609
+
610
+ except Exception as e:
611
+ print(f"Error in ensemble sentiment analysis: {str(e)}")
612
+ return {
613
+ 'ensemble_sentiment': 'neutral',
614
+ 'ensemble_score': 0.5,
615
+ 'models': {}
616
+ }
617
+
618
+ def _get_fine_grained_sentiment(self, text: str) -> Dict[str, Any]:
619
+ """Get fine-grained sentiment analysis with more detailed categories."""
620
+ # Initialize result structure
621
+ result = {
622
+ "primary": {"category": "unknown", "confidence": 0.0},
623
+ "models": {}
624
+ }
625
+
626
+ # Check if we have any fine-grained models
627
+ if not self.fine_grained_sentiment and not self.fine_grained_models:
628
+ return result
629
+
630
+ try:
631
+ # Split text into manageable chunks if too long
632
+ chunks = self._split_text(text)
633
+
634
+ # Process with default fine-grained model for backward compatibility
635
+ if self.fine_grained_sentiment:
636
+ primary_results = []
637
+
638
+ for chunk in chunks:
639
+ if not chunk.strip():
640
+ continue
641
+ chunk_result = self.fine_grained_sentiment(chunk)[0]
642
+ primary_results.append(chunk_result)
643
+
644
+ if primary_results:
645
+ # Aggregate results from all chunks
646
+ categories = {}
647
+ for res in primary_results:
648
+ label = res['label'].lower()
649
+ score = res['score']
650
+ if label in categories:
651
+ categories[label] += score
652
+ else:
653
+ categories[label] = score
654
+
655
+ # Normalize scores
656
+ total = sum(categories.values())
657
+ if total > 0:
658
+ categories = {k: round(v/total, 3) for k, v in categories.items()}
659
+
660
+ # Get dominant category
661
+ dominant_category = max(categories.items(), key=lambda x: x[1])
662
+
663
+ result["primary"] = {
664
+ "category": dominant_category[0],
665
+ "confidence": dominant_category[1],
666
+ "distribution": categories
667
+ }
668
+
669
+ # Process with additional fine-grained models
670
+ for model_name, model in self.fine_grained_models.items():
671
+ model_results = []
672
+
673
+ for chunk in chunks:
674
+ if not chunk.strip():
675
+ continue
676
+ try:
677
+ chunk_result = model(chunk)[0]
678
+ model_results.append(chunk_result)
679
+ except Exception as e:
680
+ print(f"Error analyzing chunk with model {model_name}: {str(e)}")
681
+
682
+ if model_results:
683
+ # Aggregate results from all chunks
684
+ categories = {}
685
+ for res in model_results:
686
+ # Ensure the label is lowercase for consistency
687
+ label = res['label'].lower() if isinstance(res.get('label'), str) else "unknown"
688
+ score = res['score']
689
+ if label in categories:
690
+ categories[label] += score
691
+ else:
692
+ categories[label] = score
693
+
694
+ # Normalize scores
695
+ total = sum(categories.values())
696
+ if total > 0:
697
+ categories = {k: round(v/total, 3) for k, v in categories.items()}
698
+
699
+ # Get dominant category
700
+ dominant_category = max(categories.items(), key=lambda x: x[1])
701
+
702
+ # Store results for this model
703
+ result["models"][model_name] = {
704
+ "category": dominant_category[0],
705
+ "confidence": dominant_category[1],
706
+ "distribution": categories
707
+ }
708
+
709
+ # Calculate sentiment indices based on the fine-grained results
710
+ result["indices"] = self._calculate_sentiment_indices(result)
711
+
712
+ return result
713
+
714
+ except Exception as e:
715
+ print(f"Error in fine-grained sentiment analysis: {str(e)}")
716
+ return result
717
+
718
+ def _calculate_sentiment_indices(self, fine_grained_results: Dict[str, Any]) -> Dict[str, float]:
719
+ """Calculate various sentiment indices based on fine-grained sentiment analysis."""
720
+ indices = {
721
+ "positivity_index": 0.5, # Default neutral value
722
+ "negativity_index": 0.5,
723
+ "emotional_intensity": 0.0,
724
+ "controversy_score": 0.0,
725
+ "confidence_score": 0.0,
726
+ "esg_relevance": 0.0
727
+ }
728
+
729
+ try:
730
+ # Extract distributions from all models
731
+ distributions = {}
732
+ confidence_scores = {}
733
+
734
+ # Add primary model if available
735
+ if "category" in fine_grained_results.get("primary", {}):
736
+ if "distribution" in fine_grained_results["primary"]:
737
+ distributions["primary"] = fine_grained_results["primary"]["distribution"]
738
+ confidence_scores["primary"] = fine_grained_results["primary"].get("confidence", 0.0)
739
+
740
+ # Add other models
741
+ for model_name, model_result in fine_grained_results.get("models", {}).items():
742
+ if "distribution" in model_result:
743
+ distributions[model_name] = model_result["distribution"]
744
+ confidence_scores[model_name] = model_result.get("confidence", 0.0)
745
+
746
+ # Calculate positivity index
747
+ positive_scores = []
748
+ for model_name, dist in distributions.items():
749
+ if model_name == "financial" or model_name == "primary" or model_name == "news_tone" or model_name == "aspect":
750
+ pos_score = dist.get("positive", 0.0)
751
+ positive_scores.append(pos_score)
752
+ elif model_name == "emotion":
753
+ # For emotion model, consider joy as positive
754
+ pos_score = dist.get("joy", 0.0) + dist.get("surprise", 0.0) * 0.5
755
+ positive_scores.append(pos_score)
756
+
757
+ if positive_scores:
758
+ indices["positivity_index"] = round(sum(positive_scores) / len(positive_scores), 3)
759
+
760
+ # Calculate negativity index
761
+ negative_scores = []
762
+ for model_name, dist in distributions.items():
763
+ if model_name == "financial" or model_name == "primary" or model_name == "news_tone" or model_name == "aspect":
764
+ neg_score = dist.get("negative", 0.0)
765
+ negative_scores.append(neg_score)
766
+ elif model_name == "emotion":
767
+ # For emotion model, consider sadness, anger, fear, disgust as negative
768
+ neg_score = dist.get("sadness", 0.0) + dist.get("anger", 0.0) + \
769
+ dist.get("fear", 0.0) + dist.get("disgust", 0.0)
770
+ negative_scores.append(neg_score / 4) # Average of 4 negative emotions
771
+
772
+ if negative_scores:
773
+ indices["negativity_index"] = round(sum(negative_scores) / len(negative_scores), 3)
774
+
775
+ # Calculate emotional intensity
776
+ emotion_dist = distributions.get("emotion", {})
777
+ if emotion_dist:
778
+ # Sum all emotional intensities except neutral
779
+ emotional_sum = sum(v for k, v in emotion_dist.items() if k != "neutral")
780
+ indices["emotional_intensity"] = round(emotional_sum, 3)
781
+
782
+ # Calculate controversy score (high when both positive and negative are high)
783
+ indices["controversy_score"] = round(indices["positivity_index"] * indices["negativity_index"] * 4, 3)
784
+
785
+ # Calculate confidence score (average of all model confidences)
786
+ if confidence_scores:
787
+ indices["confidence_score"] = round(sum(confidence_scores.values()) / len(confidence_scores), 3)
788
+
789
+ # Calculate ESG relevance if available
790
+ esg_dist = distributions.get("esg", {})
791
+ if esg_dist:
792
+ # Sum of all ESG categories
793
+ esg_sum = sum(v for k, v in esg_dist.items() if k in ["environmental", "social", "governance"])
794
+ indices["esg_relevance"] = round(esg_sum, 3)
795
+
796
+ return indices
797
+
798
+ except Exception as e:
799
+ print(f"Error calculating sentiment indices: {str(e)}")
800
+ return indices
801
+
802
+ def summarize_text(self, text: str) -> str:
803
+ """Generate a concise summary of the text."""
804
+ try:
805
+ # Clean and prepare text
806
+ text = text.replace('\n', ' ').strip()
807
+
808
+ # Split text into chunks if it's too long
809
+ chunks = self._split_text(text)
810
+
811
+ summaries = []
812
+ for chunk in chunks:
813
+ # Generate summary for each chunk
814
+ summary = self.summarizer(chunk,
815
+ max_length=130,
816
+ min_length=30,
817
+ do_sample=False)[0]['summary_text']
818
+ summaries.append(summary)
819
+
820
+ # Combine summaries if there were multiple chunks
821
+ final_summary = ' '.join(summaries)
822
+ return final_summary
823
+
824
+ except Exception as e:
825
+ print(f"Error generating summary: {str(e)}")
826
+ return text[:200] + '...' # Return truncated text as fallback
827
+
828
+ def extract_topics(self, text: str) -> List[str]:
829
+ """Extract key topics from the text using TF-IDF."""
830
+ try:
831
+ # Prepare text
832
+ text = text.lower()
833
+
834
+ # Fit and transform the text
835
+ tfidf_matrix = self.vectorizer.fit_transform([text])
836
+
837
+ # Get feature names and scores
838
+ feature_names = self.vectorizer.get_feature_names_out()
839
+ scores = tfidf_matrix.toarray()[0]
840
+
841
+ # Get top topics
842
+ top_indices = scores.argsort()[-5:][::-1] # Get top 5 topics
843
+ topics = [feature_names[i] for i in top_indices]
844
+
845
+ return topics
846
+
847
+ except Exception as e:
848
+ print(f"Error extracting topics: {str(e)}")
849
+ return []
850
+
851
+ def _split_text(self, text: str, max_length: int = 1024) -> List[str]:
852
+ """Split text into chunks that fit within model's maximum token limit."""
853
+ words = text.split()
854
+ chunks = []
855
+ current_chunk = []
856
+ current_length = 0
857
+
858
+ for word in words:
859
+ word_length = len(word) + 1 # +1 for space
860
+ if current_length + word_length > max_length:
861
+ chunks.append(' '.join(current_chunk))
862
+ current_chunk = [word]
863
+ current_length = word_length
864
+ else:
865
+ current_chunk.append(word)
866
+ current_length += word_length
867
+
868
+ if current_chunk:
869
+ chunks.append(' '.join(current_chunk))
870
+
871
+ return chunks
872
+
873
+ def _extract_entities(self, text: str) -> Dict[str, List[str]]:
874
+ """Extract named entities from text."""
875
+ entities = {
876
+ 'PERSON': [],
877
+ 'ORG': [],
878
+ 'GPE': [], # Countries, cities, states
879
+ 'MONEY': [],
880
+ 'PERCENT': [],
881
+ 'DATE': []
882
+ }
883
+
884
+ if not self.has_ner:
885
+ return entities
886
+
887
+ try:
888
+ # Process text with spaCy
889
+ doc = self.nlp(text[:10000]) # Limit text length for performance
890
+
891
+ # Extract entities
892
+ for ent in doc.ents:
893
+ if ent.label_ in entities:
894
+ # Clean entity text and deduplicate
895
+ clean_text = ent.text.strip()
896
+ if clean_text and clean_text not in entities[ent.label_]:
897
+ entities[ent.label_].append(clean_text)
898
+
899
+ return entities
900
+ except Exception as e:
901
+ print(f"Error extracting entities: {str(e)}")
902
+ return entities
903
+
904
+ def _extract_sentiment_targets(self, text: str, entities: Dict[str, List[str]]) -> List[Dict[str, Any]]:
905
+ """Extract entities that are targets of sentiment expressions."""
906
+ if not self.has_ner:
907
+ return []
908
+
909
+ try:
910
+ # Get all entities as a flat list
911
+ all_entities = []
912
+ for entity_type, entity_list in entities.items():
913
+ for entity in entity_list:
914
+ all_entities.append({
915
+ 'text': entity,
916
+ 'type': entity_type
917
+ })
918
+
919
+ # Find sentiment targets
920
+ targets = []
921
+
922
+ # Split text into sentences
923
+ doc = self.nlp(text[:10000]) # Limit text length
924
+
925
+ for sentence in doc.sents:
926
+ # Skip short sentences
927
+ if len(sentence.text.split()) < 3:
928
+ continue
929
+
930
+ # Check for sentiment in this sentence
931
+ try:
932
+ sentiment = self.sentiment_pipeline(sentence.text)[0]
933
+ # Only process if sentiment is strong
934
+ if sentiment['score'] > 0.7:
935
+ # Find entities in this sentence
936
+ for entity in all_entities:
937
+ if entity['text'] in sentence.text:
938
+ targets.append({
939
+ 'entity': entity['text'],
940
+ 'type': entity['type'],
941
+ 'sentiment': sentiment['label'].lower(),
942
+ 'confidence': round(sentiment['score'], 3),
943
+ 'context': sentence.text
944
+ })
945
+ except:
946
+ continue
947
+
948
+ # Return unique targets
949
+ unique_targets = []
950
+ seen = set()
951
+ for target in targets:
952
+ key = f"{target['entity']}_{target['sentiment']}"
953
+ if key not in seen:
954
+ seen.add(key)
955
+ unique_targets.append(target)
956
+
957
+ return unique_targets
958
+
959
+ except Exception as e:
960
+ print(f"Error extracting sentiment targets: {str(e)}")
961
+ return []
962
+
963
+ class TextToSpeechConverter:
964
+ def __init__(self):
965
+ self.output_dir = AUDIO_OUTPUT_DIR
966
+ self.translator = Translator()
967
+ os.makedirs(self.output_dir, exist_ok=True)
968
+
969
+ def generate_audio(self, text: str, filename: str) -> str:
970
+ """Convert text to Hindi speech and save as audio file."""
971
+ try:
972
+ print(f"Translating text to Hindi: {text[:100]}...")
973
+
974
+ # First translate the text to Hindi
975
+ # Use chunking for long text to avoid translation limits
976
+ chunks = []
977
+ for i in range(0, len(text), 1000):
978
+ chunk = text[i:i+1000]
979
+ try:
980
+ translated_chunk = self.translator.translate(chunk, dest='hi').text
981
+ chunks.append(translated_chunk)
982
+ print(f"Translated chunk {i//1000 + 1}")
983
+ except Exception as e:
984
+ print(f"Error translating chunk {i//1000 + 1}: {str(e)}")
985
+ # If translation fails, use original text
986
+ chunks.append(chunk)
987
+
988
+ hindi_text = ' '.join(chunks)
989
+ print(f"Translation complete. Hindi text length: {len(hindi_text)}")
990
+
991
+ # Generate Hindi speech
992
+ print("Generating Hindi speech...")
993
+ tts = gTTS(text=hindi_text, lang='hi', slow=False)
994
+ output_path = os.path.join(self.output_dir, f"{filename}.mp3")
995
+ tts.save(output_path)
996
+ print(f"Audio saved to {output_path}")
997
+
998
+ return output_path
999
+ except Exception as e:
1000
+ print(f"Error in TTS conversion: {str(e)}")
1001
+ # Fallback to original text if translation fails
1002
+ print("Using fallback English TTS")
1003
+ tts = gTTS(text=text, lang='en')
1004
+ output_path = os.path.join(self.output_dir, f"{filename}.mp3")
1005
+ tts.save(output_path)
1006
+ return output_path
1007
+
1008
+ class ComparativeAnalyzer:
1009
+ def __init__(self):
1010
+ pass
1011
+
1012
+ def analyze_coverage(self, articles: List[Dict[str, Any]], company_name: str = None) -> Dict[str, Any]:
1013
+ """Perform comparative analysis across articles."""
1014
+ if not articles:
1015
+ return {
1016
+ "topics": [],
1017
+ "sentiment_distribution": {},
1018
+ "coverage_differences": ["No articles found for analysis."],
1019
+ "final_sentiment": "No articles found for analysis.",
1020
+ "total_articles": 0,
1021
+ "sentiment_indices": {}
1022
+ }
1023
+
1024
+ # Debug: Print articles for analysis
1025
+ print(f"Analyzing {len(articles)} articles for company: {company_name}")
1026
+
1027
+ # Add company name to each article if provided
1028
+ if company_name:
1029
+ for article in articles:
1030
+ article['company'] = company_name
1031
+
1032
+ # Calculate sentiment distribution
1033
+ print("Calculating sentiment distribution...")
1034
+ sentiment_dist = self._get_sentiment_distribution(articles)
1035
+ print("Sentiment distribution result:")
1036
+ print(sentiment_dist)
1037
+
1038
+ # Analyze common topics
1039
+ topics = self._analyze_topics(articles)
1040
+
1041
+ # Analyze coverage differences
1042
+ differences = self._analyze_coverage_differences(articles)
1043
+
1044
+ # Get final sentiment analysis
1045
+ final_sentiment = self._get_final_sentiment(sentiment_dist, articles)
1046
+
1047
+ result = {
1048
+ "topics": topics,
1049
+ "sentiment_distribution": sentiment_dist,
1050
+ "coverage_differences": differences,
1051
+ "final_sentiment": final_sentiment,
1052
+ "total_articles": len(articles),
1053
+ "sentiment_indices": sentiment_dist.get("sentiment_indices", {})
1054
+ }
1055
+
1056
+ # Debug: Print final result
1057
+ print("Final comparative analysis result:")
1058
+ print(result)
1059
+
1060
+ return result
1061
+
1062
+ def _get_sentiment_distribution(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
1063
+ """Calculate distribution of sentiments across articles."""
1064
+ # Basic sentiment distribution
1065
+ basic_distribution = {'positive': 0, 'negative': 0, 'neutral': 0}
1066
+
1067
+ # Fine-grained sentiment distribution
1068
+ fine_grained_distribution = {}
1069
+
1070
+ # Sentiment scores
1071
+ sentiment_scores = []
1072
+
1073
+ # Sentiment indices aggregation
1074
+ sentiment_indices = {
1075
+ "positivity_index": [],
1076
+ "negativity_index": [],
1077
+ "emotional_intensity": [],
1078
+ "controversy_score": [],
1079
+ "confidence_score": [],
1080
+ "esg_relevance": []
1081
+ }
1082
+
1083
+ # Debug: Print articles for sentiment distribution
1084
+ print(f"Processing {len(articles)} articles for sentiment distribution")
1085
+
1086
+ # Process each article
1087
+ for i, article in enumerate(articles):
1088
+ try:
1089
+ # Debug: Print article sentiment data
1090
+ print(f"Article {i+1} sentiment data:")
1091
+ print(f" Basic sentiment: {article.get('sentiment', 'N/A')}")
1092
+ print(f" Fine-grained: {article.get('fine_grained_sentiment', {})}")
1093
+ print(f" Sentiment indices: {article.get('sentiment_indices', {})}")
1094
+
1095
+ # Basic sentiment
1096
+ sentiment = article.get('sentiment', 'neutral')
1097
+ if isinstance(sentiment, str):
1098
+ sentiment = sentiment.lower()
1099
+ # Ensure we have a valid sentiment category
1100
+ if sentiment not in basic_distribution:
1101
+ sentiment = 'neutral'
1102
+ basic_distribution[sentiment] = basic_distribution.get(sentiment, 0) + 1
1103
+ else:
1104
+ # Handle non-string sentiment values
1105
+ basic_distribution['neutral'] = basic_distribution.get('neutral', 0) + 1
1106
+
1107
+ # Sentiment score
1108
+ score = article.get('sentiment_score', 0.0)
1109
+ if isinstance(score, (int, float)):
1110
+ sentiment_scores.append(score)
1111
+
1112
+ # Fine-grained sentiment
1113
+ fine_grained = article.get('fine_grained_sentiment', {})
1114
+ if isinstance(fine_grained, dict) and 'category' in fine_grained:
1115
+ category = fine_grained['category']
1116
+ if isinstance(category, str):
1117
+ category = category.lower()
1118
+ fine_grained_distribution[category] = fine_grained_distribution.get(category, 0) + 1
1119
+
1120
+ # Collect sentiment indices
1121
+ indices = article.get('sentiment_indices', {})
1122
+ if isinstance(indices, dict):
1123
+ for index_name, index_values in sentiment_indices.items():
1124
+ if index_name in indices and isinstance(indices[index_name], (int, float)):
1125
+ index_values.append(indices[index_name])
1126
+ except Exception as e:
1127
+ print(f"Error processing article {i+1} for sentiment distribution: {str(e)}")
1128
+ # Continue with next article
1129
+ continue
1130
+
1131
+ # Debug: Print collected data
1132
+ print("Collected sentiment data:")
1133
+ print(f" Basic distribution: {basic_distribution}")
1134
+ print(f" Fine-grained distribution: {fine_grained_distribution}")
1135
+ print(f" Sentiment scores: {sentiment_scores}")
1136
+ print(f" Sentiment indices collected: {sentiment_indices}")
1137
+
1138
+ # Calculate average sentiment score with fallback
1139
+ avg_sentiment_score = 0.5 # Default neutral value
1140
+ if sentiment_scores:
1141
+ avg_sentiment_score = sum(sentiment_scores) / len(sentiment_scores)
1142
+
1143
+ # Calculate sentiment volatility (standard deviation) with fallback
1144
+ sentiment_volatility = 0
1145
+ if len(sentiment_scores) > 1:
1146
+ try:
1147
+ sentiment_volatility = statistics.stdev(sentiment_scores)
1148
+ except Exception as e:
1149
+ print(f"Error calculating sentiment volatility: {str(e)}")
1150
+
1151
+ # Calculate average sentiment indices with fallbacks
1152
+ avg_indices = {}
1153
+ for index_name, values in sentiment_indices.items():
1154
+ if values:
1155
+ avg_indices[index_name] = round(sum(values) / len(values), 3)
1156
+ else:
1157
+ # Provide default values for empty indices
1158
+ if index_name in ["positivity_index", "confidence_score"]:
1159
+ avg_indices[index_name] = 0.5 # Neutral default
1160
+ else:
1161
+ avg_indices[index_name] = 0.0 # Zero default for other indices
1162
+
1163
+ # Ensure all expected indices exist
1164
+ for index_name in ["positivity_index", "negativity_index", "emotional_intensity",
1165
+ "controversy_score", "confidence_score", "esg_relevance"]:
1166
+ if index_name not in avg_indices:
1167
+ avg_indices[index_name] = 0.5 if index_name in ["positivity_index", "confidence_score"] else 0.0
1168
+
1169
+ # Ensure we have at least one item in each distribution
1170
+ if not any(basic_distribution.values()):
1171
+ basic_distribution['neutral'] = 1
1172
+
1173
+ # Ensure fine_grained_distribution has at least one entry if empty
1174
+ if not fine_grained_distribution:
1175
+ fine_grained_distribution['neutral'] = 1
1176
+
1177
+ result = {
1178
+ "basic": basic_distribution,
1179
+ "fine_grained": fine_grained_distribution,
1180
+ "avg_score": round(avg_sentiment_score, 3),
1181
+ "volatility": round(sentiment_volatility, 3),
1182
+ "sentiment_indices": avg_indices
1183
+ }
1184
+
1185
+ # Debug: Print final sentiment distribution result
1186
+ print("Final sentiment distribution result:")
1187
+ print(result)
1188
+
1189
+ return result
1190
+
1191
+ def _analyze_topics(self, articles: List[Dict[str, Any]]) -> List[str]:
1192
+ """Analyze common topics across articles using TF-IDF."""
1193
+ try:
1194
+ # Combine title and content for better topic extraction
1195
+ texts = [f"{article.get('title', '')} {article.get('content', '')}" for article in articles]
1196
+
1197
+ # Create and fit TF-IDF
1198
+ vectorizer = TfidfVectorizer(
1199
+ max_features=10,
1200
+ stop_words='english',
1201
+ ngram_range=(1, 2),
1202
+ token_pattern=r'(?u)\b[A-Za-z][A-Za-z+\'-]*[A-Za-z]+\b' # Improved pattern
1203
+ )
1204
+
1205
+ # Clean and normalize texts
1206
+ cleaned_texts = []
1207
+ for text in texts:
1208
+ # Remove numbers and special characters
1209
+ cleaned = re.sub(r'\d+', '', text)
1210
+ cleaned = re.sub(r'[^\w\s]', ' ', cleaned)
1211
+ cleaned_texts.append(cleaned.lower())
1212
+
1213
+ tfidf_matrix = vectorizer.fit_transform(cleaned_texts)
1214
+ feature_names = vectorizer.get_feature_names_out()
1215
+
1216
+ # Get average TF-IDF scores for each term
1217
+ avg_scores = tfidf_matrix.mean(axis=0).A1
1218
+
1219
+ # Sort terms by score and return top meaningful terms
1220
+ sorted_indices = avg_scores.argsort()[-5:][::-1]
1221
+ meaningful_topics = []
1222
+
1223
+ for idx in sorted_indices:
1224
+ topic = feature_names[idx]
1225
+ # Filter out single characters and common words
1226
+ if len(topic) > 1 and topic not in {'000', 'com', 'said', 'says', 'year', 'new', 'one'}:
1227
+ meaningful_topics.append(topic)
1228
+ if len(meaningful_topics) >= 5:
1229
+ break
1230
+
1231
+ return meaningful_topics
1232
+
1233
+ except Exception as e:
1234
+ print(f"Error analyzing topics: {str(e)}")
1235
+ return []
1236
+
1237
+ def _analyze_coverage_differences(self, articles: List[Dict[str, Any]]) -> List[str]:
1238
+ """Analyze how coverage differs across articles."""
1239
+ if not articles:
1240
+ return ["No articles available for comparison"]
1241
+
1242
+ differences = []
1243
+
1244
+ # Compare sentiment differences
1245
+ sentiments = [article.get('sentiment', 'neutral').lower() for article in articles]
1246
+ unique_sentiments = set(sentiments)
1247
+ if len(unique_sentiments) > 1:
1248
+ pos_count = sentiments.count('positive')
1249
+ neg_count = sentiments.count('negative')
1250
+ neu_count = sentiments.count('neutral')
1251
+
1252
+ if pos_count > 0 and neg_count > 0:
1253
+ differences.append(f"Coverage sentiment varies significantly: {pos_count} positive, {neg_count} negative, and {neu_count} neutral articles.")
1254
+
1255
+ # Compare fine-grained sentiment differences
1256
+ fine_grained_categories = []
1257
+ for article in articles:
1258
+ fine_grained = article.get('fine_grained_sentiment', {})
1259
+ if isinstance(fine_grained, dict) and 'category' in fine_grained:
1260
+ category = fine_grained['category']
1261
+ if isinstance(category, str):
1262
+ fine_grained_categories.append(category.lower())
1263
+
1264
+ unique_categories = set(fine_grained_categories)
1265
+ if len(unique_categories) > 2: # More than 2 different categories
1266
+ category_counts = {}
1267
+ for category in fine_grained_categories:
1268
+ category_counts[category] = category_counts.get(category, 0) + 1
1269
+
1270
+ top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:3]
1271
+ categories_str = ", ".join([f"{cat} ({count})" for cat, count in top_categories])
1272
+ differences.append(f"Articles show diverse sentiment categories: {categories_str}")
1273
+
1274
+ # Compare sentiment indices
1275
+ indices_differences = []
1276
+ positivity_values = []
1277
+ negativity_values = []
1278
+ controversy_values = []
1279
+
1280
+ for article in articles:
1281
+ indices = article.get('sentiment_indices', {})
1282
+ if indices:
1283
+ if 'positivity_index' in indices:
1284
+ positivity_values.append(indices['positivity_index'])
1285
+ if 'negativity_index' in indices:
1286
+ negativity_values.append(indices['negativity_index'])
1287
+ if 'controversy_score' in indices:
1288
+ controversy_values.append(indices['controversy_score'])
1289
+
1290
+ # Check for high variance in positivity
1291
+ if positivity_values and len(positivity_values) > 1:
1292
+ if max(positivity_values) - min(positivity_values) > 0.4:
1293
+ indices_differences.append("Articles show significant variation in positivity levels")
1294
+
1295
+ # Check for high variance in negativity
1296
+ if negativity_values and len(negativity_values) > 1:
1297
+ if max(negativity_values) - min(negativity_values) > 0.4:
1298
+ indices_differences.append("Articles show significant variation in negativity levels")
1299
+
1300
+ # Check for high controversy scores
1301
+ if controversy_values:
1302
+ high_controversy = [v for v in controversy_values if v > 0.5]
1303
+ if high_controversy:
1304
+ indices_differences.append(f"{len(high_controversy)} articles show high controversy scores")
1305
+
1306
+ if indices_differences:
1307
+ differences.append("Sentiment index analysis: " + "; ".join(indices_differences))
1308
+
1309
+ # Compare source differences
1310
+ sources = [article.get('source', '').lower() for article in articles]
1311
+ source_counts = {}
1312
+ for source in sources:
1313
+ if source:
1314
+ source_counts[source] = source_counts.get(source, 0) + 1
1315
+
1316
+ if len(source_counts) > 1:
1317
+ top_sources = sorted(source_counts.items(), key=lambda x: x[1], reverse=True)[:3]
1318
+ sources_str = ", ".join([f"{source} ({count})" for source, count in top_sources])
1319
+ differences.append(f"Coverage spans multiple sources: {sources_str}")
1320
+
1321
+ # If no significant differences found
1322
+ if not differences:
1323
+ differences.append("Coverage is relatively consistent across articles")
1324
+
1325
+ return differences
1326
+
1327
+ def _get_final_sentiment(self, distribution: Dict[str, Any], articles: List[Dict[str, Any]]) -> str:
1328
+ """Generate final sentiment analysis based on distribution and article content."""
1329
+ try:
1330
+ # Get basic sentiment counts
1331
+ basic_dist = distribution.get('basic', {})
1332
+ positive_count = basic_dist.get('positive', 0)
1333
+ negative_count = basic_dist.get('negative', 0)
1334
+ neutral_count = basic_dist.get('neutral', 0)
1335
+
1336
+ total_articles = positive_count + negative_count + neutral_count
1337
+
1338
+ if total_articles == 0:
1339
+ return "No sentiment data available"
1340
+
1341
+ # Calculate percentages
1342
+ positive_pct = (positive_count / total_articles) * 100
1343
+ negative_pct = (negative_count / total_articles) * 100
1344
+ neutral_pct = (neutral_count / total_articles) * 100
1345
+
1346
+ # Get average sentiment score
1347
+ avg_score = distribution.get('avg_score', 0.5)
1348
+
1349
+ # Get volatility
1350
+ volatility = distribution.get('volatility', 0)
1351
+
1352
+ # Get sentiment indices
1353
+ indices = distribution.get('sentiment_indices', {})
1354
+ positivity_index = indices.get('positivity_index', 0.5)
1355
+ negativity_index = indices.get('negativity_index', 0.5)
1356
+ emotional_intensity = indices.get('emotional_intensity', 0)
1357
+ controversy_score = indices.get('controversy_score', 0)
1358
+ esg_relevance = indices.get('esg_relevance', 0)
1359
+
1360
+ # Generate analysis text
1361
+ analysis = []
1362
+
1363
+ # Overall sentiment
1364
+ if positive_pct > 60:
1365
+ analysis.append(f"Overall sentiment is predominantly positive ({positive_pct:.1f}%).")
1366
+ elif negative_pct > 60:
1367
+ analysis.append(f"Overall sentiment is predominantly negative ({negative_pct:.1f}%).")
1368
+ elif neutral_pct > 60:
1369
+ analysis.append(f"Overall sentiment is predominantly neutral ({neutral_pct:.1f}%).")
1370
+ elif positive_pct > negative_pct and positive_pct > neutral_pct:
1371
+ analysis.append(f"Overall sentiment leans positive ({positive_pct:.1f}%), with some mixed coverage.")
1372
+ elif negative_pct > positive_pct and negative_pct > neutral_pct:
1373
+ analysis.append(f"Overall sentiment leans negative ({negative_pct:.1f}%), with some mixed coverage.")
1374
+ else:
1375
+ analysis.append(f"Sentiment is mixed across sources (Positive: {positive_pct:.1f}%, Negative: {negative_pct:.1f}%, Neutral: {neutral_pct:.1f}%).")
1376
+
1377
+ # Sentiment indices insights
1378
+ if positivity_index > 0.7:
1379
+ analysis.append(f"High positivity index ({positivity_index:.2f}) indicates strong positive sentiment.")
1380
+ elif positivity_index < 0.3 and negativity_index > 0.7:
1381
+ analysis.append(f"High negativity index ({negativity_index:.2f}) with low positivity suggests strongly negative coverage.")
1382
+
1383
+ if emotional_intensity > 0.6:
1384
+ analysis.append(f"Coverage shows high emotional intensity ({emotional_intensity:.2f}).")
1385
+
1386
+ if controversy_score > 0.5:
1387
+ analysis.append(f"Coverage shows significant controversy ({controversy_score:.2f}), with polarized opinions.")
1388
+
1389
+ if esg_relevance > 0.4:
1390
+ analysis.append(f"Coverage includes significant ESG-related content ({esg_relevance:.2f}).")
1391
+
1392
+ # Volatility
1393
+ if volatility > 0.2:
1394
+ analysis.append(f"Sentiment varies considerably across articles (volatility: {volatility:.2f}).")
1395
+ else:
1396
+ analysis.append(f"Sentiment is relatively consistent across articles (volatility: {volatility:.2f}).")
1397
+
1398
+ return " ".join(analysis)
1399
+
1400
+ except Exception as e:
1401
+ print(f"Error generating final sentiment: {str(e)}")
1402
+ return "Unable to generate final sentiment analysis due to an error."