Spaces:
Sleeping
Sleeping
File size: 17,332 Bytes
23d5d20 eb00ff4 23d5d20 e1e6f2f 487344e 5818aaa 407f9e3 eeab0b3 407f9e3 487344e 23d5d20 5818aaa 23d5d20 5818aaa 407f9e3 487344e 5818aaa 407f9e3 5818aaa 407f9e3 eb00ff4 407f9e3 487344e a3d7f9f 3e17624 a3d7f9f 3e17624 a3d7f9f 3e17624 a3d7f9f 487344e a3d7f9f 487344e a3d7f9f 3e17624 a3d7f9f 487344e a3d7f9f 3e17624 a3d7f9f 3e17624 a3d7f9f 487344e a3d7f9f 3e17624 a3d7f9f 487344e a3d7f9f 036735b 23d5d20 5818aaa e1e6f2f da70a42 5818aaa e1e6f2f 5818aaa 3cbcbb2 5818aaa 3cbcbb2 5818aaa 036735b 3cbcbb2 036735b 3cbcbb2 5818aaa 3cbcbb2 5818aaa 3cbcbb2 036735b a3d7f9f 487344e 23d5d20 a3d7f9f 23d5d20 e1e6f2f 23d5d20 3cbcbb2 407f9e3 a3d7f9f 3cbcbb2 a3d7f9f 5818aaa a3d7f9f 3cbcbb2 23d5d20 3e0283c 5818aaa 23d5d20 487344e 5818aaa 23d5d20 e1e6f2f 23d5d20 5818aaa 407f9e3 23d5d20 407f9e3 23d5d20 5818aaa 23d5d20 5818aaa 407f9e3 216b84c 23d5d20 e1e6f2f 23d5d20 487344e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 |
import gradio as gr
from openai import OpenAI
import requests
import json
import os
import logging
from typing import Dict, List, Optional, Tuple
from datetime import datetime
from bs4 import BeautifulSoup
from googlesearch import search
from newsapi import NewsApiClient
import markdown
import re
import time
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class RaindropSearchBot:
def __init__(self):
self.openai_api_key = os.getenv('openaikey')
self.raindrop_api_token = os.getenv('raindroptoken')
self.newsapi_key = os.getenv('newsapikey')
if not all([self.openai_api_key, self.raindrop_api_token, self.newsapi_key]):
raise EnvironmentError(
"Missing required environment variables. Please ensure all API keys are set."
)
self.client = OpenAI(api_key=self.openai_api_key)
self.newsapi = NewsApiClient(api_key=self.newsapi_key)
def get_google_results(self, query: str, num_results: int = 5) -> List[Dict]:
"""Get Google search results using googlesearch-python."""
try:
search_results = []
for result in search(query, num_results=num_results, advanced=True):
search_results.append({
'title': result.title,
'link': result.url,
'snippet': result.description
})
return search_results
except Exception as e:
logger.error(f"Google search error: {e}")
return []
def get_news_results(self, query: str, num_results: int = 5) -> List[Dict]:
"""Get news articles using NewsAPI."""
try:
news_results = self.newsapi.get_everything(
q=query,
language='en',
sort_by='relevancy',
page_size=num_results
)
return news_results.get('articles', [])
except Exception as e:
logger.error(f"News API error: {e}")
return []
def extract_content_from_url(self, url: str) -> Optional[str]:
"""Extract main content from a URL using BeautifulSoup."""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
element.decompose()
# Get title
title = soup.title.string if soup.title else ''
# Get main content
# First try common content containers
content_containers = soup.select('article, main, .content, .post-content, .entry-content')
if content_containers:
content = content_containers[0].get_text(separator='\n', strip=True)
else:
# Fallback to all paragraphs
paragraphs = soup.find_all('p')
content = '\n'.join(p.get_text(strip=True) for p in paragraphs)
# Combine and clean
full_content = f"{title}\n\n{content}"
# Clean up the text
full_content = re.sub(r'\n\s*\n', '\n\n', full_content) # Remove extra newlines
full_content = re.sub(r'\s+', ' ', full_content) # Normalize whitespace
return full_content if full_content.strip() else None
except Exception as e:
logger.error(f"Error extracting content from {url}: {e}")
return None
def get_content_and_summary(self, item: Dict, source_type: str) -> Dict:
"""Get content and generate summary for a single item."""
try:
# Get URL based on source type
url = item.get('link') or item.get('url')
if not url:
return item
# For Raindrop items, use existing excerpt if available
if source_type == 'raindrop' and item.get('excerpt'):
content = item['excerpt']
else:
content = self.extract_content_from_url(url)
if not content:
logger.warning(f"No content extracted from {url}")
item['detailed_summary'] = "Content extraction failed."
return item
# Generate summary focused on the query topic
try:
prompt = f"""
Analyze this content and provide a detailed summary focusing on key points.
Content: {content[:4000]} # Limit content length for token constraints
Requirements:
1. Focus on the most important facts and findings
2. Include specific data points and quotes if relevant
3. Organize the information logically
4. Keep the summary to 2-3 paragraphs
5. Highlight any unique insights from this source
"""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=300
)
item['detailed_summary'] = response.choices[0].message.content
item['processed_content'] = content[:1000] # Store truncated content for later use
except Exception as e:
logger.error(f"Error generating summary: {e}")
item['detailed_summary'] = "Summary generation failed."
return item
except Exception as e:
logger.error(f"Error processing item: {e}")
return item
def search_raindrop(self, search_query: str) -> List[Dict]:
"""Search Raindrop.io with enhanced error handling and logging."""
logger.info(f"Searching Raindrop with query: {search_query}")
headers = {
"Authorization": f"Bearer {self.raindrop_api_token}"
}
# Test API connection first
try:
test_response = requests.get(
"https://api.raindrop.io/rest/v1/user",
headers=headers
)
if test_response.status_code != 200:
logger.error(f"API test failed: {test_response.status_code}")
return []
except Exception as e:
logger.error(f"API connection error: {e}")
return []
# Perform search
try:
params = {
"search": search_query,
"perpage": 50,
"sort": "-created",
"page": 0
}
response = requests.get(
"https://api.raindrop.io/rest/v1/raindrops/0",
headers=headers,
params=params
)
if response.status_code == 200:
data = response.json()
items = data.get("items", [])
logger.info(f"Found {len(items)} results")
return items
else:
logger.error(f"Search failed: {response.status_code}")
return []
except Exception as e:
logger.error(f"Search error: {e}")
return []
def process_all_results(self, raindrop_results: List[Dict],
google_results: List[Dict],
news_results: List[Dict]) -> Tuple[List[Dict], List[Dict], List[Dict]]:
"""Process and enrich all results with content and summaries."""
processed_raindrop = []
for item in raindrop_results:
processed_item = self.get_content_and_summary(item, 'raindrop')
if processed_item.get('detailed_summary'):
processed_raindrop.append(processed_item)
processed_google = []
for item in google_results:
processed_item = self.get_content_and_summary(item, 'google')
if processed_item.get('detailed_summary'):
processed_google.append(processed_item)
processed_news = []
for item in news_results:
processed_item = self.get_content_and_summary(item, 'news')
if processed_item.get('detailed_summary'):
processed_news.append(processed_item)
return processed_raindrop, processed_google, processed_news
def generate_essay_response(self, results: Tuple[List[Dict], List[Dict], List[Dict]],
user_query: str) -> str:
"""Generate a structured essay-style response with references."""
raindrop_results, google_results, news_results = results
# Collect all content for analysis
all_content = ""
reference_map = {}
ref_counter = 1
for source_list in [raindrop_results, google_results, news_results]:
for item in source_list:
if item.get('detailed_summary'):
all_content += f"\n{item['detailed_summary']}\n"
reference_map[item['link']] = ref_counter
ref_counter += 1
try:
prompt = f"""
Create a comprehensive essay-style analysis about: {user_query}
Use this content as your source material:
{all_content}
Requirements:
1. Structure the response in clear sections with markdown headers
2. Include an introduction and conclusion
3. Use reference numbers [n] to cite sources
4. Make connections between different sources
5. Highlight key findings and trends
6. Address any contradictions or gaps
7. Use markdown formatting for better readability
Format the response as a proper academic essay with sections.
"""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.5,
max_tokens=1500
)
essay = response.choices[0].message.content
# Replace reference placeholders with actual reference numbers
for url, ref_num in reference_map.items():
essay = essay.replace(f'[URL:{url}]', f'[{ref_num}]')
return essay
except Exception as e:
logger.error(f"Error generating essay: {e}")
return "Error generating analysis."
def format_results(self, results: Tuple[List[Dict], List[Dict], List[Dict]],
essay: str) -> str:
"""Format the essay and results with detailed summaries."""
raindrop_results, google_results, news_results = results
output = f"{essay}\n\n"
output += "---\n\n"
output += "# References and Detailed Summaries\n\n"
ref_counter = 1
# Format Raindrop results
if raindrop_results:
output += "## π Bookmarked Sources\n\n"
for item in raindrop_results:
output += f"### [{ref_counter}] {item.get('title', 'No Title')}\n"
output += f"**Link**: {item.get('link')}\n"
if item.get('tags'):
output += f"**Tags**: {', '.join(item['tags'])}\n"
if item.get('created'):
output += f"**Created**: {item['created'][:10]}\n"
output += "\n**Summary**:\n"
output += f"{item.get('detailed_summary', 'No summary available.')}\n\n"
ref_counter += 1
# Format Google results
if google_results:
output += "## π Web Sources\n\n"
for item in google_results:
output += f"### [{ref_counter}] {item.get('title', 'No Title')}\n"
output += f"**Link**: {item.get('link')}\n"
output += "\n**Summary**:\n"
output += f"{item.get('detailed_summary', 'No summary available.')}\n\n"
ref_counter += 1
# Format News results
if news_results:
output += "## π° Recent News\n\n"
for item in news_results:
output += f"### [{ref_counter}] {item.get('title', 'No Title')}\n"
output += f"**Link**: {item.get('url')}\n"
if item.get('source', {}).get('name'):
output += f"**Source**: {item['source']['name']}\n"
if item.get('publishedAt'):
output += f"**Published**: {item['publishedAt'][:10]}\n"
output += "\n**Summary**:\n"
output += f"{item.get('detailed_summary', 'No summary available.')}\n\n"
ref_counter += 1
return output
def process_request(self, user_request: str) -> str:
"""Process the user request with enhanced content collection and analysis."""
try:
logger.info(f"Processing request: {user_request}")
# Generate search query
search_query = self.generate_search_query(user_request)
logger.info(f"Using search query: {search_query}")
# Get results from all sources
raindrop_results = self.search_raindrop(search_query)
google_results = self.get_google_results(search_query)
news_results = self.get_news_results(search_query)
# Process all results to get content and summaries
processed_results = self.process_all_results(
raindrop_results, google_results, news_results
)
# Generate essay-style analysis
essay = self.generate_essay_response(processed_results, user_request)
# Format and return results
return self.format_results(processed_results, essay)
except Exception as e:
logger.error(f"Error processing request: {e}", exc_info=True)
return f"An error occurred while processing your request. Please try again."
def generate_search_query(self, user_request: str) -> str:
"""Convert user request to optimized search terms."""
logger.info(f"Generating search query for: {user_request}")
prompt = f"""
You are a search expert. Create a search query to find relevant documents about:
{user_request}
Guidelines:
- Focus on key concepts and synonyms
- Use simple keywords that would appear in titles or descriptions
- Avoid complex operators or special characters
- Return only the search terms, no explanation
- Include alternative phrasings
- Keep it concise (max 3-4 key terms/phrases)
Return only the search query terms.
"""
try:
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=50
)
search_query = response.choices[0].message.content.strip()
logger.info(f"Generated search query: {search_query}")
return search_query
except Exception as e:
logger.error(f"Error generating search query: {e}")
return user_request
# Initialize bot
bot = RaindropSearchBot()
# Create Gradio interface
def chatbot_interface(user_input: str) -> str:
return bot.process_request(user_input)
# Define and launch the interface
with gr.Blocks(title="Enhanced Search Assistant", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π Enhanced Search Assistant
Enter your search request in natural language, and I'll find and analyze information from multiple sources:
- Your bookmarked content
- Web search results
- Recent news articles
""")
with gr.Row():
input_text = gr.Textbox(
label="What would you like to search for?",
placeholder="Enter your search query here...",
lines=2
)
with gr.Row():
search_button = gr.Button("π Search", variant="primary")
with gr.Row():
output_text = gr.Textbox(
label="Analysis and Results",
lines=20,
interactive=False
)
search_button.click(
fn=chatbot_interface,
inputs=input_text,
outputs=output_text
)
# Launch the interface
if __name__ == "__main__":
demo.launch(share=True)
|