Guiyom commited on
Commit
3e17624
·
verified ·
1 Parent(s): 5dc7b85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -16
app.py CHANGED
@@ -35,16 +35,43 @@ class RaindropSearchBot:
35
  self.newsapi = NewsApiClient(api_key=self.newsapi_key)
36
 
37
  def extract_content_from_url(self, url: str) -> Optional[str]:
38
- """Extract main content from a URL using newspaper3k."""
39
  try:
40
- article = Article(url)
41
- article.download()
42
- time.sleep(1) # Polite delay between requests
43
- article.parse()
 
 
 
 
44
 
45
- # Combine title and text
46
- content = f"{article.title}\n\n{article.text}"
47
- return content if content.strip() else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  except Exception as e:
50
  logger.error(f"Error extracting content from {url}: {e}")
@@ -57,27 +84,33 @@ class RaindropSearchBot:
57
  url = item.get('link') or item.get('url')
58
  if not url:
59
  return item
60
-
61
  # For Raindrop items, use existing excerpt if available
62
  if source_type == 'raindrop' and item.get('excerpt'):
63
  content = item['excerpt']
64
  else:
65
  content = self.extract_content_from_url(url)
66
-
67
  if not content:
 
 
68
  return item
69
-
70
  # Generate summary focused on the query topic
71
  try:
72
  prompt = f"""
73
- Analyze this content and provide a detailed summary focusing on key points relevant
74
- to our topic. Include specific details, data, and quotes if relevant.
75
 
76
  Content: {content[:4000]} # Limit content length for token constraints
77
 
78
- Provide a concise but detailed summary in 2-3 paragraphs.
 
 
 
 
 
79
  """
80
-
81
  response = self.client.chat.completions.create(
82
  model="gpt-4o-mini",
83
  messages=[{"role": "user", "content": prompt}],
@@ -86,10 +119,12 @@ class RaindropSearchBot:
86
  )
87
 
88
  item['detailed_summary'] = response.choices[0].message.content
 
 
89
  except Exception as e:
90
  logger.error(f"Error generating summary: {e}")
91
  item['detailed_summary'] = "Summary generation failed."
92
-
93
  return item
94
 
95
  except Exception as e:
 
35
  self.newsapi = NewsApiClient(api_key=self.newsapi_key)
36
 
37
  def extract_content_from_url(self, url: str) -> Optional[str]:
38
+ """Extract main content from a URL using BeautifulSoup."""
39
  try:
40
+ headers = {
41
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
42
+ }
43
+
44
+ response = requests.get(url, headers=headers, timeout=10)
45
+ response.raise_for_status()
46
+
47
+ soup = BeautifulSoup(response.text, 'html.parser')
48
 
49
+ # Remove unwanted elements
50
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
51
+ element.decompose()
52
+
53
+ # Get title
54
+ title = soup.title.string if soup.title else ''
55
+
56
+ # Get main content
57
+ # First try common content containers
58
+ content_containers = soup.select('article, main, .content, .post-content, .entry-content')
59
+
60
+ if content_containers:
61
+ content = content_containers[0].get_text(separator='\n', strip=True)
62
+ else:
63
+ # Fallback to all paragraphs
64
+ paragraphs = soup.find_all('p')
65
+ content = '\n'.join(p.get_text(strip=True) for p in paragraphs)
66
+
67
+ # Combine and clean
68
+ full_content = f"{title}\n\n{content}"
69
+
70
+ # Clean up the text
71
+ full_content = re.sub(r'\n\s*\n', '\n\n', full_content) # Remove extra newlines
72
+ full_content = re.sub(r'\s+', ' ', full_content) # Normalize whitespace
73
+
74
+ return full_content if full_content.strip() else None
75
 
76
  except Exception as e:
77
  logger.error(f"Error extracting content from {url}: {e}")
 
84
  url = item.get('link') or item.get('url')
85
  if not url:
86
  return item
87
+
88
  # For Raindrop items, use existing excerpt if available
89
  if source_type == 'raindrop' and item.get('excerpt'):
90
  content = item['excerpt']
91
  else:
92
  content = self.extract_content_from_url(url)
93
+
94
  if not content:
95
+ logger.warning(f"No content extracted from {url}")
96
+ item['detailed_summary'] = "Content extraction failed."
97
  return item
98
+
99
  # Generate summary focused on the query topic
100
  try:
101
  prompt = f"""
102
+ Analyze this content and provide a detailed summary focusing on key points.
 
103
 
104
  Content: {content[:4000]} # Limit content length for token constraints
105
 
106
+ Requirements:
107
+ 1. Focus on the most important facts and findings
108
+ 2. Include specific data points and quotes if relevant
109
+ 3. Organize the information logically
110
+ 4. Keep the summary to 2-3 paragraphs
111
+ 5. Highlight any unique insights from this source
112
  """
113
+
114
  response = self.client.chat.completions.create(
115
  model="gpt-4o-mini",
116
  messages=[{"role": "user", "content": prompt}],
 
119
  )
120
 
121
  item['detailed_summary'] = response.choices[0].message.content
122
+ item['processed_content'] = content[:1000] # Store truncated content for later use
123
+
124
  except Exception as e:
125
  logger.error(f"Error generating summary: {e}")
126
  item['detailed_summary'] = "Summary generation failed."
127
+
128
  return item
129
 
130
  except Exception as e: