Shreyas094 commited on
Commit
f96e8a8
·
verified ·
1 Parent(s): 01447cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -38
app.py CHANGED
@@ -341,47 +341,105 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
341
  except Exception as e:
342
  return f"Error adding articles to the database: {str(e)}"
343
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  def fetch_golomt_bank_news(num_results=10):
345
  base_url = "https://golomtbank.com/en/investor-relations"
 
 
346
 
347
  try:
348
- response = requests.get(base_url)
349
- response.raise_for_status()
350
- soup = BeautifulSoup(response.content, 'html.parser')
351
-
352
- articles = soup.find_all('article', class_='gt-post')
353
- news_items = []
354
-
355
- for article in articles[:num_results]:
356
- title_div = article.find('h5', class_='gt-carousel-title')
357
- title = title_div.get_text(strip=True) if title_div else "No Title"
358
-
359
- date_div = article.find('div', class_='entry-date gt-meta')
360
- date = date_div.get_text(strip=True) if date_div else "No Date"
361
-
362
- link_tag = article.find('a')
363
- link = link_tag['href'] if link_tag else "No Link"
364
- if not link.startswith('http'):
365
- link = "https://golomtbank.com" + link
366
-
367
- try:
368
- article_response = requests.get(link)
369
- article_response.raise_for_status()
370
- article_soup = BeautifulSoup(article_response.content, 'html.parser')
371
- article_content_div = article_soup.find('div', class_='entry-post')
372
- content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
373
- except Exception as e:
374
- content = f"Error fetching article content: {str(e)}"
375
-
376
- news_item = {
377
- "published_date": date,
378
- "title": title,
379
- "url": link,
380
- "content": content
381
- }
382
- news_items.append(news_item)
383
-
384
- return news_items
385
  except Exception as e:
386
  print(f"Error fetching Golomt Bank news: {str(e)}")
387
  return []
@@ -598,7 +656,7 @@ with gr.Blocks() as demo:
598
 
599
  def fetch_news(query, temperature, top_p, repetition_penalty, news_source):
600
  return process_news(query, temperature, top_p, repetition_penalty, news_source)
601
-
602
  fetch_news_button.click(
603
  fetch_news,
604
  inputs=[news_query_input, temperature_slider, top_p_slider, repetition_penalty_slider, news_source_dropdown],
 
341
  except Exception as e:
342
  return f"Error adding articles to the database: {str(e)}"
343
 
344
+
345
+ def fetch_articles_from_page(url):
346
+ response = requests.get(url)
347
+ response.raise_for_status()
348
+ soup = BeautifulSoup(response.content, 'html.parser')
349
+ articles = soup.find_all('div', class_='gt-carousel-item gt-box-shadow-2')
350
+ return articles, soup
351
+
352
+ def extract_articles(articles):
353
+ article_data = []
354
+ for article in articles:
355
+ title_div = article.find('h5', class_='gt-carousel-title')
356
+ title = title_div.get_text(strip=True) if title_div else "No Title"
357
+ date_div = article.find('div', class_='entry-date gt-meta')
358
+ date = date_div.get_text(strip=True) if date_div else "No Date"
359
+ link_tag = article.find('a')
360
+ link = link_tag['href'] if link_tag else "No Link"
361
+ if not link.startswith('http'):
362
+ link = "https://golomtbank.com" + link
363
+ article_response = requests.get(link)
364
+ article_response.raise_for_status()
365
+ article_soup = BeautifulSoup(article_response.content, 'html.parser')
366
+ article_content_div = article_soup.find('div', class_='entry-post')
367
+ article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
368
+ article_data.append({
369
+ 'title': title,
370
+ 'date': date,
371
+ 'link': link,
372
+ 'content': article_content
373
+ })
374
+ return article_data
375
+
376
+ pythonCopyimport requests
377
+ from bs4 import BeautifulSoup
378
+
379
+ def fetch_articles_from_page(url):
380
+ response = requests.get(url)
381
+ response.raise_for_status()
382
+ soup = BeautifulSoup(response.content, 'html.parser')
383
+ articles = soup.find_all('div', class_='gt-carousel-item gt-box-shadow-2')
384
+ return articles, soup
385
+
386
+ def extract_articles(articles):
387
+ article_data = []
388
+ for article in articles:
389
+ title_div = article.find('h5', class_='gt-carousel-title')
390
+ title = title_div.get_text(strip=True) if title_div else "No Title"
391
+ date_div = article.find('div', class_='entry-date gt-meta')
392
+ date = date_div.get_text(strip=True) if date_div else "No Date"
393
+ link_tag = article.find('a')
394
+ link = link_tag['href'] if link_tag else "No Link"
395
+ if not link.startswith('http'):
396
+ link = "https://golomtbank.com" + link
397
+ article_response = requests.get(link)
398
+ article_response.raise_for_status()
399
+ article_soup = BeautifulSoup(article_response.content, 'html.parser')
400
+ article_content_div = article_soup.find('div', class_='entry-post')
401
+ article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
402
+ article_data.append({
403
+ 'title': title,
404
+ 'date': date,
405
+ 'link': link,
406
+ 'content': article_content
407
+ })
408
+ return article_data
409
+
410
  def fetch_golomt_bank_news(num_results=10):
411
  base_url = "https://golomtbank.com/en/investor-relations"
412
+ current_page_url = base_url
413
+ all_articles = []
414
 
415
  try:
416
+ while len(all_articles) < num_results:
417
+ print(f"Fetching articles from: {current_page_url}")
418
+ articles, soup = fetch_articles_from_page(current_page_url)
419
+ if not articles:
420
+ print("No articles found on this page.")
421
+ break
422
+ all_articles.extend(extract_articles(articles))
423
+ print(f"Total articles fetched so far: {len(all_articles)}")
424
+ if len(all_articles) >= num_results:
425
+ all_articles = all_articles[:num_results]
426
+ break
427
+ next_page_link = soup.find('a', class_='next')
428
+ if not next_page_link:
429
+ print("No next page link found.")
430
+ break
431
+ current_page_url = next_page_link['href']
432
+ if not current_page_url.startswith('http'):
433
+ current_page_url = "https://golomtbank.com" + current_page_url
434
+
435
+ return [
436
+ {
437
+ "published_date": article['date'],
438
+ "title": article['title'],
439
+ "url": article['link'],
440
+ "content": article['content']
441
+ } for article in all_articles
442
+ ]
 
 
 
 
 
 
 
 
 
 
443
  except Exception as e:
444
  print(f"Error fetching Golomt Bank news: {str(e)}")
445
  return []
 
656
 
657
  def fetch_news(query, temperature, top_p, repetition_penalty, news_source):
658
  return process_news(query, temperature, top_p, repetition_penalty, news_source)
659
+
660
  fetch_news_button.click(
661
  fetch_news,
662
  inputs=[news_query_input, temperature_slider, top_p_slider, repetition_penalty_slider, news_source_dropdown],