SearchGPT

Paused

App Files Files Community

Shreyas094 commited on Jul 9, 2024

Commit

f96e8a8

verified ·

1 Parent(s): 01447cf

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -38

app.py CHANGED Viewed

@@ -341,47 +341,105 @@ def process_news(query, temperature, top_p, repetition_penalty, news_source):
     except Exception as e:
         return f"Error adding articles to the database: {str(e)}"
 def fetch_golomt_bank_news(num_results=10):
     base_url = "https://golomtbank.com/en/investor-relations"
     try:
-        response = requests.get(base_url)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
-        articles = soup.find_all('article', class_='gt-post')
-        news_items = []
-        for article in articles[:num_results]:
-            title_div = article.find('h5', class_='gt-carousel-title')
-            title = title_div.get_text(strip=True) if title_div else "No Title"
-            date_div = article.find('div', class_='entry-date gt-meta')
-            date = date_div.get_text(strip=True) if date_div else "No Date"
-            link_tag = article.find('a')
-            link = link_tag['href'] if link_tag else "No Link"
-            if not link.startswith('http'):
-                link = "https://golomtbank.com" + link
-            try:
-                article_response = requests.get(link)
-                article_response.raise_for_status()
-                article_soup = BeautifulSoup(article_response.content, 'html.parser')
-                article_content_div = article_soup.find('div', class_='entry-post')
-                content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
-            except Exception as e:
-                content = f"Error fetching article content: {str(e)}"
-            news_item = {
-                "published_date": date,
-                "title": title,
-                "url": link,
-                "content": content
-            }
-            news_items.append(news_item)
-        return news_items
     except Exception as e:
         print(f"Error fetching Golomt Bank news: {str(e)}")
         return []
@@ -598,7 +656,7 @@ with gr.Blocks() as demo:
     def fetch_news(query, temperature, top_p, repetition_penalty, news_source):
         return process_news(query, temperature, top_p, repetition_penalty, news_source)
     fetch_news_button.click(
         fetch_news,
         inputs=[news_query_input, temperature_slider, top_p_slider, repetition_penalty_slider, news_source_dropdown],

     except Exception as e:
         return f"Error adding articles to the database: {str(e)}"
+def fetch_articles_from_page(url):
+    response = requests.get(url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, 'html.parser')
+    articles = soup.find_all('div', class_='gt-carousel-item gt-box-shadow-2')
+    return articles, soup
+def extract_articles(articles):
+    article_data = []
+    for article in articles:
+        title_div = article.find('h5', class_='gt-carousel-title')
+        title = title_div.get_text(strip=True) if title_div else "No Title"
+        date_div = article.find('div', class_='entry-date gt-meta')
+        date = date_div.get_text(strip=True) if date_div else "No Date"
+        link_tag = article.find('a')
+        link = link_tag['href'] if link_tag else "No Link"
+        if not link.startswith('http'):
+            link = "https://golomtbank.com" + link
+        article_response = requests.get(link)
+        article_response.raise_for_status()
+        article_soup = BeautifulSoup(article_response.content, 'html.parser')
+        article_content_div = article_soup.find('div', class_='entry-post')
+        article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
+        article_data.append({
+            'title': title,
+            'date': date,
+            'link': link,
+            'content': article_content
+        })
+    return article_data
+pythonCopyimport requests
+from bs4 import BeautifulSoup
+def fetch_articles_from_page(url):
+    response = requests.get(url)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, 'html.parser')
+    articles = soup.find_all('div', class_='gt-carousel-item gt-box-shadow-2')
+    return articles, soup
+def extract_articles(articles):
+    article_data = []
+    for article in articles:
+        title_div = article.find('h5', class_='gt-carousel-title')
+        title = title_div.get_text(strip=True) if title_div else "No Title"
+        date_div = article.find('div', class_='entry-date gt-meta')
+        date = date_div.get_text(strip=True) if date_div else "No Date"
+        link_tag = article.find('a')
+        link = link_tag['href'] if link_tag else "No Link"
+        if not link.startswith('http'):
+            link = "https://golomtbank.com" + link
+        article_response = requests.get(link)
+        article_response.raise_for_status()
+        article_soup = BeautifulSoup(article_response.content, 'html.parser')
+        article_content_div = article_soup.find('div', class_='entry-post')
+        article_content = article_content_div.get_text(strip=True) if article_content_div else "No content found"
+        article_data.append({
+            'title': title,
+            'date': date,
+            'link': link,
+            'content': article_content
+        })
+    return article_data
 def fetch_golomt_bank_news(num_results=10):
     base_url = "https://golomtbank.com/en/investor-relations"
+    current_page_url = base_url
+    all_articles = []
     try:
+        while len(all_articles) < num_results:
+            print(f"Fetching articles from: {current_page_url}")
+            articles, soup = fetch_articles_from_page(current_page_url)
+            if not articles:
+                print("No articles found on this page.")
+                break
+            all_articles.extend(extract_articles(articles))
+            print(f"Total articles fetched so far: {len(all_articles)}")
+            if len(all_articles) >= num_results:
+                all_articles = all_articles[:num_results]
+                break
+            next_page_link = soup.find('a', class_='next')
+            if not next_page_link:
+                print("No next page link found.")
+                break
+            current_page_url = next_page_link['href']
+            if not current_page_url.startswith('http'):
+                current_page_url = "https://golomtbank.com" + current_page_url
+        return [
+            {
+                "published_date": article['date'],
+                "title": article['title'],
+                "url": article['link'],
+                "content": article['content']
+            } for article in all_articles
+        ]
     except Exception as e:
         print(f"Error fetching Golomt Bank news: {str(e)}")
         return []
     def fetch_news(query, temperature, top_p, repetition_penalty, news_source):
         return process_news(query, temperature, top_p, repetition_penalty, news_source)
     fetch_news_button.click(
         fetch_news,
         inputs=[news_query_input, temperature_slider, top_p_slider, repetition_penalty_slider, news_source_dropdown],