import streamlit as st import pandas as pd import requests from bs4 import BeautifulSoup import csv from transformers import pipeline # from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration # Text sumamrization model summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") def scrape_dawn(): url = 'https://www.dawn.com/business' response = requests.get(url, verify=False) soup = BeautifulSoup(response.text, 'html.parser') articles = [] count = 0 # Counter to track the number of articles scraped for item in soup.find_all('article', class_='story'): if count >= 5: # Stop after 10 articles break title_tag = item.find('h2', class_='story__title') if title_tag: title = title_tag.get_text(strip=True) link = title_tag.find('a')['href'] full_text = get_full_article_dawn(link) # Summarize the full article summary_obj = summarizer(full_text[:1020]) # Convert the summary object to a string summary = summary_obj[0]['summary_text'] if summary_obj else "" articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) count += 1 # Increment the counter return articles # Function to get the full text of an article from Dawn def get_full_article_dawn(url): response = requests.get(url, verify = False) soup = BeautifulSoup(response.text, 'html.parser') content_div = soup.find('div', class_='story__content') if content_div: paragraphs = content_div.find_all('p') full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) return full_text return "Content not found." # Function to scrape articles from Business Recorder def scrape_brecorder(): url = 'https://www.brecorder.com/business-finance' response = requests.get(url, verify=False) soup = BeautifulSoup(response.text, 'html.parser') articles = [] count = 0 # Counter to track the number of articles scraped for item in soup.find_all('article', class_='story'): if count >= 5: # Stop after 10 articles break title_tag = item.find('h2', class_='story__title') if title_tag: title = title_tag.get_text(strip=True) link = title_tag.find('a')['href'] full_text = get_full_article_brecorder(link) # Summarize the full article summary_obj = summarizer(full_text[:1020]) # Convert the summary object to a string summary = summary_obj[0]['summary_text'] if summary_obj else "" articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) count += 1 # Increment the counter return articles # Function to get the full text of an article from Business Recorder def get_full_article_brecorder(url): response = requests.get(url, verify = False) soup = BeautifulSoup(response.text, 'html.parser') content_div = soup.find('div', class_='story__content') if content_div: paragraphs = content_div.find_all(['p', 'li']) full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) return full_text return "Content not found." # # def scrape_tnews(): # url = 'https://www.thenews.com.pk/latest/category/business' # response = requests.get(url, verify=False) # soup = BeautifulSoup(response.text, 'html.parser') # articles = [] # # count = 0 # Counter to track the number of articles scraped # # for item in soup.find_all('div', class_='most-popular-box'): # if count >= 2: # Stop after 10 articles # break # # title_tag = item.find('h2', class_='most-popular-list') # if title_tag: # title = title_tag.get_text(strip=True) # link = title_tag.find('a')['href'] # full_text = get_full_article_tnews(link) # # Summarize the full article # summary_obj = summarizer(full_text[:1020]) # # # Convert the summary object to a string # summary = summary_obj[0]['summary_text'] if summary_obj else "" # articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) # # count += 1 # Increment the counter # # return articles def scrape_tnews(): url = 'https://www.thenews.com.pk/latest/category/business' response = requests.get(url, verify=False) soup = BeautifulSoup(response.text, 'html.parser') articles = [] count = 0 # Counter to track the number of articles scraped for item in soup.find_all('div', class_='most-popular-box'): if count >= 5: # Stop after 2 articles break # Extract the title from the

tag title_tag = item.find('h2') if title_tag: title = title_tag.get_text(strip=True) # Extract the link from the tag inside

link = item.find('a')['href'] # Fetch and process full article text (you should define get_full_article_tnews) full_text = get_full_article_tnews(link) # Summarize the full article (you should define summarizer) summary_obj = summarizer(full_text[:1020]) summary = summary_obj[0]['summary_text'] if summary_obj else "" # Append the article details articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) count += 1 # Increment the counter return articles def get_full_article_tnews(url): response = requests.get(url, verify = False) soup = BeautifulSoup(response.text, 'html.parser') content_div = soup.find('div', class_='detail-content') if content_div: paragraphs = content_div.find_all(['p', 'li']) full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) return full_text return "Content not found." # Function to save articles to a CSV file def save_to_csv(filename, articles): if not articles: print(f"No articles found to save in {filename}.") return keys = articles[0].keys() with open(filename, 'w', newline='', encoding='utf-8') as output_file: dict_writer = csv.DictWriter(output_file, fieldnames=keys) dict_writer.writeheader() dict_writer.writerows(articles) # # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV # def main(): # # Scraping articles from Dawn # dawn_articles = scrape_tnews() # save_to_csv('tnews_articles_full.csv', dawn_articles) # print("tnews articles saved to CSV file successfully.") # # # Scraping articles from Business Recorder # # brecorder_articles = scrape_brecorder() # # save_to_csv('brecorder_articles_full.csv', brecorder_articles) # # print("Business Recorder articles saved to CSV file successfully.") # # # if __name__ == '__main__': # main() # url = 'https://www.thenews.com.pk/latest/category/business' # response = requests.get(url, verify=False) # soup = BeautifulSoup(response.text, 'html.parser') # s = soup.find_all('div', class_='most-popular-box') # print(s)