import streamlit as st import pandas as pd import requests from bs4 import BeautifulSoup import csv from transformers import pipeline # from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration # Text sumamrization model summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") def scrape_dawn(): url = 'https://www.dawn.com/business' response = requests.get(url, verify=False) soup = BeautifulSoup(response.text, 'html.parser') articles = [] count = 0 # Counter to track the number of articles scraped for item in soup.find_all('article', class_='story'): if count >= 5: # Stop after 10 articles break title_tag = item.find('h2', class_='story__title') if title_tag: title = title_tag.get_text(strip=True) link = title_tag.find('a')['href'] full_text = get_full_article_dawn(link) # Summarize the full article summary_obj = summarizer(full_text[:1020]) # Convert the summary object to a string summary = summary_obj[0]['summary_text'] if summary_obj else "" articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) count += 1 # Increment the counter return articles # Function to get the full text of an article from Dawn def get_full_article_dawn(url): response = requests.get(url, verify = False) soup = BeautifulSoup(response.text, 'html.parser') content_div = soup.find('div', class_='story__content') if content_div: paragraphs = content_div.find_all('p') full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) return full_text return "Content not found." # Function to scrape articles from Business Recorder def scrape_brecorder(): url = 'https://www.brecorder.com/business-finance' response = requests.get(url, verify=False) soup = BeautifulSoup(response.text, 'html.parser') articles = [] count = 0 # Counter to track the number of articles scraped for item in soup.find_all('article', class_='story'): if count >= 5: # Stop after 10 articles break title_tag = item.find('h2', class_='story__title') if title_tag: title = title_tag.get_text(strip=True) link = title_tag.find('a')['href'] full_text = get_full_article_brecorder(link) # Summarize the full article summary_obj = summarizer(full_text[:1020]) # Convert the summary object to a string summary = summary_obj[0]['summary_text'] if summary_obj else "" articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) count += 1 # Increment the counter return articles # Function to get the full text of an article from Business Recorder def get_full_article_brecorder(url): response = requests.get(url, verify = False) soup = BeautifulSoup(response.text, 'html.parser') content_div = soup.find('div', class_='story__content') if content_div: paragraphs = content_div.find_all(['p', 'li']) full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) return full_text return "Content not found." # # def scrape_tnews(): # url = 'https://www.thenews.com.pk/latest/category/business' # response = requests.get(url, verify=False) # soup = BeautifulSoup(response.text, 'html.parser') # articles = [] # # count = 0 # Counter to track the number of articles scraped # # for item in soup.find_all('div', class_='most-popular-box'): # if count >= 2: # Stop after 10 articles # break # # title_tag = item.find('h2', class_='most-popular-list') # if title_tag: # title = title_tag.get_text(strip=True) # link = title_tag.find('a')['href'] # full_text = get_full_article_tnews(link) # # Summarize the full article # summary_obj = summarizer(full_text[:1020]) # # # Convert the summary object to a string # summary = summary_obj[0]['summary_text'] if summary_obj else "" # articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) # # count += 1 # Increment the counter # # return articles def scrape_tnews(): url = 'https://www.thenews.com.pk/latest/category/business' response = requests.get(url, verify=False) soup = BeautifulSoup(response.text, 'html.parser') articles = [] count = 0 # Counter to track the number of articles scraped for item in soup.find_all('div', class_='most-popular-box'): if count >= 5: # Stop after 2 articles break # Extract the title from the