Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
from transformers import pipeline | |
# from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration | |
# Text sumamrization model | |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
def scrape_dawn(): | |
url = 'https://www.dawn.com/business' | |
response = requests.get(url, verify=False) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
articles = [] | |
count = 0 # Counter to track the number of articles scraped | |
for item in soup.find_all('article', class_='story'): | |
if count >= 5: # Stop after 10 articles | |
break | |
title_tag = item.find('h2', class_='story__title') | |
if title_tag: | |
title = title_tag.get_text(strip=True) | |
link = title_tag.find('a')['href'] | |
full_text = get_full_article_dawn(link) | |
# Summarize the full article | |
summary_obj = summarizer(full_text[:1020]) | |
# Convert the summary object to a string | |
summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
count += 1 # Increment the counter | |
return articles | |
# Function to get the full text of an article from Dawn | |
def get_full_article_dawn(url): | |
response = requests.get(url, verify = False) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
content_div = soup.find('div', class_='story__content') | |
if content_div: | |
paragraphs = content_div.find_all('p') | |
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) | |
return full_text | |
return "Content not found." | |
# Function to scrape articles from Business Recorder | |
def scrape_brecorder(): | |
url = 'https://www.brecorder.com/business-finance' | |
response = requests.get(url, verify=False) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
articles = [] | |
count = 0 # Counter to track the number of articles scraped | |
for item in soup.find_all('article', class_='story'): | |
if count >= 5: # Stop after 10 articles | |
break | |
title_tag = item.find('h2', class_='story__title') | |
if title_tag: | |
title = title_tag.get_text(strip=True) | |
link = title_tag.find('a')['href'] | |
full_text = get_full_article_brecorder(link) | |
# Summarize the full article | |
summary_obj = summarizer(full_text[:1020]) | |
# Convert the summary object to a string | |
summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
count += 1 # Increment the counter | |
return articles | |
# Function to get the full text of an article from Business Recorder | |
def get_full_article_brecorder(url): | |
response = requests.get(url, verify = False) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
content_div = soup.find('div', class_='story__content') | |
if content_div: | |
paragraphs = content_div.find_all(['p', 'li']) | |
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) | |
return full_text | |
return "Content not found." | |
# | |
# def scrape_tnews(): | |
# url = 'https://www.thenews.com.pk/latest/category/business' | |
# response = requests.get(url, verify=False) | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# articles = [] | |
# | |
# count = 0 # Counter to track the number of articles scraped | |
# | |
# for item in soup.find_all('div', class_='most-popular-box'): | |
# if count >= 2: # Stop after 10 articles | |
# break | |
# | |
# title_tag = item.find('h2', class_='most-popular-list') | |
# if title_tag: | |
# title = title_tag.get_text(strip=True) | |
# link = title_tag.find('a')['href'] | |
# full_text = get_full_article_tnews(link) | |
# # Summarize the full article | |
# summary_obj = summarizer(full_text[:1020]) | |
# | |
# # Convert the summary object to a string | |
# summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
# articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
# | |
# count += 1 # Increment the counter | |
# | |
# return articles | |
def scrape_tnews(): | |
url = 'https://www.thenews.com.pk/latest/category/business' | |
response = requests.get(url, verify=False) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
articles = [] | |
count = 0 # Counter to track the number of articles scraped | |
for item in soup.find_all('div', class_='most-popular-box'): | |
if count >= 5: # Stop after 2 articles | |
break | |
# Extract the title from the <h2> tag | |
title_tag = item.find('h2') | |
if title_tag: | |
title = title_tag.get_text(strip=True) | |
# Extract the link from the <a> tag inside <h2> | |
link = item.find('a')['href'] | |
# Fetch and process full article text (you should define get_full_article_tnews) | |
full_text = get_full_article_tnews(link) | |
# Summarize the full article (you should define summarizer) | |
summary_obj = summarizer(full_text[:1020]) | |
summary = summary_obj[0]['summary_text'] if summary_obj else "" | |
# Append the article details | |
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary}) | |
count += 1 # Increment the counter | |
return articles | |
def get_full_article_tnews(url): | |
response = requests.get(url, verify = False) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
content_div = soup.find('div', class_='detail-content') | |
if content_div: | |
paragraphs = content_div.find_all(['p', 'li']) | |
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs]) | |
return full_text | |
return "Content not found." | |
# Function to save articles to a CSV file | |
def save_to_csv(filename, articles): | |
if not articles: | |
print(f"No articles found to save in {filename}.") | |
return | |
keys = articles[0].keys() | |
with open(filename, 'w', newline='', encoding='utf-8') as output_file: | |
dict_writer = csv.DictWriter(output_file, fieldnames=keys) | |
dict_writer.writeheader() | |
dict_writer.writerows(articles) | |
# # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV | |
# def main(): | |
# # Scraping articles from Dawn | |
# dawn_articles = scrape_tnews() | |
# save_to_csv('tnews_articles_full.csv', dawn_articles) | |
# print("tnews articles saved to CSV file successfully.") | |
# | |
# # Scraping articles from Business Recorder | |
# # brecorder_articles = scrape_brecorder() | |
# # save_to_csv('brecorder_articles_full.csv', brecorder_articles) | |
# # print("Business Recorder articles saved to CSV file successfully.") | |
# | |
# | |
# if __name__ == '__main__': | |
# main() | |
# url = 'https://www.thenews.com.pk/latest/category/business' | |
# response = requests.get(url, verify=False) | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# s = soup.find_all('div', class_='most-popular-box') | |
# print(s) |