Spaces:

MurtazaNaqi
/

Article_Summarizer

Sleeping

Article_Summarizer / Scrapper_Summarizer.py

Muhammad Murtaza Naqi (Assistant Manager - Data Analyst)

supporting files

712d86b 10 months ago

7.35 kB

	import streamlit as st
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	import csv
	from transformers import pipeline
	# from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

	# Text sumamrization model
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")



	def scrape_dawn():
	url = 'https://www.dawn.com/business'
	response = requests.get(url, verify=False)
	soup = BeautifulSoup(response.text, 'html.parser')
	articles = []

	count = 0 # Counter to track the number of articles scraped

	for item in soup.find_all('article', class_='story'):
	if count >= 5: # Stop after 10 articles
	break

	title_tag = item.find('h2', class_='story__title')
	if title_tag:
	title = title_tag.get_text(strip=True)
	link = title_tag.find('a')['href']
	full_text = get_full_article_dawn(link)
	# Summarize the full article
	summary_obj = summarizer(full_text[:1020])

	# Convert the summary object to a string
	summary = summary_obj[0]['summary_text'] if summary_obj else ""
	articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})

	count += 1 # Increment the counter

	return articles

	# Function to get the full text of an article from Dawn
	def get_full_article_dawn(url):
	response = requests.get(url, verify = False)
	soup = BeautifulSoup(response.text, 'html.parser')
	content_div = soup.find('div', class_='story__content')
	if content_div:
	paragraphs = content_div.find_all('p')
	full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
	return full_text
	return "Content not found."


	# Function to scrape articles from Business Recorder
	def scrape_brecorder():
	url = 'https://www.brecorder.com/business-finance'
	response = requests.get(url, verify=False)
	soup = BeautifulSoup(response.text, 'html.parser')
	articles = []
	count = 0 # Counter to track the number of articles scraped

	for item in soup.find_all('article', class_='story'):
	if count >= 5: # Stop after 10 articles
	break

	title_tag = item.find('h2', class_='story__title')
	if title_tag:
	title = title_tag.get_text(strip=True)
	link = title_tag.find('a')['href']
	full_text = get_full_article_brecorder(link)
	# Summarize the full article
	summary_obj = summarizer(full_text[:1020])

	# Convert the summary object to a string
	summary = summary_obj[0]['summary_text'] if summary_obj else ""
	articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})

	count += 1 # Increment the counter

	return articles

	# Function to get the full text of an article from Business Recorder
	def get_full_article_brecorder(url):
	response = requests.get(url, verify = False)
	soup = BeautifulSoup(response.text, 'html.parser')
	content_div = soup.find('div', class_='story__content')
	if content_div:
	paragraphs = content_div.find_all(['p', 'li'])
	full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
	return full_text
	return "Content not found."

	#
	# def scrape_tnews():
	# url = 'https://www.thenews.com.pk/latest/category/business'
	# response = requests.get(url, verify=False)
	# soup = BeautifulSoup(response.text, 'html.parser')
	# articles = []
	#
	# count = 0 # Counter to track the number of articles scraped
	#
	# for item in soup.find_all('div', class_='most-popular-box'):
	# if count >= 2: # Stop after 10 articles
	# break
	#
	# title_tag = item.find('h2', class_='most-popular-list')
	# if title_tag:
	# title = title_tag.get_text(strip=True)
	# link = title_tag.find('a')['href']
	# full_text = get_full_article_tnews(link)
	# # Summarize the full article
	# summary_obj = summarizer(full_text[:1020])
	#
	# # Convert the summary object to a string
	# summary = summary_obj[0]['summary_text'] if summary_obj else ""
	# articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
	#
	# count += 1 # Increment the counter
	#
	# return articles


	def scrape_tnews():
	url = 'https://www.thenews.com.pk/latest/category/business'
	response = requests.get(url, verify=False)
	soup = BeautifulSoup(response.text, 'html.parser')
	articles = []

	count = 0 # Counter to track the number of articles scraped

	for item in soup.find_all('div', class_='most-popular-box'):
	if count >= 5: # Stop after 2 articles
	break

	# Extract the title from the <h2> tag
	title_tag = item.find('h2')
	if title_tag:
	title = title_tag.get_text(strip=True)

	# Extract the link from the <a> tag inside <h2>
	link = item.find('a')['href']

	# Fetch and process full article text (you should define get_full_article_tnews)
	full_text = get_full_article_tnews(link)

	# Summarize the full article (you should define summarizer)
	summary_obj = summarizer(full_text[:1020])
	summary = summary_obj[0]['summary_text'] if summary_obj else ""

	# Append the article details
	articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})


	count += 1 # Increment the counter

	return articles


	def get_full_article_tnews(url):
	response = requests.get(url, verify = False)
	soup = BeautifulSoup(response.text, 'html.parser')
	content_div = soup.find('div', class_='detail-content')
	if content_div:
	paragraphs = content_div.find_all(['p', 'li'])
	full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
	return full_text
	return "Content not found."

	# Function to save articles to a CSV file
	def save_to_csv(filename, articles):
	if not articles:
	print(f"No articles found to save in {filename}.")
	return
	keys = articles[0].keys()
	with open(filename, 'w', newline='', encoding='utf-8') as output_file:
	dict_writer = csv.DictWriter(output_file, fieldnames=keys)
	dict_writer.writeheader()
	dict_writer.writerows(articles)


	# # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV
	# def main():
	# # Scraping articles from Dawn
	# dawn_articles = scrape_tnews()
	# save_to_csv('tnews_articles_full.csv', dawn_articles)
	# print("tnews articles saved to CSV file successfully.")
	#
	# # Scraping articles from Business Recorder
	# # brecorder_articles = scrape_brecorder()
	# # save_to_csv('brecorder_articles_full.csv', brecorder_articles)
	# # print("Business Recorder articles saved to CSV file successfully.")
	#
	#
	# if __name__ == '__main__':
	# main()

	# url = 'https://www.thenews.com.pk/latest/category/business'
	# response = requests.get(url, verify=False)
	# soup = BeautifulSoup(response.text, 'html.parser')
	# s = soup.find_all('div', class_='most-popular-box')
	# print(s)