Spaces:
Sleeping
Sleeping
File size: 7,351 Bytes
712d86b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
from transformers import pipeline
# from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration
# Text sumamrization model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def scrape_dawn():
url = 'https://www.dawn.com/business'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
count = 0 # Counter to track the number of articles scraped
for item in soup.find_all('article', class_='story'):
if count >= 5: # Stop after 10 articles
break
title_tag = item.find('h2', class_='story__title')
if title_tag:
title = title_tag.get_text(strip=True)
link = title_tag.find('a')['href']
full_text = get_full_article_dawn(link)
# Summarize the full article
summary_obj = summarizer(full_text[:1020])
# Convert the summary object to a string
summary = summary_obj[0]['summary_text'] if summary_obj else ""
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
count += 1 # Increment the counter
return articles
# Function to get the full text of an article from Dawn
def get_full_article_dawn(url):
response = requests.get(url, verify = False)
soup = BeautifulSoup(response.text, 'html.parser')
content_div = soup.find('div', class_='story__content')
if content_div:
paragraphs = content_div.find_all('p')
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
return full_text
return "Content not found."
# Function to scrape articles from Business Recorder
def scrape_brecorder():
url = 'https://www.brecorder.com/business-finance'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
count = 0 # Counter to track the number of articles scraped
for item in soup.find_all('article', class_='story'):
if count >= 5: # Stop after 10 articles
break
title_tag = item.find('h2', class_='story__title')
if title_tag:
title = title_tag.get_text(strip=True)
link = title_tag.find('a')['href']
full_text = get_full_article_brecorder(link)
# Summarize the full article
summary_obj = summarizer(full_text[:1020])
# Convert the summary object to a string
summary = summary_obj[0]['summary_text'] if summary_obj else ""
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
count += 1 # Increment the counter
return articles
# Function to get the full text of an article from Business Recorder
def get_full_article_brecorder(url):
response = requests.get(url, verify = False)
soup = BeautifulSoup(response.text, 'html.parser')
content_div = soup.find('div', class_='story__content')
if content_div:
paragraphs = content_div.find_all(['p', 'li'])
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
return full_text
return "Content not found."
#
# def scrape_tnews():
# url = 'https://www.thenews.com.pk/latest/category/business'
# response = requests.get(url, verify=False)
# soup = BeautifulSoup(response.text, 'html.parser')
# articles = []
#
# count = 0 # Counter to track the number of articles scraped
#
# for item in soup.find_all('div', class_='most-popular-box'):
# if count >= 2: # Stop after 10 articles
# break
#
# title_tag = item.find('h2', class_='most-popular-list')
# if title_tag:
# title = title_tag.get_text(strip=True)
# link = title_tag.find('a')['href']
# full_text = get_full_article_tnews(link)
# # Summarize the full article
# summary_obj = summarizer(full_text[:1020])
#
# # Convert the summary object to a string
# summary = summary_obj[0]['summary_text'] if summary_obj else ""
# articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
#
# count += 1 # Increment the counter
#
# return articles
def scrape_tnews():
url = 'https://www.thenews.com.pk/latest/category/business'
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
count = 0 # Counter to track the number of articles scraped
for item in soup.find_all('div', class_='most-popular-box'):
if count >= 5: # Stop after 2 articles
break
# Extract the title from the <h2> tag
title_tag = item.find('h2')
if title_tag:
title = title_tag.get_text(strip=True)
# Extract the link from the <a> tag inside <h2>
link = item.find('a')['href']
# Fetch and process full article text (you should define get_full_article_tnews)
full_text = get_full_article_tnews(link)
# Summarize the full article (you should define summarizer)
summary_obj = summarizer(full_text[:1020])
summary = summary_obj[0]['summary_text'] if summary_obj else ""
# Append the article details
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
count += 1 # Increment the counter
return articles
def get_full_article_tnews(url):
response = requests.get(url, verify = False)
soup = BeautifulSoup(response.text, 'html.parser')
content_div = soup.find('div', class_='detail-content')
if content_div:
paragraphs = content_div.find_all(['p', 'li'])
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
return full_text
return "Content not found."
# Function to save articles to a CSV file
def save_to_csv(filename, articles):
if not articles:
print(f"No articles found to save in {filename}.")
return
keys = articles[0].keys()
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(articles)
# # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV
# def main():
# # Scraping articles from Dawn
# dawn_articles = scrape_tnews()
# save_to_csv('tnews_articles_full.csv', dawn_articles)
# print("tnews articles saved to CSV file successfully.")
#
# # Scraping articles from Business Recorder
# # brecorder_articles = scrape_brecorder()
# # save_to_csv('brecorder_articles_full.csv', brecorder_articles)
# # print("Business Recorder articles saved to CSV file successfully.")
#
#
# if __name__ == '__main__':
# main()
# url = 'https://www.thenews.com.pk/latest/category/business'
# response = requests.get(url, verify=False)
# soup = BeautifulSoup(response.text, 'html.parser')
# s = soup.find_all('div', class_='most-popular-box')
# print(s) |