|
from urllib.parse import parse_qs, urlparse |
|
from bs4 import BeautifulSoup |
|
import requests |
|
|
|
|
|
def scrapeGoogleSearch(query): |
|
finalResponse = [] |
|
|
|
searchUrl = f"https://www.google.com/search?q={query}" |
|
response = requests.get(searchUrl) |
|
if response.status_code == 200: |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
with open('soup_dump.html', 'w', encoding='utf-8') as file: |
|
file.write(soup.prettify()) |
|
|
|
results = soup.find('body') |
|
mainDiv = soup.find('div', attrs={'id': 'main'}) |
|
answerDiv = ( |
|
mainDiv.select_one('div.PqksIc') |
|
or mainDiv.select_one('div.BNeawe.iBp4i') |
|
) |
|
if answerDiv: |
|
citationDateDiv = answerDiv.select_one('sub.gMUaMb.r0bn4c.rQMQod') |
|
citationDate = citationDateDiv.text if citationDateDiv else "" |
|
answerText = answerDiv.text.replace(citationDate, '').strip() |
|
citationText = f"Citation Date: {citationDate}" if citationDate else "" |
|
finalResponse.append(f"Verified Answer:\n====\n{answerText}\n{citationText}\n====\n\n") |
|
|
|
results = mainDiv.select('div.egMi0.kCrYT') |
|
resultsDesc = mainDiv.select('div.BNeawe.s3v9rd.AP7Wnd .BNeawe.s3v9rd.AP7Wnd:last-child') |
|
|
|
if results: |
|
finalResponse.append("Search Results:\n====\n") |
|
|
|
for (i, result) in enumerate(results[:10]): |
|
title = result.find('h3').text |
|
link = result.find('a')['href'] |
|
parsedUrl = urlparse(link) |
|
urlParams = parse_qs(parsedUrl.query) |
|
link = urlParams.get('q', [None])[0] |
|
desc = resultsDesc[i].text |
|
finalResponse.append(f"Title: {title}") |
|
finalResponse.append(f"Description: {desc}") |
|
finalResponse.append(f"Link: {link}\n") |
|
else: |
|
print("Failed to retrieve search results.") |
|
|
|
return "\n".join(finalResponse) |
|
|