File size: 5,332 Bytes
7a8b33f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import requests
from bs4 import BeautifulSoup
from zsvision.zs_utils import BlockTimer
import json
import json5
import argparse
import multiprocessing as mp
from zsvision.zs_multiproc import starmap_with_kwargs
from datetime import datetime
import urllib.robotparser
import urllib.parse
from urllib.parse import urlunparse
from utils import get_google_search_results
import time
from random import randint
from fake_useragent import UserAgent
from newspaper import Article, Config
class GoogleEvidence:
def __init__(
self,
model="gpt-3.5-turbo",
limit=0,
refresh=False,
num_search_results_to_keep=3,
filter_str="",
processes=8,
):
self.model = model
self.limit = limit
self.refresh = refresh
self.num_search_results_to_keep = num_search_results_to_keep
self.filter_str = filter_str
self.processes = processes
def can_index(self, url, user_agent_name):
rp = urllib.robotparser.RobotFileParser()
robots_url = f"{url.scheme}://{url.netloc}/robots.txt"
headers = {
"User-Agent": user_agent_name,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
try:
req = urllib.request.Request(robots_url, headers=headers)
with urllib.request.urlopen(req) as response:
rp.parse(response.read().decode("utf-8").splitlines())
ok_to_index = rp.can_fetch(user_agent_name, url.geturl())
except urllib.error.URLError:
# If there is no robots.txt or there is an error accessing it, assume it's okay to index
ok_to_index = True
except Exception as e:
print(f"An unexpected error occurred in step42: {e}")
# going the safe route
ok_to_index = False
return ok_to_index
def fetch_search_results_to_gather_evidence(
self,
queryset: dict,
):
user_agent = UserAgent()
config = Config()
config.fetch_images = False
user_agent_name = "FiltirBot/1.0 (+https://filtir.com/filtirbot-info)"
headers = {
"User-Agent": user_agent_name,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
# we assume some sites won't permit indexing, so we'll skip these
num_results = self.num_search_results_to_keep + 5
results = {}
print(f"Found {len(queryset)} claims to fetch search results for")
for queryset_idx, item in enumerate(queryset):
with BlockTimer(
f"Fetching search results from Google {queryset_idx + 1}/{len(queryset)}"
):
search_results = get_google_search_results(
query_str=item["claim"], num_results=num_results
)
if search_results == [{"Result": "No good Google Search Result was found"}]:
item["search_results"] = []
continue
parsed_results = []
for search_result in search_results:
if not self.can_index(
urllib.parse.urlparse(search_result["link"]),
user_agent_name=user_agent_name,
):
print(
f"Skipping {search_result['link']} because it doesn't permit indexing"
)
continue
try:
config.browser_user_agent = user_agent.random
article = Article(
search_result["link"], language="en", config=config
)
article.download()
article.parse()
text = article.text
except Exception as e:
print(f"Error parsing article: {e}, trying with requests.get...")
try:
response = requests.get(
search_result["link"], timeout=15, headers=headers
)
html = response.text
soup = BeautifulSoup(html, features="html.parser")
text = soup.get_text()
except Exception as exception:
print(f"Error parsing article: {exception}, skipping")
continue
search_result["text"] = text
parsed_results.append(search_result)
if len(parsed_results) == self.num_search_results_to_keep:
break
item["search_results"] = parsed_results
# update the queryset with new information
date_str = datetime.now().strftime("%Y-%m-%d")
results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}
print(f"Returning web pages for search results for {len(queryset)} queries")
return results
|