File size: 5,332 Bytes
7a8b33f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import requests
from bs4 import BeautifulSoup
from zsvision.zs_utils import BlockTimer
import json
import json5
import argparse
import multiprocessing as mp
from zsvision.zs_multiproc import starmap_with_kwargs
from datetime import datetime
import urllib.robotparser
import urllib.parse
from urllib.parse import urlunparse
from utils import get_google_search_results

import time
from random import randint
from fake_useragent import UserAgent
from newspaper import Article, Config


class GoogleEvidence:
    def __init__(
        self,
        model="gpt-3.5-turbo",
        limit=0,
        refresh=False,
        num_search_results_to_keep=3,
        filter_str="",
        processes=8,
    ):
        self.model = model
        self.limit = limit
        self.refresh = refresh
        self.num_search_results_to_keep = num_search_results_to_keep
        self.filter_str = filter_str
        self.processes = processes

    def can_index(self, url, user_agent_name):
        rp = urllib.robotparser.RobotFileParser()
        robots_url = f"{url.scheme}://{url.netloc}/robots.txt"

        headers = {
            "User-Agent": user_agent_name,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }

        try:
            req = urllib.request.Request(robots_url, headers=headers)
            with urllib.request.urlopen(req) as response:
                rp.parse(response.read().decode("utf-8").splitlines())

            ok_to_index = rp.can_fetch(user_agent_name, url.geturl())
        except urllib.error.URLError:
            # If there is no robots.txt or there is an error accessing it, assume it's okay to index
            ok_to_index = True
        except Exception as e:
            print(f"An unexpected error occurred in step42: {e}")
            # going the safe route
            ok_to_index = False
        return ok_to_index

    def fetch_search_results_to_gather_evidence(
        self,
        queryset: dict,
    ):
        user_agent = UserAgent()
        config = Config()
        config.fetch_images = False

        user_agent_name = "FiltirBot/1.0 (+https://filtir.com/filtirbot-info)"

        headers = {
            "User-Agent": user_agent_name,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }

        # we assume some sites won't permit indexing, so we'll skip these
        num_results = self.num_search_results_to_keep + 5
        results = {}

        print(f"Found {len(queryset)} claims to fetch search results for")

        for queryset_idx, item in enumerate(queryset):
            with BlockTimer(
                f"Fetching search results from Google {queryset_idx + 1}/{len(queryset)}"
            ):
                search_results = get_google_search_results(
                    query_str=item["claim"], num_results=num_results
                )

            if search_results == [{"Result": "No good Google Search Result was found"}]:
                item["search_results"] = []
                continue

            parsed_results = []
            for search_result in search_results:
                if not self.can_index(
                    urllib.parse.urlparse(search_result["link"]),
                    user_agent_name=user_agent_name,
                ):
                    print(
                        f"Skipping {search_result['link']} because it doesn't permit indexing"
                    )
                    continue
                try:
                    config.browser_user_agent = user_agent.random
                    article = Article(
                        search_result["link"], language="en", config=config
                    )
                    article.download()
                    article.parse()
                    text = article.text
                except Exception as e:
                    print(f"Error parsing article: {e}, trying with requests.get...")
                    try:
                        response = requests.get(
                            search_result["link"], timeout=15, headers=headers
                        )
                        html = response.text
                        soup = BeautifulSoup(html, features="html.parser")
                        text = soup.get_text()
                    except Exception as exception:
                        print(f"Error parsing article: {exception}, skipping")
                        continue

                search_result["text"] = text
                parsed_results.append(search_result)
                if len(parsed_results) == self.num_search_results_to_keep:
                    break
            item["search_results"] = parsed_results

        # update the queryset with new information
        date_str = datetime.now().strftime("%Y-%m-%d")
        results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}

        print(f"Returning web pages for search results for {len(queryset)} queries")
        return results