Spaces:

vladbogo
/

Filtir

Sleeping

File size: 5,332 Bytes

7a8b33f

import requests
from bs4 import BeautifulSoup
from zsvision.zs_utils import BlockTimer
import json
import json5
import argparse
import multiprocessing as mp
from zsvision.zs_multiproc import starmap_with_kwargs
from datetime import datetime
import urllib.robotparser
import urllib.parse
from urllib.parse import urlunparse
from utils import get_google_search_results

import time
from random import randint
from fake_useragent import UserAgent
from newspaper import Article, Config


class GoogleEvidence:
    def __init__(
        self,
        model="gpt-3.5-turbo",
        limit=0,
        refresh=False,
        num_search_results_to_keep=3,
        filter_str="",
        processes=8,
    ):
        self.model = model
        self.limit = limit
        self.refresh = refresh
        self.num_search_results_to_keep = num_search_results_to_keep
        self.filter_str = filter_str
        self.processes = processes

    def can_index(self, url, user_agent_name):
        rp = urllib.robotparser.RobotFileParser()
        robots_url = f"{url.scheme}://{url.netloc}/robots.txt"

        headers = {
            "User-Agent": user_agent_name,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }

        try:
            req = urllib.request.Request(robots_url, headers=headers)
            with urllib.request.urlopen(req) as response:
                rp.parse(response.read().decode("utf-8").splitlines())

            ok_to_index = rp.can_fetch(user_agent_name, url.geturl())
        except urllib.error.URLError:
            # If there is no robots.txt or there is an error accessing it, assume it's okay to index
            ok_to_index = True
        except Exception as e:
            print(f"An unexpected error occurred in step42: {e}")
            # going the safe route
            ok_to_index = False
        return ok_to_index

    def fetch_search_results_to_gather_evidence(
        self,
        queryset: dict,
    ):
        user_agent = UserAgent()
        config = Config()
        config.fetch_images = False

        user_agent_name = "FiltirBot/1.0 (+https://filtir.com/filtirbot-info)"

        headers = {
            "User-Agent": user_agent_name,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }

        # we assume some sites won't permit indexing, so we'll skip these
        num_results = self.num_search_results_to_keep + 5
        results = {}

        print(f"Found {len(queryset)} claims to fetch search results for")

        for queryset_idx, item in enumerate(queryset):
            with BlockTimer(
                f"Fetching search results from Google {queryset_idx + 1}/{len(queryset)}"
            ):
                search_results = get_google_search_results(
                    query_str=item["claim"], num_results=num_results
                )

            if search_results == [{"Result": "No good Google Search Result was found"}]:
                item["search_results"] = []
                continue

            parsed_results = []
            for search_result in search_results:
                if not self.can_index(
                    urllib.parse.urlparse(search_result["link"]),
                    user_agent_name=user_agent_name,
                ):
                    print(
                        f"Skipping {search_result['link']} because it doesn't permit indexing"
                    )
                    continue
                try:
                    config.browser_user_agent = user_agent.random
                    article = Article(
                        search_result["link"], language="en", config=config
                    )
                    article.download()
                    article.parse()
                    text = article.text
                except Exception as e:
                    print(f"Error parsing article: {e}, trying with requests.get...")
                    try:
                        response = requests.get(
                            search_result["link"], timeout=15, headers=headers
                        )
                        html = response.text
                        soup = BeautifulSoup(html, features="html.parser")
                        text = soup.get_text()
                    except Exception as exception:
                        print(f"Error parsing article: {exception}, skipping")
                        continue

                search_result["text"] = text
                parsed_results.append(search_result)
                if len(parsed_results) == self.num_search_results_to_keep:
                    break
            item["search_results"] = parsed_results

        # update the queryset with new information
        date_str = datetime.now().strftime("%Y-%m-%d")
        results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}

        print(f"Returning web pages for search results for {len(queryset)} queries")
        return results