File size: 3,676 Bytes
e92817a
b259fec
cf4c3f8
b259fec
 
 
 
af2c647
cf4c3f8
b259fec
62ee9e4
b259fec
 
 
cf4c3f8
 
 
 
 
 
 
 
b259fec
 
bce51d4
 
 
 
 
38e2473
bce51d4
 
 
 
b259fec
 
8c0b736
 
 
bce51d4
 
 
 
8c0b736
bce51d4
b259fec
62ee9e4
b259fec
cf4c3f8
8c0b736
cf4c3f8
 
 
8c0b736
cf4c3f8
8c0b736
 
cf4c3f8
 
 
8c0b736
b259fec
 
e92817a
 
 
 
8c0b736
e92817a
 
 
8c0b736
4591d96
 
8c0b736
e92817a
4591d96
e92817a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c0b736
e92817a
 
b259fec
e92817a
 
 
 
 
 
 
 
b259fec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import concurrent.futures
import requests
import tldextract
from pathlib import Path
from utils.enver import enver
from utils.logger import logger
from networks.filepath_converter import UrlToFilepathConverter
from networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS


class WebpageFetcher:
    def __init__(self):
        self.enver = enver
        self.enver.set_envs(proxies=True)
        self.filepath_converter = UrlToFilepathConverter()

    def is_ignored_host(self, url):
        self.host = tldextract.extract(url).registered_domain
        if self.host in IGNORE_HOSTS:
            return True
        else:
            return False

    def send_request(self):
        try:
            self.request_response = requests.get(
                url=self.url,
                headers=REQUESTS_HEADERS,
                proxies=self.enver.requests_proxies,
                timeout=25,
            )
        except:
            logger.warn(f"Failed to fetch: [{self.url}]")
            self.request_response = None

    def save_response(self):
        if not self.html_path.exists():
            self.html_path.parent.mkdir(parents=True, exist_ok=True)
        logger.success(f"Saving to: [{self.html_path}]")

        if self.request_response is None:
            return
        else:
            with open(self.html_path, "wb") as wf:
                wf.write(self.request_response.content)

    def fetch(self, url, overwrite=False, output_parent=None):
        self.url = url
        logger.note(f"Fetching: [{self.url}]")
        self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)

        if self.is_ignored_host(self.url):
            logger.warn(f"Ignore host: [{self.host}]")
            return self.html_path

        if self.html_path.exists() and not overwrite:
            logger.success(f"HTML existed: [{self.html_path}]")
        else:
            self.send_request()
            self.save_response()
        return self.html_path


class BatchWebpageFetcher:
    def __init__(self):
        self.done_count = 0
        self.total_count = 0
        self.url_and_html_path_list = []

    def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
        webpage_fetcher = WebpageFetcher()
        html_path = webpage_fetcher.fetch(
            url=url, overwrite=overwrite, output_parent=output_parent
        )
        self.url_and_html_path_list.append({"url": url, "html_path": html_path})
        self.done_count += 1
        logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")

    def fetch(self, urls, overwrite=False, output_parent=None):
        self.urls = urls
        self.total_count = len(self.urls)
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(
                    self.fecth_single_webpage,
                    url=url,
                    overwrite=overwrite,
                    output_parent=output_parent,
                )
                for url in urls
            ]

            for idx, future in enumerate(concurrent.futures.as_completed(futures)):
                result = future.result()
        return self.url_and_html_path_list


if __name__ == "__main__":
    urls = [
        "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
        "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
        "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
    ]
    batch_webpage_fetcher = BatchWebpageFetcher()
    batch_webpage_fetcher.fetch(
        urls=urls, overwrite=True, output_parent="python tutorials"
    )