File size: 5,218 Bytes
17af92c 2b35800 17af92c eaaaf3d 17af92c 2b35800 17af92c 6a2dc59 17af92c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import os
import argparse
import csv
from time import sleep
import time
import json
import numpy as np
import fitz
import pandas as pd
import requests
from src.retrieval.html2lines import url2lines, line_correction
csv.field_size_limit(100000000)
MAX_RETRIES = 3
TIMEOUT = 5 # time limit for request
def scrape_text_from_url(url, temp_name):
response = None
for attempt in range(MAX_RETRIES):
try:
response = requests.get(url, timeout=TIMEOUT)
except requests.RequestException as e:
if attempt < MAX_RETRIES - 1:
sleep(3) # Wait before retrying
if (
response is None or response.status_code == 503
): # trafilatura does not handle retry with 503, often waiting 24 hours as overwritten by the html
return []
if url.endswith(".pdf"):
with open(f"pdf_dir/{temp_name}.pdf", "wb") as f:
f.write(response.content)
extracted_text = ""
doc = fitz.open(f"pdf_dir/{temp_name}.pdf")
for page in doc: # iterate the document pages
extracted_text += page.get_text() if page.get_text() else ""
return line_correction(extracted_text.split("\n"))
return line_correction(url2lines(url))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scraping text from URLs.")
parser.add_argument(
"-i",
"--tsv_input_file",
type=str,
help="The path of the input files containing URLs from Google search.",
)
parser.add_argument(
"-o",
"--json_output_dir",
type=str,
default="output",
help="The output JSON file to save the scraped data.",
)
parser.add_argument(
"--overwrite_out_file",
action="store_true",
)
args = parser.parse_args()
assert (
os.path.splitext(args.tsv_input_file)[-1] == ".tsv"
), "The input should be a tsv file."
os.makedirs(args.json_output_dir, exist_ok=True)
total_scraped, empty, total_failed = 0, 0, 0
print(f"Processing files {args.tsv_input_file}")
st = time.time()
claim_id = os.path.splitext(os.path.basename(args.tsv_input_file))[0]
json_output_path = os.path.join(args.json_output_dir, f"{claim_id}.json")
lines_skipped = 0
if os.path.exists(json_output_path):
if args.overwrite_out_file:
os.remove(json_output_path)
else:
with open(json_output_path, "r", encoding="utf-8") as json_file:
existing_data = json_file.readlines()
lines_skipped = len(existing_data)
print(f" Skipping {lines_skipped} lines in {json_output_path}")
# Some tsv files will fail to be loaded, try different libs to to load them
try:
df = pd.read_csv(args.tsv_input_file, sep="\t", header=None)
data = df.values
print("Data loaded successfully with Pandas.")
except Exception as e:
print("Error loading with csv:", e)
try:
data = np.genfromtxt(
args.tsv_input_file, delimiter="\t", dtype=None, encoding=None
)
print("Data loaded successfully with NumPy.")
except Exception as e:
print("Error loading with NumPy:", e)
try:
data = []
with open(args.tsv_input_file, "r", newline="") as tsvfile:
reader = csv.reader(tsvfile, delimiter="\t")
for row in reader:
data.append(row)
print("Data loaded successfully with csv.")
except Exception as e:
print("Error loading with csv:", e)
data = None
if len(data) == lines_skipped:
print(" No more lines need to be processed!")
else:
with open(json_output_path, "a", encoding="utf-8") as json_file:
for index, row in enumerate(data):
if index < lines_skipped:
continue
url = row[2]
json_data = {
"claim_id": claim_id,
"type": row[1],
"query": row[3],
"url": url,
"url2text": [],
}
print(f"Scraping text for url_{index}: {url}!")
try:
scrape_result = scrape_text_from_url(url, claim_id)
json_data["url2text"] = scrape_result
if len(json_data["url2text"]) > 0:
total_scraped += 1
else:
empty += 1
except Exception as e:
total_failed += 1
json_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
json_file.flush()
print(f"Output for {args.tsv_input_file} saved to {json_output_path}")
elapsed_time = time.time() - st
elapsed_minutes = int(elapsed_time // 60)
elapsed_seconds = int(elapsed_time % 60)
print(f"Time elapsed: {elapsed_minutes}min {elapsed_seconds}sec")
print(f"{total_scraped} scraped, {empty} empty, {total_failed} failed")
|