Spaces:
Running
Running
import time | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from markdownify import markdownify as md | |
import pandas as pd | |
import argparse | |
def extract_content(url: str): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
page_subject = soup.select_one("#load_content .page-subject") | |
page_content = soup.select_one("#load_content .page-content") | |
markdown_content = md( | |
str(page_subject) + str(page_content), | |
heading_style="ATX", | |
bullets="-", | |
strong_em_symbol="*", | |
code_language="python", | |
escape_asterisks=False, | |
escape_underscores=False, | |
) | |
normalized_text = re.sub(r"\n{2}", "\n", markdown_content) | |
return normalized_text | |
def main(ebook_url): | |
base_url = "https://wikidocs.net" | |
# book_id ์ถ์ถ | |
book_id = ebook_url.split("/")[-1] | |
# ํ์ด์ง ์์ค ๊ฐ์ ธ์ค๊ธฐ | |
response = requests.get(ebook_url) | |
response.raise_for_status() # ์์ธ ์ฒ๋ฆฌ | |
soup = BeautifulSoup(response.content, "html.parser") | |
# ๋ชฉ์ฐจ์์ 'a' ํ๊ทธ๋ง ๊ฐ์ ธ์ค๊ธฐ | |
toc = soup.select(".list-group-toc a[href^='javascript:page(']") | |
# ์ถ์ถํ ๋ฐ์ดํฐ ์ ์ฅํ ๋ฆฌ์คํธ | |
data_list = [] | |
for item in toc: | |
title = item.get_text(strip=True) | |
page_id = item.get("href").split("page(")[-1].rstrip(")") | |
link = f"{base_url}/{page_id}" | |
data_list.append({"title": title, "link": link}) | |
# ๋ฐ์ดํฐ ๋ฆฌ์คํธ๋ฅผ ์ํํ๋ฉฐ ์ฝํ ์ธ ์ถ์ถ | |
for item in data_list[1:]: | |
item["content"] = extract_content(item["link"]) | |
time.sleep(1) # ํ์ด์ง ๋ก๋๋ฅผ ์ํด ๋๊ธฐ | |
# ๋ฐ์ดํฐํ๋ ์์ผ๋ก ๋ณํ | |
df = pd.DataFrame(data_list) | |
df = df.dropna(subset=["content"]) | |
# ๋ฐ์ดํฐํ๋ ์์ parquet ํ์ผ๋ก ์ ์ฅ | |
parquet_filename = f"wikidocs_{book_id}.parquet" | |
df.to_parquet(parquet_filename, index=False) | |
print(f"ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์ ์ฅ๋์์ต๋๋ค: {parquet_filename}") | |
if __name__ == "__main__": | |
# ๋ช ๋ น์ด ์ค ์ธ์ ์ฒ๋ฆฌ | |
parser = argparse.ArgumentParser(description="Wikidocs ebook URL์ ์ ๋ ฅํ์ธ์.") | |
parser.add_argument("ebook_url", type=str, help="Wikidocs ebook URL") | |
args = parser.parse_args() | |
main(args.ebook_url) | |