Spaces:
Sleeping
Sleeping
Update file_loader.py
Browse files- file_loader.py +70 -10
file_loader.py
CHANGED
@@ -9,31 +9,91 @@ from helpers import (
|
|
9 |
list_docx_files, # Lấy danh sách file .docx
|
10 |
get_splits, # Xử lý file docx thành splits
|
11 |
get_json_splits_only, # Xử lý file JSON (FAQ)
|
12 |
-
scrape_website, # Xử lý dữ liệu từ web
|
13 |
)
|
14 |
|
|
|
15 |
import json
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
SCRAPED_DATA_PATH = "scraped_data.json"
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
if os.path.exists(SCRAPED_DATA_PATH):
|
22 |
print("🔄 Loading scraped website contents from file...")
|
23 |
with open(SCRAPED_DATA_PATH, "r", encoding="utf-8") as f:
|
24 |
return json.load(f)
|
25 |
|
26 |
print("🌍 Scraping websites...")
|
27 |
-
website_contents = scrape_website(base_urls)
|
28 |
-
|
29 |
-
# Lưu lại dữ liệu để lần sau không cần scrape
|
30 |
with open(SCRAPED_DATA_PATH, "w", encoding="utf-8") as f:
|
31 |
json.dump(website_contents, f, ensure_ascii=False, indent=4)
|
32 |
-
|
33 |
return website_contents
|
34 |
|
35 |
-
|
36 |
-
def get_vectorstore():
|
37 |
### Xử lý tất cả các tài liệu và nhét vào database
|
38 |
folder_path = "syllabus_nct_word_format/"
|
39 |
docx_files = list_docx_files(folder_path)
|
@@ -43,7 +103,7 @@ def get_vectorstore():
|
|
43 |
#
|
44 |
base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
|
45 |
# ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
|
46 |
-
website_contents = get_scraped_data(base_urls=base_urls)
|
47 |
all_splits += website_contents
|
48 |
|
49 |
print('Feeding .docx files')
|
|
|
9 |
list_docx_files, # Lấy danh sách file .docx
|
10 |
get_splits, # Xử lý file docx thành splits
|
11 |
get_json_splits_only, # Xử lý file JSON (FAQ)
|
12 |
+
# scrape_website, # Xử lý dữ liệu từ web
|
13 |
)
|
14 |
|
15 |
+
|
16 |
import json
|
17 |
|
18 |
+
os.system("playwright install chromium")
|
19 |
+
|
20 |
+
import asyncio
|
21 |
+
from urllib.parse import urljoin
|
22 |
+
from playwright.async_api import async_playwright
|
23 |
+
from langchain_community.document_loaders import AsyncHtmlLoader
|
24 |
+
from langchain_community.document_transformers import Html2TextTransformer
|
25 |
+
from tqdm.asyncio import tqdm
|
26 |
+
|
27 |
SCRAPED_DATA_PATH = "scraped_data.json"
|
28 |
|
29 |
+
# ----------- ASYNC SCRAPING FUNCTIONS -----------
|
30 |
+
async def _fetch_urls(base_url):
|
31 |
+
"""Extract all links from a JavaScript-rendered webpage."""
|
32 |
+
urls = set()
|
33 |
+
try:
|
34 |
+
async with async_playwright() as p:
|
35 |
+
browser = await p.chromium.launch(headless=True)
|
36 |
+
page = await browser.new_page()
|
37 |
+
await page.goto(base_url)
|
38 |
+
await page.wait_for_load_state("networkidle")
|
39 |
+
links = await page.locator("a").all()
|
40 |
+
for link in links:
|
41 |
+
href = await link.get_attribute("href")
|
42 |
+
if href and "#" not in href:
|
43 |
+
full_url = urljoin(base_url, href)
|
44 |
+
if full_url.startswith(base_url):
|
45 |
+
urls.add(full_url)
|
46 |
+
await browser.close()
|
47 |
+
except Exception as e:
|
48 |
+
print(f"⚠️ Không thể truy cập {base_url}: {e}")
|
49 |
+
return list(urls)
|
50 |
+
|
51 |
+
async def _fetch_web_content(urls):
|
52 |
+
"""Fetch HTML content and convert it to text, with a progress bar."""
|
53 |
+
docs = []
|
54 |
+
progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
|
55 |
+
for page_url in urls:
|
56 |
+
try:
|
57 |
+
# Replace these with your actual async loader/transformer.
|
58 |
+
loader = AsyncHtmlLoader(page_url)
|
59 |
+
html2text = Html2TextTransformer()
|
60 |
+
html = await loader.aload()
|
61 |
+
doc = html2text.transform_documents(html)
|
62 |
+
docs.extend(doc)
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error loading {page_url}: {e}")
|
65 |
+
progress_bar.update(1)
|
66 |
+
progress_bar.close()
|
67 |
+
return docs
|
68 |
+
|
69 |
+
async def scrape_website(base_urls):
|
70 |
+
"""
|
71 |
+
Scrapes a list of base URLs and extracts their content.
|
72 |
+
"""
|
73 |
+
all_urls = []
|
74 |
+
for base_url in base_urls:
|
75 |
+
urls = await _fetch_urls(base_url)
|
76 |
+
all_urls.extend(urls)
|
77 |
+
docs = await _fetch_web_content(all_urls)
|
78 |
+
return docs
|
79 |
+
|
80 |
+
async def get_scraped_data(base_urls):
|
81 |
+
"""
|
82 |
+
Automatically load scraped data from file if available;
|
83 |
+
otherwise, scrape and cache it.
|
84 |
+
"""
|
85 |
if os.path.exists(SCRAPED_DATA_PATH):
|
86 |
print("🔄 Loading scraped website contents from file...")
|
87 |
with open(SCRAPED_DATA_PATH, "r", encoding="utf-8") as f:
|
88 |
return json.load(f)
|
89 |
|
90 |
print("🌍 Scraping websites...")
|
91 |
+
website_contents = await scrape_website(base_urls)
|
|
|
|
|
92 |
with open(SCRAPED_DATA_PATH, "w", encoding="utf-8") as f:
|
93 |
json.dump(website_contents, f, ensure_ascii=False, indent=4)
|
|
|
94 |
return website_contents
|
95 |
|
96 |
+
async def get_vectorstore():
|
|
|
97 |
### Xử lý tất cả các tài liệu và nhét vào database
|
98 |
folder_path = "syllabus_nct_word_format/"
|
99 |
docx_files = list_docx_files(folder_path)
|
|
|
103 |
#
|
104 |
base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
|
105 |
# ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
|
106 |
+
website_contents = await get_scraped_data(base_urls=base_urls)
|
107 |
all_splits += website_contents
|
108 |
|
109 |
print('Feeding .docx files')
|