Update process_documents.py
Browse files- process_documents.py +3 -1
process_documents.py
CHANGED
@@ -36,14 +36,16 @@ def process_documents(urls):
|
|
36 |
def process_web(url, source_id):
|
37 |
data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
|
38 |
try:
|
|
|
39 |
page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
|
40 |
except Exception as e:
|
|
|
41 |
page_content = data.page_content.strip()
|
42 |
document_snippets = [
|
43 |
Document(
|
44 |
page_content=page_content,
|
45 |
metadata={
|
46 |
-
"header":
|
47 |
"source_url": url,
|
48 |
"source_type": "web",
|
49 |
"chunk_id": source_id,
|
|
|
36 |
def process_web(url, source_id):
|
37 |
data = WebBaseLoader(f"https://r.jina.ai/{url}").load()[0]
|
38 |
try:
|
39 |
+
header = re.search(r"Title: (.*)?", data.page_content).group(1)
|
40 |
page_content = data.page_content[data.page_content.index("Markdown Content:") + len("Markdown Content:"):].strip()
|
41 |
except Exception as e:
|
42 |
+
header = ""
|
43 |
page_content = data.page_content.strip()
|
44 |
document_snippets = [
|
45 |
Document(
|
46 |
page_content=page_content,
|
47 |
metadata={
|
48 |
+
"header": header,
|
49 |
"source_url": url,
|
50 |
"source_type": "web",
|
51 |
"chunk_id": source_id,
|