Spaces:
Running
Running
File size: 3,973 Bytes
e04dd70 d258ef6 e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 c5a0a6e e04dd70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
from langchain.document_loaders import PyPDFLoader
import pandas as pd
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import requests
class DocumentsLoader:
def __init__(self) -> None:
pass
def load_urls_from_csv(self, url_path: str, column: str = "url"):
df = pd.read_csv(url_path)
doc_urls = df[column].to_list()
return doc_urls
# TODO: complete notion scraper
def is_notion_url(self, url):
# Regular expressions to match Notion URLs
return "notion" in url
# notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
# Check if the URL matches the Notion regex
# return re.match(notion_regex, url) is not None
def is_pdf_url(self, url):
# Define a list of common PDF file extensions
pdf_extensions = [".pdf"]
# Check if the URL ends with a PDF file extension
for extension in pdf_extensions:
if url.endswith(extension):
return True
return False
def is_valid_url(self, url):
# TODO: handle status codes not 200
try:
response = requests.head(url)
if response.status_code == 200:
return True # 200 status code indicates a valid URL
except requests.RequestException:
return False
def load_docs(self, doc_urls: list) -> list:
web_urls, pdf_urls, notion_urls, docs = [], [], [], []
if isinstance(doc_urls[0], list):
doc_urls = [doc[0] for doc in doc_urls]
# doc_urls = doc_urls[0]
# split urls on pdf,web,
print("docs urls: ", doc_urls)
for url in doc_urls:
print("URL : ", url)
print(self.is_pdf_url(url))
if self.is_pdf_url(url):
pdf_urls.append(url)
if self.is_notion_url(url):
notion_urls.append(url)
else:
web_urls.append(url)
# load web urls
if len(web_urls) > 0:
web_urls = [url for url in web_urls if self.is_valid_url(url)]
for web_url in web_urls:
try:
web_loader = WebBaseLoader(web_url)
web_docs = web_loader.load()
docs = docs + web_docs
except Exception as e:
print(f"Error web loader, {web_url}: {str(e)}")
# load pdf urls
if len(pdf_urls) > 0:
# print("n urls", pdf_urls)
# pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
# print("n urls", pdf_urls)
for pdf_url in pdf_urls:
try:
pdf_loader = PyPDFLoader(pdf_url)
pdf_docs = pdf_loader.load()
docs = docs + pdf_docs
except Exception as e:
print(f"Error pdf loader, {pdf_url}: {str(e)}")
# notion loade: not working
# if len(notion_urls) > 0:
# print("ADDING NOTION URLS")
# notion_urls = [url for url in notion_urls if self.is_notion_url(url)]
# for notion_url in notion_urls:
# print(notion_url)
# try:
# notion_loader = NotionDirectoryLoader(notion_url)
# notion_docs = notion_loader.load()
# print("Notion docs ", notion_docs)
# docs = notion_docs + docs
# except Exception as e:
# print(f"Error notion loader, {notion_url}: {str(e)}")
return docs
def split_docs(self, docs, chunk_size=2000):
r_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=0,
separators=["\n\n", "\n", "\. ", " ", ""],
)
splits = r_splitter.split_documents(docs)
return splits
|