Spaces:
Running
Running
import re | |
from langchain.document_loaders import PyPDFLoader | |
import pandas as pd | |
from langchain.document_loaders import WebBaseLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import requests | |
class DocumentsLoader: | |
def __init__(self) -> None: | |
pass | |
def load_urls_from_csv(self, url_path: str, column: str = "url"): | |
df = pd.read_csv(url_path) | |
doc_urls = df[column].to_list() | |
return doc_urls | |
def is_notion_url(self, url): | |
# Regular expressions to match Notion URLs | |
notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/" | |
# Check if the URL matches the Notion regex | |
return re.match(notion_regex, url) is not None | |
def is_pdf_url(self, url): | |
# Define a list of common PDF file extensions | |
pdf_extensions = [".pdf"] | |
# Check if the URL ends with a PDF file extension | |
for extension in pdf_extensions: | |
if url.endswith(extension): | |
return True | |
return False | |
def is_valid_url(self, url): | |
# TODO: handle status codes not 200 | |
try: | |
response = requests.head(url) | |
if response.status_code == 200: | |
return True # 200 status code indicates a valid URL | |
except requests.RequestException: | |
return False | |
def load_docs(self, doc_urls: list) -> list: | |
web_urls, pdf_urls, docs = [], [], [] | |
if isinstance(doc_urls[0], list): | |
doc_urls = [doc[0] for doc in doc_urls] | |
# doc_urls = doc_urls[0] | |
for url in doc_urls: | |
if self.is_pdf_url(url): | |
pdf_urls.append(url) | |
else: | |
web_urls.append(url) | |
if len(web_urls) > 0: | |
web_urls = [url for url in web_urls if self.is_valid_url(url)] | |
for web_url in web_urls: | |
try: | |
web_loader = WebBaseLoader(web_url) | |
web_docs = web_loader.load() | |
docs = docs + web_docs | |
except Exception as e: | |
print(f"Error web loader, {web_url}: {str(e)}") | |
if len(pdf_urls) > 0: | |
pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)] | |
for pdf_url in pdf_urls: | |
try: | |
pdf_loader = PyPDFLoader(pdf_url) | |
pdf_docs = pdf_loader.load() | |
docs = docs + pdf_docs | |
except Exception as e: | |
print(f"Error pdf loader, {pdf_url}: {str(e)}") | |
return docs | |
def split_docs(self, docs, chunk_size=2000): | |
r_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=0, | |
separators=["\n\n", "\n", "\. ", " ", ""], | |
) | |
splits = r_splitter.split_documents(docs) | |
return splits | |