from langchain.document_loaders import PyPDFLoader import pandas as pd from langchain.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import requests class DocumentsLoader: def __init__(self) -> None: pass def load_urls_from_csv(self, url_path: str, column: str = "url"): df = pd.read_csv(url_path) doc_urls = df[column].to_list() return doc_urls # TODO: complete notion scraper def is_notion_url(self, url): # Regular expressions to match Notion URLs return "notion" in url # notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/" # Check if the URL matches the Notion regex # return re.match(notion_regex, url) is not None def is_pdf_url(self, url): # Define a list of common PDF file extensions pdf_extensions = [".pdf"] # Check if the URL ends with a PDF file extension for extension in pdf_extensions: if url.endswith(extension): return True return False def is_valid_url(self, url): # TODO: handle status codes not 200 try: response = requests.head(url) if response.status_code == 200: return True # 200 status code indicates a valid URL except requests.RequestException: return False def load_docs(self, doc_urls: list) -> list: web_urls, pdf_urls, notion_urls, docs = [], [], [], [] if isinstance(doc_urls[0], list): doc_urls = [doc[0] for doc in doc_urls] # doc_urls = doc_urls[0] # split urls on pdf,web, print("docs urls: ", doc_urls) for url in doc_urls: print("URL : ", url) print(self.is_pdf_url(url)) if self.is_pdf_url(url): pdf_urls.append(url) if self.is_notion_url(url): notion_urls.append(url) else: web_urls.append(url) # load web urls if len(web_urls) > 0: web_urls = [url for url in web_urls if self.is_valid_url(url)] for web_url in web_urls: try: web_loader = WebBaseLoader(web_url) web_docs = web_loader.load() docs = docs + web_docs except Exception as e: print(f"Error web loader, {web_url}: {str(e)}") # load pdf urls if len(pdf_urls) > 0: # print("n urls", pdf_urls) # pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)] # print("n urls", pdf_urls) for pdf_url in pdf_urls: try: pdf_loader = PyPDFLoader(pdf_url) pdf_docs = pdf_loader.load() docs = docs + pdf_docs except Exception as e: print(f"Error pdf loader, {pdf_url}: {str(e)}") # notion loade: not working # if len(notion_urls) > 0: # print("ADDING NOTION URLS") # notion_urls = [url for url in notion_urls if self.is_notion_url(url)] # for notion_url in notion_urls: # print(notion_url) # try: # notion_loader = NotionDirectoryLoader(notion_url) # notion_docs = notion_loader.load() # print("Notion docs ", notion_docs) # docs = notion_docs + docs # except Exception as e: # print(f"Error notion loader, {notion_url}: {str(e)}") return docs def split_docs(self, docs, chunk_size=2000): r_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=0, separators=["\n\n", "\n", "\. ", " ", ""], ) splits = r_splitter.split_documents(docs) return splits