from pathlib import Path import faiss import pickle from PyPDF2 import PdfReader from tqdm import tqdm import glob import os import re from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores import FAISS from langchain.document_loaders import TextLoader import dotenv dotenv.load_dotenv() def get_all_pdf_filenames(paths, recursive): extensions = ["pdf"] filenames = [] for ext_name in extensions: ext = f"**/*.{ext_name}" if recursive else f"*.{ext_name}" for path in paths: filenames.extend(glob.glob(os.path.join(path, ext), recursive=recursive)) return filenames #all_pdf_paths = get_all_pdf_filenames(["/mnt/c/users/elio/Downloads/UNHCR Emergency Manual"], recursive=True) #print(f"Found {len(all_pdf_paths)} PDF files") #assert len(all_pdf_paths) > 0 #all_pdf_paths = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf'] class Ingester(): """ Vectorises chunks of the data and puts source as metadata """ def __init__( self, separator='\n', chunk_overlap=200, chunk_size=200, ): self.splitter = CharacterTextSplitter(chunk_size=chunk_size, separator=separator, chunk_overlap=chunk_overlap) def ingest(self, path): #ps = get_all_pdf_filenames([path], recursive=True) # get paths ps = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf'] data = [] sources = [] for p in tqdm(ps): # extract data from paths reader = PdfReader(p) page = '\n'.join([reader.pages[i].extract_text() for i in range(len(reader.pages))]) data.append(page) sources.append(p) docs = [] metadatas = [] for i, d in tqdm(enumerate(data)): # split text and make documents splits = self.splitter.split_text(d) if all(s != "" for s in splits): docs.extend(splits) metadatas.extend([{"source": sources[i]}] * len(splits)) assert len(docs) > 0 print("Extracting embeddings") store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) with open(os.path.join('./data', 'store.pkl'), "wb") as f: pickle.dump(store, f) print(f"Saved store at {os.path.join('./data', 'store.pkl')}.") ingester = Ingester(chunk_size=2000) ingester.ingest("/mnt/c/users/elio/Downloads/UNHCR Emergency Manual")