unhcr / ingest.py
elyxlz
initial commit
0c7add2
from pathlib import Path
import faiss
import pickle
from PyPDF2 import PdfReader
from tqdm import tqdm
import glob
import os
import re
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
import dotenv
dotenv.load_dotenv()
def get_all_pdf_filenames(paths, recursive):
extensions = ["pdf"]
filenames = []
for ext_name in extensions:
ext = f"**/*.{ext_name}" if recursive else f"*.{ext_name}"
for path in paths:
filenames.extend(glob.glob(os.path.join(path, ext), recursive=recursive))
return filenames
#all_pdf_paths = get_all_pdf_filenames(["/mnt/c/users/elio/Downloads/UNHCR Emergency Manual"], recursive=True)
#print(f"Found {len(all_pdf_paths)} PDF files")
#assert len(all_pdf_paths) > 0
#all_pdf_paths = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']
class Ingester():
"""
Vectorises chunks of the data and puts source as metadata
"""
def __init__(
self,
separator='\n',
chunk_overlap=200,
chunk_size=200,
):
self.splitter = CharacterTextSplitter(chunk_size=chunk_size, separator=separator, chunk_overlap=chunk_overlap)
def ingest(self, path):
#ps = get_all_pdf_filenames([path], recursive=True) # get paths
ps = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']
data = []
sources = []
for p in tqdm(ps): # extract data from paths
reader = PdfReader(p)
page = '\n'.join([reader.pages[i].extract_text() for i in range(len(reader.pages))])
data.append(page)
sources.append(p)
docs = []
metadatas = []
for i, d in tqdm(enumerate(data)): # split text and make documents
splits = self.splitter.split_text(d)
if all(s != "" for s in splits):
docs.extend(splits)
metadatas.extend([{"source": sources[i]}] * len(splits))
assert len(docs) > 0
print("Extracting embeddings")
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
with open(os.path.join('./data', 'store.pkl'), "wb") as f:
pickle.dump(store, f)
print(f"Saved store at {os.path.join('./data', 'store.pkl')}.")
ingester = Ingester(chunk_size=2000)
ingester.ingest("/mnt/c/users/elio/Downloads/UNHCR Emergency Manual")