|
from pathlib import Path |
|
import faiss |
|
import pickle |
|
from PyPDF2 import PdfReader |
|
from tqdm import tqdm |
|
import glob |
|
import os |
|
import re |
|
|
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.vectorstores import FAISS |
|
from langchain.document_loaders import TextLoader |
|
|
|
|
|
import dotenv |
|
|
|
dotenv.load_dotenv() |
|
|
|
def get_all_pdf_filenames(paths, recursive): |
|
extensions = ["pdf"] |
|
filenames = [] |
|
for ext_name in extensions: |
|
ext = f"**/*.{ext_name}" if recursive else f"*.{ext_name}" |
|
for path in paths: |
|
filenames.extend(glob.glob(os.path.join(path, ext), recursive=recursive)) |
|
return filenames |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Ingester(): |
|
""" |
|
Vectorises chunks of the data and puts source as metadata |
|
""" |
|
def __init__( |
|
self, |
|
separator='\n', |
|
chunk_overlap=200, |
|
chunk_size=200, |
|
): |
|
|
|
self.splitter = CharacterTextSplitter(chunk_size=chunk_size, separator=separator, chunk_overlap=chunk_overlap) |
|
|
|
def ingest(self, path): |
|
|
|
ps = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf'] |
|
data = [] |
|
sources = [] |
|
for p in tqdm(ps): |
|
reader = PdfReader(p) |
|
page = '\n'.join([reader.pages[i].extract_text() for i in range(len(reader.pages))]) |
|
data.append(page) |
|
sources.append(p) |
|
|
|
docs = [] |
|
metadatas = [] |
|
for i, d in tqdm(enumerate(data)): |
|
splits = self.splitter.split_text(d) |
|
if all(s != "" for s in splits): |
|
docs.extend(splits) |
|
metadatas.extend([{"source": sources[i]}] * len(splits)) |
|
|
|
assert len(docs) > 0 |
|
|
|
print("Extracting embeddings") |
|
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) |
|
|
|
with open(os.path.join('./data', 'store.pkl'), "wb") as f: |
|
pickle.dump(store, f) |
|
|
|
print(f"Saved store at {os.path.join('./data', 'store.pkl')}.") |
|
|
|
ingester = Ingester(chunk_size=2000) |
|
ingester.ingest("/mnt/c/users/elio/Downloads/UNHCR Emergency Manual") |
|
|