File size: 2,645 Bytes
0c7add2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
from pathlib import Path
import faiss
import pickle
from PyPDF2 import PdfReader
from tqdm import tqdm
import glob
import os
import re
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
import dotenv
dotenv.load_dotenv()
def get_all_pdf_filenames(paths, recursive):
extensions = ["pdf"]
filenames = []
for ext_name in extensions:
ext = f"**/*.{ext_name}" if recursive else f"*.{ext_name}"
for path in paths:
filenames.extend(glob.glob(os.path.join(path, ext), recursive=recursive))
return filenames
#all_pdf_paths = get_all_pdf_filenames(["/mnt/c/users/elio/Downloads/UNHCR Emergency Manual"], recursive=True)
#print(f"Found {len(all_pdf_paths)} PDF files")
#assert len(all_pdf_paths) > 0
#all_pdf_paths = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']
class Ingester():
"""
Vectorises chunks of the data and puts source as metadata
"""
def __init__(
self,
separator='\n',
chunk_overlap=200,
chunk_size=200,
):
self.splitter = CharacterTextSplitter(chunk_size=chunk_size, separator=separator, chunk_overlap=chunk_overlap)
def ingest(self, path):
#ps = get_all_pdf_filenames([path], recursive=True) # get paths
ps = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']
data = []
sources = []
for p in tqdm(ps): # extract data from paths
reader = PdfReader(p)
page = '\n'.join([reader.pages[i].extract_text() for i in range(len(reader.pages))])
data.append(page)
sources.append(p)
docs = []
metadatas = []
for i, d in tqdm(enumerate(data)): # split text and make documents
splits = self.splitter.split_text(d)
if all(s != "" for s in splits):
docs.extend(splits)
metadatas.extend([{"source": sources[i]}] * len(splits))
assert len(docs) > 0
print("Extracting embeddings")
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
with open(os.path.join('./data', 'store.pkl'), "wb") as f:
pickle.dump(store, f)
print(f"Saved store at {os.path.join('./data', 'store.pkl')}.")
ingester = Ingester(chunk_size=2000)
ingester.ingest("/mnt/c/users/elio/Downloads/UNHCR Emergency Manual")
|