File size: 2,645 Bytes
0c7add2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from pathlib import Path
import faiss
import pickle
from PyPDF2 import PdfReader
from tqdm import tqdm
import glob
import os
import re

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader


import dotenv

dotenv.load_dotenv()

def get_all_pdf_filenames(paths, recursive):
    extensions = ["pdf"]
    filenames = []
    for ext_name in extensions:
        ext = f"**/*.{ext_name}" if recursive else f"*.{ext_name}"
        for path in paths:
            filenames.extend(glob.glob(os.path.join(path, ext), recursive=recursive))
    return filenames


#all_pdf_paths = get_all_pdf_filenames(["/mnt/c/users/elio/Downloads/UNHCR Emergency Manual"], recursive=True)
#print(f"Found {len(all_pdf_paths)} PDF files")
#assert len(all_pdf_paths) > 0
#all_pdf_paths = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']

class Ingester():
    """
    Vectorises chunks of the data and puts source as metadata
    """
    def __init__(
        self,
        separator='\n',
        chunk_overlap=200,
        chunk_size=200,
    ):

        self.splitter = CharacterTextSplitter(chunk_size=chunk_size, separator=separator, chunk_overlap=chunk_overlap)
        
    def ingest(self, path):
        #ps = get_all_pdf_filenames([path], recursive=True) # get paths
        ps = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf']
        data = []
        sources = []
        for p in tqdm(ps): # extract data from paths
            reader = PdfReader(p)
            page = '\n'.join([reader.pages[i].extract_text() for i in range(len(reader.pages))])
            data.append(page)
            sources.append(p)

        docs = []
        metadatas = []
        for i, d in tqdm(enumerate(data)): # split text and make documents
            splits = self.splitter.split_text(d)
            if all(s != "" for s in splits):
                docs.extend(splits)
                metadatas.extend([{"source": sources[i]}] * len(splits))
                
        assert len(docs) > 0
                
        print("Extracting embeddings")
        store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
        
        with open(os.path.join('./data', 'store.pkl'), "wb") as f:
            pickle.dump(store, f)
            
        print(f"Saved store at {os.path.join('./data', 'store.pkl')}.")
        
ingester = Ingester(chunk_size=2000)
ingester.ingest("/mnt/c/users/elio/Downloads/UNHCR Emergency Manual")