File size: 1,252 Bytes
6ffd7f9
 
 
7117f9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ffd7f9
 
 
907da04
 
6ffd7f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 
import PyPDF2


# Function to process a PDF file
def process_pdf(file_stream):
    if isinstance(file_stream, dict): # Check if PDF was obtained using Drag and Drop or Drive link
        file_path = file_stream['name'] # Use 'path' for local testing and 'name' for Gradio
        pdf_reader = PyPDF2.PdfReader(file_path)
    else:
        pdf_reader = PyPDF2.PdfReader(file_stream)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    return text


def create_dnd_database(file_list):
    raw_text = ''
    if file_list is None:
        return None
    for pdf in file_list:
        raw_text += process_pdf(pdf)

    embedding = OpenAIEmbeddings()

    text_splitter = CharacterTextSplitter(        
            separator = "\n",
            chunk_size = 1000,
            chunk_overlap  = 200, 
            length_function = len,
        )
    texts = text_splitter.split_text(raw_text)
    print('Length of text: ' + str(len(raw_text)))
    db = FAISS.from_texts(texts, embedding)
    
    return db