Spaces:
Sleeping
Sleeping
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
import PyPDF2 | |
# Function to process a PDF file | |
def process_pdf(file_stream): | |
if isinstance(file_stream, dict): # Check if PDF was obtained using Drag and Drop or Drive link | |
file_path = file_stream['name'] # Use 'path' for local testing and 'name' for Gradio | |
pdf_reader = PyPDF2.PdfReader(file_path) | |
else: | |
pdf_reader = PyPDF2.PdfReader(file_stream) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
return text | |
def create_dnd_database(file_list): | |
raw_text = '' | |
if file_list is None: | |
return None | |
for pdf in file_list: | |
raw_text += process_pdf(pdf) | |
embedding = OpenAIEmbeddings() | |
text_splitter = CharacterTextSplitter( | |
separator = "\n", | |
chunk_size = 1000, | |
chunk_overlap = 200, | |
length_function = len, | |
) | |
texts = text_splitter.split_text(raw_text) | |
print('Length of text: ' + str(len(raw_text))) | |
db = FAISS.from_texts(texts, embedding) | |
return db | |