Spaces:
Sleeping
Sleeping
import gradio as gr | |
import PyPDF2 | |
#rom langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores.faiss import FAISS | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain import HuggingFaceHub | |
from langchain.document_loaders import UnstructuredPDFLoader | |
from langchain.indexes import VectorstoreIndexCreator | |
from langchain import OpenAI, VectorDBQA | |
import os | |
openai_api_key = os.environ["OPENAI_API_KEY"] | |
def pdf_to_text(pdf_file, query): | |
# Open the PDF file in binary mode | |
with open(pdf_file.name, 'rb') as pdf_file: | |
# Create a PDF reader object | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
# Create an empty string to store the text | |
text = "" | |
# Loop through each page of the PDF | |
for page_num in range(len(pdf_reader.pages)): | |
# Get the page object | |
page = pdf_reader.pages[page_num] | |
# Extract the texst from the page and add it to the text variable | |
text += page.extract_text() | |
#embedding step | |
from langchain.text_splitter import CharacterTextSplitter | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
texts = text_splitter.split_text(text) | |
embeddings = HuggingFaceEmbeddings() | |
#vector store | |
vectorstore = FAISS.from_texts(texts, embeddings) | |
llm = HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512}) | |
loaders = UnstructuredPDFLoader(pdf_file) | |
index = VectorstoreIndexCreator( | |
embedding=HuggingFaceEmbeddings(), | |
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).from_loaders(loaders)) | |
#inference | |
qa = VectorDBQA.from_chain_type(llm=llm, chain_type="stuff", vectorstore=vectorstore) | |
from langchain.chains import RetrievalQA | |
chain = RetrievalQA.from_chain_type(llm=llm, | |
chain_type="stuff", | |
retriever=index.vectorstore.as_retriever(), | |
input_key="question") | |
return chain.run(query) | |
# Define the Gradio interface | |
pdf_input = gr.inputs.File(label="PDF File") | |
query_input = gr.inputs.Textbox(label="Query") | |
outputs = gr.outputs.Textbox(label="Chatbot Response") | |
interface = gr.Interface(fn=pdf_to_text, inputs=[pdf_input, query_input], outputs=outputs) | |
# Run the interface | |
interface.launch(debug = True) |