PDF_Based_QnA / app.py
Sreekumar1608's picture
Update app.py
44bf37d
raw
history blame
2.54 kB
#creating directory
from google.colab import files
upload = files.upload()
#necessary installations
# !pip install --upgrade langchain openai -q
# !pip install unstructured -q
# !pip install unstructured[local-inference] -q
# !pip install detectron2@git+https://github.com/facebookresearch/[email protected]#egg=detectron2 -q
# !apt-get install poppler-utils
# !pip install pinecone-client -q
#importing necessary modules
import os
import openai
import pinecone
from langchain.vectorstores import Pinecone
os.environ["OpenAI_API_Key"] = "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"
from langchain.document_loaders import DirectoryLoader
#Provding directory to the file
pdf = 'mod3.pdf'
os.system(pdf)
directory = '/content/Dir'
def load_docs(directory):
loader = PyPDFLoader(directory)
documents = loader.load()
return documents
documents = load_docs(directory)
len(documents)
#Splitting directory into chunks using RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
docs = split_docs(documents)
print(len(docs))
#!pip install tiktoken -q
from langchain.embeddings.openai import OpenAIEmbeddings
#Creating embeddings for the chunks
embeddings = OpenAIEmbeddings(model_name="ada")
query_result = embeddings.embed_query("Hello world")
len(query_result)
pinecone.init(
api_key="80e2dca6-e86a-4669-ad68-f751aaf739f4",
environment="us-west4-gcp-free"
)
#creating a index in pinecone for storing the embeddings
index_name = "pdf_read"
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
#Checking similar texts
def get_similiar_docs(query, k=2, score=False):
if score:
similar_docs = index.similarity_search_with_score(query, k=k)
else:
similar_docs = index.similarity_search(query, k=k)
return similar_docs
#Providing openAI model
from langchain.llms import OpenAI
# model_name = "text-davinci-003"
# model_name = "gpt-3.5-turbo"
model_name = "gpt-4"
llm = OpenAI(model_name=model_name)
#Chaining the relevant docs and query
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")
def get_answer(query):
similar_docs = get_similiar_docs(query)
answer = chain.run(input_documents=similar_docs, question=query)
return answer