Spaces:
Runtime error
Runtime error
#creating directory | |
from google.colab import files | |
upload = files.upload() | |
#necessary installations | |
# !pip install --upgrade langchain openai -q | |
# !pip install unstructured -q | |
# !pip install unstructured[local-inference] -q | |
# !pip install detectron2@git+https://github.com/facebookresearch/[email protected]#egg=detectron2 -q | |
# !apt-get install poppler-utils | |
# !pip install pinecone-client -q | |
#importing necessary modules | |
import os | |
import openai | |
import pinecone | |
from langchain.vectorstores import Pinecone | |
os.environ["OpenAI_API_Key"] = "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" | |
from langchain.document_loaders import DirectoryLoader | |
#Provding directory to the file | |
pdf = 'mod3.pdf' | |
os.system(pdf) | |
directory = '/content/Dir' | |
def load_docs(directory): | |
loader = PyPDFLoader(directory) | |
documents = loader.load() | |
return documents | |
documents = load_docs(directory) | |
len(documents) | |
#Splitting directory into chunks using RecursiveCharacterTextSplitter | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
def split_docs(documents, chunk_size=1000, chunk_overlap=20): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
docs = text_splitter.split_documents(documents) | |
return docs | |
docs = split_docs(documents) | |
print(len(docs)) | |
#!pip install tiktoken -q | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
#Creating embeddings for the chunks | |
embeddings = OpenAIEmbeddings(model_name="ada") | |
query_result = embeddings.embed_query("Hello world") | |
len(query_result) | |
pinecone.init( | |
api_key="80e2dca6-e86a-4669-ad68-f751aaf739f4", | |
environment="us-west4-gcp-free" | |
) | |
#creating a index in pinecone for storing the embeddings | |
index_name = "pdf_read" | |
index = Pinecone.from_documents(docs, embeddings, index_name=index_name) | |
#Checking similar texts | |
def get_similiar_docs(query, k=2, score=False): | |
if score: | |
similar_docs = index.similarity_search_with_score(query, k=k) | |
else: | |
similar_docs = index.similarity_search(query, k=k) | |
return similar_docs | |
#Providing openAI model | |
from langchain.llms import OpenAI | |
# model_name = "text-davinci-003" | |
# model_name = "gpt-3.5-turbo" | |
model_name = "gpt-4" | |
llm = OpenAI(model_name=model_name) | |
#Chaining the relevant docs and query | |
from langchain.chains.question_answering import load_qa_chain | |
chain = load_qa_chain(llm, chain_type="stuff") | |
def get_answer(query): | |
similar_docs = get_similiar_docs(query) | |
answer = chain.run(input_documents=similar_docs, question=query) | |
return answer |