Sreekumar1608 commited on
Commit
443c2d6
·
1 Parent(s): b923833

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -85
app.py CHANGED
@@ -1,89 +1,85 @@
1
- #creating directory
2
-
3
- from google.colab import files
4
- upload = files.upload()
5
-
6
- #necessary installations
7
- # !pip install --upgrade langchain openai -q
8
- # !pip install unstructured -q
9
- # !pip install unstructured[local-inference] -q
10
- # !pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2 -q
11
- # !apt-get install poppler-utils
12
- # !pip install pinecone-client -q
13
-
14
- #importing necessary modules
15
- import os
16
- import openai
17
- import pinecone
18
- from langchain.vectorstores import Pinecone
19
- os.environ["OpenAI_API_Key"] = "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"
20
- from langchain.document_loaders import DirectoryLoader
21
-
22
- #Provding directory to the file
23
- pdf = 'mod3.pdf'
24
- os.system(pdf)
25
- directory = '/content/Dir'
26
-
27
- def load_docs(directory):
28
- loader = PyPDFLoader(directory)
29
- documents = loader.load()
30
- return documents
31
- documents = load_docs(directory)
32
- len(documents)
33
-
34
- #Splitting directory into chunks using RecursiveCharacterTextSplitter
35
  from langchain.text_splitter import RecursiveCharacterTextSplitter
36
-
37
- def split_docs(documents, chunk_size=1000, chunk_overlap=20):
38
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
39
- docs = text_splitter.split_documents(documents)
40
- return docs
41
-
42
- docs = split_docs(documents)
43
- print(len(docs))
44
-
45
- #!pip install tiktoken -q
46
-
47
  from langchain.embeddings.openai import OpenAIEmbeddings
48
-
49
- #Creating embeddings for the chunks
50
- embeddings = OpenAIEmbeddings(model_name="ada")
51
-
52
- query_result = embeddings.embed_query("Hello world")
53
- len(query_result)
54
-
55
- pinecone.init(
56
- api_key="80e2dca6-e86a-4669-ad68-f751aaf739f4",
57
- environment="us-west4-gcp-free"
58
- )
59
-
60
- #creating a index in pinecone for storing the embeddings
61
- index_name = "pdf_read"
62
-
63
- index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
64
-
65
- #Checking similar texts
66
- def get_similiar_docs(query, k=2, score=False):
67
- if score:
68
- similar_docs = index.similarity_search_with_score(query, k=k)
69
- else:
70
- similar_docs = index.similarity_search(query, k=k)
71
- return similar_docs
72
-
73
- #Providing openAI model
74
  from langchain.llms import OpenAI
75
-
76
- # model_name = "text-davinci-003"
77
- # model_name = "gpt-3.5-turbo"
78
- model_name = "gpt-4"
79
- llm = OpenAI(model_name=model_name)
80
-
81
- #Chaining the relevant docs and query
82
  from langchain.chains.question_answering import load_qa_chain
83
-
84
- chain = load_qa_chain(llm, chain_type="stuff")
85
-
86
- def get_answer(query):
87
- similar_docs = get_similiar_docs(query)
88
- answer = chain.run(input_documents=similar_docs, question=query)
89
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ import pickle
4
+ from PyPDF2 import PdfReader
5
+ from streamlit_extras.add_vertical_space import add_vertical_space
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
 
 
 
 
 
 
 
7
  from langchain.embeddings.openai import OpenAIEmbeddings
8
+ from langchain.vectorstores import FAISS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from langchain.llms import OpenAI
 
 
 
 
 
 
 
10
  from langchain.chains.question_answering import load_qa_chain
11
+ from langchain.callbacks import get_openai_callback
12
+ import os
13
+
14
+ # Sidebar contents
15
+ with st.sidebar:
16
+ st.title('🤗💬 LLM Chat App')
17
+ st.markdown('''
18
+ ## About
19
+ This app is an LLM-powered chatbot built using:
20
+ - [Streamlit](https://streamlit.io/)
21
+ - [LangChain](https://python.langchain.com/)
22
+ - [OpenAI](https://platform.openai.com/docs/models) LLM model
23
+
24
+ ''')
25
+ add_vertical_space(5)
26
+ st.write('Made with ❤️ by [Prompt Engineer](https://youtube.com/@engineerprompt)')
27
+
28
+ load_dotenv()
29
+
30
+ def main():
31
+ st.header("Chat with PDF 💬")
32
+
33
+
34
+ # upload a PDF file
35
+ pdf = st.file_uploader("Upload your PDF", type='pdf')
36
+
37
+ # st.write(pdf)
38
+ if pdf is not None:
39
+ pdf_reader = PdfReader(pdf)
40
+
41
+ text = ""
42
+ for page in pdf_reader.pages:
43
+ text += page.extract_text()
44
+
45
+ text_splitter = RecursiveCharacterTextSplitter(
46
+ chunk_size=1000,
47
+ chunk_overlap=200,
48
+ length_function=len
49
+ )
50
+ chunks = text_splitter.split_text(text=text)
51
+
52
+ # # embeddings
53
+ store_name = pdf.name[:-4]
54
+ st.write(f'{store_name}')
55
+ # st.write(chunks)
56
+
57
+ if os.path.exists(f"{store_name}.pkl"):
58
+ with open(f"{store_name}.pkl", "rb") as f:
59
+ VectorStore = pickle.load(f)
60
+ # st.write('Embeddings Loaded from the Disk')s
61
+ else:
62
+ embeddings = OpenAIEmbeddings()
63
+ VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
64
+ with open(f"{store_name}.pkl", "wb") as f:
65
+ pickle.dump(VectorStore, f)
66
+
67
+ # embeddings = OpenAIEmbeddings()
68
+ # VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
69
+
70
+ # Accept user questions/query
71
+ query = st.text_input("Ask questions about your PDF file:")
72
+ # st.write(query)
73
+
74
+ if query:
75
+ docs = VectorStore.similarity_search(query=query, k=3)
76
+
77
+ llm = OpenAI()
78
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
79
+ with get_openai_callback() as cb:
80
+ response = chain.run(input_documents=docs, question=query)
81
+ print(cb)
82
+ st.write(response)
83
+
84
+ if __name__ == '__main__':
85
+ main()