akashmishra358 commited on
Commit
5f11ab0
·
verified ·
1 Parent(s): 51f99e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain_openai import OpenAIEmbeddings
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain_openai import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+
11
+ def extract_text_from_pdf(pdf_file_obj):
12
+ pdf_reader = PdfReader(BytesIO(pdf_file_obj.getbuffer()))
13
+ text = ""
14
+ for page_num in range(len(pdf_reader.pages)):
15
+ page_obj = pdf_reader.pages[page_num]
16
+ text += page_obj.extract_text()
17
+ return text
18
+
19
+ def get_text_chunks(text):
20
+ text_splitter = CharacterTextSplitter(
21
+ separator="\n",
22
+ chunk_size=1000,
23
+ chunk_overlap=200,
24
+ length_function=len
25
+ )
26
+ chunks = text_splitter.split_text(text)
27
+ return chunks
28
+
29
+ def get_vectorstore(text_chunks):
30
+ metadatas = [{"source": f"{i}-pl"} for i in range(len(text_chunks))]
31
+ embeddings = OpenAIEmbeddings()
32
+ vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embeddings)
33
+ return vectorstore
34
+
35
+ def get_conversation_chain(vectorstore):
36
+ llm = ChatOpenAI()
37
+
38
+ memory = ConversationBufferMemory(
39
+ memory_key='chat_history', return_messages=True)
40
+ conversation_chain = ConversationalRetrievalChain.from_llm(
41
+ llm=llm,
42
+ retriever=vectorstore.as_retriever(),
43
+ memory=memory
44
+ )
45
+ return conversation_chain
46
+
47
+ def handle_userinput(user_question):
48
+ response = st.session_state.conversation({'question': user_question})
49
+ st.session_state.chat_history = response['chat_history']
50
+
51
+ for i, message in enumerate(st.session_state.chat_history):
52
+ if i % 2 == 0:
53
+ st.markdown(("User: "+message.content))
54
+ else:
55
+ st.markdown(("AI: "+message.content))
56
+
57
+ def main():
58
+ st.title("PDF Question Answering")
59
+ if "conversation" not in st.session_state:
60
+ st.session_state.conversation = None
61
+ if "chat_history" not in st.session_state:
62
+ st.session_state.chat_history = None
63
+
64
+ if st.session_state.conversation is not None:
65
+ st.header("Ask questions from your PDF")
66
+ user_question = st.text_input("Ask a question about your document:")
67
+ if user_question:
68
+ handle_userinput(user_question)
69
+
70
+ if st.session_state.conversation is None:
71
+ st.header("Upload your PDF here")
72
+ pdf_doc = st.file_uploader("Browse your file here", type="pdf")
73
+ if pdf_doc is not None:
74
+ with st.spinner("Processing"):
75
+ # get pdf text
76
+ raw_text = extract_text_from_pdf(pdf_doc)
77
+
78
+ # get the text chunks
79
+ text_chunks = get_text_chunks(raw_text)
80
+
81
+ # create vector store
82
+ vectorstore = get_vectorstore(text_chunks)
83
+
84
+ # create conversation chain
85
+ st.session_state.conversation = get_conversation_chain(
86
+ vectorstore)
87
+
88
+ if __name__ == '__main__':
89
+ main()