Spaces:

fracapuano
/

AISandbox

Runtime error

App Files Files Community

fracapuano commited on Sep 5, 2023

Commit

4f5c619

1 Parent(s): 0e17089

fix: bug fixing inheritance

Browse files

Files changed (1) hide show

qa/qa.py +130 -79

qa/qa.py CHANGED Viewed

@@ -5,11 +5,63 @@ from typing import Text, Union
 multiple_files = True
-def clear_submit():
     """
-    Toggles the file_submitted internal session state variable to False.
     """
-    st.session_state["file_submitted"] = False
 def set_openai_api_key(api_key:Text)->bool:
     """Sets the internal OpenAI API key to the given value.
@@ -17,15 +69,14 @@ def set_openai_api_key(api_key:Text)->bool:
     Args:
         api_key (Text): OpenAI API key
     """
-    if not (api_key.startswith('sk-') and len(api_key)==51):
-        st.error("Invalid OpenAI API key! Please provide a valid key.")
-        return False
     st.session_state["OPENAI_API_KEY"] = api_key
     st.session_state["api_key_configured"] = True
     return True
-def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
     """Converts a file to a document using specialized parsers."""
     if file.name.endswith(".pdf"):
         doc = parse_pdf(file)
@@ -43,14 +94,9 @@ def file_to_doc(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
 # def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
 def qa_main():
-    st.markdown("<h2>This app allows to chat with files!</h2>", unsafe_allow_html=True)
     st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
-    index = None
-    doc = None
-    upload_document_greenlight = False
-    uploaded_processed_document_greenlight = False
     # OpenAI API Key - TODO: consider adding a key valid for everyone
     # st.header("Configure OpenAI API Key")
     # st.warning('Please enter your OpenAI API Key!', icon='⚠️')
@@ -63,88 +109,93 @@ def qa_main():
     #     help="You can get your API key from https://platform.openai.com/account/api-keys.",
     #     value=st.session_state.get("OPENAI_API_KEY", ""),
     # )
     user_secret = st.secrets["OPENAI_API_KEY"]
     if user_secret:
         if set_openai_api_key(user_secret):
-            st.success('OpenAI API key successfully accessed!', icon='✅')
-            upload_document_greenlight = True
-    if upload_document_greenlight:
         # File that needs to be queried
         st.header("Upload a file")
-        uploaded_file = st.file_uploader(
             "Upload a pdf, docx, or txt file (scanned documents not supported)",
             type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
             help="Scanned documents are not supported yet 🥲",
-            on_change=clear_submit,
-            accept_multiple_files=multiple_files
         )
-        # reading the uploaded files
-        text = []
-        if len(uploaded_file) != 0:
-            # toggle internal file submission state to True
-            st.session_state["file_submitted"] = True
-            for file in uploaded_file:
-                # parse the file using custom parsers
-                file_doc = file_to_doc(file)
-                # converts the files into a list of documents
-                file_text = text_to_docs(text=tuple(file_doc), file_name=file.name)
-                text.extend(file_text)
-            # embeds the documents using OpenAI API
             try:
-                with st.spinner("Indexing the document... This might take a while!"):
-                    index = embed_docs(tuple(text))
-                    st.session_state["api_key_configured"] = True
             except OpenAIError as e:
                 st.error("OpenAI error encountered: ", e._message)
-            uploaded_processed_document_greenlight = True
-    if uploaded_processed_document_greenlight:
-        if "messages" not in st.session_state:
-            st.session_state["messages"] = []
-        for message in st.session_state.messages:
-            with st.chat_message(message["role"]):
-                st.markdown(message["content"])
-        if prompt := st.chat_input("Ask the document something..."):
-            st.session_state.messages.append({"role": "user", "content": prompt})
-            with st.chat_message("user"):
-                st.markdown(prompt)
-            with st.chat_message("assistant"):
-                message_placeholder = st.empty()
-                # retrieving the most relevant sources
-                sources = search_docs(index, prompt)
-                # producing the answer, live
-                full_response = ""
-                for answer_bit in get_answer(sources, prompt)["output_text"]:
-                    full_response += answer_bit
-                    message_placeholder.markdown(full_response + "▌")
-                message_placeholder.markdown(full_response)
-                # answer = get_answer(sources, prompt)
-                # message_placeholder.markdown(answer["output_text"])
-            # st.session_state.messages.append({"role": "assistant", "content": answer["output_text"]})
-            st.session_state.messages.append({"role": "assistant", "content": full_response})
-# This might be useful to add memory to the chatbot harnessing a more low-level approach
-# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
-# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
-# retriever = your_vector_store.as_retriever()
-# # Create the multipurpose chain
-# qachat = ConversationalRetrievalChain.from_llm(
-#     llm=ChatOpenAI(temperature=0),
-#     memory=memory,
-#     retriever=retriever,
-#     return_source_documents=True
-# )
-# qachat("Ask your question here...")

 multiple_files = True
+def query_pipeline(index:VectorStore, query:Text, stream_answer:bool=False)->Text:
+    """This function reproduces the querying pipeline considering a given input index."""
+    # retrieving the most relevant pieces of information within the knowledge base
+    sources = search_docs(index, query=query)
+    # getting the answer, all at once
+    answer = get_answer(sources, query=query, stream_answer=stream_answer)["output_text"]
+    return answer
+def toggle_process_document():
+    """Toggles the greenlight for the next step in the pipeline, i.e. processing the document."""
+    if "processing_document_greenlight" not in st.session_state:
+        st.session_state["processing_document_greenlight"] = True
+    st.session_state["processing_document_greenlight"] = not st.session_state["processing_document_greenlight"]
+def register_new_file_name(file_name):
     """
+    Registers a new file name in the internal session state.
     """
+    if "uploaded_file_names" not in st.session_state:
+        st.session_state["uploaded_file_names"] = []
+    st.session_state["uploaded_file_names"].append(file_name)
+def clear_index():
+    """
+    Clears the index from the internal session state.
+    This is a non reversible operation.
+    """
+    if "index" in st.session_state:
+        del globals()["index"]
+def clear_session_state():
+    """
+    Clears the session state iterating over keys.
+    This is a non reversible operation.
+    """
+    for k in st.session_state.keys():
+        del st.session_state[k]
+def register_new_file(new_file):
+    """
+    Registers a new file in the internal session state.
+    """
+    if "uploaded_files" not in st.session_state:
+        st.session_state["uploaded_files"] = []
+    st.session_state["uploaded_files"].extend(new_file)
+def clear_all_files():
+    """Removes all uploaded files from the interal session state."""
+    st.session_state["uploaded_files"] = []
+def append_uploaded_files(file):
+    """Appends the uploaded files to the internal session state."""
+    st.session_state.get("uploaded_files", []).extend(file)
 def set_openai_api_key(api_key:Text)->bool:
     """Sets the internal OpenAI API key to the given value.
     Args:
         api_key (Text): OpenAI API key
     """
+    if not check_openai_api_key(api_key=api_key):
+        raise ValueError("Invalid OpenAI API key! Please provide a valid key.")
     st.session_state["OPENAI_API_KEY"] = api_key
     st.session_state["api_key_configured"] = True
     return True
+def parse_file(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
     """Converts a file to a document using specialized parsers."""
     if file.name.endswith(".pdf"):
         doc = parse_pdf(file)
 # def document_embedding_pipeline(file:Union[PDFFile, DocxFile, TxtFile, CodeFile]) -> None:
 def qa_main():
+    """Main function for the QA app."""
     st.write("Just upload something using and start chatting with a version of GPT4 that has read the file!")
     # OpenAI API Key - TODO: consider adding a key valid for everyone
     # st.header("Configure OpenAI API Key")
     # st.warning('Please enter your OpenAI API Key!', icon='⚠️')
     #     help="You can get your API key from https://platform.openai.com/account/api-keys.",
     #     value=st.session_state.get("OPENAI_API_KEY", ""),
     # )
     user_secret = st.secrets["OPENAI_API_KEY"]
     if user_secret:
         if set_openai_api_key(user_secret):
+            # removing this when the OpenAI API key is hardcoded
+            # st.success('OpenAI API key successfully accessed!', icon='✅')
+            # greenlight for next step, i.e. uploading the document to chat with
+            st.session_state["upload_document_greenlight"] = True
+    if st.session_state.get("upload_document_greenlight"):
         # File that needs to be queried
         st.header("Upload a file")
+        st.file_uploader(
             "Upload a pdf, docx, or txt file (scanned documents not supported)",
             type=["pdf", "docx", "txt", "py", "json", "html", "css", "md"],
             help="Scanned documents are not supported yet 🥲",
+            accept_multiple_files=multiple_files,
+            #on_change=toggle_process_document,
+            key="uploaded_file"
         )
+    documents = {}
+    indexes = {}
+    for file in st.session_state["uploaded_file"]:
+        parsed_file = parse_file(file)
+        # converts the files into a list of documents
+        document = text_to_docs(pages=tuple(parsed_file), file_name=file.name)
+        documents[file.name] = document
+        with st.spinner(f"Indexing {file.name} (might take some time)"):
             try:
+                # indexing the document uploaded
+                indexes[file.name] = embed_docs(file_name=file.name, _docs=tuple(document))
             except OpenAIError as e:
                 st.error("OpenAI error encountered: ", e._message)
+    if len(documents)>1:
+        # documents to be indexed when providing the query
+        st.multiselect(
+            label="Select the documents to be indexed",
+            options=list(documents.keys()),
+            key="multiselect_documents_choices",
+        )
+    elif len(documents)==1:
+        st.session_state["multiselect_documents_choices"] = [list(documents.keys())[0]]
+    # this is the code that actually performs the chat process
+    if "messages" not in st.session_state:  # checking if there is any cache history
+        st.session_state["messages"] = []
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"], unsafe_allow_html=True)
+    if prompt:=st.chat_input("Ask the document something..."):
+        if prompt=="1":
+            prompt="What is this document about?"
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        with st.chat_message("assistant"):
+            # full_response will store every question asked to all the document(s) considered
+            full_response = ""
+            message_placeholder = st.empty()
+            # asking the same question to all of the documents considered
+            for chat_document in st.session_state["multiselect_documents_choices"]:
+                # keeping track of what is asked to what document
+                full_response += \
+                    f"<i>Asking</i> <b>{chat_document}</b> <i>question</i> <b>{prompt}</b></i><br>"
+                message_placeholder.markdown(full_response, unsafe_allow_html=True)
+                # retrieving the vector store associated to the chat document considered
+                chat_index = indexes[chat_document]
+                # producing the answer considered, live
+                for answer_bit in query_pipeline(chat_index, prompt, stream_answer=True):
+                    full_response += answer_bit
+                    message_placeholder.markdown(full_response + "▌", unsafe_allow_html=True)
+                # appending a final entering
+                full_response += "<br>"
+            message_placeholder.markdown(full_response, unsafe_allow_html=True)
+        # appending the final response obtained after having asked all the documents
+        st.session_state.messages.append({"role": "assistant", "content": full_response})