Spaces:

captain-awesome
/

Docuverse-zephyr-beta

Sleeping

App Files Files Community

captain-awesome commited on Nov 7, 2023

Commit

f18103b

•

1 Parent(s): d9b4100

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -1

app.py CHANGED Viewed

@@ -106,8 +106,17 @@ def create_vector_database(loaded_documents):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30, length_function = len)
     chunked_documents = text_splitter.split_documents(loaded_documents)
     embeddings = HuggingFaceBgeEmbeddings(
-        model_name = "BAAI/bge-large-en"
     )
     persist_directory = 'db'
@@ -122,3 +131,176 @@ def create_vector_database(loaded_documents):
     # db = Chroma(persist_directory=persist_directory,
     #               embedding_function=embedding)
     return db

     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30, length_function = len)
     chunked_documents = text_splitter.split_documents(loaded_documents)
+    # embeddings = HuggingFaceBgeEmbeddings(
+    #     model_name = "BAAI/bge-large-en"
+    # )
+    model_name = "BAAI/bge-large-en"
+    model_kwargs = {'device': 'cpu'}
+    encode_kwargs = {'normalize_embeddings': False}
     embeddings = HuggingFaceBgeEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
     )
     persist_directory = 'db'
     # db = Chroma(persist_directory=persist_directory,
     #               embedding_function=embedding)
     return db
+def set_custom_prompt():
+    """
+    Prompt template for retrieval for each vectorstore
+    """
+    prompt_template = """Use the following pieces of information to answer the user's question.
+    If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    Context: {context}
+    Question: {question}
+    Only return the helpful answer below and nothing else.
+    Helpful answer:
+    """
+    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+    return prompt
+def create_chain(llm, prompt, db):
+    """
+    Creates a Retrieval Question-Answering (QA) chain using a given language model, prompt, and database.
+    This function initializes a ConversationalRetrievalChain object with a specific chain type and configurations,
+    and returns this  chain. The retriever is set up to return the top 3 results (k=3).
+    Args:
+        llm (any): The language model to be used in the RetrievalQA.
+        prompt (str): The prompt to be used in the chain type.
+        db (any): The database to be used as the
+        retriever.
+    Returns:
+        ConversationalRetrievalChain: The initialized conversational chain.
+    """
+    memory = ConversationTokenBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, input_key='question', output_key='answer')
+    # chain = ConversationalRetrievalChain.from_llm(
+    #     llm=llm,
+    #     chain_type="stuff",
+    #     retriever=db.as_retriever(search_kwargs={"k": 3}),
+    #     return_source_documents=True,
+    #     max_tokens_limit=256,
+    #     combine_docs_chain_kwargs={"prompt": prompt},
+    #     condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+    #     memory=memory,
+    # )
+    chain = RetrievalQA.from_chain_type(llm=llm,
+                                       chain_type='stuff',
+                                       retriever=db.as_retriever(search_kwargs={'k': 3}),
+                                       return_source_documents=True,
+                                       chain_type_kwargs={'prompt': prompt}
+                                       )
+    return chain
+def create_retrieval_qa_bot(loaded_documents):
+    # if not os.path.exists(persist_dir):
+    #       raise FileNotFoundError(f"No directory found at {persist_dir}")
+    try:
+        llm = load_model()  # Assuming this function exists and works as expected
+    except Exception as e:
+        raise Exception(f"Failed to load model: {str(e)}")
+    try:
+        prompt = set_custom_prompt()  # Assuming this function exists and works as expected
+    except Exception as e:
+        raise Exception(f"Failed to get prompt: {str(e)}")
+    # try:
+    #     CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()  # Assuming this function exists and works as expected
+    # except Exception as e:
+    #     raise Exception(f"Failed to get condense prompt: {str(e)}")
+    try:
+        db = create_vector_database(loaded_documents)  # Assuming this function exists and works as expected
+    except Exception as e:
+        raise Exception(f"Failed to get database: {str(e)}")
+    try:
+        # qa = create_chain(
+        #     llm=llm, prompt=prompt,CONDENSE_QUESTION_PROMPT=CONDENSE_QUESTION_PROMPT, db=db
+        # )  # Assuming this function exists and works as expected
+        qa = create_chain(
+            llm=llm, prompt=prompt, db=db
+        )  # Assuming this function exists and works as expected
+    except Exception as e:
+        raise Exception(f"Failed to create retrieval QA chain: {str(e)}")
+    return qa
+def retrieve_bot_answer(query, loaded_documents):
+    """
+    Retrieves the answer to a given query using a QA bot.
+    This function creates an instance of a QA bot, passes the query to it,
+    and returns the bot's response.
+    Args:
+        query (str): The question to be answered by the QA bot.
+    Returns:
+        dict: The QA bot's response, typically a dictionary with response details.
+    """
+    qa_bot_instance = create_retrieval_qa_bot(loaded_documents)
+    # bot_response = qa_bot_instance({"question": query})
+    bot_response = qa_bot_instance({"query": query})
+    # Check if the 'answer' key exists in the bot_response dictionary
+    # if 'answer' in bot_response:
+    #     # answer = bot_response['answer']
+    #     return bot_response
+    # else:
+    #     raise KeyError("Expected 'answer' key in bot_response, but it was not found.")
+    # result = bot_response['answer']
+    result = bot_response['result']
+    sources = []
+    for source in bot_response["source_documents"]:
+        sources.append(source.metadata['source'])
+    return result, sources
+def main():
+    st.title("Docuverse")
+    # Upload files
+    uploaded_files = st.file_uploader("Upload your documents", type=["pdf", "md", "txt", "csv", "py", "epub", "html", "ppt", "pptx", "doc", "docx", "odt", "ipynb"], accept_multiple_files=True)
+    loaded_documents = []
+    if uploaded_files:
+        # Create a temporary directory
+        with tempfile.TemporaryDirectory() as td:
+            # Move the uploaded files to the temporary directory and process them
+            for uploaded_file in uploaded_files:
+                st.write(f"Uploaded: {uploaded_file.name}")
+                ext = os.path.splitext(uploaded_file.name)[-1][1:].lower()
+                st.write(f"Uploaded: {ext}")
+                # Check if the extension is in FILE_LOADER_MAPPING
+                if ext in FILE_LOADER_MAPPING:
+                    loader_class, loader_args = FILE_LOADER_MAPPING[ext]
+                    # st.write(f"loader_class: {loader_class}")
+                    # Save the uploaded file to the temporary directory
+                    file_path = os.path.join(td, uploaded_file.name)
+                    with open(file_path, 'wb') as temp_file:
+                        temp_file.write(uploaded_file.read())
+                    # Use Langchain loader to process the file
+                    loader = loader_class(file_path, **loader_args)
+                    loaded_documents.extend(loader.load())
+                else:
+                    st.warning(f"Unsupported file extension: {ext}")
+        # st.write(f"loaded_documents: {loaded_documents}")
+        st.write("Chat with the Document:")
+        query = st.text_input("Ask a question:")
+        if st.button("Get Answer"):
+            if query:
+                # Load model, set prompts, create vector database, and retrieve answer
+                try:
+                    start = timeit.default_timer()
+                    llm = load_model()
+                    prompt = set_custom_prompt()
+                    CONDENSE_QUESTION_PROMPT = set_custom_prompt_condense()
+                    db = create_vector_database(loaded_documents)
+                    # st.write(f"db: {db}")
+                    result, sources = retrieve_bot_answer(query,loaded_documents)
+                    end = timeit.default_timer()
+                    st.write("Elapsed time:")
+                    st.write(end - start)
+                    # st.write(f"response: {response}")
+                    # Display bot response
+                    st.write("Bot Response:")
+                    st.write(result)
+                    st.write(sources)
+                except Exception as e:
+                    st.error(f"An error occurred: {str(e)}")
+            else:
+                st.warning("Please enter a question.")
+if __name__ == "__main__":
+    main()