Spaces:

myscale
/

ChatData

Running

App Files Files Community

Fangrui Liu commited on Jul 17, 2023

Commit

eb820e1

1 Parent(s): 1a24bbc

revised prompt

Browse files

Files changed (3) hide show

app.py +65 -52
chains/arxiv_chains.py +131 -0
prompts/arxiv_prompt.py +7 -8

app.py CHANGED Viewed

@@ -10,15 +10,12 @@ from langchain.vectorstores import MyScale, MyScaleSettings
 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.retrievers.self_query.base import SelfQueryRetriever
 from langchain.chains.query_constructor.base import AttributeInfo
-from langchain.chains import RetrievalQAWithSourcesChain
 from langchain import OpenAI
 from langchain.chat_models import ChatOpenAI
-from prompts.arxiv_prompt import combine_prompt_template, _myscale_prompt
-from callbacks.arxiv_callbacks import ChatDataSelfSearchCallBackHandler, \
-    ChatDataSelfAskCallBackHandler, ChatDataSQLSearchCallBackHandler, \
-    ChatDataSQLAskCallBackHandler
 from langchain.prompts.prompt import PromptTemplate
 from sqlalchemy import create_engine, MetaData
 from langchain.chains.sql_database.base import SQLDatabaseChain
 from langchain.chains.sql_database.parser import VectorSQLRetrieveAllOutputParser
@@ -26,12 +23,17 @@ from langchain.chains import LLMChain
 from langchain.sql_database import SQLDatabase
 from langchain.retrievers import SQLDatabaseChainRetriever
 st.set_page_config(page_title="ChatData")
 st.header("ChatData")
-columns = ['title', 'id', 'categories', 'abstract', 'authors', 'pubdate']
 def try_eval(x):
@@ -41,7 +43,9 @@ def try_eval(x):
         return x
-def display(dataframe, columns=None):
     if len(dataframe) > 0:
         if columns:
             st.dataframe(dataframe[columns])
@@ -108,24 +112,35 @@ def build_retriever():
             doc_search, "Scientific papers indexes with abstracts. All in English.", metadata_field_info,
             use_original_query=False)
-    with st.spinner('Building RetrievalQAWith SourcesChain...'):
-        document_with_metadata_prompt = PromptTemplate(
-            input_variables=["page_content", "id", "title",
-                             "authors", "pubdate", "categories"],
-            template="Content:\n\tTitle: {title}\n\tAbstract: {page_content}\n\tAuthors: {authors}\n\tDate of Publication: {pubdate}\n\tCategories: {categories}\nSOURCE: {id}")
-        COMBINE_PROMPT = PromptTemplate(
-            template=combine_prompt_template, input_variables=["summaries", "question"])
-        chain = RetrievalQAWithSourcesChain.from_chain_type(
-            ChatOpenAI(model_name='gpt-3.5-turbo-16k',
-                       openai_api_key=st.secrets['OPENAI_API_KEY'], temperature=0.6),
             retriever=retriever,
-            chain_type='stuff',
-            chain_type_kwargs={
-                'prompt': COMBINE_PROMPT,
-                'document_prompt': document_with_metadata_prompt,
-            }, return_source_documents=True)
-    with st.spinner('Building Vector SQL Database Chain'):
         MYSCALE_USER = st.secrets['MYSCALE_USER']
         MYSCALE_PASSWORD = st.secrets['MYSCALE_PASSWORD']
         MYSCALE_HOST = st.secrets['MYSCALE_HOST']
@@ -141,7 +156,7 @@ def build_retriever():
         output_parser = VectorSQLRetrieveAllOutputParser.from_embeddings(
             model=embeddings)
         sql_query_chain = SQLDatabaseChain.from_llm(
-            llm=OpenAI(openai_api_key=st.secrets['OPENAI_API_KEY'], temperature=0),
             prompt=PROMPT,
             top_k=10,
             return_direct=True,
@@ -151,15 +166,23 @@ def build_retriever():
         )
         sql_retriever = SQLDatabaseChainRetriever(
             sql_db_chain=sql_query_chain, page_content_key="abstract")
-        sql_chain = RetrievalQAWithSourcesChain.from_chain_type(
-            ChatOpenAI(model_name='gpt-3.5-turbo-16k',
-                       openai_api_key=st.secrets['OPENAI_API_KEY'], temperature=0.6),
             retriever=sql_retriever,
-            chain_type='stuff',
-            chain_type_kwargs={
-                'prompt': COMBINE_PROMPT,
-                'document_prompt': document_with_metadata_prompt,
-            }, return_source_documents=True)
     return [{'name': m.name, 'desc': m.description, 'type': m.type} for m in metadata_field_info], retriever, chain, sql_retriever, sql_chain
@@ -220,7 +243,7 @@ ENGINE = ReplacingMergeTree ORDER BY id
                 display(docs)
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
-                # raise e
     if st.session_state.ask_sql:
         plc_hldr = st.empty()
@@ -233,17 +256,12 @@ ENGINE = ReplacingMergeTree ORDER BY id
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
-                docs = ret['source_documents']
-                ref = re.findall(
-                    '(http://arxiv.org/abs/\d{4}.\d+v\d)', ret['sources'])
-                ref += re.findall(
-                    '(http://arxiv.org/abs/\d{4}.\d+v\d)', ret['answer'])
-                docs = pd.DataFrame([{**d.metadata, 'abstract': d.page_content}
-                                    for d in docs if d.metadata['id'] in set(ref)])
-                display(docs, columns)
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
-                # raise e
 with tab_self_query:
@@ -270,7 +288,7 @@ with tab_self_query:
                 display(docs, columns)
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
-                # raise e
     if st.session_state.ask_self:
         plc_hldr = st.empty()
@@ -284,14 +302,9 @@ with tab_self_query:
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
-                docs = ret['source_documents']
-                ref = re.findall(
-                    '(http://arxiv.org/abs/\d{4}.\d+v\d)', ret['sources'])
-                ref += re.findall(
-                    '(http://arxiv.org/abs/\d{4}.\d+v\d)', ret['answer'])
-                docs = pd.DataFrame([{**d.metadata, 'abstract': d.page_content}
-                                    for d in docs if d.metadata['id'] in set(ref)])
-                display(docs, columns)
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
-                # raise e

 from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.retrievers.self_query.base import SelfQueryRetriever
 from langchain.chains.query_constructor.base import AttributeInfo
 from langchain import OpenAI
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.prompt import PromptTemplate
+from langchain.prompts import PromptTemplate, ChatPromptTemplate, \
+    SystemMessagePromptTemplate, HumanMessagePromptTemplate
 from sqlalchemy import create_engine, MetaData
 from langchain.chains.sql_database.base import SQLDatabaseChain
 from langchain.chains.sql_database.parser import VectorSQLRetrieveAllOutputParser
 from langchain.sql_database import SQLDatabase
 from langchain.retrievers import SQLDatabaseChainRetriever
+from chains.arxiv_chains import ArXivQAwithSourcesChain, ArXivStuffDocumentChain
+from callbacks.arxiv_callbacks import ChatDataSelfSearchCallBackHandler, \
+    ChatDataSelfAskCallBackHandler, ChatDataSQLSearchCallBackHandler, \
+    ChatDataSQLAskCallBackHandler
+from prompts.arxiv_prompt import combine_prompt_template, _myscale_prompt
 st.set_page_config(page_title="ChatData")
 st.header("ChatData")
+columns = ['ref_id', 'title', 'id', 'categories', 'abstract', 'authors', 'pubdate']
 def try_eval(x):
         return x
+def display(dataframe, columns=None, index=None):
+    if index:
+        dataframe.set_index(index)
     if len(dataframe) > 0:
         if columns:
             st.dataframe(dataframe[columns])
             doc_search, "Scientific papers indexes with abstracts. All in English.", metadata_field_info,
             use_original_query=False)
+    document_with_metadata_prompt = PromptTemplate(
+        input_variables=["page_content", "id", "title", "ref_id",
+                        "authors", "pubdate", "categories"],
+        template="Title for PDF #{ref_id}: {title}\n\tAbstract: {page_content}\n\tAuthors: {authors}\n\tDate of Publication: {pubdate}\n\tCategories: {categories}\nSOURCE: {id}")
+    COMBINE_PROMPT = ChatPromptTemplate.from_strings(
+        string_messages=[(SystemMessagePromptTemplate, combine_prompt_template),
+                        (HumanMessagePromptTemplate, '{question}')])
+    OPENAI_API_KEY = st.secrets['OPENAI_API_KEY']
+    with st.spinner('Building QA Chain with Self-query...'):
+        chain = ArXivQAwithSourcesChain(
             retriever=retriever,
+            combine_documents_chain=ArXivStuffDocumentChain(
+                llm_chain=LLMChain(
+                    prompt=COMBINE_PROMPT,
+                    llm=ChatOpenAI(model_name='gpt-3.5-turbo-16k',
+                                openai_api_key=OPENAI_API_KEY, temperature=0.6),
+                ),
+                document_prompt=document_with_metadata_prompt,
+                document_variable_name="summaries",
+            ),
+            return_source_documents=True,
+            max_tokens_limit=12000,
+        )
+    with st.spinner('Building Vector SQL Database Retriever'):
         MYSCALE_USER = st.secrets['MYSCALE_USER']
         MYSCALE_PASSWORD = st.secrets['MYSCALE_PASSWORD']
         MYSCALE_HOST = st.secrets['MYSCALE_HOST']
         output_parser = VectorSQLRetrieveAllOutputParser.from_embeddings(
             model=embeddings)
         sql_query_chain = SQLDatabaseChain.from_llm(
+            llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),
             prompt=PROMPT,
             top_k=10,
             return_direct=True,
         )
         sql_retriever = SQLDatabaseChainRetriever(
             sql_db_chain=sql_query_chain, page_content_key="abstract")
+    with st.spinner('Building QA Chain with Vector SQL...'):
+        sql_chain = ArXivQAwithSourcesChain(
             retriever=sql_retriever,
+            combine_documents_chain=ArXivStuffDocumentChain(
+                llm_chain=LLMChain(
+                    prompt=COMBINE_PROMPT,
+                    llm=ChatOpenAI(model_name='gpt-3.5-turbo-16k',
+                                openai_api_key=OPENAI_API_KEY, temperature=0.6),
+                ),
+                document_prompt=document_with_metadata_prompt,
+                document_variable_name="summaries",
+            ),
+            return_source_documents=True,
+            max_tokens_limit=12000,
+        )
     return [{'name': m.name, 'desc': m.description, 'type': m.type} for m in metadata_field_info], retriever, chain, sql_retriever, sql_chain
                 display(docs)
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
+                raise e
     if st.session_state.ask_sql:
         plc_hldr = st.empty()
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
+                docs = ret['sources']
+                docs = pd.DataFrame([{**d.metadata, 'abstract': d.page_content} for d in docs])
+                display(docs, columns, index='ref_id')
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
+                raise e
 with tab_self_query:
                 display(docs, columns)
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
+                raise e
     if st.session_state.ask_self:
         plc_hldr = st.empty()
                 callback.progress_bar.progress(value=1.0, text="Done!")
                 st.markdown(
                     f"### Answer from LLM\n{ret['answer']}\n### References")
+                docs = ret['sources']
+                docs = pd.DataFrame([{**d.metadata, 'abstract': d.page_content} for d in docs])
+                display(docs, columns, index='ref_id')
             except Exception as e:
                 st.write('Oops 😵 Something bad happened...')
+                raise e

chains/arxiv_chains.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import re
+import inspect
+from typing import Dict, Any, Optional, List, Tuple
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain.schema import BaseRetriever
+from langchain.callbacks.manager import Callbacks
+from langchain.schema.prompt_template import format_document
+from langchain.docstore.document import Document
+from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
+from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
+from langchain.chains.combine_documents.stuff import StuffDocumentsChain
+class ArXivStuffDocumentChain(StuffDocumentsChain):
+    """Combine arxiv documents with PDF reference number"""
+    def _get_inputs(self, docs: List[Document], **kwargs: Any) -> dict:
+        """Construct inputs from kwargs and docs.
+        Format and the join all the documents together into one input with name
+        `self.document_variable_name`. The pluck any additional variables
+        from **kwargs.
+        Args:
+            docs: List of documents to format and then join into single input
+            **kwargs: additional inputs to chain, will pluck any other required
+                arguments from here.
+        Returns:
+            dictionary of inputs to LLMChain
+        """
+        # Format each document according to the prompt
+        doc_strings = []
+        for doc_id, doc in enumerate(docs):
+            # add temp reference number in metadata
+            doc.metadata.update({'ref_id': doc_id})
+            doc.page_content = doc.page_content.replace('\n', ' ')
+            doc_strings.append(format_document(doc, self.document_prompt))
+        # Join the documents together to put them in the prompt.
+        inputs = {
+            k: v
+            for k, v in kwargs.items()
+            if k in self.llm_chain.prompt.input_variables
+        }
+        inputs[self.document_variable_name] = self.document_separator.join(
+            doc_strings)
+        return inputs
+    def combine_docs(
+        self, docs: List[Document], callbacks: Callbacks = None, **kwargs: Any
+    ) -> Tuple[str, dict]:
+        """Stuff all documents into one prompt and pass to LLM.
+        Args:
+            docs: List of documents to join together into one variable
+            callbacks: Optional callbacks to pass along
+            **kwargs: additional parameters to use to get inputs to LLMChain.
+        Returns:
+            The first element returned is the single string output. The second
+            element returned is a dictionary of other keys to return.
+        """
+        inputs = self._get_inputs(docs, **kwargs)
+        # Call predict on the LLM.
+        output = self.llm_chain.predict(callbacks=callbacks, **inputs)
+        return output, {}
+    @property
+    def _chain_type(self) -> str:
+        return "referenced_stuff_documents_chain"
+class ArXivQAwithSourcesChain(RetrievalQAWithSourcesChain):
+    """QA with source chain for Chat ArXiv app with references
+    This chain will automatically assign reference number to the article,
+    Then parse it back to titles or anything else.
+    """
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, str]:
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        accepts_run_manager = (
+            "run_manager" in inspect.signature(self._get_docs).parameters
+        )
+        if accepts_run_manager:
+            docs = self._get_docs(inputs, run_manager=_run_manager)
+        else:
+            docs = self._get_docs(inputs)  # type: ignore[call-arg]
+        answer = self.combine_documents_chain.run(
+            input_documents=docs, callbacks=_run_manager.get_child(), **inputs
+        )
+        # parse source with ref_id
+        sources = []
+        ref_cnt = 1
+        for d in docs:
+            ref_id = d.metadata['ref_id']
+            if f"PDF #{ref_id}" in answer:
+                title = d.metadata['title'].replace('\n', '')
+                d.metadata['ref_id'] = ref_cnt
+                answer = answer.replace(f"PDF #{ref_id}", f"{title} [{ref_cnt}]")
+                sources.append(d)
+                ref_cnt += 1
+        result: Dict[str, Any] = {
+            self.answer_key: answer,
+            self.sources_answer_key: sources,
+        }
+        if self.return_source_documents:
+            result["source_documents"] = docs
+        return result
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        raise NotImplementedError
+    @property
+    def _chain_type(self) -> str:
+        return "arxiv_qa_with_sources_chain"

prompts/arxiv_prompt.py CHANGED Viewed

@@ -1,15 +1,14 @@
-from langchain.chains.qa_with_sources.map_reduce_prompt import combine_prompt_template
-combine_prompt_template_ = (
-            "You are a helpful paper assistant. Your task is to provide information and answer any questions "
-            + "related to PDFs given below. You should only use the abstract of the selected papers as your source of information "
             + "and try to provide concise and accurate answers to any questions asked by the user. If you are unable to find "
             + "relevant information in the given sections, you will need to let the user know that the source does not contain "
-            + "relevant information but still try to provide an answer based on your general knowledge. The following is the related information "
-            + "about the paper that will help you answer users' questions, you MUST answer it using question's language:\n\n"
         )
-combine_prompt_template = combine_prompt_template_ + combine_prompt_template
 _myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
 MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance.
 When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.

+combine_prompt_template = (
+            "You are a helpful PDF assistant. Your task is to provide information and answer any questions "
+            + "related to PDFs given below. You should use the sections, title and abstract of the selected PDFs as your source of information "
             + "and try to provide concise and accurate answers to any questions asked by the user. If you are unable to find "
             + "relevant information in the given sections, you will need to let the user know that the source does not contain "
+            + "relevant information but still try to provide an answer based on your general knowledge. You must refer to the "
+            + "corresponding section name and page that you refer to when answering. The following is the related information "
+            + "about the PDF file that will help you answer users' questions, you MUST answer it using question's language:\n\n {summaries}"
+            + "Now you should anwser user's question. Remember you must use the PDF # to refer papers:\n\n"
         )
 _myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
 MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance.
 When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.