Spaces:

joshuasundance
/

langchain-streamlit-demo

Runtime error

App Files Files Community

Joshua Sundance Bailey commited on Sep 28, 2023

Commit

622ac66

1 Parent(s): e4344c4

qagen

Browse files

Files changed (10) hide show

.idea/.name +1 -1
.idea/inspectionProfiles/Project_Default.xml +1 -1
.idea/inspectionProfiles/profiles_settings.xml +1 -1
.idea/kubernetes-settings.xml +1 -1
.idea/langchain-streamlit-demo.iml +1 -1
.idea/misc.xml +1 -1
.idea/modules.xml +1 -1
.idea/vcs.xml +1 -1
langchain-streamlit-demo/app.py +80 -37
langchain-streamlit-demo/qagen.py +75 -0

.idea/.name CHANGED Viewed

	@@ -1 +1 @@
1	- langchain-streamlit-demo


1	+ langchain-streamlit-demo

.idea/inspectionProfiles/Project_Default.xml CHANGED Viewed

@@ -18,4 +18,4 @@
     </inspection_tool>
     <inspection_tool class="PyShadowingNamesInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
   </profile>
-</component>

     </inspection_tool>
     <inspection_tool class="PyShadowingNamesInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
   </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml CHANGED Viewed

@@ -3,4 +3,4 @@
     <option name="USE_PROJECT_PROFILE" value="false" />
     <version value="1.0" />
   </settings>
-</component>

     <option name="USE_PROJECT_PROFILE" value="false" />
     <version value="1.0" />
   </settings>
+</component>

.idea/kubernetes-settings.xml CHANGED Viewed

@@ -3,4 +3,4 @@
   <component name="KubernetesSettings">
     <option name="contextName" value="swca-aks" />
   </component>
-</project>

   <component name="KubernetesSettings">
     <option name="contextName" value="swca-aks" />
   </component>
+</project>

.idea/langchain-streamlit-demo.iml CHANGED Viewed

@@ -5,4 +5,4 @@
     <orderEntry type="jdk" jdkName="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
-</module>

     <orderEntry type="jdk" jdkName="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
+</module>

.idea/misc.xml CHANGED Viewed

@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" project-jdk-type="Python SDK" />
-</project>

 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml CHANGED Viewed

@@ -5,4 +5,4 @@
       <module fileurl="file://$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" filepath="$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" />
     </modules>
   </component>
-</project>

       <module fileurl="file://$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" filepath="$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" />
     </modules>
   </component>
+</project>

.idea/vcs.xml CHANGED Viewed

@@ -3,4 +3,4 @@
   <component name="VcsDirectoryMappings">
     <mapping directory="$PROJECT_DIR$" vcs="Git" />
   </component>
-</project>

   <component name="VcsDirectoryMappings">
     <mapping directory="$PROJECT_DIR$" vcs="Git" />
   </component>
+</project>

langchain-streamlit-demo/app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 from datetime import datetime
 from tempfile import NamedTemporaryFile
-from typing import Union
 import anthropic
 import langsmith.utils
@@ -18,12 +18,15 @@ from langchain.document_loaders import PyPDFLoader
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.memory import ConversationBufferMemory, StreamlitChatMessageHistory
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.schema.retriever import BaseRetriever
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langsmith.client import Client
 from streamlit_feedback import streamlit_feedback
 __version__ = "0.0.6"
 # --- Initialization ---
@@ -46,6 +49,7 @@ st_init_null(
     "document_chat_chain_type",
     "llm",
     "ls_tracer",
     "retriever",
     "run",
     "run_id",
@@ -120,11 +124,11 @@ DEFAULT_CHUNK_OVERLAP = 0
 @st.cache_data
-def get_retriever(
     uploaded_file_bytes: bytes,
     chunk_size: int = DEFAULT_CHUNK_SIZE,
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
-) -> BaseRetriever:
     with NamedTemporaryFile() as temp_file:
         temp_file.write(uploaded_file_bytes)
         temp_file.seek(0)
@@ -138,7 +142,7 @@ def get_retriever(
         texts = text_splitter.split_documents(documents)
         embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
         db = FAISS.from_documents(texts, embeddings)
-        return db.as_retriever()
 # --- Sidebar ---
@@ -152,10 +156,12 @@ with sidebar:
         index=SUPPORTED_MODELS.index(DEFAULT_MODEL),
     )
-    provider = MODEL_DICT[model]
-    provider_api_key = PROVIDER_KEY_DICT.get(provider) or st.text_input(
-        f"{provider} API key",
         type="password",
     )
@@ -170,7 +176,7 @@ with sidebar:
         openai_api_key = (
             provider_api_key
-            if provider == "OpenAI"
             else OPENAI_API_KEY
             or st.sidebar.text_input("OpenAI API Key: ", type="password")
         )
@@ -210,7 +216,7 @@ with sidebar:
         )
         document_chat_chain_type = st.selectbox(
             label="Document Chat Chain Type",
-            options=["stuff", "refine", "map_reduce", "map_rerank"],
             index=0,
             help=chain_type_help,
             disabled=not document_chat,
@@ -218,7 +224,10 @@ with sidebar:
         if uploaded_file:
             if openai_api_key:
-                st.session_state.retriever = get_retriever(
                     uploaded_file_bytes=uploaded_file.getvalue(),
                     chunk_size=chunk_size,
                     chunk_overlap=chunk_overlap,
@@ -280,7 +289,7 @@ with sidebar:
 # --- LLM Instantiation ---
 if provider_api_key:
-    if provider == "OpenAI":
         st.session_state.llm = ChatOpenAI(
             model=model,
             openai_api_key=provider_api_key,
@@ -288,7 +297,7 @@ if provider_api_key:
             streaming=True,
             max_tokens=max_tokens,
         )
-    elif provider == "Anthropic":
         st.session_state.llm = ChatAnthropic(
             model_name=model,
             anthropic_api_key=provider_api_key,
@@ -296,7 +305,7 @@ if provider_api_key:
             streaming=True,
             max_tokens_to_sample=max_tokens,
         )
-    elif provider == "Anyscale Endpoints":
         st.session_state.llm = ChatAnyscale(
             model=model,
             anyscale_api_key=provider_api_key,
@@ -321,18 +330,24 @@ for msg in STMEMORY.messages:
 if st.session_state.llm:
     # --- Document Chat ---
     if st.session_state.retriever:
-        # st.session_state.doc_chain = ConversationalRetrievalChain.from_llm(
-        #     st.session_state.llm,
-        #     st.session_state.retriever,
-        #     memory=MEMORY,
-        # )
-        st.session_state.doc_chain = RetrievalQA.from_chain_type(
-            llm=st.session_state.llm,
-            chain_type=document_chat_chain_type,
-            retriever=st.session_state.retriever,
-            memory=MEMORY,
-        )
     else:
         # --- Regular Chat ---
@@ -375,17 +390,45 @@ if st.session_state.llm:
             )
             try:
                 if use_document_chat:
-                    st_handler = StreamlitCallbackHandler(st.container())
-                    callbacks.append(st_handler)
-                    full_response = st.session_state.doc_chain(
-                        {"query": prompt},
-                        callbacks=callbacks,
-                        tags=["Streamlit Chat"],
-                        return_only_outputs=True,
-                    )[st.session_state.doc_chain.output_key]
-                    st_handler._complete_current_thought()
-                    st.markdown(full_response)
                 else:
                     message_placeholder = st.empty()
                     stream_handler = StreamHandler(message_placeholder)
@@ -399,7 +442,7 @@ if st.session_state.llm:
                     message_placeholder.markdown(full_response)
             except (openai.error.AuthenticationError, anthropic.AuthenticationError):
                 st.error(
-                    f"Please enter a valid {provider} API key.",
                     icon="❌",
                 )
                 full_response = None
@@ -468,4 +511,4 @@ if st.session_state.llm:
                 st.warning("Invalid feedback score.")
 else:
-    st.error(f"Please enter a valid {provider} API key.", icon="❌")

 import os
 from datetime import datetime
 from tempfile import NamedTemporaryFile
+from typing import Tuple, List, Dict, Any, Union
 import anthropic
 import langsmith.utils
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.memory import ConversationBufferMemory, StreamlitChatMessageHistory
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.schema.document import Document
 from langchain.schema.retriever import BaseRetriever
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langsmith.client import Client
 from streamlit_feedback import streamlit_feedback
+from qagen import get_qa_gen_chain, combine_qa_pair_lists
 __version__ = "0.0.6"
 # --- Initialization ---
     "document_chat_chain_type",
     "llm",
     "ls_tracer",
+    "provider",
     "retriever",
     "run",
     "run_id",
 @st.cache_data
+def get_texts_and_retriever(
     uploaded_file_bytes: bytes,
     chunk_size: int = DEFAULT_CHUNK_SIZE,
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+) -> Tuple[List[Document], BaseRetriever]:
     with NamedTemporaryFile() as temp_file:
         temp_file.write(uploaded_file_bytes)
         temp_file.seek(0)
         texts = text_splitter.split_documents(documents)
         embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
         db = FAISS.from_documents(texts, embeddings)
+        return texts, db.as_retriever()
 # --- Sidebar ---
         index=SUPPORTED_MODELS.index(DEFAULT_MODEL),
     )
+    st.session_state.provider = MODEL_DICT[model]
+    provider_api_key = PROVIDER_KEY_DICT.get(
+        st.session_state.provider,
+    ) or st.text_input(
+        f"{st.session_state.provider} API key",
         type="password",
     )
         openai_api_key = (
             provider_api_key
+            if st.session_state.provider == "OpenAI"
             else OPENAI_API_KEY
             or st.sidebar.text_input("OpenAI API Key: ", type="password")
         )
         )
         document_chat_chain_type = st.selectbox(
             label="Document Chat Chain Type",
+            options=["stuff", "refine", "map_reduce", "map_rerank", "Q&A Generation"],
             index=0,
             help=chain_type_help,
             disabled=not document_chat,
         if uploaded_file:
             if openai_api_key:
+                (
+                    st.session_state.texts,
+                    st.session_state.retriever,
+                ) = get_texts_and_retriever(
                     uploaded_file_bytes=uploaded_file.getvalue(),
                     chunk_size=chunk_size,
                     chunk_overlap=chunk_overlap,
 # --- LLM Instantiation ---
 if provider_api_key:
+    if st.session_state.provider == "OpenAI":
         st.session_state.llm = ChatOpenAI(
             model=model,
             openai_api_key=provider_api_key,
             streaming=True,
             max_tokens=max_tokens,
         )
+    elif st.session_state.provider == "Anthropic":
         st.session_state.llm = ChatAnthropic(
             model_name=model,
             anthropic_api_key=provider_api_key,
             streaming=True,
             max_tokens_to_sample=max_tokens,
         )
+    elif st.session_state.provider == "Anyscale Endpoints":
         st.session_state.llm = ChatAnyscale(
             model=model,
             anyscale_api_key=provider_api_key,
 if st.session_state.llm:
     # --- Document Chat ---
     if st.session_state.retriever:
+        if document_chat_chain_type == "Summarization":
+            raise NotImplementedError
+            # st.session_state.doc_chain = RetrievalQA.from_chain_type(
+            #     llm=st.session_state.llm,
+            #     chain_type=chain_type,
+            #     retriever=st.session_state.retriever,
+            #     memory=MEMORY,
+            # )
+        elif document_chat_chain_type == "Q&A Generation":
+            st.session_state.doc_chain = get_qa_gen_chain(st.session_state.llm)
+        else:
+            st.session_state.doc_chain = RetrievalQA.from_chain_type(
+                llm=st.session_state.llm,
+                chain_type=document_chat_chain_type,
+                retriever=st.session_state.retriever,
+                memory=MEMORY,
+            )
     else:
         # --- Regular Chat ---
             )
             try:
+                full_response: Union[str, None]
                 if use_document_chat:
+                    if document_chat_chain_type == "Summarization":
+                        raise NotImplementedError
+                    elif document_chat_chain_type == "Q&A Generation":
+                        config: Dict[str, Any] = dict(
+                            callbacks=callbacks,
+                            tags=["Streamlit Chat"],
+                        )
+                        if st.session_state.provider == "Anthropic":
+                            config["max_concurrency"] = 5
+                        raw_results = st.session_state.doc_chain.batch(
+                            [
+                                {"input": doc.page_content, "prompt": prompt}
+                                for doc in st.session_state.texts
+                            ],
+                            config,
+                        )
+                        results = combine_qa_pair_lists(raw_results).QuestionAnswerPairs
+                        full_response = "\n".join(
+                            f"**Q:** {result.question}\n**A:** {result.answer}\n"
+                            for result in results
+                        )
+                        for idx, result in enumerate(results, start=1):
+                            st.markdown(f"{idx}. **Q:** {result.question}")
+                            st.markdown(f"{idx}. **A:** {result.answer}")
+                            st.markdown("\n")
+                    else:
+                        st_handler = StreamlitCallbackHandler(st.container())
+                        callbacks.append(st_handler)
+                        full_response = st.session_state.doc_chain(
+                            {"query": prompt},
+                            callbacks=callbacks,
+                            tags=["Streamlit Chat"],
+                            return_only_outputs=True,
+                        )[st.session_state.doc_chain.output_key]
+                        st_handler._complete_current_thought()
+                        st.markdown(full_response)
                 else:
                     message_placeholder = st.empty()
                     stream_handler = StreamHandler(message_placeholder)
                     message_placeholder.markdown(full_response)
             except (openai.error.AuthenticationError, anthropic.AuthenticationError):
                 st.error(
+                    f"Please enter a valid {st.session_state.provider} API key.",
                     icon="❌",
                 )
                 full_response = None
                 st.warning("Invalid feedback score.")
 else:
+    st.error(f"Please enter a valid {st.session_state.provider} API key.", icon="❌")

langchain-streamlit-demo/qagen.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from functools import reduce
+from typing import List
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+)
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.schema.runnable import RunnableSequence
+from pydantic import BaseModel, field_validator, Field
+class QuestionAnswerPair(BaseModel):
+    question: str = Field(..., description="The question that will be answered.")
+    answer: str = Field(..., description="The answer to the question that was asked.")
+    @field_validator("question")
+    def validate_question(cls, v: str) -> str:
+        if not v.endswith("?"):
+            raise ValueError("Question must end with a question mark.")
+        return v
+class QuestionAnswerPairList(BaseModel):
+    QuestionAnswerPairs: List[QuestionAnswerPair]
+PYDANTIC_PARSER: PydanticOutputParser = PydanticOutputParser(
+    pydantic_object=QuestionAnswerPairList,
+)
+templ1 = """You are a smart assistant designed to help college professors come up with reading comprehension questions.
+Given a piece of text, you must come up with question and answer pairs that can be used to test a student's reading comprehension abilities.
+Generate as many question/answer pairs as you can.
+When coming up with the question/answer pairs, you must respond in the following format:
+{format_instructions}
+Do not provide additional commentary and do not wrap your response in Markdown formatting. Return RAW, VALID JSON.
+"""
+templ2 = """{prompt}
+Please create question/answer pairs, in the specified JSON format, for the following text:
+----------------
+{input}"""
+CHAT_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        ("system", templ1),
+        ("human", templ2),
+    ],
+).partial(format_instructions=PYDANTIC_PARSER.get_format_instructions)
+def combine_qa_pair_lists(
+    qa_pair_lists: List[QuestionAnswerPairList],
+) -> QuestionAnswerPairList:
+    def reducer(
+        accumulator: QuestionAnswerPairList,
+        current: QuestionAnswerPairList,
+    ) -> QuestionAnswerPairList:
+        return QuestionAnswerPairList(
+            QuestionAnswerPairs=accumulator.QuestionAnswerPairs
+            + current.QuestionAnswerPairs,
+        )
+    return reduce(
+        reducer,
+        qa_pair_lists,
+        QuestionAnswerPairList(QuestionAnswerPairs=[]),
+    )
+def get_qa_gen_chain(llm: BaseLanguageModel) -> RunnableSequence:
+    return (
+        CHAT_PROMPT | llm | OutputFixingParser.from_llm(llm=llm, parser=PYDANTIC_PARSER)
+    )