Spaces:

joshuasundance
/

langchain-streamlit-demo

Runtime error

App Files Files Community

Joshua Sundance Bailey commited on Sep 28, 2023

Commit

85880a1

unverified ·

2 Parent(s): e4344c4 47c2ffc

Merge pull request #34 from joshuasundance-swca/summarize

Browse files

Files changed (11) hide show

.idea/.name +1 -1
.idea/inspectionProfiles/Project_Default.xml +1 -1
.idea/inspectionProfiles/profiles_settings.xml +1 -1
.idea/kubernetes-settings.xml +1 -1
.idea/langchain-streamlit-demo.iml +1 -1
.idea/misc.xml +1 -1
.idea/modules.xml +1 -1
.idea/vcs.xml +1 -1
langchain-streamlit-demo/app.py +100 -38
langchain-streamlit-demo/qagen.py +75 -0
langchain-streamlit-demo/summarize.py +51 -0

.idea/.name CHANGED Viewed

	@@ -1 +1 @@
1	- langchain-streamlit-demo


1	+ langchain-streamlit-demo

.idea/inspectionProfiles/Project_Default.xml CHANGED Viewed

@@ -18,4 +18,4 @@
     </inspection_tool>
     <inspection_tool class="PyShadowingNamesInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
   </profile>
-</component>

     </inspection_tool>
     <inspection_tool class="PyShadowingNamesInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
   </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml CHANGED Viewed

@@ -3,4 +3,4 @@
     <option name="USE_PROJECT_PROFILE" value="false" />
     <version value="1.0" />
   </settings>
-</component>

     <option name="USE_PROJECT_PROFILE" value="false" />
     <version value="1.0" />
   </settings>
+</component>

.idea/kubernetes-settings.xml CHANGED Viewed

@@ -3,4 +3,4 @@
   <component name="KubernetesSettings">
     <option name="contextName" value="swca-aks" />
   </component>
-</project>

   <component name="KubernetesSettings">
     <option name="contextName" value="swca-aks" />
   </component>
+</project>

.idea/langchain-streamlit-demo.iml CHANGED Viewed

@@ -5,4 +5,4 @@
     <orderEntry type="jdk" jdkName="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
-</module>

     <orderEntry type="jdk" jdkName="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
+</module>

.idea/misc.xml CHANGED Viewed

@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" project-jdk-type="Python SDK" />
-</project>

 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.11.4 Docker (&lt;none&gt;:&lt;none&gt;) (5)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml CHANGED Viewed

@@ -5,4 +5,4 @@
       <module fileurl="file://$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" filepath="$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" />
     </modules>
   </component>
-</project>

       <module fileurl="file://$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" filepath="$PROJECT_DIR$/.idea/langchain-streamlit-demo.iml" />
     </modules>
   </component>
+</project>

.idea/vcs.xml CHANGED Viewed

@@ -3,4 +3,4 @@
   <component name="VcsDirectoryMappings">
     <mapping directory="$PROJECT_DIR$" vcs="Git" />
   </component>
-</project>

   <component name="VcsDirectoryMappings">
     <mapping directory="$PROJECT_DIR$" vcs="Git" />
   </component>
+</project>

langchain-streamlit-demo/app.py CHANGED Viewed

@@ -1,29 +1,33 @@
 import os
 from datetime import datetime
 from tempfile import NamedTemporaryFile
-from typing import Union
 import anthropic
 import langsmith.utils
 import openai
 import streamlit as st
-from langchain import LLMChain
 from langchain.callbacks import StreamlitCallbackHandler
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.tracers.langchain import LangChainTracer, wait_for_all_tracers
 from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
 from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI, ChatAnyscale, ChatAnthropic
 from langchain.document_loaders import PyPDFLoader
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.memory import ConversationBufferMemory, StreamlitChatMessageHistory
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain.schema.retriever import BaseRetriever
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langsmith.client import Client
 from streamlit_feedback import streamlit_feedback
 __version__ = "0.0.6"
 # --- Initialization ---
@@ -46,6 +50,7 @@ st_init_null(
     "document_chat_chain_type",
     "llm",
     "ls_tracer",
     "retriever",
     "run",
     "run_id",
@@ -120,11 +125,11 @@ DEFAULT_CHUNK_OVERLAP = 0
 @st.cache_data
-def get_retriever(
     uploaded_file_bytes: bytes,
     chunk_size: int = DEFAULT_CHUNK_SIZE,
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
-) -> BaseRetriever:
     with NamedTemporaryFile() as temp_file:
         temp_file.write(uploaded_file_bytes)
         temp_file.seek(0)
@@ -138,7 +143,7 @@ def get_retriever(
         texts = text_splitter.split_documents(documents)
         embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
         db = FAISS.from_documents(texts, embeddings)
-        return db.as_retriever()
 # --- Sidebar ---
@@ -152,10 +157,12 @@ with sidebar:
         index=SUPPORTED_MODELS.index(DEFAULT_MODEL),
     )
-    provider = MODEL_DICT[model]
-    provider_api_key = PROVIDER_KEY_DICT.get(provider) or st.text_input(
-        f"{provider} API key",
         type="password",
     )
@@ -170,7 +177,7 @@ with sidebar:
         openai_api_key = (
             provider_api_key
-            if provider == "OpenAI"
             else OPENAI_API_KEY
             or st.sidebar.text_input("OpenAI API Key: ", type="password")
         )
@@ -210,7 +217,14 @@ with sidebar:
         )
         document_chat_chain_type = st.selectbox(
             label="Document Chat Chain Type",
-            options=["stuff", "refine", "map_reduce", "map_rerank"],
             index=0,
             help=chain_type_help,
             disabled=not document_chat,
@@ -218,7 +232,10 @@ with sidebar:
         if uploaded_file:
             if openai_api_key:
-                st.session_state.retriever = get_retriever(
                     uploaded_file_bytes=uploaded_file.getvalue(),
                     chunk_size=chunk_size,
                     chunk_overlap=chunk_overlap,
@@ -280,7 +297,7 @@ with sidebar:
 # --- LLM Instantiation ---
 if provider_api_key:
-    if provider == "OpenAI":
         st.session_state.llm = ChatOpenAI(
             model=model,
             openai_api_key=provider_api_key,
@@ -288,7 +305,7 @@ if provider_api_key:
             streaming=True,
             max_tokens=max_tokens,
         )
-    elif provider == "Anthropic":
         st.session_state.llm = ChatAnthropic(
             model_name=model,
             anthropic_api_key=provider_api_key,
@@ -296,7 +313,7 @@ if provider_api_key:
             streaming=True,
             max_tokens_to_sample=max_tokens,
         )
-    elif provider == "Anyscale Endpoints":
         st.session_state.llm = ChatAnyscale(
             model=model,
             anyscale_api_key=provider_api_key,
@@ -321,18 +338,18 @@ for msg in STMEMORY.messages:
 if st.session_state.llm:
     # --- Document Chat ---
     if st.session_state.retriever:
-        # st.session_state.doc_chain = ConversationalRetrievalChain.from_llm(
-        #     st.session_state.llm,
-        #     st.session_state.retriever,
-        #     memory=MEMORY,
-        # )
-        st.session_state.doc_chain = RetrievalQA.from_chain_type(
-            llm=st.session_state.llm,
-            chain_type=document_chat_chain_type,
-            retriever=st.session_state.retriever,
-            memory=MEMORY,
-        )
     else:
         # --- Regular Chat ---
@@ -375,17 +392,62 @@ if st.session_state.llm:
             )
             try:
                 if use_document_chat:
-                    st_handler = StreamlitCallbackHandler(st.container())
-                    callbacks.append(st_handler)
-                    full_response = st.session_state.doc_chain(
-                        {"query": prompt},
-                        callbacks=callbacks,
-                        tags=["Streamlit Chat"],
-                        return_only_outputs=True,
-                    )[st.session_state.doc_chain.output_key]
-                    st_handler._complete_current_thought()
-                    st.markdown(full_response)
                 else:
                     message_placeholder = st.empty()
                     stream_handler = StreamHandler(message_placeholder)
@@ -399,7 +461,7 @@ if st.session_state.llm:
                     message_placeholder.markdown(full_response)
             except (openai.error.AuthenticationError, anthropic.AuthenticationError):
                 st.error(
-                    f"Please enter a valid {provider} API key.",
                     icon="❌",
                 )
                 full_response = None
@@ -468,4 +530,4 @@ if st.session_state.llm:
                 st.warning("Invalid feedback score.")
 else:
-    st.error(f"Please enter a valid {provider} API key.", icon="❌")

 import os
 from datetime import datetime
 from tempfile import NamedTemporaryFile
+from typing import Tuple, List, Dict, Any, Union
 import anthropic
 import langsmith.utils
 import openai
 import streamlit as st
 from langchain.callbacks import StreamlitCallbackHandler
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.tracers.langchain import LangChainTracer, wait_for_all_tracers
 from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
 from langchain.chains import RetrievalQA
+from langchain.chains.llm import LLMChain
 from langchain.chat_models import ChatOpenAI, ChatAnyscale, ChatAnthropic
 from langchain.document_loaders import PyPDFLoader
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.memory import ConversationBufferMemory, StreamlitChatMessageHistory
 from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.schema.document import Document
 from langchain.schema.retriever import BaseRetriever
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langsmith.client import Client
 from streamlit_feedback import streamlit_feedback
+from qagen import get_qa_gen_chain, combine_qa_pair_lists
+from summarize import get_summarization_chain
 __version__ = "0.0.6"
 # --- Initialization ---
     "document_chat_chain_type",
     "llm",
     "ls_tracer",
+    "provider",
     "retriever",
     "run",
     "run_id",
 @st.cache_data
+def get_texts_and_retriever(
     uploaded_file_bytes: bytes,
     chunk_size: int = DEFAULT_CHUNK_SIZE,
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+) -> Tuple[List[Document], BaseRetriever]:
     with NamedTemporaryFile() as temp_file:
         temp_file.write(uploaded_file_bytes)
         temp_file.seek(0)
         texts = text_splitter.split_documents(documents)
         embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
         db = FAISS.from_documents(texts, embeddings)
+        return texts, db.as_retriever()
 # --- Sidebar ---
         index=SUPPORTED_MODELS.index(DEFAULT_MODEL),
     )
+    st.session_state.provider = MODEL_DICT[model]
+    provider_api_key = PROVIDER_KEY_DICT.get(
+        st.session_state.provider,
+    ) or st.text_input(
+        f"{st.session_state.provider} API key",
         type="password",
     )
         openai_api_key = (
             provider_api_key
+            if st.session_state.provider == "OpenAI"
             else OPENAI_API_KEY
             or st.sidebar.text_input("OpenAI API Key: ", type="password")
         )
         )
         document_chat_chain_type = st.selectbox(
             label="Document Chat Chain Type",
+            options=[
+                "stuff",
+                "refine",
+                "map_reduce",
+                "map_rerank",
+                "Q&A Generation",
+                "Summarization",
+            ],
             index=0,
             help=chain_type_help,
             disabled=not document_chat,
         if uploaded_file:
             if openai_api_key:
+                (
+                    st.session_state.texts,
+                    st.session_state.retriever,
+                ) = get_texts_and_retriever(
                     uploaded_file_bytes=uploaded_file.getvalue(),
                     chunk_size=chunk_size,
                     chunk_overlap=chunk_overlap,
 # --- LLM Instantiation ---
 if provider_api_key:
+    if st.session_state.provider == "OpenAI":
         st.session_state.llm = ChatOpenAI(
             model=model,
             openai_api_key=provider_api_key,
             streaming=True,
             max_tokens=max_tokens,
         )
+    elif st.session_state.provider == "Anthropic":
         st.session_state.llm = ChatAnthropic(
             model_name=model,
             anthropic_api_key=provider_api_key,
             streaming=True,
             max_tokens_to_sample=max_tokens,
         )
+    elif st.session_state.provider == "Anyscale Endpoints":
         st.session_state.llm = ChatAnyscale(
             model=model,
             anyscale_api_key=provider_api_key,
 if st.session_state.llm:
     # --- Document Chat ---
     if st.session_state.retriever:
+        if document_chat_chain_type == "Summarization":
+            st.session_state.doc_chain = "summarization"
+        elif document_chat_chain_type == "Q&A Generation":
+            st.session_state.doc_chain = get_qa_gen_chain(st.session_state.llm)
+        else:
+            st.session_state.doc_chain = RetrievalQA.from_chain_type(
+                llm=st.session_state.llm,
+                chain_type=document_chat_chain_type,
+                retriever=st.session_state.retriever,
+                memory=MEMORY,
+            )
     else:
         # --- Regular Chat ---
             )
             try:
+                full_response: Union[str, None]
                 if use_document_chat:
+                    if document_chat_chain_type == "Summarization":
+                        st.session_state.doc_chain = get_summarization_chain(
+                            st.session_state.llm,
+                            prompt,
+                        )
+                        full_response = st.session_state.doc_chain.run(
+                            st.session_state.texts,
+                            callbacks=callbacks,
+                            tags=["Streamlit Chat"],
+                        )
+                        st.markdown(full_response)
+                    elif document_chat_chain_type == "Q&A Generation":
+                        config: Dict[str, Any] = dict(
+                            callbacks=callbacks,
+                            tags=["Streamlit Chat"],
+                        )
+                        if st.session_state.provider == "Anthropic":
+                            config["max_concurrency"] = 5
+                        raw_results = st.session_state.doc_chain.batch(
+                            [
+                                {"input": doc.page_content, "prompt": prompt}
+                                for doc in st.session_state.texts
+                            ],
+                            config,
+                        )
+                        results = combine_qa_pair_lists(raw_results).QuestionAnswerPairs
+                        def _to_str(idx, qap):
+                            question_piece = f"{idx}. **Q:** {qap.question}"
+                            whitespace = " " * (len(str(idx)) + 2)
+                            answer_piece = f"{whitespace}**A:** {qap.answer}"
+                            return f"{question_piece}\n{answer_piece}"
+                        output_text = "\n\n".join(
+                            [
+                                _to_str(idx, qap)
+                                for idx, qap in enumerate(results, start=1)
+                            ],
+                        )
+                        st.markdown(output_text)
+                    else:
+                        st_handler = StreamlitCallbackHandler(st.container())
+                        callbacks.append(st_handler)
+                        full_response = st.session_state.doc_chain(
+                            {"query": prompt},
+                            callbacks=callbacks,
+                            tags=["Streamlit Chat"],
+                            return_only_outputs=True,
+                        )[st.session_state.doc_chain.output_key]
+                        st_handler._complete_current_thought()
+                        st.markdown(full_response)
                 else:
                     message_placeholder = st.empty()
                     stream_handler = StreamHandler(message_placeholder)
                     message_placeholder.markdown(full_response)
             except (openai.error.AuthenticationError, anthropic.AuthenticationError):
                 st.error(
+                    f"Please enter a valid {st.session_state.provider} API key.",
                     icon="❌",
                 )
                 full_response = None
                 st.warning("Invalid feedback score.")
 else:
+    st.error(f"Please enter a valid {st.session_state.provider} API key.", icon="❌")

langchain-streamlit-demo/qagen.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from functools import reduce
+from typing import List
+from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+)
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.schema.runnable import RunnableSequence
+from pydantic import BaseModel, field_validator, Field
+class QuestionAnswerPair(BaseModel):
+    question: str = Field(..., description="The question that will be answered.")
+    answer: str = Field(..., description="The answer to the question that was asked.")
+    @field_validator("question")
+    def validate_question(cls, v: str) -> str:
+        if not v.endswith("?"):
+            raise ValueError("Question must end with a question mark.")
+        return v
+class QuestionAnswerPairList(BaseModel):
+    QuestionAnswerPairs: List[QuestionAnswerPair]
+PYDANTIC_PARSER: PydanticOutputParser = PydanticOutputParser(
+    pydantic_object=QuestionAnswerPairList,
+)
+templ1 = """You are a smart assistant designed to help college professors come up with reading comprehension questions.
+Given a piece of text, you must come up with question and answer pairs that can be used to test a student's reading comprehension abilities.
+Generate as many question/answer pairs as you can.
+When coming up with the question/answer pairs, you must respond in the following format:
+{format_instructions}
+Do not provide additional commentary and do not wrap your response in Markdown formatting. Return RAW, VALID JSON.
+"""
+templ2 = """{prompt}
+Please create question/answer pairs, in the specified JSON format, for the following text:
+----------------
+{input}"""
+CHAT_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        ("system", templ1),
+        ("human", templ2),
+    ],
+).partial(format_instructions=PYDANTIC_PARSER.get_format_instructions)
+def combine_qa_pair_lists(
+    qa_pair_lists: List[QuestionAnswerPairList],
+) -> QuestionAnswerPairList:
+    def reducer(
+        accumulator: QuestionAnswerPairList,
+        current: QuestionAnswerPairList,
+    ) -> QuestionAnswerPairList:
+        return QuestionAnswerPairList(
+            QuestionAnswerPairs=accumulator.QuestionAnswerPairs
+            + current.QuestionAnswerPairs,
+        )
+    return reduce(
+        reducer,
+        qa_pair_lists,
+        QuestionAnswerPairList(QuestionAnswerPairs=[]),
+    )
+def get_qa_gen_chain(llm: BaseLanguageModel) -> RunnableSequence:
+    return (
+        CHAT_PROMPT | llm | OutputFixingParser.from_llm(llm=llm, parser=PYDANTIC_PARSER)
+    )

langchain-streamlit-demo/summarize.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from langchain.chains.base import Chain
+from langchain.chains.summarize import load_summarize_chain
+from langchain.prompts import PromptTemplate
+from langchain.schema.language_model import BaseLanguageModel
+prompt_template = """Write a concise summary of the following text, based on the user input.
+User input: {query}
+Text:
+```
+{text}
+```
+CONCISE SUMMARY:"""
+refine_template = (
+    "You are iteratively crafting a summary of the text below based on the user input\n"
+    "User input: {query}"
+    "We have provided an existing summary up to a certain point: {existing_answer}\n"
+    "We have the opportunity to refine the existing summary"
+    "(only if needed) with some more context below.\n"
+    "------------\n"
+    "{text}\n"
+    "------------\n"
+    "Given the new context, refine the original summary.\n"
+    "If the context isn't useful, return the original summary.\n"
+    "If the context is useful, refine the summary to include the new context.\n"
+    "Your contribution is helping to build a comprehensive summary of a large body of knowledge.\n"
+    "You do not have the complete context, so do not discard pieces of the original summary."
+)
+def get_summarization_chain(
+    llm: BaseLanguageModel,
+    prompt: str,
+) -> Chain:
+    _prompt = PromptTemplate.from_template(
+        prompt_template,
+        partial_variables={"query": prompt},
+    )
+    refine_prompt = PromptTemplate.from_template(
+        refine_template,
+        partial_variables={"query": prompt},
+    )
+    return load_summarize_chain(
+        llm=llm,
+        chain_type="refine",
+        question_prompt=_prompt,
+        refine_prompt=refine_prompt,
+        return_intermediate_steps=False,
+        input_key="input_documents",
+        output_key="output_text",
+    )