Suat ATAN commited on
Commit
9177215
1 Parent(s): b9d2468

first commit

Browse files
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from knowledge_gpt.components.sidebar import sidebar
4
+
5
+ from knowledge_gpt.ui import (
6
+ wrap_doc_in_html,
7
+ is_query_valid,
8
+ is_file_valid,
9
+ is_open_ai_key_valid,
10
+ display_file_read_error,
11
+ )
12
+
13
+ from knowledge_gpt.core.caching import bootstrap_caching
14
+
15
+ from knowledge_gpt.core.parsing import read_file
16
+ from knowledge_gpt.core.chunking import chunk_file
17
+ from knowledge_gpt.core.embedding import embed_files
18
+ from knowledge_gpt.core.qa import query_folder
19
+ from knowledge_gpt.core.utils import get_llm
20
+
21
+
22
+ EMBEDDING = "openai"
23
+ VECTOR_STORE = "faiss"
24
+ MODEL_LIST = ["gpt-3.5-turbo", "gpt-4"]
25
+
26
+ # Uncomment to enable debug mode
27
+ # MODEL_LIST.insert(0, "debug")
28
+
29
+ st.set_page_config(page_title="ReportIO", page_icon="☘️", layout="wide")
30
+ st.header("☘️ReportIO")
31
+
32
+ # Enable caching for expensive functions
33
+ bootstrap_caching()
34
+
35
+ sidebar()
36
+
37
+ openai_api_key = st.session_state.get("OPENAI_API_KEY")
38
+
39
+
40
+ if not openai_api_key:
41
+ st.warning(
42
+ "Enter your OpenAI API key in the sidebar. You can get a key at"
43
+ " https://platform.openai.com/account/api-keys."
44
+ )
45
+
46
+
47
+ uploaded_file = st.file_uploader(
48
+ "Upload a pdf, docx, or txt file",
49
+ type=["pdf", "docx", "txt"],
50
+ help="Scanned documents are not supported yet!",
51
+ )
52
+
53
+ # model: str = st.selectbox("Model", options=MODEL_LIST) # type: ignore
54
+
55
+ model = MODEL_LIST[0]
56
+
57
+
58
+
59
+ with st.expander("Advanced Options"):
60
+ return_all_chunks = st.checkbox("Show all chunks retrieved from vector search")
61
+ show_full_doc = st.checkbox("Show parsed contents of the document")
62
+
63
+
64
+ if not uploaded_file:
65
+ st.stop()
66
+
67
+ try:
68
+ file = read_file(uploaded_file)
69
+ except Exception as e:
70
+ display_file_read_error(e, file_name=uploaded_file.name)
71
+
72
+ chunked_file = chunk_file(file, chunk_size=300, chunk_overlap=0)
73
+
74
+ if not is_file_valid(file):
75
+ st.stop()
76
+
77
+
78
+ if not is_open_ai_key_valid(openai_api_key, model):
79
+ st.stop()
80
+
81
+
82
+ with st.spinner("Indexing document... This may take a while⏳"):
83
+ folder_index = embed_files(
84
+ files=[chunked_file],
85
+ embedding=EMBEDDING if model != "debug" else "debug",
86
+ vector_store=VECTOR_STORE if model != "debug" else "debug",
87
+ openai_api_key=openai_api_key,
88
+ )
89
+
90
+
91
+ with st.form(key="qa_form"):
92
+ options = ['List all pre existing conditions which may affect home insurance', 'Show the problematic components!', 'Show repair needs!']
93
+ query = st.selectbox('Select an option', options)
94
+ submit = st.form_submit_button("Submit")
95
+
96
+
97
+ if show_full_doc:
98
+ with st.expander("Document"):
99
+ # Hack to get around st.markdown rendering LaTeX
100
+ st.markdown(f"<p>{wrap_doc_in_html(file.docs)}</p>", unsafe_allow_html=True)
101
+
102
+
103
+ if submit:
104
+ if not is_query_valid(query):
105
+ st.stop()
106
+
107
+ # Output Columns
108
+ answer_col, sources_col = st.columns(2)
109
+
110
+ llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
111
+ result = query_folder(
112
+ folder_index=folder_index,
113
+ query=query,
114
+ return_all=return_all_chunks,
115
+ llm=llm,
116
+ )
117
+
118
+ with answer_col:
119
+ st.markdown("#### Answer")
120
+ st.markdown(result.answer)
121
+
122
+ with sources_col:
123
+ st.markdown("#### Sources")
124
+ for source in result.sources:
125
+ st.markdown(source.page_content)
126
+ st.markdown(source.metadata["source"])
127
+ st.markdown("---")
128
+
129
+
components/__init__.py ADDED
File without changes
components/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (190 Bytes). View file
 
components/__pycache__/faq.cpython-311.pyc ADDED
Binary file (1.8 kB). View file
 
components/__pycache__/sidebar.cpython-311.pyc ADDED
Binary file (2.15 kB). View file
 
components/faq.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ import streamlit as st
3
+
4
+
5
+ def faq():
6
+ st.markdown(
7
+ """
8
+ # FAQ
9
+ ## How does ReportIO work?
10
+ When you upload a document, it will be divided into smaller chunks
11
+ and stored in a special type of database called a vector index
12
+ that allows for semantic search and retrieval.
13
+
14
+ When you ask a question, ReportIO will search through the
15
+ document chunks and find the most relevant ones using the vector index.
16
+ Then, it will use GPT3 to generate a final answer.
17
+
18
+ ## Is my data safe?
19
+ Yes, your data is safe. ReportIO does not store your documents or
20
+ questions. All uploaded data is deleted after you close the browser tab.
21
+
22
+ ## Why does it take so long to index my document?
23
+ If you are using a free OpenAI API key, it will take a while to index
24
+ your document. This is because the free API key has strict [rate limits](https://platform.openai.com/docs/guides/rate-limits/overview).
25
+ To speed up the indexing process, you can use a paid API key.
26
+
27
+ ## What do the numbers mean under each source?
28
+ For a PDF document, you will see a citation number like this: 3-12.
29
+ The first number is the page number and the second number is
30
+ the chunk number on that page. For DOCS and TXT documents,
31
+ the first number is set to 1 and the second number is the chunk number.
32
+
33
+ ## Are the answers 100% accurate?
34
+
35
+ But for most use cases, ReportIO is very accurate and can answer
36
+ most questions. Always check with the sources to make sure that the answers
37
+ are correct.
38
+ """
39
+ )
components/sidebar.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from knowledge_gpt.components.faq import faq
4
+ from dotenv import load_dotenv
5
+ import os
6
+
7
+ load_dotenv()
8
+
9
+
10
+ def sidebar():
11
+ with st.sidebar:
12
+ st.markdown(
13
+ "## How can I help you?\n"
14
+
15
+ "1. Upload a pdf, docx, or txt file of home inspection report📄\n"
16
+ "2. Ask a question about the report\n"
17
+ "2. Or use existing extractor button to see analyzes. ⭐\n"
18
+ )
19
+ api_key_input = st.text_input(
20
+ "OpenAI API Key",
21
+ type="password",
22
+ placeholder="Paste your OpenAI API key here (sk-...)",
23
+ help="You can get your API key from https://platform.openai.com/account/api-keys.", # noqa: E501
24
+ value=os.environ.get("OPENAI_API_KEY", None)
25
+ or st.session_state.get("OPENAI_API_KEY", ""),
26
+ )
27
+
28
+ st.session_state["OPENAI_API_KEY"] = api_key_input
29
+
30
+ st.markdown("---")
31
+ st.markdown("# About")
32
+ st.markdown(
33
+ "☘️ReportIO allows you to ask questions about your "
34
+ "home inspection reports and get accurate answers with instant citations. "
35
+ )
36
+ st.markdown(
37
+ "This tool is a work in progress. "
38
+ )
39
+ st.markdown("Made by S.Atan")
40
+ st.markdown("---")
41
+
42
+ faq()
core/__init__.py ADDED
File without changes
core/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (184 Bytes). View file
 
core/__pycache__/caching.cpython-311.pyc ADDED
Binary file (2.38 kB). View file
 
core/__pycache__/chunking.cpython-311.pyc ADDED
Binary file (1.84 kB). View file
 
core/__pycache__/debug.cpython-311.pyc ADDED
Binary file (3.93 kB). View file
 
core/__pycache__/embedding.cpython-311.pyc ADDED
Binary file (3.64 kB). View file
 
core/__pycache__/parsing.cpython-311.pyc ADDED
Binary file (7.47 kB). View file
 
core/__pycache__/prompts.cpython-311.pyc ADDED
Binary file (2.34 kB). View file
 
core/__pycache__/qa.cpython-311.pyc ADDED
Binary file (3.42 kB). View file
 
core/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.85 kB). View file
 
core/caching.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit.runtime.caching.hashing import HashFuncsDict
3
+
4
+ import knowledge_gpt.core.parsing as parsing
5
+ import knowledge_gpt.core.chunking as chunking
6
+ import knowledge_gpt.core.embedding as embedding
7
+ from knowledge_gpt.core.parsing import File
8
+
9
+
10
+ def file_hash_func(file: File) -> str:
11
+ """Get a unique hash for a file"""
12
+ return file.id
13
+
14
+
15
+ @st.cache_data(show_spinner=False)
16
+ def bootstrap_caching():
17
+ """Patch module functions with caching"""
18
+
19
+ # Get all substypes of File from module
20
+ file_subtypes = [
21
+ cls
22
+ for cls in vars(parsing).values()
23
+ if isinstance(cls, type) and issubclass(cls, File) and cls != File
24
+ ]
25
+ file_hash_funcs: HashFuncsDict = {cls: file_hash_func for cls in file_subtypes}
26
+
27
+ parsing.read_file = st.cache_data(show_spinner=False)(parsing.read_file)
28
+ chunking.chunk_file = st.cache_data(show_spinner=False, hash_funcs=file_hash_funcs)(
29
+ chunking.chunk_file
30
+ )
31
+ embedding.embed_files = st.cache_data(
32
+ show_spinner=False, hash_funcs=file_hash_funcs
33
+ )(embedding.embed_files)
core/chunking.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.docstore.document import Document
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from knowledge_gpt.core.parsing import File
4
+
5
+
6
+ def chunk_file(
7
+ file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo"
8
+ ) -> File:
9
+ """Chunks each document in a file into smaller documents
10
+ according to the specified chunk size and overlap
11
+ where the size is determined by the number of tokens for the specified model.
12
+ """
13
+
14
+ # split each document into chunks
15
+ chunked_docs = []
16
+ for doc in file.docs:
17
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
18
+ model_name=model_name,
19
+ chunk_size=chunk_size,
20
+ chunk_overlap=chunk_overlap,
21
+ )
22
+
23
+ chunks = text_splitter.split_text(doc.page_content)
24
+
25
+ for i, chunk in enumerate(chunks):
26
+ doc = Document(
27
+ page_content=chunk,
28
+ metadata={
29
+ "page": doc.metadata.get("page", 1),
30
+ "chunk": i + 1,
31
+ "source": f"{doc.metadata.get('page', 1)}-{i + 1}",
32
+ },
33
+ )
34
+ chunked_docs.append(doc)
35
+
36
+ chunked_file = file.copy()
37
+ chunked_file.docs = chunked_docs
38
+ return chunked_file
core/debug.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import VectorStore
2
+ from typing import Iterable, List, Any
3
+ from langchain.docstore.document import Document
4
+ from langchain.embeddings.base import Embeddings
5
+ from langchain.embeddings.fake import FakeEmbeddings as FakeEmbeddingsBase
6
+ from langchain.chat_models.fake import FakeListChatModel
7
+ from typing import Optional
8
+
9
+
10
+ class FakeChatModel(FakeListChatModel):
11
+ def __init__(self, **kwargs):
12
+ responses = ["The answer is 42. SOURCES: 1, 2, 3, 4"]
13
+ super().__init__(responses=responses, **kwargs)
14
+
15
+
16
+ class FakeEmbeddings(FakeEmbeddingsBase):
17
+ def __init__(self, **kwargs):
18
+ super().__init__(size=4, **kwargs)
19
+
20
+
21
+ class FakeVectorStore(VectorStore):
22
+ """Fake vector store for testing purposes."""
23
+
24
+ def __init__(self, texts: List[str]):
25
+ self.texts: List[str] = texts
26
+
27
+ def add_texts(
28
+ self, texts: Iterable[str], metadatas: List[dict] | None = None, **kwargs: Any
29
+ ) -> List[str]:
30
+ self.texts.extend(texts)
31
+ return self.texts
32
+
33
+ @classmethod
34
+ def from_texts(
35
+ cls,
36
+ texts: List[str],
37
+ embedding: Embeddings,
38
+ metadatas: Optional[List[dict]] = None,
39
+ **kwargs: Any,
40
+ ) -> "FakeVectorStore":
41
+ return cls(texts=list(texts))
42
+
43
+ def similarity_search(
44
+ self, query: str, k: int = 4, **kwargs: Any
45
+ ) -> List[Document]:
46
+ return [
47
+ Document(page_content=text, metadata={"source": f"{i+1}-{1}"})
48
+ for i, text in enumerate(self.texts)
49
+ ]
core/embedding.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import VectorStore
2
+ from knowledge_gpt.core.parsing import File
3
+ from langchain.vectorstores.faiss import FAISS
4
+ from langchain.embeddings import OpenAIEmbeddings
5
+ from langchain.embeddings.base import Embeddings
6
+ from typing import List, Type
7
+ from langchain.docstore.document import Document
8
+ from knowledge_gpt.core.debug import FakeVectorStore, FakeEmbeddings
9
+
10
+
11
+ class FolderIndex:
12
+ """Index for a collection of files (a folder)"""
13
+
14
+ def __init__(self, files: List[File], index: VectorStore):
15
+ self.name: str = "default"
16
+ self.files = files
17
+ self.index: VectorStore = index
18
+
19
+ @staticmethod
20
+ def _combine_files(files: List[File]) -> List[Document]:
21
+ """Combines all the documents in a list of files into a single list."""
22
+
23
+ all_texts = []
24
+ for file in files:
25
+ for doc in file.docs:
26
+ doc.metadata["file_name"] = file.name
27
+ doc.metadata["file_id"] = file.id
28
+ all_texts.append(doc)
29
+
30
+ return all_texts
31
+
32
+ @classmethod
33
+ def from_files(
34
+ cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]
35
+ ) -> "FolderIndex":
36
+ """Creates an index from files."""
37
+
38
+ all_docs = cls._combine_files(files)
39
+
40
+ index = vector_store.from_documents(
41
+ documents=all_docs,
42
+ embedding=embeddings,
43
+ )
44
+
45
+ return cls(files=files, index=index)
46
+
47
+
48
+ def embed_files(
49
+ files: List[File], embedding: str, vector_store: str, **kwargs
50
+ ) -> FolderIndex:
51
+ """Embeds a collection of files and stores them in a FolderIndex."""
52
+
53
+ supported_embeddings: dict[str, Type[Embeddings]] = {
54
+ "openai": OpenAIEmbeddings,
55
+ "debug": FakeEmbeddings,
56
+ }
57
+ supported_vector_stores: dict[str, Type[VectorStore]] = {
58
+ "faiss": FAISS,
59
+ "debug": FakeVectorStore,
60
+ }
61
+
62
+ if embedding in supported_embeddings:
63
+ _embeddings = supported_embeddings[embedding](**kwargs)
64
+ else:
65
+ raise NotImplementedError(f"Embedding {embedding} not supported.")
66
+
67
+ if vector_store in supported_vector_stores:
68
+ _vector_store = supported_vector_stores[vector_store]
69
+ else:
70
+ raise NotImplementedError(f"Vector store {vector_store} not supported.")
71
+
72
+ return FolderIndex.from_files(
73
+ files=files, embeddings=_embeddings, vector_store=_vector_store
74
+ )
core/parsing.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ from typing import List, Any, Optional
3
+ import re
4
+
5
+ import docx2txt
6
+ from langchain.docstore.document import Document
7
+ import fitz
8
+ from hashlib import md5
9
+
10
+ from abc import abstractmethod, ABC
11
+ from copy import deepcopy
12
+
13
+
14
+ class File(ABC):
15
+ """Represents an uploaded file comprised of Documents"""
16
+
17
+ def __init__(
18
+ self,
19
+ name: str,
20
+ id: str,
21
+ metadata: Optional[dict[str, Any]] = None,
22
+ docs: Optional[List[Document]] = None,
23
+ ):
24
+ self.name = name
25
+ self.id = id
26
+ self.metadata = metadata or {}
27
+ self.docs = docs or []
28
+
29
+ @classmethod
30
+ @abstractmethod
31
+ def from_bytes(cls, file: BytesIO) -> "File":
32
+ """Creates a File from a BytesIO object"""
33
+
34
+ def __repr__(self) -> str:
35
+ return (
36
+ f"File(name={self.name}, id={self.id},"
37
+ " metadata={self.metadata}, docs={self.docs})"
38
+ )
39
+
40
+ def __str__(self) -> str:
41
+ return f"File(name={self.name}, id={self.id}, metadata={self.metadata})"
42
+
43
+ def copy(self) -> "File":
44
+ """Create a deep copy of this File"""
45
+ return self.__class__(
46
+ name=self.name,
47
+ id=self.id,
48
+ metadata=deepcopy(self.metadata),
49
+ docs=deepcopy(self.docs),
50
+ )
51
+
52
+
53
+ def strip_consecutive_newlines(text: str) -> str:
54
+ """Strips consecutive newlines from a string
55
+ possibly with whitespace in between
56
+ """
57
+ return re.sub(r"\s*\n\s*", "\n", text)
58
+
59
+
60
+ class DocxFile(File):
61
+ @classmethod
62
+ def from_bytes(cls, file: BytesIO) -> "DocxFile":
63
+ text = docx2txt.process(file)
64
+ text = strip_consecutive_newlines(text)
65
+ doc = Document(page_content=text.strip())
66
+ doc.metadata["source"] = "p-1"
67
+ return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
68
+
69
+
70
+ class PdfFile(File):
71
+ @classmethod
72
+ def from_bytes(cls, file: BytesIO) -> "PdfFile":
73
+ pdf = fitz.open(stream=file.read(), filetype="pdf") # type: ignore
74
+ docs = []
75
+ for i, page in enumerate(pdf):
76
+ text = page.get_text(sort=True)
77
+ text = strip_consecutive_newlines(text)
78
+ doc = Document(page_content=text.strip())
79
+ doc.metadata["page"] = i + 1
80
+ doc.metadata["source"] = f"p-{i+1}"
81
+ docs.append(doc)
82
+ # file.read() mutates the file object, which can affect caching
83
+ # so we need to reset the file pointer to the beginning
84
+ file.seek(0)
85
+ return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=docs)
86
+
87
+
88
+ class TxtFile(File):
89
+ @classmethod
90
+ def from_bytes(cls, file: BytesIO) -> "TxtFile":
91
+ text = file.read().decode("utf-8", errors="replace")
92
+ text = strip_consecutive_newlines(text)
93
+ file.seek(0)
94
+ doc = Document(page_content=text.strip())
95
+ doc.metadata["source"] = "p-1"
96
+ return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
97
+
98
+
99
+ def read_file(file: BytesIO) -> File:
100
+ """Reads an uploaded file and returns a File object"""
101
+ if file.name.lower().endswith(".docx"):
102
+ return DocxFile.from_bytes(file)
103
+ elif file.name.lower().endswith(".pdf"):
104
+ return PdfFile.from_bytes(file)
105
+ elif file.name.lower().endswith(".txt"):
106
+ return TxtFile.from_bytes(file)
107
+ else:
108
+ raise NotImplementedError(f"File type {file.name.split('.')[-1]} not supported")
core/prompts.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ from langchain.prompts import PromptTemplate
3
+
4
+ ## Use a shorter template to reduce the number of tokens in the prompt
5
+ template = """Create a final answer to the given questions using the provided document excerpts (given in no particular order) as sources. ALWAYS include a "SOURCES" section in your answer citing only the minimal set of sources needed to answer the question. If you are unable to answer the question, simply state that you do not have enough information to answer the question and leave the SOURCES section empty. Use only the provided documents and do not attempt to fabricate an answer.
6
+
7
+ ---------
8
+
9
+ QUESTION: What is the purpose of ARPA-H?
10
+ =========
11
+ Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt's based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer's, diabetes, and more.
12
+ SOURCES: 1-32
13
+ Content: While we're at it, let's make sure every American can get the health care they need. \n\nWe've already made historic investments in health care. \n\nWe've made it easier for Americans to get the care they need, when they need it. \n\nWe've made it easier for Americans to get the treatments they need, when they need them. \n\nWe've made it easier for Americans to get the medications they need, when they need them.
14
+ SOURCES: 1-33
15
+ Content: The V.A. is pioneering new ways of linking toxic exposures to disease, already helping veterans get the care they deserve. \n\nWe need to extend that same care to all Americans. \n\nThat's why I'm calling on Congress to pass legislation that would establish a national registry of toxic exposures, and provide health care and financial assistance to those affected.
16
+ SOURCES: 1-30
17
+ =========
18
+ FINAL ANSWER: The purpose of ARPA-H is to drive breakthroughs in cancer, Alzheimer's, diabetes, and more.
19
+ SOURCES: 1-32
20
+
21
+ ---------
22
+
23
+ QUESTION: {question}
24
+ =========
25
+ {summaries}
26
+ =========
27
+ FINAL ANSWER:"""
28
+
29
+ STUFF_PROMPT = PromptTemplate(
30
+ template=template, input_variables=["summaries", "question"]
31
+ )
core/qa.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
3
+ from knowledge_gpt.core.prompts import STUFF_PROMPT
4
+ from langchain.docstore.document import Document
5
+ from knowledge_gpt.core.embedding import FolderIndex
6
+ from pydantic import BaseModel
7
+ from langchain.chat_models.base import BaseChatModel
8
+
9
+
10
+ class AnswerWithSources(BaseModel):
11
+ answer: str
12
+ sources: List[Document]
13
+
14
+
15
+ def query_folder(
16
+ query: str,
17
+ folder_index: FolderIndex,
18
+ llm: BaseChatModel,
19
+ return_all: bool = False,
20
+ ) -> AnswerWithSources:
21
+ """Queries a folder index for an answer.
22
+
23
+ Args:
24
+ query (str): The query to search for.
25
+ folder_index (FolderIndex): The folder index to search.
26
+ return_all (bool): Whether to return all the documents from the embedding or
27
+ just the sources for the answer.
28
+ model (str): The model to use for the answer generation.
29
+ **model_kwargs (Any): Keyword arguments for the model.
30
+
31
+ Returns:
32
+ AnswerWithSources: The answer and the source documents.
33
+ """
34
+
35
+ chain = load_qa_with_sources_chain(
36
+ llm=llm,
37
+ chain_type="stuff",
38
+ prompt=STUFF_PROMPT,
39
+ )
40
+
41
+ relevant_docs = folder_index.index.similarity_search(query, k=5)
42
+ result = chain(
43
+ {"input_documents": relevant_docs, "question": query}, return_only_outputs=True
44
+ )
45
+ sources = relevant_docs
46
+
47
+ if not return_all:
48
+ sources = get_sources(result["output_text"], folder_index)
49
+
50
+ answer = result["output_text"].split("SOURCES: ")[0]
51
+
52
+ return AnswerWithSources(answer=answer, sources=sources)
53
+
54
+
55
+ def get_sources(answer: str, folder_index: FolderIndex) -> List[Document]:
56
+ """Retrieves the docs that were used to answer the question the generated answer."""
57
+
58
+ source_keys = [s for s in answer.split("SOURCES: ")[-1].split(", ")]
59
+
60
+ source_docs = []
61
+ for file in folder_index.files:
62
+ for doc in file.docs:
63
+ if doc.metadata["source"] in source_keys:
64
+ source_docs.append(doc)
65
+ return source_docs
core/utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
3
+ from langchain.docstore.document import Document
4
+
5
+ from langchain.chat_models import ChatOpenAI
6
+ from knowledge_gpt.core.debug import FakeChatModel
7
+ from langchain.chat_models.base import BaseChatModel
8
+
9
+
10
+ def pop_docs_upto_limit(
11
+ query: str, chain: StuffDocumentsChain, docs: List[Document], max_len: int
12
+ ) -> List[Document]:
13
+ """Pops documents from a list until the final prompt length is less
14
+ than the max length."""
15
+
16
+ token_count: int = chain.prompt_length(docs, question=query) # type: ignore
17
+
18
+ while token_count > max_len and len(docs) > 0:
19
+ docs.pop()
20
+ token_count = chain.prompt_length(docs, question=query) # type: ignore
21
+
22
+ return docs
23
+
24
+
25
+ def get_llm(model: str, **kwargs) -> BaseChatModel:
26
+ if model == "debug":
27
+ return FakeChatModel()
28
+
29
+ if "gpt" in model:
30
+ return ChatOpenAI(model=model, **kwargs) # type: ignore
31
+
32
+ raise NotImplementedError(f"Model {model} not supported!")
ui.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import streamlit as st
3
+ from langchain.docstore.document import Document
4
+ from knowledge_gpt.core.parsing import File
5
+ import openai
6
+ from streamlit.logger import get_logger
7
+ from typing import NoReturn
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def wrap_doc_in_html(docs: List[Document]) -> str:
13
+ """Wraps each page in document separated by newlines in <p> tags"""
14
+ text = [doc.page_content for doc in docs]
15
+ if isinstance(text, list):
16
+ # Add horizontal rules between pages
17
+ text = "\n<hr/>\n".join(text)
18
+ return "".join([f"<p>{line}</p>" for line in text.split("\n")])
19
+
20
+
21
+ def is_query_valid(query: str) -> bool:
22
+ if not query:
23
+ st.error("Please enter a question!")
24
+ return False
25
+ return True
26
+
27
+
28
+ def is_file_valid(file: File) -> bool:
29
+ if (
30
+ len(file.docs) == 0
31
+ or "".join([doc.page_content for doc in file.docs]).strip() == ""
32
+ ):
33
+ st.error("Cannot read document! Make sure the document has selectable text")
34
+ logger.error("Cannot read document")
35
+ return False
36
+ return True
37
+
38
+
39
+ def display_file_read_error(e: Exception, file_name: str) -> NoReturn:
40
+ st.error("Error reading file. Make sure the file is not corrupted or encrypted")
41
+ logger.error(f"{e.__class__.__name__}: {e}. Extension: {file_name.split('.')[-1]}")
42
+ st.stop()
43
+
44
+
45
+ @st.cache_data(show_spinner=False)
46
+ def is_open_ai_key_valid(openai_api_key, model: str) -> bool:
47
+ if model == "debug":
48
+ return True
49
+
50
+ if not openai_api_key:
51
+ st.error("Please enter your OpenAI API key in the sidebar!")
52
+ return False
53
+ try:
54
+ openai.ChatCompletion.create(
55
+ model=model,
56
+ messages=[{"role": "user", "content": "test"}],
57
+ api_key=openai_api_key,
58
+ )
59
+ except Exception as e:
60
+ st.error(f"{e.__class__.__name__}: {e}")
61
+ logger.error(f"{e.__class__.__name__}: {e}")
62
+ return False
63
+
64
+ return True