Suat ATAN
commited on
Commit
•
9177215
1
Parent(s):
b9d2468
first commit
Browse files- __init__.py +0 -0
- app.py +129 -0
- components/__init__.py +0 -0
- components/__pycache__/__init__.cpython-311.pyc +0 -0
- components/__pycache__/faq.cpython-311.pyc +0 -0
- components/__pycache__/sidebar.cpython-311.pyc +0 -0
- components/faq.py +39 -0
- components/sidebar.py +42 -0
- core/__init__.py +0 -0
- core/__pycache__/__init__.cpython-311.pyc +0 -0
- core/__pycache__/caching.cpython-311.pyc +0 -0
- core/__pycache__/chunking.cpython-311.pyc +0 -0
- core/__pycache__/debug.cpython-311.pyc +0 -0
- core/__pycache__/embedding.cpython-311.pyc +0 -0
- core/__pycache__/parsing.cpython-311.pyc +0 -0
- core/__pycache__/prompts.cpython-311.pyc +0 -0
- core/__pycache__/qa.cpython-311.pyc +0 -0
- core/__pycache__/utils.cpython-311.pyc +0 -0
- core/caching.py +33 -0
- core/chunking.py +38 -0
- core/debug.py +49 -0
- core/embedding.py +74 -0
- core/parsing.py +108 -0
- core/prompts.py +31 -0
- core/qa.py +65 -0
- core/utils.py +32 -0
- ui.py +64 -0
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from knowledge_gpt.components.sidebar import sidebar
|
4 |
+
|
5 |
+
from knowledge_gpt.ui import (
|
6 |
+
wrap_doc_in_html,
|
7 |
+
is_query_valid,
|
8 |
+
is_file_valid,
|
9 |
+
is_open_ai_key_valid,
|
10 |
+
display_file_read_error,
|
11 |
+
)
|
12 |
+
|
13 |
+
from knowledge_gpt.core.caching import bootstrap_caching
|
14 |
+
|
15 |
+
from knowledge_gpt.core.parsing import read_file
|
16 |
+
from knowledge_gpt.core.chunking import chunk_file
|
17 |
+
from knowledge_gpt.core.embedding import embed_files
|
18 |
+
from knowledge_gpt.core.qa import query_folder
|
19 |
+
from knowledge_gpt.core.utils import get_llm
|
20 |
+
|
21 |
+
|
22 |
+
EMBEDDING = "openai"
|
23 |
+
VECTOR_STORE = "faiss"
|
24 |
+
MODEL_LIST = ["gpt-3.5-turbo", "gpt-4"]
|
25 |
+
|
26 |
+
# Uncomment to enable debug mode
|
27 |
+
# MODEL_LIST.insert(0, "debug")
|
28 |
+
|
29 |
+
st.set_page_config(page_title="ReportIO", page_icon="☘️", layout="wide")
|
30 |
+
st.header("☘️ReportIO")
|
31 |
+
|
32 |
+
# Enable caching for expensive functions
|
33 |
+
bootstrap_caching()
|
34 |
+
|
35 |
+
sidebar()
|
36 |
+
|
37 |
+
openai_api_key = st.session_state.get("OPENAI_API_KEY")
|
38 |
+
|
39 |
+
|
40 |
+
if not openai_api_key:
|
41 |
+
st.warning(
|
42 |
+
"Enter your OpenAI API key in the sidebar. You can get a key at"
|
43 |
+
" https://platform.openai.com/account/api-keys."
|
44 |
+
)
|
45 |
+
|
46 |
+
|
47 |
+
uploaded_file = st.file_uploader(
|
48 |
+
"Upload a pdf, docx, or txt file",
|
49 |
+
type=["pdf", "docx", "txt"],
|
50 |
+
help="Scanned documents are not supported yet!",
|
51 |
+
)
|
52 |
+
|
53 |
+
# model: str = st.selectbox("Model", options=MODEL_LIST) # type: ignore
|
54 |
+
|
55 |
+
model = MODEL_LIST[0]
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
with st.expander("Advanced Options"):
|
60 |
+
return_all_chunks = st.checkbox("Show all chunks retrieved from vector search")
|
61 |
+
show_full_doc = st.checkbox("Show parsed contents of the document")
|
62 |
+
|
63 |
+
|
64 |
+
if not uploaded_file:
|
65 |
+
st.stop()
|
66 |
+
|
67 |
+
try:
|
68 |
+
file = read_file(uploaded_file)
|
69 |
+
except Exception as e:
|
70 |
+
display_file_read_error(e, file_name=uploaded_file.name)
|
71 |
+
|
72 |
+
chunked_file = chunk_file(file, chunk_size=300, chunk_overlap=0)
|
73 |
+
|
74 |
+
if not is_file_valid(file):
|
75 |
+
st.stop()
|
76 |
+
|
77 |
+
|
78 |
+
if not is_open_ai_key_valid(openai_api_key, model):
|
79 |
+
st.stop()
|
80 |
+
|
81 |
+
|
82 |
+
with st.spinner("Indexing document... This may take a while⏳"):
|
83 |
+
folder_index = embed_files(
|
84 |
+
files=[chunked_file],
|
85 |
+
embedding=EMBEDDING if model != "debug" else "debug",
|
86 |
+
vector_store=VECTOR_STORE if model != "debug" else "debug",
|
87 |
+
openai_api_key=openai_api_key,
|
88 |
+
)
|
89 |
+
|
90 |
+
|
91 |
+
with st.form(key="qa_form"):
|
92 |
+
options = ['List all pre existing conditions which may affect home insurance', 'Show the problematic components!', 'Show repair needs!']
|
93 |
+
query = st.selectbox('Select an option', options)
|
94 |
+
submit = st.form_submit_button("Submit")
|
95 |
+
|
96 |
+
|
97 |
+
if show_full_doc:
|
98 |
+
with st.expander("Document"):
|
99 |
+
# Hack to get around st.markdown rendering LaTeX
|
100 |
+
st.markdown(f"<p>{wrap_doc_in_html(file.docs)}</p>", unsafe_allow_html=True)
|
101 |
+
|
102 |
+
|
103 |
+
if submit:
|
104 |
+
if not is_query_valid(query):
|
105 |
+
st.stop()
|
106 |
+
|
107 |
+
# Output Columns
|
108 |
+
answer_col, sources_col = st.columns(2)
|
109 |
+
|
110 |
+
llm = get_llm(model=model, openai_api_key=openai_api_key, temperature=0)
|
111 |
+
result = query_folder(
|
112 |
+
folder_index=folder_index,
|
113 |
+
query=query,
|
114 |
+
return_all=return_all_chunks,
|
115 |
+
llm=llm,
|
116 |
+
)
|
117 |
+
|
118 |
+
with answer_col:
|
119 |
+
st.markdown("#### Answer")
|
120 |
+
st.markdown(result.answer)
|
121 |
+
|
122 |
+
with sources_col:
|
123 |
+
st.markdown("#### Sources")
|
124 |
+
for source in result.sources:
|
125 |
+
st.markdown(source.page_content)
|
126 |
+
st.markdown(source.metadata["source"])
|
127 |
+
st.markdown("---")
|
128 |
+
|
129 |
+
|
components/__init__.py
ADDED
File without changes
|
components/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (190 Bytes). View file
|
|
components/__pycache__/faq.cpython-311.pyc
ADDED
Binary file (1.8 kB). View file
|
|
components/__pycache__/sidebar.cpython-311.pyc
ADDED
Binary file (2.15 kB). View file
|
|
components/faq.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# flake8: noqa
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
|
5 |
+
def faq():
|
6 |
+
st.markdown(
|
7 |
+
"""
|
8 |
+
# FAQ
|
9 |
+
## How does ReportIO work?
|
10 |
+
When you upload a document, it will be divided into smaller chunks
|
11 |
+
and stored in a special type of database called a vector index
|
12 |
+
that allows for semantic search and retrieval.
|
13 |
+
|
14 |
+
When you ask a question, ReportIO will search through the
|
15 |
+
document chunks and find the most relevant ones using the vector index.
|
16 |
+
Then, it will use GPT3 to generate a final answer.
|
17 |
+
|
18 |
+
## Is my data safe?
|
19 |
+
Yes, your data is safe. ReportIO does not store your documents or
|
20 |
+
questions. All uploaded data is deleted after you close the browser tab.
|
21 |
+
|
22 |
+
## Why does it take so long to index my document?
|
23 |
+
If you are using a free OpenAI API key, it will take a while to index
|
24 |
+
your document. This is because the free API key has strict [rate limits](https://platform.openai.com/docs/guides/rate-limits/overview).
|
25 |
+
To speed up the indexing process, you can use a paid API key.
|
26 |
+
|
27 |
+
## What do the numbers mean under each source?
|
28 |
+
For a PDF document, you will see a citation number like this: 3-12.
|
29 |
+
The first number is the page number and the second number is
|
30 |
+
the chunk number on that page. For DOCS and TXT documents,
|
31 |
+
the first number is set to 1 and the second number is the chunk number.
|
32 |
+
|
33 |
+
## Are the answers 100% accurate?
|
34 |
+
|
35 |
+
But for most use cases, ReportIO is very accurate and can answer
|
36 |
+
most questions. Always check with the sources to make sure that the answers
|
37 |
+
are correct.
|
38 |
+
"""
|
39 |
+
)
|
components/sidebar.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from knowledge_gpt.components.faq import faq
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
import os
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
|
10 |
+
def sidebar():
|
11 |
+
with st.sidebar:
|
12 |
+
st.markdown(
|
13 |
+
"## How can I help you?\n"
|
14 |
+
|
15 |
+
"1. Upload a pdf, docx, or txt file of home inspection report📄\n"
|
16 |
+
"2. Ask a question about the report\n"
|
17 |
+
"2. Or use existing extractor button to see analyzes. ⭐\n"
|
18 |
+
)
|
19 |
+
api_key_input = st.text_input(
|
20 |
+
"OpenAI API Key",
|
21 |
+
type="password",
|
22 |
+
placeholder="Paste your OpenAI API key here (sk-...)",
|
23 |
+
help="You can get your API key from https://platform.openai.com/account/api-keys.", # noqa: E501
|
24 |
+
value=os.environ.get("OPENAI_API_KEY", None)
|
25 |
+
or st.session_state.get("OPENAI_API_KEY", ""),
|
26 |
+
)
|
27 |
+
|
28 |
+
st.session_state["OPENAI_API_KEY"] = api_key_input
|
29 |
+
|
30 |
+
st.markdown("---")
|
31 |
+
st.markdown("# About")
|
32 |
+
st.markdown(
|
33 |
+
"☘️ReportIO allows you to ask questions about your "
|
34 |
+
"home inspection reports and get accurate answers with instant citations. "
|
35 |
+
)
|
36 |
+
st.markdown(
|
37 |
+
"This tool is a work in progress. "
|
38 |
+
)
|
39 |
+
st.markdown("Made by S.Atan")
|
40 |
+
st.markdown("---")
|
41 |
+
|
42 |
+
faq()
|
core/__init__.py
ADDED
File without changes
|
core/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (184 Bytes). View file
|
|
core/__pycache__/caching.cpython-311.pyc
ADDED
Binary file (2.38 kB). View file
|
|
core/__pycache__/chunking.cpython-311.pyc
ADDED
Binary file (1.84 kB). View file
|
|
core/__pycache__/debug.cpython-311.pyc
ADDED
Binary file (3.93 kB). View file
|
|
core/__pycache__/embedding.cpython-311.pyc
ADDED
Binary file (3.64 kB). View file
|
|
core/__pycache__/parsing.cpython-311.pyc
ADDED
Binary file (7.47 kB). View file
|
|
core/__pycache__/prompts.cpython-311.pyc
ADDED
Binary file (2.34 kB). View file
|
|
core/__pycache__/qa.cpython-311.pyc
ADDED
Binary file (3.42 kB). View file
|
|
core/__pycache__/utils.cpython-311.pyc
ADDED
Binary file (1.85 kB). View file
|
|
core/caching.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit.runtime.caching.hashing import HashFuncsDict
|
3 |
+
|
4 |
+
import knowledge_gpt.core.parsing as parsing
|
5 |
+
import knowledge_gpt.core.chunking as chunking
|
6 |
+
import knowledge_gpt.core.embedding as embedding
|
7 |
+
from knowledge_gpt.core.parsing import File
|
8 |
+
|
9 |
+
|
10 |
+
def file_hash_func(file: File) -> str:
|
11 |
+
"""Get a unique hash for a file"""
|
12 |
+
return file.id
|
13 |
+
|
14 |
+
|
15 |
+
@st.cache_data(show_spinner=False)
|
16 |
+
def bootstrap_caching():
|
17 |
+
"""Patch module functions with caching"""
|
18 |
+
|
19 |
+
# Get all substypes of File from module
|
20 |
+
file_subtypes = [
|
21 |
+
cls
|
22 |
+
for cls in vars(parsing).values()
|
23 |
+
if isinstance(cls, type) and issubclass(cls, File) and cls != File
|
24 |
+
]
|
25 |
+
file_hash_funcs: HashFuncsDict = {cls: file_hash_func for cls in file_subtypes}
|
26 |
+
|
27 |
+
parsing.read_file = st.cache_data(show_spinner=False)(parsing.read_file)
|
28 |
+
chunking.chunk_file = st.cache_data(show_spinner=False, hash_funcs=file_hash_funcs)(
|
29 |
+
chunking.chunk_file
|
30 |
+
)
|
31 |
+
embedding.embed_files = st.cache_data(
|
32 |
+
show_spinner=False, hash_funcs=file_hash_funcs
|
33 |
+
)(embedding.embed_files)
|
core/chunking.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.docstore.document import Document
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from knowledge_gpt.core.parsing import File
|
4 |
+
|
5 |
+
|
6 |
+
def chunk_file(
|
7 |
+
file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo"
|
8 |
+
) -> File:
|
9 |
+
"""Chunks each document in a file into smaller documents
|
10 |
+
according to the specified chunk size and overlap
|
11 |
+
where the size is determined by the number of tokens for the specified model.
|
12 |
+
"""
|
13 |
+
|
14 |
+
# split each document into chunks
|
15 |
+
chunked_docs = []
|
16 |
+
for doc in file.docs:
|
17 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
18 |
+
model_name=model_name,
|
19 |
+
chunk_size=chunk_size,
|
20 |
+
chunk_overlap=chunk_overlap,
|
21 |
+
)
|
22 |
+
|
23 |
+
chunks = text_splitter.split_text(doc.page_content)
|
24 |
+
|
25 |
+
for i, chunk in enumerate(chunks):
|
26 |
+
doc = Document(
|
27 |
+
page_content=chunk,
|
28 |
+
metadata={
|
29 |
+
"page": doc.metadata.get("page", 1),
|
30 |
+
"chunk": i + 1,
|
31 |
+
"source": f"{doc.metadata.get('page', 1)}-{i + 1}",
|
32 |
+
},
|
33 |
+
)
|
34 |
+
chunked_docs.append(doc)
|
35 |
+
|
36 |
+
chunked_file = file.copy()
|
37 |
+
chunked_file.docs = chunked_docs
|
38 |
+
return chunked_file
|
core/debug.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import VectorStore
|
2 |
+
from typing import Iterable, List, Any
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
from langchain.embeddings.base import Embeddings
|
5 |
+
from langchain.embeddings.fake import FakeEmbeddings as FakeEmbeddingsBase
|
6 |
+
from langchain.chat_models.fake import FakeListChatModel
|
7 |
+
from typing import Optional
|
8 |
+
|
9 |
+
|
10 |
+
class FakeChatModel(FakeListChatModel):
|
11 |
+
def __init__(self, **kwargs):
|
12 |
+
responses = ["The answer is 42. SOURCES: 1, 2, 3, 4"]
|
13 |
+
super().__init__(responses=responses, **kwargs)
|
14 |
+
|
15 |
+
|
16 |
+
class FakeEmbeddings(FakeEmbeddingsBase):
|
17 |
+
def __init__(self, **kwargs):
|
18 |
+
super().__init__(size=4, **kwargs)
|
19 |
+
|
20 |
+
|
21 |
+
class FakeVectorStore(VectorStore):
|
22 |
+
"""Fake vector store for testing purposes."""
|
23 |
+
|
24 |
+
def __init__(self, texts: List[str]):
|
25 |
+
self.texts: List[str] = texts
|
26 |
+
|
27 |
+
def add_texts(
|
28 |
+
self, texts: Iterable[str], metadatas: List[dict] | None = None, **kwargs: Any
|
29 |
+
) -> List[str]:
|
30 |
+
self.texts.extend(texts)
|
31 |
+
return self.texts
|
32 |
+
|
33 |
+
@classmethod
|
34 |
+
def from_texts(
|
35 |
+
cls,
|
36 |
+
texts: List[str],
|
37 |
+
embedding: Embeddings,
|
38 |
+
metadatas: Optional[List[dict]] = None,
|
39 |
+
**kwargs: Any,
|
40 |
+
) -> "FakeVectorStore":
|
41 |
+
return cls(texts=list(texts))
|
42 |
+
|
43 |
+
def similarity_search(
|
44 |
+
self, query: str, k: int = 4, **kwargs: Any
|
45 |
+
) -> List[Document]:
|
46 |
+
return [
|
47 |
+
Document(page_content=text, metadata={"source": f"{i+1}-{1}"})
|
48 |
+
for i, text in enumerate(self.texts)
|
49 |
+
]
|
core/embedding.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import VectorStore
|
2 |
+
from knowledge_gpt.core.parsing import File
|
3 |
+
from langchain.vectorstores.faiss import FAISS
|
4 |
+
from langchain.embeddings import OpenAIEmbeddings
|
5 |
+
from langchain.embeddings.base import Embeddings
|
6 |
+
from typing import List, Type
|
7 |
+
from langchain.docstore.document import Document
|
8 |
+
from knowledge_gpt.core.debug import FakeVectorStore, FakeEmbeddings
|
9 |
+
|
10 |
+
|
11 |
+
class FolderIndex:
|
12 |
+
"""Index for a collection of files (a folder)"""
|
13 |
+
|
14 |
+
def __init__(self, files: List[File], index: VectorStore):
|
15 |
+
self.name: str = "default"
|
16 |
+
self.files = files
|
17 |
+
self.index: VectorStore = index
|
18 |
+
|
19 |
+
@staticmethod
|
20 |
+
def _combine_files(files: List[File]) -> List[Document]:
|
21 |
+
"""Combines all the documents in a list of files into a single list."""
|
22 |
+
|
23 |
+
all_texts = []
|
24 |
+
for file in files:
|
25 |
+
for doc in file.docs:
|
26 |
+
doc.metadata["file_name"] = file.name
|
27 |
+
doc.metadata["file_id"] = file.id
|
28 |
+
all_texts.append(doc)
|
29 |
+
|
30 |
+
return all_texts
|
31 |
+
|
32 |
+
@classmethod
|
33 |
+
def from_files(
|
34 |
+
cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]
|
35 |
+
) -> "FolderIndex":
|
36 |
+
"""Creates an index from files."""
|
37 |
+
|
38 |
+
all_docs = cls._combine_files(files)
|
39 |
+
|
40 |
+
index = vector_store.from_documents(
|
41 |
+
documents=all_docs,
|
42 |
+
embedding=embeddings,
|
43 |
+
)
|
44 |
+
|
45 |
+
return cls(files=files, index=index)
|
46 |
+
|
47 |
+
|
48 |
+
def embed_files(
|
49 |
+
files: List[File], embedding: str, vector_store: str, **kwargs
|
50 |
+
) -> FolderIndex:
|
51 |
+
"""Embeds a collection of files and stores them in a FolderIndex."""
|
52 |
+
|
53 |
+
supported_embeddings: dict[str, Type[Embeddings]] = {
|
54 |
+
"openai": OpenAIEmbeddings,
|
55 |
+
"debug": FakeEmbeddings,
|
56 |
+
}
|
57 |
+
supported_vector_stores: dict[str, Type[VectorStore]] = {
|
58 |
+
"faiss": FAISS,
|
59 |
+
"debug": FakeVectorStore,
|
60 |
+
}
|
61 |
+
|
62 |
+
if embedding in supported_embeddings:
|
63 |
+
_embeddings = supported_embeddings[embedding](**kwargs)
|
64 |
+
else:
|
65 |
+
raise NotImplementedError(f"Embedding {embedding} not supported.")
|
66 |
+
|
67 |
+
if vector_store in supported_vector_stores:
|
68 |
+
_vector_store = supported_vector_stores[vector_store]
|
69 |
+
else:
|
70 |
+
raise NotImplementedError(f"Vector store {vector_store} not supported.")
|
71 |
+
|
72 |
+
return FolderIndex.from_files(
|
73 |
+
files=files, embeddings=_embeddings, vector_store=_vector_store
|
74 |
+
)
|
core/parsing.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import BytesIO
|
2 |
+
from typing import List, Any, Optional
|
3 |
+
import re
|
4 |
+
|
5 |
+
import docx2txt
|
6 |
+
from langchain.docstore.document import Document
|
7 |
+
import fitz
|
8 |
+
from hashlib import md5
|
9 |
+
|
10 |
+
from abc import abstractmethod, ABC
|
11 |
+
from copy import deepcopy
|
12 |
+
|
13 |
+
|
14 |
+
class File(ABC):
|
15 |
+
"""Represents an uploaded file comprised of Documents"""
|
16 |
+
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
name: str,
|
20 |
+
id: str,
|
21 |
+
metadata: Optional[dict[str, Any]] = None,
|
22 |
+
docs: Optional[List[Document]] = None,
|
23 |
+
):
|
24 |
+
self.name = name
|
25 |
+
self.id = id
|
26 |
+
self.metadata = metadata or {}
|
27 |
+
self.docs = docs or []
|
28 |
+
|
29 |
+
@classmethod
|
30 |
+
@abstractmethod
|
31 |
+
def from_bytes(cls, file: BytesIO) -> "File":
|
32 |
+
"""Creates a File from a BytesIO object"""
|
33 |
+
|
34 |
+
def __repr__(self) -> str:
|
35 |
+
return (
|
36 |
+
f"File(name={self.name}, id={self.id},"
|
37 |
+
" metadata={self.metadata}, docs={self.docs})"
|
38 |
+
)
|
39 |
+
|
40 |
+
def __str__(self) -> str:
|
41 |
+
return f"File(name={self.name}, id={self.id}, metadata={self.metadata})"
|
42 |
+
|
43 |
+
def copy(self) -> "File":
|
44 |
+
"""Create a deep copy of this File"""
|
45 |
+
return self.__class__(
|
46 |
+
name=self.name,
|
47 |
+
id=self.id,
|
48 |
+
metadata=deepcopy(self.metadata),
|
49 |
+
docs=deepcopy(self.docs),
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
def strip_consecutive_newlines(text: str) -> str:
|
54 |
+
"""Strips consecutive newlines from a string
|
55 |
+
possibly with whitespace in between
|
56 |
+
"""
|
57 |
+
return re.sub(r"\s*\n\s*", "\n", text)
|
58 |
+
|
59 |
+
|
60 |
+
class DocxFile(File):
|
61 |
+
@classmethod
|
62 |
+
def from_bytes(cls, file: BytesIO) -> "DocxFile":
|
63 |
+
text = docx2txt.process(file)
|
64 |
+
text = strip_consecutive_newlines(text)
|
65 |
+
doc = Document(page_content=text.strip())
|
66 |
+
doc.metadata["source"] = "p-1"
|
67 |
+
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
|
68 |
+
|
69 |
+
|
70 |
+
class PdfFile(File):
|
71 |
+
@classmethod
|
72 |
+
def from_bytes(cls, file: BytesIO) -> "PdfFile":
|
73 |
+
pdf = fitz.open(stream=file.read(), filetype="pdf") # type: ignore
|
74 |
+
docs = []
|
75 |
+
for i, page in enumerate(pdf):
|
76 |
+
text = page.get_text(sort=True)
|
77 |
+
text = strip_consecutive_newlines(text)
|
78 |
+
doc = Document(page_content=text.strip())
|
79 |
+
doc.metadata["page"] = i + 1
|
80 |
+
doc.metadata["source"] = f"p-{i+1}"
|
81 |
+
docs.append(doc)
|
82 |
+
# file.read() mutates the file object, which can affect caching
|
83 |
+
# so we need to reset the file pointer to the beginning
|
84 |
+
file.seek(0)
|
85 |
+
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=docs)
|
86 |
+
|
87 |
+
|
88 |
+
class TxtFile(File):
|
89 |
+
@classmethod
|
90 |
+
def from_bytes(cls, file: BytesIO) -> "TxtFile":
|
91 |
+
text = file.read().decode("utf-8", errors="replace")
|
92 |
+
text = strip_consecutive_newlines(text)
|
93 |
+
file.seek(0)
|
94 |
+
doc = Document(page_content=text.strip())
|
95 |
+
doc.metadata["source"] = "p-1"
|
96 |
+
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
|
97 |
+
|
98 |
+
|
99 |
+
def read_file(file: BytesIO) -> File:
|
100 |
+
"""Reads an uploaded file and returns a File object"""
|
101 |
+
if file.name.lower().endswith(".docx"):
|
102 |
+
return DocxFile.from_bytes(file)
|
103 |
+
elif file.name.lower().endswith(".pdf"):
|
104 |
+
return PdfFile.from_bytes(file)
|
105 |
+
elif file.name.lower().endswith(".txt"):
|
106 |
+
return TxtFile.from_bytes(file)
|
107 |
+
else:
|
108 |
+
raise NotImplementedError(f"File type {file.name.split('.')[-1]} not supported")
|
core/prompts.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# flake8: noqa
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
|
4 |
+
## Use a shorter template to reduce the number of tokens in the prompt
|
5 |
+
template = """Create a final answer to the given questions using the provided document excerpts (given in no particular order) as sources. ALWAYS include a "SOURCES" section in your answer citing only the minimal set of sources needed to answer the question. If you are unable to answer the question, simply state that you do not have enough information to answer the question and leave the SOURCES section empty. Use only the provided documents and do not attempt to fabricate an answer.
|
6 |
+
|
7 |
+
---------
|
8 |
+
|
9 |
+
QUESTION: What is the purpose of ARPA-H?
|
10 |
+
=========
|
11 |
+
Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt's based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer's, diabetes, and more.
|
12 |
+
SOURCES: 1-32
|
13 |
+
Content: While we're at it, let's make sure every American can get the health care they need. \n\nWe've already made historic investments in health care. \n\nWe've made it easier for Americans to get the care they need, when they need it. \n\nWe've made it easier for Americans to get the treatments they need, when they need them. \n\nWe've made it easier for Americans to get the medications they need, when they need them.
|
14 |
+
SOURCES: 1-33
|
15 |
+
Content: The V.A. is pioneering new ways of linking toxic exposures to disease, already helping veterans get the care they deserve. \n\nWe need to extend that same care to all Americans. \n\nThat's why I'm calling on Congress to pass legislation that would establish a national registry of toxic exposures, and provide health care and financial assistance to those affected.
|
16 |
+
SOURCES: 1-30
|
17 |
+
=========
|
18 |
+
FINAL ANSWER: The purpose of ARPA-H is to drive breakthroughs in cancer, Alzheimer's, diabetes, and more.
|
19 |
+
SOURCES: 1-32
|
20 |
+
|
21 |
+
---------
|
22 |
+
|
23 |
+
QUESTION: {question}
|
24 |
+
=========
|
25 |
+
{summaries}
|
26 |
+
=========
|
27 |
+
FINAL ANSWER:"""
|
28 |
+
|
29 |
+
STUFF_PROMPT = PromptTemplate(
|
30 |
+
template=template, input_variables=["summaries", "question"]
|
31 |
+
)
|
core/qa.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
|
3 |
+
from knowledge_gpt.core.prompts import STUFF_PROMPT
|
4 |
+
from langchain.docstore.document import Document
|
5 |
+
from knowledge_gpt.core.embedding import FolderIndex
|
6 |
+
from pydantic import BaseModel
|
7 |
+
from langchain.chat_models.base import BaseChatModel
|
8 |
+
|
9 |
+
|
10 |
+
class AnswerWithSources(BaseModel):
|
11 |
+
answer: str
|
12 |
+
sources: List[Document]
|
13 |
+
|
14 |
+
|
15 |
+
def query_folder(
|
16 |
+
query: str,
|
17 |
+
folder_index: FolderIndex,
|
18 |
+
llm: BaseChatModel,
|
19 |
+
return_all: bool = False,
|
20 |
+
) -> AnswerWithSources:
|
21 |
+
"""Queries a folder index for an answer.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
query (str): The query to search for.
|
25 |
+
folder_index (FolderIndex): The folder index to search.
|
26 |
+
return_all (bool): Whether to return all the documents from the embedding or
|
27 |
+
just the sources for the answer.
|
28 |
+
model (str): The model to use for the answer generation.
|
29 |
+
**model_kwargs (Any): Keyword arguments for the model.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
AnswerWithSources: The answer and the source documents.
|
33 |
+
"""
|
34 |
+
|
35 |
+
chain = load_qa_with_sources_chain(
|
36 |
+
llm=llm,
|
37 |
+
chain_type="stuff",
|
38 |
+
prompt=STUFF_PROMPT,
|
39 |
+
)
|
40 |
+
|
41 |
+
relevant_docs = folder_index.index.similarity_search(query, k=5)
|
42 |
+
result = chain(
|
43 |
+
{"input_documents": relevant_docs, "question": query}, return_only_outputs=True
|
44 |
+
)
|
45 |
+
sources = relevant_docs
|
46 |
+
|
47 |
+
if not return_all:
|
48 |
+
sources = get_sources(result["output_text"], folder_index)
|
49 |
+
|
50 |
+
answer = result["output_text"].split("SOURCES: ")[0]
|
51 |
+
|
52 |
+
return AnswerWithSources(answer=answer, sources=sources)
|
53 |
+
|
54 |
+
|
55 |
+
def get_sources(answer: str, folder_index: FolderIndex) -> List[Document]:
|
56 |
+
"""Retrieves the docs that were used to answer the question the generated answer."""
|
57 |
+
|
58 |
+
source_keys = [s for s in answer.split("SOURCES: ")[-1].split(", ")]
|
59 |
+
|
60 |
+
source_docs = []
|
61 |
+
for file in folder_index.files:
|
62 |
+
for doc in file.docs:
|
63 |
+
if doc.metadata["source"] in source_keys:
|
64 |
+
source_docs.append(doc)
|
65 |
+
return source_docs
|
core/utils.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
|
5 |
+
from langchain.chat_models import ChatOpenAI
|
6 |
+
from knowledge_gpt.core.debug import FakeChatModel
|
7 |
+
from langchain.chat_models.base import BaseChatModel
|
8 |
+
|
9 |
+
|
10 |
+
def pop_docs_upto_limit(
|
11 |
+
query: str, chain: StuffDocumentsChain, docs: List[Document], max_len: int
|
12 |
+
) -> List[Document]:
|
13 |
+
"""Pops documents from a list until the final prompt length is less
|
14 |
+
than the max length."""
|
15 |
+
|
16 |
+
token_count: int = chain.prompt_length(docs, question=query) # type: ignore
|
17 |
+
|
18 |
+
while token_count > max_len and len(docs) > 0:
|
19 |
+
docs.pop()
|
20 |
+
token_count = chain.prompt_length(docs, question=query) # type: ignore
|
21 |
+
|
22 |
+
return docs
|
23 |
+
|
24 |
+
|
25 |
+
def get_llm(model: str, **kwargs) -> BaseChatModel:
|
26 |
+
if model == "debug":
|
27 |
+
return FakeChatModel()
|
28 |
+
|
29 |
+
if "gpt" in model:
|
30 |
+
return ChatOpenAI(model=model, **kwargs) # type: ignore
|
31 |
+
|
32 |
+
raise NotImplementedError(f"Model {model} not supported!")
|
ui.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import streamlit as st
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
from knowledge_gpt.core.parsing import File
|
5 |
+
import openai
|
6 |
+
from streamlit.logger import get_logger
|
7 |
+
from typing import NoReturn
|
8 |
+
|
9 |
+
logger = get_logger(__name__)
|
10 |
+
|
11 |
+
|
12 |
+
def wrap_doc_in_html(docs: List[Document]) -> str:
|
13 |
+
"""Wraps each page in document separated by newlines in <p> tags"""
|
14 |
+
text = [doc.page_content for doc in docs]
|
15 |
+
if isinstance(text, list):
|
16 |
+
# Add horizontal rules between pages
|
17 |
+
text = "\n<hr/>\n".join(text)
|
18 |
+
return "".join([f"<p>{line}</p>" for line in text.split("\n")])
|
19 |
+
|
20 |
+
|
21 |
+
def is_query_valid(query: str) -> bool:
|
22 |
+
if not query:
|
23 |
+
st.error("Please enter a question!")
|
24 |
+
return False
|
25 |
+
return True
|
26 |
+
|
27 |
+
|
28 |
+
def is_file_valid(file: File) -> bool:
|
29 |
+
if (
|
30 |
+
len(file.docs) == 0
|
31 |
+
or "".join([doc.page_content for doc in file.docs]).strip() == ""
|
32 |
+
):
|
33 |
+
st.error("Cannot read document! Make sure the document has selectable text")
|
34 |
+
logger.error("Cannot read document")
|
35 |
+
return False
|
36 |
+
return True
|
37 |
+
|
38 |
+
|
39 |
+
def display_file_read_error(e: Exception, file_name: str) -> NoReturn:
|
40 |
+
st.error("Error reading file. Make sure the file is not corrupted or encrypted")
|
41 |
+
logger.error(f"{e.__class__.__name__}: {e}. Extension: {file_name.split('.')[-1]}")
|
42 |
+
st.stop()
|
43 |
+
|
44 |
+
|
45 |
+
@st.cache_data(show_spinner=False)
|
46 |
+
def is_open_ai_key_valid(openai_api_key, model: str) -> bool:
|
47 |
+
if model == "debug":
|
48 |
+
return True
|
49 |
+
|
50 |
+
if not openai_api_key:
|
51 |
+
st.error("Please enter your OpenAI API key in the sidebar!")
|
52 |
+
return False
|
53 |
+
try:
|
54 |
+
openai.ChatCompletion.create(
|
55 |
+
model=model,
|
56 |
+
messages=[{"role": "user", "content": "test"}],
|
57 |
+
api_key=openai_api_key,
|
58 |
+
)
|
59 |
+
except Exception as e:
|
60 |
+
st.error(f"{e.__class__.__name__}: {e}")
|
61 |
+
logger.error(f"{e.__class__.__name__}: {e}")
|
62 |
+
return False
|
63 |
+
|
64 |
+
return True
|