Spaces:
Runtime error
Runtime error
Adrian Cowham
commited on
Commit
•
e71c4e6
1
Parent(s):
e8442b6
restarting
Browse files- .gitattributes +3 -0
- .gitignore +160 -0
- README.md +1 -1
- resources/design-by-fire.pdf +3 -0
- resources/lets-talk.pdf +3 -0
- resources/progit.pdf +3 -0
- src/.DS_Store +0 -0
- src/app.py +138 -0
- src/components/__init__.py +0 -0
- src/components/faq.py +46 -0
- src/core/__init__.py +0 -0
- src/core/caching.py +32 -0
- src/core/chunking.py +61 -0
- src/core/debug.py +49 -0
- src/core/embedding.py +76 -0
- src/core/parsing.py +105 -0
- src/core/prompts.py +31 -0
- src/core/qa.py +78 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
resources/design-by-fire.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
resources/lets-talk.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
resources/progit.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
|
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.40.1
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.40.1
|
8 |
+
app_file: src/app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
resources/design-by-fire.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0cf8176ae0f4873ca6547f22ecad1fcc366b170488782087a3be48801721eba0
|
3 |
+
size 1353204
|
resources/lets-talk.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1545cbf074f8e363200e32f1da24b0319163bfafd2871078c8e750a32b02098b
|
3 |
+
size 4156635
|
resources/progit.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dea5f1cce14aabd2d3e7246fd4d0e6fe632c13561060bb47eee069b7f257289a
|
3 |
+
size 18915172
|
src/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
src/app.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
from threading import Lock
|
4 |
+
from typing import Any, Dict, Optional, Tuple
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
from langchain.chains import ConversationalRetrievalChain
|
8 |
+
from langchain.chat_models import ChatOpenAI
|
9 |
+
from langchain.memory import ConversationBufferMemory
|
10 |
+
from langchain.prompts.chat import (ChatPromptTemplate,
|
11 |
+
HumanMessagePromptTemplate,
|
12 |
+
SystemMessagePromptTemplate)
|
13 |
+
|
14 |
+
from .core.chunking import chunk_file
|
15 |
+
from .core.embedding import embed_files
|
16 |
+
from .core.parsing import read_file
|
17 |
+
|
18 |
+
VECTOR_STORE = "faiss"
|
19 |
+
MODEL = "openai"
|
20 |
+
EMBEDDING = "openai"
|
21 |
+
MODEL = "gpt-3.5-turbo-16k"
|
22 |
+
K = 5
|
23 |
+
USE_VERBOSE = True
|
24 |
+
API_KEY = os.environ["OPENAI_API_KEY"]
|
25 |
+
system_template = """
|
26 |
+
Use the context below to answer questions. You must only use the Context to answer questions. If I ask you about 'the book' or 'this book' or similar references, then answer using the Context. If you cannot find the answer from the Context below, you must respond with
|
27 |
+
"I'm sorry, but I can't find the answer to your question in the book, 'Design by Fire,' by Emily Elizabeth Schlickman and Brett Milligan." All answers must be in English unless you are explicitly asked to translate to a different language.
|
28 |
+
----------------
|
29 |
+
{context}
|
30 |
+
{chat_history}
|
31 |
+
"""
|
32 |
+
|
33 |
+
# Create the chat prompt templates
|
34 |
+
messages = [
|
35 |
+
SystemMessagePromptTemplate.from_template(system_template),
|
36 |
+
HumanMessagePromptTemplate.from_template("{question}")
|
37 |
+
]
|
38 |
+
qa_prompt = ChatPromptTemplate.from_messages(messages)
|
39 |
+
|
40 |
+
class AnswerConversationBufferMemory(ConversationBufferMemory):
|
41 |
+
def save_context(self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None:
|
42 |
+
return super(AnswerConversationBufferMemory, self).save_context(inputs,{'response': outputs['answer']})
|
43 |
+
|
44 |
+
def getretriever():
|
45 |
+
with open("./resources/design-by-fire.pdf", 'rb') as uploaded_file:
|
46 |
+
try:
|
47 |
+
file = read_file(uploaded_file)
|
48 |
+
except Exception as e:
|
49 |
+
print(e)
|
50 |
+
|
51 |
+
chunked_file = chunk_file(file, chunk_size=512, chunk_overlap=0)
|
52 |
+
folder_index = embed_files(
|
53 |
+
files=[chunked_file],
|
54 |
+
embedding=EMBEDDING,
|
55 |
+
vector_store=VECTOR_STORE,
|
56 |
+
openai_api_key=API_KEY,
|
57 |
+
)
|
58 |
+
return folder_index.index.as_retriever(verbose=True, search_type="similarity", search_kwargs={"k": K})
|
59 |
+
|
60 |
+
retriever = getretriever()
|
61 |
+
|
62 |
+
def getanswer(chain, question, history):
|
63 |
+
if hasattr(chain, "value"):
|
64 |
+
chain = chain.value
|
65 |
+
if hasattr(history, "value"):
|
66 |
+
history = history.value
|
67 |
+
if hasattr(question, "value"):
|
68 |
+
question = question.value
|
69 |
+
|
70 |
+
history = history or []
|
71 |
+
lock = Lock()
|
72 |
+
lock.acquire()
|
73 |
+
try:
|
74 |
+
output = chain({"question": question})
|
75 |
+
output = output["answer"]
|
76 |
+
history.append((question, output))
|
77 |
+
except Exception as e:
|
78 |
+
raise e
|
79 |
+
finally:
|
80 |
+
lock.release()
|
81 |
+
return history, history, gr.update(value="")
|
82 |
+
|
83 |
+
def load_chain(inputs = None):
|
84 |
+
llm = ChatOpenAI(
|
85 |
+
openai_api_key=API_KEY,
|
86 |
+
model_name=MODEL,
|
87 |
+
verbose=True)
|
88 |
+
chain = ConversationalRetrievalChain.from_llm(
|
89 |
+
llm,
|
90 |
+
retriever=retriever,
|
91 |
+
return_source_documents=USE_VERBOSE,
|
92 |
+
memory=AnswerConversationBufferMemory(memory_key="chat_history", return_messages=True),
|
93 |
+
verbose=USE_VERBOSE,
|
94 |
+
combine_docs_chain_kwargs={"prompt": qa_prompt})
|
95 |
+
return chain
|
96 |
+
|
97 |
+
CSS ="""
|
98 |
+
.contain { display: flex; flex-direction: column; }
|
99 |
+
.gradio-container { height: 100vh !important; }
|
100 |
+
#component-0 { height: 100%; }
|
101 |
+
#chatbot { flex-grow: 1; overflow: auto;}
|
102 |
+
"""
|
103 |
+
|
104 |
+
with gr.Blocks() as block:
|
105 |
+
with gr.Row():
|
106 |
+
with gr.Column(scale=0.75):
|
107 |
+
with gr.Row():
|
108 |
+
gr.Markdown("<h1>Design by Fire</h1>")
|
109 |
+
with gr.Row():
|
110 |
+
gr.Markdown("by Emily Elizabeth Schlickman and Brett Milligan")
|
111 |
+
chatbot = gr.Chatbot(elem_id="chatbot").style(height=600)
|
112 |
+
|
113 |
+
with gr.Row():
|
114 |
+
message = gr.Textbox(
|
115 |
+
label="",
|
116 |
+
placeholder="Design by Fire",
|
117 |
+
lines=1,
|
118 |
+
)
|
119 |
+
with gr.Row():
|
120 |
+
submit = gr.Button(value="Send", variant="primary", scale=1)
|
121 |
+
|
122 |
+
state = gr.State()
|
123 |
+
chain_state = gr.State(load_chain)
|
124 |
+
|
125 |
+
submit.click(getanswer, inputs=[chain_state, message, state], outputs=[chatbot, state, message])
|
126 |
+
message.submit(getanswer, inputs=[chain_state, message, state], outputs=[chatbot, state, message])
|
127 |
+
|
128 |
+
with gr.Column(scale=0.25):
|
129 |
+
with gr.Row():
|
130 |
+
gr.Markdown("<h1><center>Suggestions</center></h1>")
|
131 |
+
ex1 = gr.Button(value="What are the main factors and trends discussed in the book that contribute to the changing behavior of wildfires?", variant="primary")
|
132 |
+
ex1.click(getanswer, inputs=[chain_state, ex1, state], outputs=[chatbot, state, message])
|
133 |
+
ex2 = gr.Button(value="How does the book explore the relationship between fire and different landscapes, such as wilderness and urban areas?", variant="primary")
|
134 |
+
ex2.click(getanswer, inputs=[chain_state, ex2, state], outputs=[chatbot, state, message])
|
135 |
+
ex3 = gr.Button(value="What are the three approaches to designing with fire that the book presents?", variant="primary")
|
136 |
+
ex3.click(getanswer, inputs=[chain_state, ex3, state], outputs=[chatbot, state, message])
|
137 |
+
|
138 |
+
block.launch(debug=True)
|
src/components/__init__.py
ADDED
File without changes
|
src/components/faq.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# flake8: noqa
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
|
5 |
+
def faq():
|
6 |
+
st.markdown(
|
7 |
+
"""
|
8 |
+
# FAQ
|
9 |
+
## How does KnowledgeGPT work?
|
10 |
+
When you upload a document, it will be divided into smaller chunks
|
11 |
+
and stored in a special type of database called a vector index
|
12 |
+
that allows for semantic search and retrieval.
|
13 |
+
|
14 |
+
When you ask a question, KnowledgeGPT will search through the
|
15 |
+
document chunks and find the most relevant ones using the vector index.
|
16 |
+
Then, it will use GPT3 to generate a final answer.
|
17 |
+
|
18 |
+
## Is my data safe?
|
19 |
+
Yes, your data is safe. KnowledgeGPT does not store your documents or
|
20 |
+
questions. All uploaded data is deleted after you close the browser tab.
|
21 |
+
|
22 |
+
## Why does it take so long to index my document?
|
23 |
+
If you are using a free OpenAI API key, it will take a while to index
|
24 |
+
your document. This is because the free API key has strict [rate limits](https://platform.openai.com/docs/guides/rate-limits/overview).
|
25 |
+
To speed up the indexing process, you can use a paid API key.
|
26 |
+
|
27 |
+
## What do the numbers mean under each source?
|
28 |
+
For a PDF document, you will see a citation number like this: 3-12.
|
29 |
+
The first number is the page number and the second number is
|
30 |
+
the chunk number on that page. For DOCS and TXT documents,
|
31 |
+
the first number is set to 1 and the second number is the chunk number.
|
32 |
+
|
33 |
+
## Are the answers 100% accurate?
|
34 |
+
No, the answers are not 100% accurate. KnowledgeGPT uses GPT-3 to generate
|
35 |
+
answers. GPT-3 is a powerful language model, but it sometimes makes mistakes
|
36 |
+
and is prone to hallucinations. Also, KnowledgeGPT uses semantic search
|
37 |
+
to find the most relevant chunks and does not see the entire document,
|
38 |
+
which means that it may not be able to find all the relevant information and
|
39 |
+
may not be able to answer all questions (especially summary-type questions
|
40 |
+
or questions that require a lot of context from the document).
|
41 |
+
|
42 |
+
But for most use cases, KnowledgeGPT is very accurate and can answer
|
43 |
+
most questions. Always check with the sources to make sure that the answers
|
44 |
+
are correct.
|
45 |
+
"""
|
46 |
+
)
|
src/core/__init__.py
ADDED
File without changes
|
src/core/caching.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import canonical_demo_memory.core.chunking as chunking
|
2 |
+
import canonical_demo_memory.core.embedding as embedding
|
3 |
+
import canonical_demo_memory.core.parsing as parsing
|
4 |
+
import streamlit as st
|
5 |
+
from canonical_demo_memory.core.parsing import File
|
6 |
+
from streamlit.runtime.caching.hashing import HashFuncsDict
|
7 |
+
|
8 |
+
|
9 |
+
def file_hash_func(file: File) -> str:
|
10 |
+
"""Get a unique hash for a file"""
|
11 |
+
return file.id
|
12 |
+
|
13 |
+
|
14 |
+
@st.cache_data(show_spinner=False)
|
15 |
+
def bootstrap_caching():
|
16 |
+
"""Patch module functions with caching"""
|
17 |
+
|
18 |
+
# Get all substypes of File from module
|
19 |
+
file_subtypes = [
|
20 |
+
cls
|
21 |
+
for cls in vars(parsing).values()
|
22 |
+
if isinstance(cls, type) and issubclass(cls, File) and cls != File
|
23 |
+
]
|
24 |
+
file_hash_funcs: HashFuncsDict = {cls: file_hash_func for cls in file_subtypes}
|
25 |
+
|
26 |
+
parsing.read_file = st.cache_data(show_spinner=False)(parsing.read_file)
|
27 |
+
chunking.chunk_file = st.cache_data(show_spinner=False, hash_funcs=file_hash_funcs)(
|
28 |
+
chunking.chunk_file
|
29 |
+
)
|
30 |
+
embedding.embed_files = st.cache_data(
|
31 |
+
show_spinner=False, hash_funcs=file_hash_funcs
|
32 |
+
)(embedding.embed_files)
|
src/core/chunking.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.docstore.document import Document
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
|
4 |
+
from .parsing import File
|
5 |
+
|
6 |
+
|
7 |
+
def chunk_sentences(sentences, chunk_size=512):
|
8 |
+
sents = []
|
9 |
+
current_sent = ""
|
10 |
+
|
11 |
+
for sentence in sentences:
|
12 |
+
# If adding the next sentence doesn't exceed the chunk_size,
|
13 |
+
# we add the sentence to the current chunk.
|
14 |
+
if len(current_sent) + len(sentence) <= chunk_size:
|
15 |
+
current_sent += " " + sentence
|
16 |
+
else:
|
17 |
+
# If adding the sentence would make the chunk too long,
|
18 |
+
# we add the current_sent chunk to the list of chunks and start a new chunk.
|
19 |
+
sents.append(current_sent)
|
20 |
+
current_sent = sentence
|
21 |
+
|
22 |
+
# After going through all the sentences, there may be a chunk that hasn't yet been added to the list.
|
23 |
+
# We add it now:
|
24 |
+
if current_sent:
|
25 |
+
sents.append(current_sent)
|
26 |
+
|
27 |
+
return sents
|
28 |
+
|
29 |
+
def chunk_file(
|
30 |
+
file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo"
|
31 |
+
) -> File:
|
32 |
+
"""Chunks each document in a file into smaller documents
|
33 |
+
according to the specified chunk size and overlap
|
34 |
+
where the size is determined by the number of token for the specified model.
|
35 |
+
"""
|
36 |
+
|
37 |
+
# split each document into chunks
|
38 |
+
chunked_docs = []
|
39 |
+
for doc in file.docs:
|
40 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
41 |
+
model_name=model_name,
|
42 |
+
chunk_size=chunk_size,
|
43 |
+
chunk_overlap=chunk_overlap,
|
44 |
+
)
|
45 |
+
|
46 |
+
chunks = text_splitter.split_text(doc.page_content)
|
47 |
+
|
48 |
+
for i, chunk in enumerate(chunks):
|
49 |
+
doc = Document(
|
50 |
+
page_content=chunk,
|
51 |
+
metadata={
|
52 |
+
"page": doc.metadata.get("page", 1),
|
53 |
+
"chunk": i + 1,
|
54 |
+
"source": f"{doc.metadata.get('page', 1)}-{i + 1}",
|
55 |
+
},
|
56 |
+
)
|
57 |
+
chunked_docs.append(doc)
|
58 |
+
|
59 |
+
chunked_file = file.copy()
|
60 |
+
chunked_file.docs = chunked_docs
|
61 |
+
return chunked_file
|
src/core/debug.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import VectorStore
|
2 |
+
from typing import Iterable, List, Any
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
from langchain.embeddings.base import Embeddings
|
5 |
+
from langchain.embeddings.fake import FakeEmbeddings as FakeEmbeddingsBase
|
6 |
+
from langchain.chat_models.fake import FakeListChatModel
|
7 |
+
from typing import Optional
|
8 |
+
|
9 |
+
|
10 |
+
class FakeChatModel(FakeListChatModel):
|
11 |
+
def __init__(self, **kwargs):
|
12 |
+
responses = ["The answer is 42. SOURCES: 1, 2, 3, 4"]
|
13 |
+
super().__init__(responses=responses, **kwargs)
|
14 |
+
|
15 |
+
|
16 |
+
class FakeEmbeddings(FakeEmbeddingsBase):
|
17 |
+
def __init__(self, **kwargs):
|
18 |
+
super().__init__(size=4, **kwargs)
|
19 |
+
|
20 |
+
|
21 |
+
class FakeVectorStore(VectorStore):
|
22 |
+
"""Fake vector store for testing purposes."""
|
23 |
+
|
24 |
+
def __init__(self, texts: List[str]):
|
25 |
+
self.texts: List[str] = texts
|
26 |
+
|
27 |
+
def add_texts(
|
28 |
+
self, texts: Iterable[str], metadatas: List[dict] | None = None, **kwargs: Any
|
29 |
+
) -> List[str]:
|
30 |
+
self.texts.extend(texts)
|
31 |
+
return self.texts
|
32 |
+
|
33 |
+
@classmethod
|
34 |
+
def from_texts(
|
35 |
+
cls,
|
36 |
+
texts: List[str],
|
37 |
+
embedding: Embeddings,
|
38 |
+
metadatas: Optional[List[dict]] = None,
|
39 |
+
**kwargs: Any,
|
40 |
+
) -> "FakeVectorStore":
|
41 |
+
return cls(texts=list(texts))
|
42 |
+
|
43 |
+
def similarity_search(
|
44 |
+
self, query: str, k: int = 4, **kwargs: Any
|
45 |
+
) -> List[Document]:
|
46 |
+
return [
|
47 |
+
Document(page_content=text, metadata={"source": f"{i+1}-{1}"})
|
48 |
+
for i, text in enumerate(self.texts)
|
49 |
+
]
|
src/core/embedding.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Type
|
2 |
+
|
3 |
+
from langchain.docstore.document import Document
|
4 |
+
from langchain.embeddings import OpenAIEmbeddings
|
5 |
+
from langchain.embeddings.base import Embeddings
|
6 |
+
from langchain.vectorstores import VectorStore
|
7 |
+
from langchain.vectorstores.faiss import FAISS
|
8 |
+
|
9 |
+
from .debug import FakeEmbeddings, FakeVectorStore
|
10 |
+
from .parsing import File
|
11 |
+
|
12 |
+
|
13 |
+
class FolderIndex:
|
14 |
+
"""Index for a collection of files (a folder)"""
|
15 |
+
|
16 |
+
def __init__(self, files: List[File], index: VectorStore):
|
17 |
+
self.name: str = "default"
|
18 |
+
self.files = files
|
19 |
+
self.index: VectorStore = index
|
20 |
+
|
21 |
+
@staticmethod
|
22 |
+
def _combine_files(files: List[File]) -> List[Document]:
|
23 |
+
"""Combines all the documents in a list of files into a single list."""
|
24 |
+
|
25 |
+
all_texts = []
|
26 |
+
for file in files:
|
27 |
+
for doc in file.docs:
|
28 |
+
doc.metadata["file_name"] = file.name
|
29 |
+
doc.metadata["file_id"] = file.id
|
30 |
+
all_texts.append(doc)
|
31 |
+
|
32 |
+
return all_texts
|
33 |
+
|
34 |
+
@classmethod
|
35 |
+
def from_files(
|
36 |
+
cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]
|
37 |
+
) -> "FolderIndex":
|
38 |
+
"""Creates an index from files."""
|
39 |
+
|
40 |
+
all_docs = cls._combine_files(files)
|
41 |
+
|
42 |
+
index = vector_store.from_documents(
|
43 |
+
documents=all_docs,
|
44 |
+
embedding=embeddings,
|
45 |
+
)
|
46 |
+
|
47 |
+
return cls(files=files, index=index)
|
48 |
+
|
49 |
+
|
50 |
+
def embed_files(
|
51 |
+
files: List[File], embedding: str, vector_store: str, **kwargs
|
52 |
+
) -> FolderIndex:
|
53 |
+
"""Embeds a collection of files and stores them in a FolderIndex."""
|
54 |
+
|
55 |
+
supported_embeddings: dict[str, Type[Embeddings]] = {
|
56 |
+
"openai": OpenAIEmbeddings,
|
57 |
+
"debug": FakeEmbeddings,
|
58 |
+
}
|
59 |
+
supported_vector_stores: dict[str, Type[VectorStore]] = {
|
60 |
+
"faiss": FAISS,
|
61 |
+
"debug": FakeVectorStore,
|
62 |
+
}
|
63 |
+
|
64 |
+
if embedding in supported_embeddings:
|
65 |
+
_embeddings = supported_embeddings[embedding](**kwargs)
|
66 |
+
else:
|
67 |
+
raise NotImplementedError(f"Embedding {embedding} not supported.")
|
68 |
+
|
69 |
+
if vector_store in supported_vector_stores:
|
70 |
+
_vector_store = supported_vector_stores[vector_store]
|
71 |
+
else:
|
72 |
+
raise NotImplementedError(f"Vector store {vector_store} not supported.")
|
73 |
+
|
74 |
+
return FolderIndex.from_files(
|
75 |
+
files=files, embeddings=_embeddings, vector_store=_vector_store
|
76 |
+
)
|
src/core/parsing.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from copy import deepcopy
|
4 |
+
from hashlib import md5
|
5 |
+
from io import BytesIO
|
6 |
+
from typing import Any, List, Optional
|
7 |
+
|
8 |
+
import docx2txt
|
9 |
+
import fitz
|
10 |
+
from langchain.docstore.document import Document
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
class File(ABC):
|
15 |
+
"""Represents an uploaded file comprised of Documents"""
|
16 |
+
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
name: str,
|
20 |
+
id: str,
|
21 |
+
metadata: Optional[dict[str, Any]] = None,
|
22 |
+
docs: Optional[List[Document]] = None,
|
23 |
+
):
|
24 |
+
self.name = name
|
25 |
+
self.id = id
|
26 |
+
self.metadata = metadata or {}
|
27 |
+
self.docs = docs or []
|
28 |
+
|
29 |
+
@classmethod
|
30 |
+
@abstractmethod
|
31 |
+
def from_bytes(cls, file: BytesIO) -> "File":
|
32 |
+
"""Creates a File from a BytesIO object"""
|
33 |
+
|
34 |
+
def __repr__(self) -> str:
|
35 |
+
return (
|
36 |
+
f"File(name={self.name}, id={self.id},"
|
37 |
+
" metadata={self.metadata}, docs={self.docs})"
|
38 |
+
)
|
39 |
+
|
40 |
+
def __str__(self) -> str:
|
41 |
+
return f"File(name={self.name}, id={self.id}, metadata={self.metadata})"
|
42 |
+
|
43 |
+
def copy(self) -> "File":
|
44 |
+
"""Create a deep copy of this File"""
|
45 |
+
return self.__class__(
|
46 |
+
name=self.name,
|
47 |
+
id=self.id,
|
48 |
+
metadata=deepcopy(self.metadata),
|
49 |
+
docs=deepcopy(self.docs),
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
def strip_consecutive_newlines(text: str) -> str:
|
54 |
+
"""Strips consecutive newlines from a string
|
55 |
+
possibly with whitespace in between
|
56 |
+
"""
|
57 |
+
return re.sub(r"\s*\n\s*", "\n", text)
|
58 |
+
|
59 |
+
|
60 |
+
class DocxFile(File):
|
61 |
+
@classmethod
|
62 |
+
def from_bytes(cls, file: BytesIO) -> "DocxFile":
|
63 |
+
text = docx2txt.process(file)
|
64 |
+
text = strip_consecutive_newlines(text)
|
65 |
+
doc = Document(page_content=text.strip())
|
66 |
+
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
|
67 |
+
|
68 |
+
|
69 |
+
class PdfFile(File):
|
70 |
+
@classmethod
|
71 |
+
def from_bytes(cls, file: BytesIO) -> "PdfFile":
|
72 |
+
pdf = fitz.open(stream=file.read(), filetype="pdf") # type: ignore
|
73 |
+
docs = []
|
74 |
+
for i, page in enumerate(pdf):
|
75 |
+
text = page.get_text(sort=True)
|
76 |
+
text = strip_consecutive_newlines(text)
|
77 |
+
doc = Document(page_content=text.strip())
|
78 |
+
doc.metadata["page"] = i + 1
|
79 |
+
docs.append(doc)
|
80 |
+
# file.read() mutates the file object, which can affect caching
|
81 |
+
# so we need to reset the file pointer to the beginning
|
82 |
+
file.seek(0)
|
83 |
+
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=docs)
|
84 |
+
|
85 |
+
|
86 |
+
class TxtFile(File):
|
87 |
+
@classmethod
|
88 |
+
def from_bytes(cls, file: BytesIO) -> "TxtFile":
|
89 |
+
text = file.read().decode("utf-8")
|
90 |
+
text = strip_consecutive_newlines(text)
|
91 |
+
file.seek(0)
|
92 |
+
doc = Document(page_content=text.strip())
|
93 |
+
return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=[doc])
|
94 |
+
|
95 |
+
|
96 |
+
def read_file(file: BytesIO) -> File:
|
97 |
+
"""Reads an uploaded file and returns a File object"""
|
98 |
+
if file.name.lower().endswith(".docx"):
|
99 |
+
return DocxFile.from_bytes(file)
|
100 |
+
elif file.name.lower().endswith(".pdf"):
|
101 |
+
return PdfFile.from_bytes(file)
|
102 |
+
elif file.name.lower().endswith(".txt"):
|
103 |
+
return TxtFile.from_bytes(file)
|
104 |
+
else:
|
105 |
+
raise NotImplementedError(f"File type {file.name.split('.')[-1]} not supported")
|
src/core/prompts.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# flake8: noqa
|
2 |
+
from langchain.prompts import PromptTemplate
|
3 |
+
|
4 |
+
## Use a shorter template to reduce the number of tokens in the prompt
|
5 |
+
template = """Create a final answer to the given questions using the provided document excerpts(in no particular order) as references. ALWAYS include a "SOURCES" section in your answer including only the minimal set of sources needed to answer the question. If you are unable to answer the question, simply state that you do not know. Do not attempt to fabricate an answer and leave the SOURCES section empty.
|
6 |
+
|
7 |
+
---------
|
8 |
+
|
9 |
+
QUESTION: What is the purpose of ARPA-H?
|
10 |
+
=========
|
11 |
+
Content: More support for patients and families. \n\nTo get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n\nIt’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n\nARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
|
12 |
+
Source: 1-32
|
13 |
+
Content: While we’re at it, let’s make sure every American can get the health care they need. \n\nWe’ve already made historic investments in health care. \n\nWe’ve made it easier for Americans to get the care they need, when they need it. \n\nWe’ve made it easier for Americans to get the treatments they need, when they need them. \n\nWe’ve made it easier for Americans to get the medications they need, when they need them.
|
14 |
+
Source: 1-33
|
15 |
+
Content: The V.A. is pioneering new ways of linking toxic exposures to disease, already helping veterans get the care they deserve. \n\nWe need to extend that same care to all Americans. \n\nThat’s why I’m calling on Congress to pass legislation that would establish a national registry of toxic exposures, and provide health care and financial assistance to those affected.
|
16 |
+
Source: 1-30
|
17 |
+
=========
|
18 |
+
FINAL ANSWER: The purpose of ARPA-H is to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more.
|
19 |
+
SOURCES: 1-32
|
20 |
+
|
21 |
+
---------
|
22 |
+
|
23 |
+
QUESTION: {question}
|
24 |
+
=========
|
25 |
+
{summaries}
|
26 |
+
=========
|
27 |
+
FINAL ANSWER:"""
|
28 |
+
|
29 |
+
STUFF_PROMPT = PromptTemplate(
|
30 |
+
template=template, input_variables=["summaries", "question"]
|
31 |
+
)
|
src/core/qa.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, List
|
2 |
+
|
3 |
+
from canonical_demo_memory.core.debug import FakeChatModel
|
4 |
+
from canonical_demo_memory.core.embedding import FolderIndex
|
5 |
+
from canonical_demo_memory.core.prompts import STUFF_PROMPT
|
6 |
+
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
|
7 |
+
from langchain.chat_models import ChatOpenAI
|
8 |
+
from langchain.docstore.document import Document
|
9 |
+
from pydantic import BaseModel
|
10 |
+
|
11 |
+
|
12 |
+
class AnswerWithSources(BaseModel):
|
13 |
+
answer: str
|
14 |
+
sources: List[Document]
|
15 |
+
|
16 |
+
|
17 |
+
def query_folder(
|
18 |
+
query: str,
|
19 |
+
folder_index: FolderIndex,
|
20 |
+
return_all: bool = False,
|
21 |
+
model: str = "openai",
|
22 |
+
**model_kwargs: Any,
|
23 |
+
) -> AnswerWithSources:
|
24 |
+
"""Queries a folder index for an answer.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
query (str): The query to search for.
|
28 |
+
folder_index (FolderIndex): The folder index to search.
|
29 |
+
return_all (bool): Whether to return all the documents from the embedding or
|
30 |
+
just the sources for the answer.
|
31 |
+
model (str): The model to use for the answer generation.
|
32 |
+
**model_kwargs (Any): Keyword arguments for the model.
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
AnswerWithSources: The answer and the source documents.
|
36 |
+
"""
|
37 |
+
supported_models = {
|
38 |
+
"openai": ChatOpenAI,
|
39 |
+
"debug": FakeChatModel,
|
40 |
+
}
|
41 |
+
|
42 |
+
if model in supported_models:
|
43 |
+
llm = supported_models[model](**model_kwargs)
|
44 |
+
else:
|
45 |
+
raise ValueError(f"Model {model} not supported.")
|
46 |
+
|
47 |
+
chain = load_qa_with_sources_chain(
|
48 |
+
llm=llm,
|
49 |
+
chain_type="stuff",
|
50 |
+
prompt=STUFF_PROMPT,
|
51 |
+
)
|
52 |
+
|
53 |
+
relevant_docs = folder_index.index.similarity_search(query, k=5)
|
54 |
+
result = chain(
|
55 |
+
{"input_documents": relevant_docs, "question": query}, return_only_outputs=True
|
56 |
+
)
|
57 |
+
sources = relevant_docs
|
58 |
+
|
59 |
+
if not return_all:
|
60 |
+
sources = get_sources(result["output_text"], folder_index)
|
61 |
+
|
62 |
+
answer = result["output_text"].split("SOURCES: ")[0]
|
63 |
+
|
64 |
+
return AnswerWithSources(answer=answer, sources=sources)
|
65 |
+
|
66 |
+
|
67 |
+
def get_sources(answer: str, folder_index: FolderIndex) -> List[Document]:
|
68 |
+
"""Retrieves the docs that were used to answer the question the generated answer."""
|
69 |
+
|
70 |
+
source_keys = [s for s in answer.split("SOURCES: ")[-1].split(", ")]
|
71 |
+
|
72 |
+
source_docs = []
|
73 |
+
for file in folder_index.files:
|
74 |
+
for doc in file.docs:
|
75 |
+
if doc.metadata["source"] in source_keys:
|
76 |
+
source_docs.append(doc)
|
77 |
+
|
78 |
+
return source_docs
|