Spaces:
Runtime error
Runtime error
Corentin
commited on
Commit
•
ea4e986
1
Parent(s):
f8a66c4
spaces
Browse files- .gitattributes +4 -0
- LICENSE +21 -0
- README.md +47 -7
- awk.sh +1 -0
- data/dnd_spell.pdf +3 -0
- data/dnd_spell.txt +0 -0
- data/dnd_spell_split.txt +0 -0
- db_spells/chroma-collections.parquet +3 -0
- db_spells/chroma-embeddings.parquet +3 -0
- db_spells/index/id_to_uuid_abb61b80-18ff-4301-8b99-627ee62ef944.pkl +3 -0
- db_spells/index/index_abb61b80-18ff-4301-8b99-627ee62ef944.bin +3 -0
- db_spells/index/index_metadata_abb61b80-18ff-4301-8b99-627ee62ef944.pkl +3 -0
- db_spells/index/uuid_to_id_abb61b80-18ff-4301-8b99-627ee62ef944.pkl +3 -0
- ingest.py +35 -0
- main.py +100 -0
- qa.py +55 -0
- requirements.txt +8 -0
.gitattributes
CHANGED
@@ -32,3 +32,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
data/dnd_spell.pdf filter=lfs diff=lfs merge=lfs -text
|
36 |
+
db_spells/index/index_abb61b80-18ff-4301-8b99-627ee62ef944.bin filter=lfs diff=lfs merge=lfs -text
|
37 |
+
db_spells/chroma-collections.parquet filter=lfs diff=lfs merge=lfs -text
|
38 |
+
db_spells/chroma-embeddings.parquet filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Harrison Chase
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,13 +1,53 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.19.0
|
8 |
-
app_file:
|
9 |
-
pinned: false
|
10 |
license: agpl-3.0
|
|
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: DnD QA Bot
|
3 |
+
emoji: 🗡️
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.19.0
|
8 |
+
app_file: main.py
|
|
|
9 |
license: agpl-3.0
|
10 |
+
python: 3.10.9
|
11 |
---
|
12 |
|
13 |
+
# 🗡️D&D Spell QA Bot🗡️
|
14 |
+
|
15 |
+
This is a chatbot that can answer questions about **Dungeon and Dragons spells** based on this [database](https://www.aidedd.org/dnd-filters/spells-5e.php) and built with LangChain and OpenAI API. Usefull to find informations quickly instead of browsing through 50 pages of PDF.
|
16 |
+
The creator of this bot is **[Corentin Meyer (@corentinm_py)](https://twitter.com/corentinm_py)**.
|
17 |
+
💪 This bot it based on Notion Question-Answering demo from [LangChain](https://github.com/hwchase17/langchain)
|
18 |
+
|
19 |
+
# 🌲 Environment Setup
|
20 |
+
|
21 |
+
In order to set your environment up to run the code here, first install all requirements and then launch streamlit app:
|
22 |
+
|
23 |
+
```shell
|
24 |
+
python -m venv .venv
|
25 |
+
source .venv/bin/activate
|
26 |
+
pip install -r requirements.txt
|
27 |
+
streamlit run main.py
|
28 |
+
```
|
29 |
+
|
30 |
+
Then set your OpenAI API key (if you don't have one, get one [here](https://beta.openai.com/playground))
|
31 |
+
|
32 |
+
```shell
|
33 |
+
export OPENAI_API_KEY=....
|
34 |
+
```
|
35 |
+
|
36 |
+
## 🚀 Code to deploy on StreamLit
|
37 |
+
|
38 |
+
The code to run the StreamLit app is in `main.py`.
|
39 |
+
Note that when setting up your StreamLit app you should make sure to add `OPENAI_API_KEY` as a secret environment variable.
|
40 |
+
|
41 |
+
## 🧑 Reproduce the embedding and stuff
|
42 |
+
|
43 |
+
Run the following command to ingest the data.
|
44 |
+
|
45 |
+
```shell
|
46 |
+
python ingest.py
|
47 |
+
```
|
48 |
+
|
49 |
+
Boom! Now you're done, and you can ask it questions like:
|
50 |
+
|
51 |
+
```shell
|
52 |
+
python qa.py "What's the size of tsunami spell ?"
|
53 |
+
```
|
awk.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
awk '{B[NR%3]=$0} NR>2{ print B[(NR+1)%3]} /^level/ {print ""} END {print B[(NR+2)%3]; print B[(NR+3)%3]}' dnd_spell.txt > dnd_spell_split.txt
|
data/dnd_spell.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b89814c7fe82a5b318fbf0aace794533ed9a4501e28dec72bb1eccd113b5027
|
3 |
+
size 2307284
|
data/dnd_spell.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/dnd_spell_split.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
db_spells/chroma-collections.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:085aa69d06c9189b9a861beb67583680dc3ada6e910077896432aed1739558c5
|
3 |
+
size 557
|
db_spells/chroma-embeddings.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a15ac51b1f91a063e3e4b3ddfd8e22d4fd0df42d527e5b8459f26d0edc486de6
|
3 |
+
size 3148551
|
db_spells/index/id_to_uuid_abb61b80-18ff-4301-8b99-627ee62ef944.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfc6dbe52b6e9d5c12281a1d253f9950a4dbaf6cba9d5b421ea0608e1d3817b3
|
3 |
+
size 10498
|
db_spells/index/index_abb61b80-18ff-4301-8b99-627ee62ef944.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16ed23c2bb32432023b2665ee2f0eb9b812f24959266df681a128a5aec1679ec
|
3 |
+
size 2076836
|
db_spells/index/index_metadata_abb61b80-18ff-4301-8b99-627ee62ef944.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47a774357132affe4f362595e475f07233eed4702f998ad5cd1e3b67455dbaa4
|
3 |
+
size 74
|
db_spells/index/uuid_to_id_abb61b80-18ff-4301-8b99-627ee62ef944.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08c067aeda07cadf4c22b364e1b009e6352806af610923970c00c9bc227b5a9b
|
3 |
+
size 12300
|
ingest.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#%%
|
2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
3 |
+
|
4 |
+
from langchain.vectorstores import Chroma
|
5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
6 |
+
|
7 |
+
persist_directory = "db_spells"
|
8 |
+
with open("data/dnd_spell_split.txt") as f:
|
9 |
+
dnd_spell = f.read()
|
10 |
+
text_splitter = CharacterTextSplitter(
|
11 |
+
separator="\n\n",
|
12 |
+
chunk_size=1000,
|
13 |
+
chunk_overlap=0,
|
14 |
+
length_function=len,
|
15 |
+
)
|
16 |
+
texts = text_splitter.split_text(dnd_spell)
|
17 |
+
|
18 |
+
docs = text_splitter.create_documents([dnd_spell])
|
19 |
+
embeddings = OpenAIEmbeddings()
|
20 |
+
|
21 |
+
metadatas = []
|
22 |
+
for i in texts:
|
23 |
+
source = i.split("\n")[0]
|
24 |
+
metadatas.append({"source": f"Spell {source} in dnd_spell_split.txt"})
|
25 |
+
#%%
|
26 |
+
docsearch = Chroma.from_texts(
|
27 |
+
texts,
|
28 |
+
embeddings,
|
29 |
+
persist_directory=persist_directory,
|
30 |
+
metadatas=metadatas,
|
31 |
+
)
|
32 |
+
docsearch.persist()
|
33 |
+
docsearch = None
|
34 |
+
|
35 |
+
# %%
|
main.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Python file to serve as the frontend"""
|
2 |
+
import streamlit as st
|
3 |
+
from streamlit_chat import message
|
4 |
+
|
5 |
+
from langchain.chains import VectorDBQAWithSourcesChain
|
6 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
7 |
+
from langchain.vectorstores import Chroma
|
8 |
+
from langchain.chat_models import ChatOpenAI
|
9 |
+
from langchain.prompts.chat import (
|
10 |
+
ChatPromptTemplate,
|
11 |
+
SystemMessagePromptTemplate,
|
12 |
+
HumanMessagePromptTemplate,
|
13 |
+
)
|
14 |
+
|
15 |
+
st.set_page_config(page_title="D&D 🗡️ Spell QA Bot", page_icon="🗡️")
|
16 |
+
|
17 |
+
# Load the LangChain.
|
18 |
+
system_template = """Use the following pieces of context to answer the users question.
|
19 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
20 |
+
ALWAYS return a "SOURCES" part in your answer.
|
21 |
+
The "SOURCES" part should be a reference to the source of the document from which you got your answer.
|
22 |
+
|
23 |
+
Example of your response should be:
|
24 |
+
|
25 |
+
```
|
26 |
+
The answer is foo
|
27 |
+
SOURCES: xyz
|
28 |
+
```
|
29 |
+
|
30 |
+
Begin!
|
31 |
+
----------------
|
32 |
+
{summaries}"""
|
33 |
+
messages = [
|
34 |
+
SystemMessagePromptTemplate.from_template(system_template),
|
35 |
+
HumanMessagePromptTemplate.from_template("{question}"),
|
36 |
+
]
|
37 |
+
prompt = ChatPromptTemplate.from_messages(messages)
|
38 |
+
|
39 |
+
|
40 |
+
@st.cache_resource
|
41 |
+
def load_chroma():
|
42 |
+
persist_directory = "db_spells"
|
43 |
+
embeddings = OpenAIEmbeddings()
|
44 |
+
vectordb = Chroma(
|
45 |
+
persist_directory=persist_directory, embedding_function=embeddings
|
46 |
+
)
|
47 |
+
return vectordb
|
48 |
+
|
49 |
+
|
50 |
+
vectordb = load_chroma()
|
51 |
+
chain_type_kwargs = {"prompt": prompt}
|
52 |
+
chain = VectorDBQAWithSourcesChain.from_chain_type(
|
53 |
+
ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
|
54 |
+
chain_type="stuff",
|
55 |
+
vectorstore=vectordb,
|
56 |
+
chain_type_kwargs=chain_type_kwargs,
|
57 |
+
)
|
58 |
+
|
59 |
+
|
60 |
+
# From here down is all the StreamLit UI.
|
61 |
+
st.header("D&D 🗡️ Spell QA Bot")
|
62 |
+
st.markdown(
|
63 |
+
"""
|
64 |
+
This is a chatbot that can answer questions about **Dungeon and Dragons spells** based on this [database](https://www.aidedd.org/dnd-filters/spells-5e.php) and built with LangChain and OpenAI API.
|
65 |
+
The creator of this bot is **[Corentin Meyer (@corentinm_py)](https://twitter.com/corentinm_py)**.
|
66 |
+
Try by yourself by typing something like: "What's the size of tsunami spell ?"
|
67 |
+
"""
|
68 |
+
)
|
69 |
+
|
70 |
+
if "generated" not in st.session_state:
|
71 |
+
st.session_state["generated"] = []
|
72 |
+
|
73 |
+
if "past" not in st.session_state:
|
74 |
+
st.session_state["past"] = []
|
75 |
+
|
76 |
+
|
77 |
+
def get_text():
|
78 |
+
input_text = st.text_input(
|
79 |
+
"You: ", "What's the size of tsunami spell ?", key="input"
|
80 |
+
)
|
81 |
+
return input_text
|
82 |
+
|
83 |
+
|
84 |
+
user_input = get_text()
|
85 |
+
|
86 |
+
if user_input:
|
87 |
+
result = chain(
|
88 |
+
{"question": user_input},
|
89 |
+
return_only_outputs=True,
|
90 |
+
)
|
91 |
+
output = f"Answer: {result['answer']}\nSources: {result['sources']}"
|
92 |
+
|
93 |
+
st.session_state.past.append(user_input)
|
94 |
+
st.session_state.generated.append(output)
|
95 |
+
|
96 |
+
if st.session_state["generated"]:
|
97 |
+
|
98 |
+
for i in range(len(st.session_state["generated"]) - 1, -1, -1):
|
99 |
+
message(st.session_state["generated"][i], key=str(i))
|
100 |
+
message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
|
qa.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Ask a question to the database."""
|
2 |
+
#%%
|
3 |
+
from langchain.chains import VectorDBQAWithSourcesChain
|
4 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
5 |
+
from langchain.vectorstores import Chroma
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.prompts.chat import (
|
8 |
+
ChatPromptTemplate,
|
9 |
+
SystemMessagePromptTemplate,
|
10 |
+
HumanMessagePromptTemplate,
|
11 |
+
)
|
12 |
+
import argparse
|
13 |
+
|
14 |
+
system_template = """Use the following pieces of context to answer the users question.
|
15 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
16 |
+
ALWAYS return a "SOURCES" part in your answer.
|
17 |
+
The "SOURCES" part should be a reference to the source of the document from which you got your answer.
|
18 |
+
|
19 |
+
Example of your response should be:
|
20 |
+
|
21 |
+
```
|
22 |
+
The answer is foo
|
23 |
+
SOURCES: xyz
|
24 |
+
```
|
25 |
+
|
26 |
+
Begin!
|
27 |
+
----------------
|
28 |
+
{summaries}"""
|
29 |
+
messages = [
|
30 |
+
SystemMessagePromptTemplate.from_template(system_template),
|
31 |
+
HumanMessagePromptTemplate.from_template("{question}"),
|
32 |
+
]
|
33 |
+
prompt = ChatPromptTemplate.from_messages(messages)
|
34 |
+
|
35 |
+
persist_directory = "db_spells"
|
36 |
+
embeddings = OpenAIEmbeddings()
|
37 |
+
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
38 |
+
chain_type_kwargs = {"prompt": prompt}
|
39 |
+
chain = VectorDBQAWithSourcesChain.from_chain_type(
|
40 |
+
ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
|
41 |
+
chain_type="stuff",
|
42 |
+
vectorstore=vectordb,
|
43 |
+
chain_type_kwargs=chain_type_kwargs,
|
44 |
+
)
|
45 |
+
|
46 |
+
parser = argparse.ArgumentParser(description="Ask a question to the DB.")
|
47 |
+
parser.add_argument("question", type=str, help="The question to ask the DB")
|
48 |
+
args = parser.parse_args()
|
49 |
+
|
50 |
+
result = chain(
|
51 |
+
{"question": args.question},
|
52 |
+
return_only_outputs=True,
|
53 |
+
)
|
54 |
+
print(f"Answer: {result['answer']}")
|
55 |
+
print(f"Sources: {result['sources']}")
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
openai
|
3 |
+
streamlit
|
4 |
+
streamlit-chat
|
5 |
+
ipykernel
|
6 |
+
tiktoken
|
7 |
+
chromadb
|
8 |
+
ipykernel
|