Spaces:
Build error
Build error
heikowagner
commited on
Commit
•
fbb697c
1
Parent(s):
19b8811
add document upload
Browse files- app/VectorStore/chroma-collections.parquet +2 -2
- app/VectorStore/chroma-embeddings.parquet +2 -2
- app/VectorStore/index/id_to_uuid_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl +3 -0
- app/VectorStore/index/id_to_uuid_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl +2 -2
- app/VectorStore/index/id_to_uuid_90530179-2196-4073-89e7-11f14538d27c.pkl +3 -0
- app/VectorStore/index/index_0244568c-57df-4dab-9a52-e4703f31eeaa.bin +3 -0
- app/VectorStore/index/index_52984ff2-d9c3-459b-acc0-0b0aa559d50f.bin +2 -2
- app/VectorStore/index/index_90530179-2196-4073-89e7-11f14538d27c.bin +3 -0
- app/VectorStore/index/index_metadata_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl +3 -0
- app/VectorStore/index/index_metadata_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl +1 -1
- app/VectorStore/index/index_metadata_90530179-2196-4073-89e7-11f14538d27c.pkl +3 -0
- app/VectorStore/index/uuid_to_id_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl +3 -0
- app/VectorStore/index/uuid_to_id_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl +2 -2
- app/VectorStore/index/uuid_to_id_90530179-2196-4073-89e7-11f14538d27c.pkl +3 -0
- app/app.py +38 -37
- app/load_model.py +0 -3
- app/load_vectors.py +23 -2
- app/requirements.txt +1 -1
- app/utils.py +51 -0
app/VectorStore/chroma-collections.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e65624a226acdd91b0686aede21cb17c270204829fcc86602f16a6352b877337
|
3 |
+
size 943
|
app/VectorStore/chroma-embeddings.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:514430ced16df82f6b5355cc14ed912c5af38661418efb691ea8e73e6333ffed
|
3 |
+
size 5782971
|
app/VectorStore/index/id_to_uuid_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df8c9c0ad0e24164c8cdea96715e56553fc72fcb3dc7e7d7da60f0f9cf38ef1c
|
3 |
+
size 1640
|
app/VectorStore/index/id_to_uuid_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f39dc0dcfa56bb6584759d134c28bd53ac0165a2873cdd5b9e0ff70244840542
|
3 |
+
size 131496
|
app/VectorStore/index/id_to_uuid_90530179-2196-4073-89e7-11f14538d27c.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7296f8de5fb49d35a4e1b00cdc056b260b4a57b3d320f72d1c20982acd6c5f37
|
3 |
+
size 3371
|
app/VectorStore/index/index_0244568c-57df-4dab-9a52-e4703f31eeaa.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae356cfeea07ada1ba8342dfd128fdb76f212e37ba0e2876fccafe2b16bd95e3
|
3 |
+
size 164384
|
app/VectorStore/index/index_52984ff2-d9c3-459b-acc0-0b0aa559d50f.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de4cc0ee24b85680520f48ee0ee673443133127e18533255c60528cfe2f925be
|
3 |
+
size 13050028
|
app/VectorStore/index/index_90530179-2196-4073-89e7-11f14538d27c.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3015aa28224dce1211e2498699823bdee0958c6f024dd28ed317c0ec7e401556
|
3 |
+
size 341400
|
app/VectorStore/index/index_metadata_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:757150b880764d792751d3d3675056820575f39244ff3401bbf602f213ba7df9
|
3 |
+
size 73
|
app/VectorStore/index/index_metadata_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 74
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64da3bdfe4bc7727e421826a6459753a44eabcd37df7fe207fbde1014c0c2fe6
|
3 |
size 74
|
app/VectorStore/index/index_metadata_90530179-2196-4073-89e7-11f14538d27c.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6d8aafc2b81de7e6a55297e5029654ee387b8774a6f91d5d702420e1ff80c78
|
3 |
+
size 73
|
app/VectorStore/index/uuid_to_id_0244568c-57df-4dab-9a52-e4703f31eeaa.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3a5ae6784f1c41a78ce924bc4fd48d24083fb9df13ce10e271a25d00303f9e4
|
3 |
+
size 1903
|
app/VectorStore/index/uuid_to_id_52984ff2-d9c3-459b-acc0-0b0aa559d50f.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b153d0a7649253a0b5b095f1d126ba5a36b6a650e177b03393bd76cf8b399896
|
3 |
+
size 153763
|
app/VectorStore/index/uuid_to_id_90530179-2196-4073-89e7-11f14538d27c.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a97f316acd65482df9ed256c51cdb8c113d53e1d70346a666350ed625f2da76
|
3 |
+
size 3938
|
app/app.py
CHANGED
@@ -1,13 +1,9 @@
|
|
1 |
import streamlit as st
|
2 |
-
import langchain
|
3 |
import load_model
|
4 |
import utils as ut
|
5 |
-
import chromadb
|
6 |
-
from chromadb.config import Settings
|
7 |
import os
|
8 |
|
9 |
persist_directory = load_model.persist_directory
|
10 |
-
|
11 |
st.title('myGPT')
|
12 |
st.header('An GPT example brought to you by Heiko Wagner')
|
13 |
|
@@ -15,38 +11,43 @@ st.markdown('*\"Parametrised models are simply functions that depend on inputs a
|
|
15 |
|
16 |
st.latex(r'''h(\boldsymbol x, \boldsymbol w)= \sum_{k=1}^{K}\boldsymbol w_{k} \phi_{k}(\boldsymbol x)''')
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
model_type = st.selectbox(
|
22 |
-
'Select the Documents to be used to answer your question',
|
23 |
-
('OpenAI', 'local_model') )
|
24 |
-
|
25 |
-
if model_type=='OpenAI':
|
26 |
-
openai_key= st.text_area('OpenAI Key:', '')
|
27 |
-
os.environ["OPENAI_API_KEY"] = openai_key
|
28 |
-
llm= load_model.load_openai_model()
|
29 |
else:
|
30 |
-
llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
|
31 |
-
|
32 |
-
|
33 |
-
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
34 |
-
persist_directory=persist_directory
|
35 |
-
))
|
36 |
-
|
37 |
-
collections = tuple( [collection.name for collection in client.list_collections()] )
|
38 |
-
print(collections)
|
39 |
-
option = st.selectbox(
|
40 |
-
'Select the Documents to be used to answer your question',
|
41 |
-
collections )
|
42 |
-
|
43 |
-
st.write('You selected:', option)
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
2 |
import load_model
|
3 |
import utils as ut
|
|
|
|
|
4 |
import os
|
5 |
|
6 |
persist_directory = load_model.persist_directory
|
|
|
7 |
st.title('myGPT')
|
8 |
st.header('An GPT example brought to you by Heiko Wagner')
|
9 |
|
|
|
11 |
|
12 |
st.latex(r'''h(\boldsymbol x, \boldsymbol w)= \sum_{k=1}^{K}\boldsymbol w_{k} \phi_{k}(\boldsymbol x)''')
|
13 |
|
14 |
+
agree = st.checkbox('Load new Documents')
|
15 |
+
if agree:
|
16 |
+
ut.load_files()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
import torch
|
20 |
+
torch.cuda.empty_cache()
|
21 |
+
|
22 |
+
model_type = st.selectbox(
|
23 |
+
'Select the Documents to be used to answer your question',
|
24 |
+
('OpenAI', 'local_model') )
|
25 |
+
|
26 |
+
if model_type=='OpenAI':
|
27 |
+
if 'openai_key' not in st.session_state:
|
28 |
+
openai_key= st.text_area('OpenAI Key:', '')
|
29 |
+
if len(openai_key)>10:
|
30 |
+
st.session_state['openai_key'] = openai_key
|
31 |
+
os.environ["OPENAI_API_KEY"] = openai_key
|
32 |
+
else:
|
33 |
+
os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
|
34 |
+
llm= load_model.load_openai_model()
|
35 |
+
else:
|
36 |
+
llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
|
37 |
+
|
38 |
+
|
39 |
+
collections = ut.retrieve_collections()
|
40 |
+
option = st.selectbox(
|
41 |
+
'Select the Documents to be used to answer your question',
|
42 |
+
collections )
|
43 |
+
|
44 |
+
st.write('You selected:', option)
|
45 |
+
|
46 |
+
chain = load_model.create_chain(llm, collection=option)
|
47 |
+
try:
|
48 |
+
query = st.text_area('Ask a question:', 'Hallo how are you today?')
|
49 |
+
result = chain({"query": query})
|
50 |
+
ut.format_result_set(result)
|
51 |
+
finally:
|
52 |
+
del chain
|
53 |
+
torch.cuda.empty_cache()
|
app/load_model.py
CHANGED
@@ -27,9 +27,6 @@ print(current_path)
|
|
27 |
persist_directory = current_path + "/VectorStore"
|
28 |
|
29 |
# %%
|
30 |
-
llm =OpenAI(temperature=0.9)
|
31 |
-
llm
|
32 |
-
|
33 |
@st.cache_resource
|
34 |
def load_cpu_model():
|
35 |
"""Does not work atm, bc cpu model is not persisted"""
|
|
|
27 |
persist_directory = current_path + "/VectorStore"
|
28 |
|
29 |
# %%
|
|
|
|
|
|
|
30 |
@st.cache_resource
|
31 |
def load_cpu_model():
|
32 |
"""Does not work atm, bc cpu model is not persisted"""
|
app/load_vectors.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
import nltk
|
3 |
from langchain.indexes import VectorstoreIndexCreator
|
4 |
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
|
5 |
-
from langchain.document_loaders import OnlinePDFLoader
|
6 |
from langchain.vectorstores import Chroma
|
7 |
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
|
8 |
from chromadb.config import Settings
|
@@ -16,6 +16,8 @@ from load_model import load_embedding
|
|
16 |
import torch
|
17 |
import re
|
18 |
import pathlib
|
|
|
|
|
19 |
|
20 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
21 |
|
@@ -47,7 +49,7 @@ def create_and_add(collection_name, sub_docs, model_name):
|
|
47 |
)
|
48 |
|
49 |
client = chromadb.Client(client_settings)
|
50 |
-
collection_name = collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)
|
51 |
|
52 |
embeddings = load_embedding(model_name)
|
53 |
logging.info(f"Adding documents to {collection_name}")
|
@@ -71,6 +73,25 @@ def create_and_add(collection_name, sub_docs, model_name):
|
|
71 |
|
72 |
return vectorstore
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
def load_from_web(urls, cache=True):
|
75 |
docs_list = urls
|
76 |
filename=f"./{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
|
|
|
2 |
import nltk
|
3 |
from langchain.indexes import VectorstoreIndexCreator
|
4 |
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
|
5 |
+
from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
|
6 |
from langchain.vectorstores import Chroma
|
7 |
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
|
8 |
from chromadb.config import Settings
|
|
|
16 |
import torch
|
17 |
import re
|
18 |
import pathlib
|
19 |
+
import tempfile
|
20 |
+
|
21 |
|
22 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
23 |
|
|
|
49 |
)
|
50 |
|
51 |
client = chromadb.Client(client_settings)
|
52 |
+
collection_name = collection_name # + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)
|
53 |
|
54 |
embeddings = load_embedding(model_name)
|
55 |
logging.info(f"Adding documents to {collection_name}")
|
|
|
73 |
|
74 |
return vectorstore
|
75 |
|
76 |
+
def load_from_file(files):
|
77 |
+
|
78 |
+
saved_files=[]
|
79 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
80 |
+
for file in files:
|
81 |
+
temp_dir = pathlib.Path(tmpdirname)
|
82 |
+
file_name = os.path.join(temp_dir,file.name)
|
83 |
+
saved_files.append(file_name)
|
84 |
+
with open(file_name, mode='wb') as w:
|
85 |
+
w.write(file.read())
|
86 |
+
|
87 |
+
print(saved_files)
|
88 |
+
loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
|
89 |
+
docs = []
|
90 |
+
print(loaders)
|
91 |
+
for loader in loaders:
|
92 |
+
docs.extend(loader.load())
|
93 |
+
return docs
|
94 |
+
|
95 |
def load_from_web(urls, cache=True):
|
96 |
docs_list = urls
|
97 |
filename=f"./{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
|
app/requirements.txt
CHANGED
@@ -9,4 +9,4 @@ streamlit
|
|
9 |
requests==2.28.0
|
10 |
latex2markdown
|
11 |
openai
|
12 |
-
unstructured
|
|
|
9 |
requests==2.28.0
|
10 |
latex2markdown
|
11 |
openai
|
12 |
+
unstructured[local-inference]
|
app/utils.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1 |
import streamlit as st
|
2 |
import latex2markdown
|
3 |
from langchain.docstore.document import Document
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def format_document(document: Document):
|
6 |
"""TODO: Implement a nice style"""
|
@@ -16,4 +21,50 @@ def format_result_set(result):
|
|
16 |
for document in source_documents:
|
17 |
st.write(format_document(document))
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import latex2markdown
|
3 |
from langchain.docstore.document import Document
|
4 |
+
import chromadb
|
5 |
+
from chromadb.config import Settings
|
6 |
+
import load_model
|
7 |
+
from load_vectors import load_from_file, load_and_split, create_and_add
|
8 |
+
persist_directory = load_model.persist_directory
|
9 |
|
10 |
def format_document(document: Document):
|
11 |
"""TODO: Implement a nice style"""
|
|
|
21 |
for document in source_documents:
|
22 |
st.write(format_document(document))
|
23 |
|
24 |
+
@st.cache_resource
|
25 |
+
def get_chroma_client():
|
26 |
+
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
27 |
+
persist_directory=persist_directory
|
28 |
+
))
|
29 |
+
@st.cache_data
|
30 |
+
def retrieve_collections():
|
31 |
+
client = get_chroma_client()
|
32 |
+
collections = tuple( [collection.name for collection in client.list_collections()] )
|
33 |
+
return collections
|
34 |
|
35 |
+
def load_files():
|
36 |
+
|
37 |
+
client = get_chroma_client()
|
38 |
+
|
39 |
+
option = st.radio(
|
40 |
+
"",
|
41 |
+
options=["Add Documents", "Start new collection"],
|
42 |
+
)
|
43 |
+
|
44 |
+
collections = retrieve_collections()
|
45 |
+
|
46 |
+
if option == "Add Documents":
|
47 |
+
selected_collection = st.selectbox(
|
48 |
+
'Add to exsisting collection or create a new one',
|
49 |
+
collections )
|
50 |
+
if st.button('Delete Collection (⚠️ This is destructive and not reversible)'):
|
51 |
+
client.delete_collection(name=selected_collection)
|
52 |
+
retrieve_collections.clear()
|
53 |
+
collections = retrieve_collections()
|
54 |
+
|
55 |
+
st.write('Source Documents:')
|
56 |
+
uploaded_files = st.file_uploader("Choose a PDF file", accept_multiple_files=True)
|
57 |
+
chunk_size = st.text_area('chunk Size:', 1000)
|
58 |
+
|
59 |
+
if st.button('Upload'):
|
60 |
+
docs = load_from_file(uploaded_files)
|
61 |
+
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
62 |
+
create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
|
63 |
+
uploaded_files=None
|
64 |
+
else:
|
65 |
+
collection = st.text_area('Name of your new collection:', '')
|
66 |
+
if st.button('Create'):
|
67 |
+
if len(collection)>3:
|
68 |
+
client.create_collection(collection) #collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name) --Problem i added the model to the name -> Better use Metadata :)
|
69 |
+
retrieve_collections.clear()
|
70 |
+
st.write("Collection " +collection+" succesfully created.")
|