Spaces:
Build error
Build error
Commit
·
1f84a9a
1
Parent(s):
8d717c1
upload
Browse files- app/VectorStore/chroma-collections.parquet +2 -2
- app/VectorStore/index/id_to_uuid_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
- app/VectorStore/index/index_3c194f90-478a-4f8e-a5ac-67776218c783.bin +0 -3
- app/VectorStore/index/index_metadata_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
- app/VectorStore/index/uuid_to_id_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
- app/app.py +2 -2
- app/exploration.py +23 -0
- app/load_model.py +4 -4
- app/load_test.py +29 -0
- app/load_vectors.py +3 -3
- app/run.py +37 -5
- app/utils.py +12 -6
app/VectorStore/chroma-collections.parquet
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6500348785bdf69480c86a933feaa0dd3328a9acffda71e251ca9928c6813627
|
| 3 |
+
size 957
|
app/VectorStore/index/id_to_uuid_3c194f90-478a-4f8e-a5ac-67776218c783.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b3fd923d38dbc7773fa8ddd035a3a12b35b36c0596120795d5441fa2631aa500
|
| 3 |
-
size 7657
|
|
|
|
|
|
|
|
|
|
|
|
app/VectorStore/index/index_3c194f90-478a-4f8e-a5ac-67776218c783.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e8012c468a836e45dec5264f07e79a82dd9b0cfbd57b7db82ab3e5f87659e004
|
| 3 |
-
size 779728
|
|
|
|
|
|
|
|
|
|
|
|
app/VectorStore/index/index_metadata_3c194f90-478a-4f8e-a5ac-67776218c783.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fe883ac5dc1e9c3d5b56fe942e1fef13b990df4e9b32e59c5eb7b12bba00e7c0
|
| 3 |
-
size 73
|
|
|
|
|
|
|
|
|
|
|
|
app/VectorStore/index/uuid_to_id_3c194f90-478a-4f8e-a5ac-67776218c783.pkl
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d94d83b22ad6a388ffd24e1151e31ff2b22aaee250d0a8e442f0744bc00cffda
|
| 3 |
-
size 8970
|
|
|
|
|
|
|
|
|
|
|
|
app/app.py
CHANGED
|
@@ -42,9 +42,9 @@ else:
|
|
| 42 |
'Select the Documents to be used to answer your question',
|
| 43 |
collections )
|
| 44 |
|
| 45 |
-
st.write('You selected:', option)
|
| 46 |
|
| 47 |
-
chain = load_model.create_chain(llm, collection=option, model_name=
|
| 48 |
try:
|
| 49 |
query = st.text_area('Ask a question:', 'Hallo how are you today?')
|
| 50 |
result = chain({"query": query})
|
|
|
|
| 42 |
'Select the Documents to be used to answer your question',
|
| 43 |
collections )
|
| 44 |
|
| 45 |
+
st.write('You selected:', option['name'])
|
| 46 |
|
| 47 |
+
chain = load_model.create_chain(llm, collection=option['name'], model_name=option['model_name'])
|
| 48 |
try:
|
| 49 |
query = st.text_area('Ask a question:', 'Hallo how are you today?')
|
| 50 |
result = chain({"query": query})
|
app/exploration.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# %%
|
| 2 |
+
|
| 3 |
+
from utils import retrieve_collections, get_chroma_client
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from load_model import load_embedding
|
| 7 |
+
|
| 8 |
+
#retrieve_collections()
|
| 9 |
+
|
| 10 |
+
client = get_chroma_client()
|
| 11 |
+
|
| 12 |
+
# %%
|
| 13 |
+
client.reset()
|
| 14 |
+
# %%
|
| 15 |
+
collections = tuple( [collection.name for collection in client.list_collections()] ) ##Keine Embedding function in der Collection angelegt...
|
| 16 |
+
|
| 17 |
+
ef = load_embedding("hkunlp/instructor-large")
|
| 18 |
+
collection="heikostest2"
|
| 19 |
+
client.create_collection(collection, embedding_function=ef, metadata={"loaded_docs":[]})
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# %%
|
| 23 |
+
client.list_collections()
|
app/load_model.py
CHANGED
|
@@ -97,9 +97,8 @@ def load_embedding(model_name):
|
|
| 97 |
)
|
| 98 |
return embeddings
|
| 99 |
|
| 100 |
-
def load_vectorstore(model_name, collection):
|
| 101 |
embeddings = load_embedding(model_name)
|
| 102 |
-
|
| 103 |
client_settings = Settings(
|
| 104 |
chroma_db_impl="duckdb+parquet",
|
| 105 |
persist_directory=persist_directory,
|
|
@@ -110,11 +109,12 @@ def load_vectorstore(model_name, collection):
|
|
| 110 |
embedding_function=embeddings,
|
| 111 |
client_settings=client_settings,
|
| 112 |
persist_directory=persist_directory,
|
|
|
|
| 113 |
)
|
| 114 |
return vectorstore
|
| 115 |
|
| 116 |
-
def create_chain(_llm, collection, model_name):
|
| 117 |
-
vectorstore = load_vectorstore(model_name, collection)
|
| 118 |
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
|
| 119 |
chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
| 120 |
return chain
|
|
|
|
| 97 |
)
|
| 98 |
return embeddings
|
| 99 |
|
| 100 |
+
def load_vectorstore(model_name, collection, metadata):
|
| 101 |
embeddings = load_embedding(model_name)
|
|
|
|
| 102 |
client_settings = Settings(
|
| 103 |
chroma_db_impl="duckdb+parquet",
|
| 104 |
persist_directory=persist_directory,
|
|
|
|
| 109 |
embedding_function=embeddings,
|
| 110 |
client_settings=client_settings,
|
| 111 |
persist_directory=persist_directory,
|
| 112 |
+
collection_metadata=metadata
|
| 113 |
)
|
| 114 |
return vectorstore
|
| 115 |
|
| 116 |
+
def create_chain(_llm, collection, model_name, metadata=None):
|
| 117 |
+
vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
|
| 118 |
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
|
| 119 |
chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
|
| 120 |
return chain
|
app/load_test.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# %%
|
| 2 |
+
# %%
|
| 3 |
+
import os
|
| 4 |
+
import pathlib
|
| 5 |
+
|
| 6 |
+
from load_model import load_embedding
|
| 7 |
+
from utils import get_chroma_client
|
| 8 |
+
from load_vectors import load_from_web, create_and_add, load_and_split
|
| 9 |
+
|
| 10 |
+
collection="axaterms"
|
| 11 |
+
client = get_chroma_client()
|
| 12 |
+
# Load collection to get metadata
|
| 13 |
+
loaded_collection = client.get_collection(collection)
|
| 14 |
+
|
| 15 |
+
# %%
|
| 16 |
+
model_name = loaded_collection.metadata['model_name']
|
| 17 |
+
|
| 18 |
+
# %%
|
| 19 |
+
print( loaded_collection.json() )
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# %%
|
| 23 |
+
client.get_collection(collection).json() #add documents destroys the metadata... maybe :)
|
| 24 |
+
# %%
|
| 25 |
+
|
| 26 |
+
#loaded_collection.modify(metadata={"Test":99})
|
| 27 |
+
|
| 28 |
+
# %%
|
| 29 |
+
loaded_collection.json()
|
app/load_vectors.py
CHANGED
|
@@ -41,10 +41,10 @@ def create_collection(collection_name, model_name, client):
|
|
| 41 |
client.get_or_create_collection(collection_name, embedding_function=ef)
|
| 42 |
return True
|
| 43 |
|
| 44 |
-
def create_and_add(collection_name, sub_docs, model_name):
|
| 45 |
logging.info(f"Adding documents to {collection_name}")
|
| 46 |
-
embeddings = load_embedding(model_name)
|
| 47 |
-
vectorstore = load_vectorstore(model_name, collection_name)
|
| 48 |
vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
|
| 49 |
vectorstore.persist()
|
| 50 |
|
|
|
|
| 41 |
client.get_or_create_collection(collection_name, embedding_function=ef)
|
| 42 |
return True
|
| 43 |
|
| 44 |
+
def create_and_add(collection_name, sub_docs, model_name, metadata):
|
| 45 |
logging.info(f"Adding documents to {collection_name}")
|
| 46 |
+
embeddings = load_embedding(model_name)
|
| 47 |
+
vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)
|
| 48 |
vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
|
| 49 |
vectorstore.persist()
|
| 50 |
|
app/run.py
CHANGED
|
@@ -1,17 +1,49 @@
|
|
|
|
|
| 1 |
# %%
|
| 2 |
import os
|
| 3 |
import pathlib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
| 5 |
with open(current_path+'/.openaiapikey', 'r') as reader:
|
| 6 |
os.environ['OPENAI_API_KEY']=reader.read()
|
| 7 |
import load_model
|
| 8 |
-
import cloudpickle
|
| 9 |
|
| 10 |
# %%
|
| 11 |
-
#
|
| 12 |
llm= load_model.load_openai_model()
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# %%
|
| 15 |
-
chain = load_model.create_chain(llm, collection=
|
| 16 |
-
result = chain({"query": "
|
| 17 |
-
print(result)
|
|
|
|
| 1 |
+
# This script inits the models and adds an example collection to the Vectorstore
|
| 2 |
# %%
|
| 3 |
import os
|
| 4 |
import pathlib
|
| 5 |
+
|
| 6 |
+
from load_model import load_embedding
|
| 7 |
+
from utils import get_chroma_client
|
| 8 |
+
from load_vectors import load_from_web, create_and_add, load_and_split
|
| 9 |
+
|
| 10 |
current_path = str( pathlib.Path(__file__).parent.resolve() )
|
| 11 |
with open(current_path+'/.openaiapikey', 'r') as reader:
|
| 12 |
os.environ['OPENAI_API_KEY']=reader.read()
|
| 13 |
import load_model
|
|
|
|
| 14 |
|
| 15 |
# %%
|
| 16 |
+
#load_model.load_gpu_model("decapoda-research/llama-7b-hf") #Download local model
|
| 17 |
llm= load_model.load_openai_model()
|
| 18 |
|
| 19 |
+
# %%
|
| 20 |
+
#Load example Data
|
| 21 |
+
client = get_chroma_client()
|
| 22 |
+
client.reset()
|
| 23 |
+
ef = load_embedding("hkunlp/instructor-large")
|
| 24 |
+
collection_name="axaterms"
|
| 25 |
+
metadata= {"loaded_docs":[], "Subject":"AXA Terms", "model_name": ef.model_name}
|
| 26 |
+
selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
|
| 27 |
+
|
| 28 |
+
docs_tarifs= [
|
| 29 |
+
"https://www.axa.de/site/axa-de/get/documents_E1805589786/axade/medien/privatkunden/fahrzeugversicherungen/kfz-versicherung/start-and-drive/start-and-drive-versicherungsbedingungen.pdf",
|
| 30 |
+
"https://www.axa.de/site/axa-de/get/documents_E-298610932/axade/medien/privatkunden/haftpflicht-und-recht/rechtsschutz/versicherungsbedingungen-roland-rechtsschutz.pdf",
|
| 31 |
+
"https://www.axa.de/site/axa-de/get/documents_E1450059874/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-L.pdf",
|
| 32 |
+
"https://www.axa.de/site/axa-de/get/documents_E1883536226/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-L.pdf",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# %%
|
| 36 |
+
# Load collection to get metadata
|
| 37 |
+
loaded_collection = client.get_collection(collection_name)
|
| 38 |
+
model_name = loaded_collection.metadata['model_name']
|
| 39 |
+
|
| 40 |
+
# %%
|
| 41 |
+
|
| 42 |
+
docs = load_from_web(docs_tarifs)
|
| 43 |
+
sub_docs = load_and_split(docs, chunk_size=1000)
|
| 44 |
+
create_and_add(collection_name, sub_docs, model_name, metadata)
|
| 45 |
+
|
| 46 |
# %%
|
| 47 |
+
chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name)
|
| 48 |
+
#result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
|
| 49 |
+
#print(result)
|
app/utils.py
CHANGED
|
@@ -4,6 +4,7 @@ from langchain.docstore.document import Document
|
|
| 4 |
import chromadb
|
| 5 |
from chromadb.config import Settings
|
| 6 |
import load_model
|
|
|
|
| 7 |
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
|
| 8 |
persist_directory = load_model.persist_directory
|
| 9 |
|
|
@@ -21,15 +22,18 @@ def format_result_set(result):
|
|
| 21 |
for document in source_documents:
|
| 22 |
st.write(format_document(document))
|
| 23 |
|
| 24 |
-
|
| 25 |
def get_chroma_client():
|
| 26 |
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
| 27 |
persist_directory=persist_directory
|
| 28 |
))
|
| 29 |
-
|
| 30 |
def retrieve_collections():
|
| 31 |
client = get_chroma_client()
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
| 33 |
return collections
|
| 34 |
|
| 35 |
def load_files():
|
|
@@ -64,7 +68,7 @@ def load_files():
|
|
| 64 |
if st.button('Upload'):
|
| 65 |
docs = load_from_file(uploaded_files)
|
| 66 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
| 67 |
-
create_and_add(selected_collection, sub_docs,
|
| 68 |
uploaded_files=None
|
| 69 |
else:
|
| 70 |
st.write('Urls of Source Documents (Comma separated):')
|
|
@@ -75,12 +79,14 @@ def load_files():
|
|
| 75 |
if st.button('Upload'):
|
| 76 |
docs = load_from_web(urls)
|
| 77 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
| 78 |
-
create_and_add(selected_collection, sub_docs,
|
| 79 |
uploaded_files=None
|
| 80 |
else:
|
| 81 |
collection = st.text_area('Name of your new collection:', '')
|
|
|
|
| 82 |
if st.button('Create'):
|
| 83 |
if len(collection)>3:
|
| 84 |
-
|
|
|
|
| 85 |
retrieve_collections.clear()
|
| 86 |
st.write("Collection " +collection+" succesfully created.")
|
|
|
|
| 4 |
import chromadb
|
| 5 |
from chromadb.config import Settings
|
| 6 |
import load_model
|
| 7 |
+
from load_model import load_embedding
|
| 8 |
from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
|
| 9 |
persist_directory = load_model.persist_directory
|
| 10 |
|
|
|
|
| 22 |
for document in source_documents:
|
| 23 |
st.write(format_document(document))
|
| 24 |
|
| 25 |
+
#@st.cache_resource
|
| 26 |
def get_chroma_client():
|
| 27 |
return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
|
| 28 |
persist_directory=persist_directory
|
| 29 |
))
|
| 30 |
+
#@st.cache_data
|
| 31 |
def retrieve_collections():
|
| 32 |
client = get_chroma_client()
|
| 33 |
+
all_collections = client.list_collections()
|
| 34 |
+
print(all_collections)
|
| 35 |
+
print(all_collections[0].metadata)
|
| 36 |
+
collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name']} for collection in all_collections] )
|
| 37 |
return collections
|
| 38 |
|
| 39 |
def load_files():
|
|
|
|
| 68 |
if st.button('Upload'):
|
| 69 |
docs = load_from_file(uploaded_files)
|
| 70 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
| 71 |
+
create_and_add(selected_collection, sub_docs, None)
|
| 72 |
uploaded_files=None
|
| 73 |
else:
|
| 74 |
st.write('Urls of Source Documents (Comma separated):')
|
|
|
|
| 79 |
if st.button('Upload'):
|
| 80 |
docs = load_from_web(urls)
|
| 81 |
sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
|
| 82 |
+
create_and_add(selected_collection, sub_docs, None)
|
| 83 |
uploaded_files=None
|
| 84 |
else:
|
| 85 |
collection = st.text_area('Name of your new collection:', '')
|
| 86 |
+
model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
|
| 87 |
if st.button('Create'):
|
| 88 |
if len(collection)>3:
|
| 89 |
+
ef = load_embedding(model_name)
|
| 90 |
+
client.create_collection(collection, embedding_function=ef)
|
| 91 |
retrieve_collections.clear()
|
| 92 |
st.write("Collection " +collection+" succesfully created.")
|