Spaces:

heikowagner
/

GPT-Docker

Build error

App Files Files Community

heikowagner commited on May 2, 2023

Commit

1f84a9a

1 Parent(s): 8d717c1

upload

Browse files

Files changed (12) hide show

app/VectorStore/chroma-collections.parquet +2 -2
app/VectorStore/index/id_to_uuid_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
app/VectorStore/index/index_3c194f90-478a-4f8e-a5ac-67776218c783.bin +0 -3
app/VectorStore/index/index_metadata_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
app/VectorStore/index/uuid_to_id_3c194f90-478a-4f8e-a5ac-67776218c783.pkl +0 -3
app/app.py +2 -2
app/exploration.py +23 -0
app/load_model.py +4 -4
app/load_test.py +29 -0
app/load_vectors.py +3 -3
app/run.py +37 -5
app/utils.py +12 -6

app/VectorStore/chroma-collections.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:485e9d46361ec332e64d9b50063f7b958cfd7bf015931232033e05d34e3474d2
-size 712

 version https://git-lfs.github.com/spec/v1
+oid sha256:6500348785bdf69480c86a933feaa0dd3328a9acffda71e251ca9928c6813627
+size 957

app/VectorStore/index/id_to_uuid_3c194f90-478a-4f8e-a5ac-67776218c783.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3fd923d38dbc7773fa8ddd035a3a12b35b36c0596120795d5441fa2631aa500
-size 7657

app/VectorStore/index/index_3c194f90-478a-4f8e-a5ac-67776218c783.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e8012c468a836e45dec5264f07e79a82dd9b0cfbd57b7db82ab3e5f87659e004
-size 779728

app/VectorStore/index/index_metadata_3c194f90-478a-4f8e-a5ac-67776218c783.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fe883ac5dc1e9c3d5b56fe942e1fef13b990df4e9b32e59c5eb7b12bba00e7c0
-size 73

app/VectorStore/index/uuid_to_id_3c194f90-478a-4f8e-a5ac-67776218c783.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d94d83b22ad6a388ffd24e1151e31ff2b22aaee250d0a8e442f0744bc00cffda
-size 8970

app/app.py CHANGED Viewed

@@ -42,9 +42,9 @@ else:
         'Select the Documents to be used to answer your question',
         collections )
-    st.write('You selected:', option)
-    chain = load_model.create_chain(llm, collection=option, model_name="hkunlp/instructor-large ")
     try:
         query = st.text_area('Ask a question:', 'Hallo how are you today?')
         result = chain({"query": query})

         'Select the Documents to be used to answer your question',
         collections )
+    st.write('You selected:', option['name'])
+    chain = load_model.create_chain(llm, collection=option['name'], model_name=option['model_name'])
     try:
         query = st.text_area('Ask a question:', 'Hallo how are you today?')
         result = chain({"query": query})

app/exploration.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# %%
+from utils import retrieve_collections, get_chroma_client
+from load_model import load_embedding
+#retrieve_collections()
+client = get_chroma_client()
+# %%
+client.reset()
+# %%
+collections = tuple( [collection.name for collection in client.list_collections()] )  ##Keine Embedding function in der Collection angelegt...
+ef = load_embedding("hkunlp/instructor-large")
+collection="heikostest2"
+client.create_collection(collection, embedding_function=ef, metadata={"loaded_docs":[]})
+# %%
+client.list_collections()

app/load_model.py CHANGED Viewed

@@ -97,9 +97,8 @@ def load_embedding(model_name):
         )
     return embeddings
-def load_vectorstore(model_name, collection):
         embeddings = load_embedding(model_name)
         client_settings = Settings(
             chroma_db_impl="duckdb+parquet",
             persist_directory=persist_directory,
@@ -110,11 +109,12 @@ def load_vectorstore(model_name, collection):
             embedding_function=embeddings,
             client_settings=client_settings,
             persist_directory=persist_directory,
         )
         return vectorstore
-def create_chain(_llm, collection, model_name):
-    vectorstore = load_vectorstore(model_name, collection)
     retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
     chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
     return chain

         )
     return embeddings
+def load_vectorstore(model_name, collection, metadata):
         embeddings = load_embedding(model_name)
         client_settings = Settings(
             chroma_db_impl="duckdb+parquet",
             persist_directory=persist_directory,
             embedding_function=embeddings,
             client_settings=client_settings,
             persist_directory=persist_directory,
+            collection_metadata=metadata
         )
         return vectorstore
+def create_chain(_llm, collection, model_name, metadata=None):
+    vectorstore = load_vectorstore(model_name, collection, metadata=metadata)
     retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
     chain = RetrievalQA.from_chain_type(llm=_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
     return chain

app/load_test.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# %%
+# %%
+import os
+import pathlib
+from load_model import load_embedding
+from utils import get_chroma_client
+from load_vectors import load_from_web, create_and_add, load_and_split
+collection="axaterms"
+client = get_chroma_client()
+# Load collection to get metadata
+loaded_collection = client.get_collection(collection)
+# %%
+model_name = loaded_collection.metadata['model_name']
+# %%
+print( loaded_collection.json() )
+# %%
+client.get_collection(collection).json()   #add documents destroys the metadata... maybe :)
+# %%
+#loaded_collection.modify(metadata={"Test":99})
+# %%
+loaded_collection.json()

app/load_vectors.py CHANGED Viewed

@@ -41,10 +41,10 @@ def create_collection(collection_name, model_name, client):
     client.get_or_create_collection(collection_name, embedding_function=ef)
     return True
-def create_and_add(collection_name, sub_docs, model_name):
     logging.info(f"Adding documents to {collection_name}")
-    embeddings = load_embedding(model_name)
-    vectorstore = load_vectorstore(model_name, collection_name)
     vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
     vectorstore.persist()

     client.get_or_create_collection(collection_name, embedding_function=ef)
     return True
+def create_and_add(collection_name, sub_docs, model_name, metadata):
     logging.info(f"Adding documents to {collection_name}")
+    embeddings = load_embedding(model_name)
+    vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)
     vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
     vectorstore.persist()

app/run.py CHANGED Viewed

@@ -1,17 +1,49 @@
 # %%
 import os
 import pathlib
 current_path = str( pathlib.Path(__file__).parent.resolve() )
 with open(current_path+'/.openaiapikey', 'r') as reader:
     os.environ['OPENAI_API_KEY']=reader.read()
 import load_model
-import cloudpickle
 # %%
-# llm = load_model.load_gpu_model("decapoda-research/llama-7b-hf")
 llm= load_model.load_openai_model()
 # %%
-chain = load_model.create_chain(llm, collection="heikospaper", model_name="hkunlp/instructor-large")
-result = chain({"query": "What are AXAs green Goals?"})
-print(result)

+# This script inits the models and adds an example collection to the Vectorstore
 # %%
 import os
 import pathlib
+from load_model import load_embedding
+from utils import get_chroma_client
+from load_vectors import load_from_web, create_and_add, load_and_split
 current_path = str( pathlib.Path(__file__).parent.resolve() )
 with open(current_path+'/.openaiapikey', 'r') as reader:
     os.environ['OPENAI_API_KEY']=reader.read()
 import load_model
 # %%
+#load_model.load_gpu_model("decapoda-research/llama-7b-hf") #Download local model
 llm= load_model.load_openai_model()
+# %%
+#Load example Data
+client = get_chroma_client()
+client.reset()
+ef = load_embedding("hkunlp/instructor-large")
+collection_name="axaterms"
+metadata= {"loaded_docs":[], "Subject":"AXA Terms", "model_name": ef.model_name}
+selected_collection = client.create_collection(collection_name, embedding_function=ef, metadata=metadata)
+docs_tarifs= [
+    "https://www.axa.de/site/axa-de/get/documents_E1805589786/axade/medien/privatkunden/fahrzeugversicherungen/kfz-versicherung/start-and-drive/start-and-drive-versicherungsbedingungen.pdf",
+    "https://www.axa.de/site/axa-de/get/documents_E-298610932/axade/medien/privatkunden/haftpflicht-und-recht/rechtsschutz/versicherungsbedingungen-roland-rechtsschutz.pdf",
+    "https://www.axa.de/site/axa-de/get/documents_E1450059874/axade/medien/privatkunden/haftpflicht-und-recht/private%20haftpflichtversicherung/privathaftpflicht-versicherungsbedingungen-leistungspaket-L.pdf",
+    "https://www.axa.de/site/axa-de/get/documents_E1883536226/axade/medien/privatkunden/haus-und-wohnen/hausratversicherung/hausrat-versicherungsbedingungen-L.pdf",
+]
+# %%
+# Load collection to get metadata
+loaded_collection = client.get_collection(collection_name)
+model_name = loaded_collection.metadata['model_name']
+# %%
+docs = load_from_web(docs_tarifs)
+sub_docs = load_and_split(docs, chunk_size=1000)
+create_and_add(collection_name, sub_docs, model_name, metadata)
 # %%
+chain = load_model.create_chain(llm, collection=collection_name, model_name=model_name)
+#result = chain({"query": "Ist mein Kinderwagen bei einem Leitungswasserschaden mitversichert?"})
+#print(result)

app/utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ from langchain.docstore.document import Document
 import chromadb
 from chromadb.config import Settings
 import load_model
 from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
 persist_directory = load_model.persist_directory
@@ -21,15 +22,18 @@ def format_result_set(result):
         for document in source_documents:
             st.write(format_document(document))
-@st.cache_resource
 def get_chroma_client():
     return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                     persist_directory=persist_directory
                                 ))
-@st.cache_data
 def retrieve_collections():
     client = get_chroma_client()
-    collections = tuple( [collection.name for collection in client.list_collections()] )
     return collections
 def load_files():
@@ -64,7 +68,7 @@ def load_files():
             if st.button('Upload'):
                 docs = load_from_file(uploaded_files)
                 sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
-                create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
                 uploaded_files=None
         else:
             st.write('Urls of Source Documents (Comma separated):')
@@ -75,12 +79,14 @@ def load_files():
             if st.button('Upload'):
                 docs = load_from_web(urls)
                 sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
-                create_and_add(selected_collection, sub_docs, "hkunlp/instructor-large")
                 uploaded_files=None
     else:
         collection = st.text_area('Name of your new collection:', '')
         if st.button('Create'):
             if len(collection)>3:
-                client.create_collection(collection) #collection_name + "_" + re.sub('[^A-Za-z0-9]+', '', model_name)  --Problem i added the model to the name -> Better use Metadata :)
                 retrieve_collections.clear()
                 st.write("Collection " +collection+" succesfully created.")

 import chromadb
 from chromadb.config import Settings
 import load_model
+from load_model import load_embedding
 from load_vectors import load_from_file, load_and_split, create_and_add, load_from_web
 persist_directory = load_model.persist_directory
         for document in source_documents:
             st.write(format_document(document))
+#@st.cache_resource
 def get_chroma_client():
     return chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                     persist_directory=persist_directory
                                 ))
+#@st.cache_data
 def retrieve_collections():
     client = get_chroma_client()
+    all_collections = client.list_collections()
+    print(all_collections)
+    print(all_collections[0].metadata)
+    collections = tuple( [{'name': collection.name, 'model_name': collection.metadata['model_name']} for collection in all_collections] )
     return collections
 def load_files():
             if st.button('Upload'):
                 docs = load_from_file(uploaded_files)
                 sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
+                create_and_add(selected_collection, sub_docs, None)
                 uploaded_files=None
         else:
             st.write('Urls of Source Documents (Comma separated):')
             if st.button('Upload'):
                 docs = load_from_web(urls)
                 sub_docs = load_and_split(docs, chunk_size=int(chunk_size))
+                create_and_add(selected_collection, sub_docs, None)
                 uploaded_files=None
     else:
         collection = st.text_area('Name of your new collection:', '')
+        model_name = st.text_area('Choose the embedding function:', "hkunlp/instructor-large")
         if st.button('Create'):
             if len(collection)>3:
+                ef = load_embedding(model_name)
+                client.create_collection(collection, embedding_function=ef)
                 retrieve_collections.clear()
                 st.write("Collection " +collection+" succesfully created.")