Spaces:

zhtet
/

RegBotBeta

Sleeping

App Files Files Community

Zwea Htet commited on May 15, 2024

Commit

9fb0f7d

1 Parent(s): 0a665f4

integrated pinecone with llama index to store vector embeddings

Browse files

Files changed (2) hide show

models/vector_database.py +41 -2
pages/llama_custom_demo.py +6 -23

models/vector_database.py CHANGED Viewed

@@ -1,6 +1,14 @@
 from pinecone import Pinecone, ServerlessSpec
 from llama_index.vector_stores.pinecone import PineconeVectorStore
 from dotenv import load_dotenv
 import os
@@ -30,5 +38,36 @@ if not index_exists(pc_index_name):
 # Initialize your index
 pinecone_index = pc.Index(pc_index_name)
-# Define the vector store
-pinecone_vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

+from typing import List
 from pinecone import Pinecone, ServerlessSpec
 from llama_index.vector_stores.pinecone import PineconeVectorStore
 from dotenv import load_dotenv
+from llama_index.core import (
+    SimpleDirectoryReader,
+    Document,
+    VectorStoreIndex,
+    StorageContext,
+)
+from huggingface_hub import HfFileSystem
 import os
 # Initialize your index
 pinecone_index = pc.Index(pc_index_name)
+# print("Deleting all vectors in the pinecone index: ", pinecone_index.delete(delete_all=True))
+# print("Deleting all vectors with the namespace 'calregs_pdf': ", pinecone_index.delete(namespace="calregs_pdf"))
+SAVE_DIR = "uploaded_files"
+def _namespace_exists(namespace: str):
+    namespaces = pinecone_index.describe_index_stats()["namespaces"]
+    return namespace in namespaces
+def get_pinecone_index(filename: str) -> VectorStoreIndex:
+    """This function loads the index from Pinecone if it exists, otherwise it creates a new index from the document."""
+    namespace = filename.replace(".", "_").replace(" ", "_")
+    pinecone_vector_store = PineconeVectorStore(
+        pinecone_index=pinecone_index,
+        namespace=namespace,
+    )
+    index = None
+    if _namespace_exists(namespace=namespace):
+        print(f"Namespace {namespace} exists.")
+        index = VectorStoreIndex.from_vector_store(vector_store=pinecone_vector_store)
+    else:
+        reader = SimpleDirectoryReader(input_files=[f"{SAVE_DIR}/{filename}"])
+        docs = reader.load_data(show_progress=True)
+        storage_context = StorageContext.from_defaults(
+            vector_store=pinecone_vector_store
+        )
+        index = VectorStoreIndex.from_documents(
+            documents=docs, show_progress=True, storage_context=storage_context
+        )
+    return index

pages/llama_custom_demo.py CHANGED Viewed

@@ -5,11 +5,11 @@ from typing import List
 # local imports
 from models.llms import load_llm, integrated_llms
-from models.embeddings import hf_embed_model, openai_embed_model
 from models.llamaCustom import LlamaCustom
-from models.llamaCustomV2 import LlamaCustomV2
-# from models.vector_database import pinecone_vector_store
 from utils.chatbox import show_previous_messages, show_chat_input
 from utils.util import validate_openai_api_key
@@ -22,6 +22,7 @@ from llama_index.core import (
     Settings,
     load_index_from_storage,
 )
 from llama_index.core.memory import ChatMemoryBuffer
 from llama_index.core.base.llms.types import ChatMessage
@@ -93,24 +94,6 @@ def get_index(
         raise e
     return index
-# def get_pinecone_index(filename: str) -> VectorStoreIndex:
-#     """Thie function loads the index from Pinecone if it exists, otherwise it creates a new index from the document."""
-#     reader = SimpleDirectoryReader(input_files=[f"{SAVE_DIR}/{filename}"])
-#     docs = reader.load_data(show_progress=True)
-#     storage_context = StorageContext.from_defaults(vector_store=pinecone_vector_store)
-#     index = VectorStoreIndex.from_documents(
-#         documents=docs, show_progress=True, storage_context=storage_context
-#     )
-#     return index
-def get_chroma_index(filename: str) -> VectorStoreIndex:
-    """This function loads the index from Chroma if it exists, otherwise it creates a new index from the document."""
-    pass
 def check_api_key(model_name: str, source: str):
     if source.startswith("openai"):
         if not st.session_state.openai_api_key:
@@ -205,8 +188,8 @@ with tab1:
                 Settings.llm = llama_llm
                 st.write("Processing Data ...")
-                index = get_index(selected_file)
-                # index = get_pinecone_index(selected_file)
                 st.write("Finishing Up ...")
                 llama_custom = LlamaCustom(model_name=selected_llm_name, index=index)

 # local imports
 from models.llms import load_llm, integrated_llms
+from models.embeddings import openai_embed_model
 from models.llamaCustom import LlamaCustom
+# from models.llamaCustomV2 import LlamaCustomV2
+from models.vector_database import get_pinecone_index
 from utils.chatbox import show_previous_messages, show_chat_input
 from utils.util import validate_openai_api_key
     Settings,
     load_index_from_storage,
 )
+from llama_index.vector_stores.pinecone import PineconeVectorStore
 from llama_index.core.memory import ChatMemoryBuffer
 from llama_index.core.base.llms.types import ChatMessage
         raise e
     return index
 def check_api_key(model_name: str, source: str):
     if source.startswith("openai"):
         if not st.session_state.openai_api_key:
                 Settings.llm = llama_llm
                 st.write("Processing Data ...")
+                # index = get_index(selected_file)
+                index = get_pinecone_index(selected_file)
                 st.write("Finishing Up ...")
                 llama_custom = LlamaCustom(model_name=selected_llm_name, index=index)