Spaces:

brianknowsai
/

collection-manager

Running

@@ -3,6 +3,8 @@ from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_informat
 from generate_kb import add_links_to_knowledge_base
 from app import client, default_embedding_function
 import pandas as pd
 st.title("Get knowledge boxes")
@@ -56,6 +58,28 @@ if st.button("add url to collection"):
     res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
     st.write(res)
 st.header("Add csv to existing collection")
 uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])

 from generate_kb import add_links_to_knowledge_base
 from app import client, default_embedding_function
 import pandas as pd
+from tempfile import NamedTemporaryFile
+import os
 st.title("Get knowledge boxes")
     res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
     st.write(res)
+st.header("Add pdf to existing collection")
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+if st.button("add pdf"):
+    # Create a temporary file
+    with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+        # Write the uploaded PDF to the temporary file
+        tmp_file.write(uploaded_file.getvalue())
+        tmp_path = tmp_file.name
+        print("PATH: ", tmp_path)
+        urls = [tmp_path]
+        res = add_links_to_knowledge_base(
+            client=client, kb_name=collection_name, urls=urls
+        )
+        st.write(res)
+    # Clean up: delete the temporary file
+    os.remove(tmp_path)
+# if st.button("add pdf"):
+#     urls = [url_text]  # put in a list even if only one
+#     res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
+#     st.write(res)
 st.header("Add csv to existing collection")
 uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])

requirements.txt CHANGED Viewed

@@ -10,6 +10,7 @@ streamlit>=1.27.2
 python-dotenv==1.0.0
 fastapi>=0.104.0
 uvicorn>=0.23.2
 #pypdf==3.16.4
 #python-multipart==0.0.6
 #matplotlib==3.8.3

 python-dotenv==1.0.0
 fastapi>=0.104.0
 uvicorn>=0.23.2
+pypdf
 #pypdf==3.16.4
 #python-multipart==0.0.6
 #matplotlib==3.8.3

services/document_manager/document_loader.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pandas as pd
 from langchain.document_loaders import WebBaseLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import requests
 class DocumentsLoader:
@@ -17,9 +18,10 @@ class DocumentsLoader:
     def is_notion_url(self, url):
         # Regular expressions to match Notion URLs
-        notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
         # Check if the URL matches the Notion regex
-        return re.match(notion_regex, url) is not None
     def is_pdf_url(self, url):
         # Define a list of common PDF file extensions
@@ -42,16 +44,24 @@ class DocumentsLoader:
             return False
     def load_docs(self, doc_urls: list) -> list:
-        web_urls, pdf_urls, docs = [], [], []
         if isinstance(doc_urls[0], list):
             doc_urls = [doc[0] for doc in doc_urls]
             # doc_urls = doc_urls[0]
         for url in doc_urls:
             if self.is_pdf_url(url):
                 pdf_urls.append(url)
             else:
                 web_urls.append(url)
         if len(web_urls) > 0:
             web_urls = [url for url in web_urls if self.is_valid_url(url)]
             for web_url in web_urls:
@@ -62,8 +72,11 @@ class DocumentsLoader:
                 except Exception as e:
                     print(f"Error web loader, {web_url}: {str(e)}")
         if len(pdf_urls) > 0:
-            pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
             for pdf_url in pdf_urls:
                 try:
                     pdf_loader = PyPDFLoader(pdf_url)
@@ -71,6 +84,20 @@ class DocumentsLoader:
                     docs = docs + pdf_docs
                 except Exception as e:
                     print(f"Error pdf loader, {pdf_url}: {str(e)}")
         return docs
     def split_docs(self, docs, chunk_size=2000):

 from langchain.document_loaders import WebBaseLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import requests
+from langchain_community.document_loaders import NotionDirectoryLoader
 class DocumentsLoader:
     def is_notion_url(self, url):
         # Regular expressions to match Notion URLs
+        return "notion" in url
+        # notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
         # Check if the URL matches the Notion regex
+        # return re.match(notion_regex, url) is not None
     def is_pdf_url(self, url):
         # Define a list of common PDF file extensions
             return False
     def load_docs(self, doc_urls: list) -> list:
+        web_urls, pdf_urls, notion_urls, docs = [], [], [], []
         if isinstance(doc_urls[0], list):
             doc_urls = [doc[0] for doc in doc_urls]
             # doc_urls = doc_urls[0]
+        # split urls on pdf,web,
+        print("docs urls: ", doc_urls)
         for url in doc_urls:
+            print("URL : ", url)
+            print(self.is_pdf_url(url))
             if self.is_pdf_url(url):
                 pdf_urls.append(url)
+            if self.is_notion_url(url):
+                notion_urls.append(url)
             else:
                 web_urls.append(url)
+        # load web urls
         if len(web_urls) > 0:
             web_urls = [url for url in web_urls if self.is_valid_url(url)]
             for web_url in web_urls:
                 except Exception as e:
                     print(f"Error web loader, {web_url}: {str(e)}")
+        # load pdf urls
         if len(pdf_urls) > 0:
+            # print("n urls", pdf_urls)
+            # pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
+            # print("n urls", pdf_urls)
             for pdf_url in pdf_urls:
                 try:
                     pdf_loader = PyPDFLoader(pdf_url)
                     docs = docs + pdf_docs
                 except Exception as e:
                     print(f"Error pdf loader, {pdf_url}: {str(e)}")
+        # notion loade: not working
+        # if len(notion_urls) > 0:
+        #     print("ADDING NOTION URLS")
+        #     notion_urls = [url for url in notion_urls if self.is_notion_url(url)]
+        #     for notion_url in notion_urls:
+        #         print(notion_url)
+        #         try:
+        #             notion_loader = NotionDirectoryLoader(notion_url)
+        #             notion_docs = notion_loader.load()
+        #             print("Notion docs ", notion_docs)
+        #             docs = notion_docs + docs
+        #         except Exception as e:
+        #             print(f"Error notion loader, {notion_url}: {str(e)}")
         return docs
     def split_docs(self, docs, chunk_size=2000):

services/vectordb_manager/vectordb_manager.py CHANGED Viewed

@@ -21,8 +21,10 @@ sys.path.append("../..")
 from dotenv import load_dotenv, find_dotenv
 _ = load_dotenv(find_dotenv())  # read local .env file
-#openai.api_key = os.environ["OPENAI_API_KEY"]
 openai.api_key = st.secrets["OPENAI_API_KEY"]
 class VectordbManager:
     def __init__(
@@ -59,7 +61,7 @@ class VectordbManager:
         collection = client.get_collection(
             self.knowledge_base_name,
             embedding_function=embedding_functions.OpenAIEmbeddingFunction(
-                api_key=os.environ["OPENAI_API_KEY"]
             ),
         )
         return collection
@@ -80,7 +82,7 @@ class VectordbManager:
             collection = client.get_or_create_collection(
                 knowledge_base_name,
                 embedding_function=embedding_functions.OpenAIEmbeddingFunction(
-                    api_key=os.environ["OPENAI_API_KEY"]
                 ),
             )

 from dotenv import load_dotenv, find_dotenv
 _ = load_dotenv(find_dotenv())  # read local .env file
+# openai.api_key = os.environ["OPENAI_API_KEY"]
 openai.api_key = st.secrets["OPENAI_API_KEY"]
+openai_key = st.secrets["OPENAI_API_KEY"]
 class VectordbManager:
     def __init__(
         collection = client.get_collection(
             self.knowledge_base_name,
             embedding_function=embedding_functions.OpenAIEmbeddingFunction(
+                api_key=openai_key
             ),
         )
         return collection
             collection = client.get_or_create_collection(
                 knowledge_base_name,
                 embedding_function=embedding_functions.OpenAIEmbeddingFunction(
+                    api_key=openai_key
                 ),
             )

utils.py CHANGED Viewed

@@ -6,9 +6,10 @@ import os
 import streamlit as st
 load_dotenv()
-#openai_key = os.getenv("OPENAI_API_KEY")
 openai_key = st.secrets["OPENAI_API_KEY"]
 def get_chroma_client(
     host: str = "chroma.brianknows.org",
     port: int = 443,

 import streamlit as st
 load_dotenv()
+# openai_key = os.getenv("OPENAI_API_KEY")
 openai_key = st.secrets["OPENAI_API_KEY"]
 def get_chroma_client(
     host: str = "chroma.brianknows.org",
     port: int = 443,