Spaces:
Running
Running
marcellopoliti
commited on
Commit
•
c5a0a6e
1
Parent(s):
f85c548
feat: add pdf docs
Browse files- .streamlit/secrets.toml +1 -0
- app.py +1 -0
- pages/manage_knowledge_box.py +24 -0
- requirements.txt +1 -0
- services/document_manager/document_loader.py +31 -4
- services/vectordb_manager/vectordb_manager.py +5 -3
- utils.py +2 -1
.streamlit/secrets.toml
CHANGED
@@ -1 +1,2 @@
|
|
1 |
password = "brianknowsai"
|
|
|
|
1 |
password = "brianknowsai"
|
2 |
+
OPENAI_API_KEY = "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"
|
app.py
CHANGED
@@ -5,6 +5,7 @@ from utils import get_chroma_client, get_embedding_function
|
|
5 |
|
6 |
import hmac
|
7 |
import streamlit as st
|
|
|
8 |
|
9 |
__import__("pysqlite3")
|
10 |
import sys
|
|
|
5 |
|
6 |
import hmac
|
7 |
import streamlit as st
|
8 |
+
import os
|
9 |
|
10 |
__import__("pysqlite3")
|
11 |
import sys
|
pages/manage_knowledge_box.py
CHANGED
@@ -3,6 +3,8 @@ from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_informat
|
|
3 |
from generate_kb import add_links_to_knowledge_base
|
4 |
from app import client, default_embedding_function
|
5 |
import pandas as pd
|
|
|
|
|
6 |
|
7 |
st.title("Get knowledge boxes")
|
8 |
|
@@ -56,6 +58,28 @@ if st.button("add url to collection"):
|
|
56 |
res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
|
57 |
st.write(res)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
st.header("Add csv to existing collection")
|
61 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
|
|
3 |
from generate_kb import add_links_to_knowledge_base
|
4 |
from app import client, default_embedding_function
|
5 |
import pandas as pd
|
6 |
+
from tempfile import NamedTemporaryFile
|
7 |
+
import os
|
8 |
|
9 |
st.title("Get knowledge boxes")
|
10 |
|
|
|
58 |
res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
|
59 |
st.write(res)
|
60 |
|
61 |
+
st.header("Add pdf to existing collection")
|
62 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
63 |
+
if st.button("add pdf"):
|
64 |
+
# Create a temporary file
|
65 |
+
with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
66 |
+
# Write the uploaded PDF to the temporary file
|
67 |
+
tmp_file.write(uploaded_file.getvalue())
|
68 |
+
tmp_path = tmp_file.name
|
69 |
+
print("PATH: ", tmp_path)
|
70 |
+
urls = [tmp_path]
|
71 |
+
res = add_links_to_knowledge_base(
|
72 |
+
client=client, kb_name=collection_name, urls=urls
|
73 |
+
)
|
74 |
+
st.write(res)
|
75 |
+
# Clean up: delete the temporary file
|
76 |
+
os.remove(tmp_path)
|
77 |
+
|
78 |
+
# if st.button("add pdf"):
|
79 |
+
# urls = [url_text] # put in a list even if only one
|
80 |
+
# res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
|
81 |
+
# st.write(res)
|
82 |
+
|
83 |
|
84 |
st.header("Add csv to existing collection")
|
85 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
requirements.txt
CHANGED
@@ -10,6 +10,7 @@ streamlit>=1.27.2
|
|
10 |
python-dotenv==1.0.0
|
11 |
fastapi>=0.104.0
|
12 |
uvicorn>=0.23.2
|
|
|
13 |
#pypdf==3.16.4
|
14 |
#python-multipart==0.0.6
|
15 |
#matplotlib==3.8.3
|
|
|
10 |
python-dotenv==1.0.0
|
11 |
fastapi>=0.104.0
|
12 |
uvicorn>=0.23.2
|
13 |
+
pypdf
|
14 |
#pypdf==3.16.4
|
15 |
#python-multipart==0.0.6
|
16 |
#matplotlib==3.8.3
|
services/document_manager/document_loader.py
CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
|
|
4 |
from langchain.document_loaders import WebBaseLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
import requests
|
|
|
7 |
|
8 |
|
9 |
class DocumentsLoader:
|
@@ -17,9 +18,10 @@ class DocumentsLoader:
|
|
17 |
|
18 |
def is_notion_url(self, url):
|
19 |
# Regular expressions to match Notion URLs
|
20 |
-
|
|
|
21 |
# Check if the URL matches the Notion regex
|
22 |
-
return re.match(notion_regex, url) is not None
|
23 |
|
24 |
def is_pdf_url(self, url):
|
25 |
# Define a list of common PDF file extensions
|
@@ -42,16 +44,24 @@ class DocumentsLoader:
|
|
42 |
return False
|
43 |
|
44 |
def load_docs(self, doc_urls: list) -> list:
|
45 |
-
web_urls, pdf_urls, docs = [], [], []
|
46 |
if isinstance(doc_urls[0], list):
|
47 |
doc_urls = [doc[0] for doc in doc_urls]
|
48 |
# doc_urls = doc_urls[0]
|
|
|
|
|
|
|
49 |
for url in doc_urls:
|
|
|
|
|
50 |
if self.is_pdf_url(url):
|
51 |
pdf_urls.append(url)
|
|
|
|
|
52 |
else:
|
53 |
web_urls.append(url)
|
54 |
|
|
|
55 |
if len(web_urls) > 0:
|
56 |
web_urls = [url for url in web_urls if self.is_valid_url(url)]
|
57 |
for web_url in web_urls:
|
@@ -62,8 +72,11 @@ class DocumentsLoader:
|
|
62 |
except Exception as e:
|
63 |
print(f"Error web loader, {web_url}: {str(e)}")
|
64 |
|
|
|
65 |
if len(pdf_urls) > 0:
|
66 |
-
|
|
|
|
|
67 |
for pdf_url in pdf_urls:
|
68 |
try:
|
69 |
pdf_loader = PyPDFLoader(pdf_url)
|
@@ -71,6 +84,20 @@ class DocumentsLoader:
|
|
71 |
docs = docs + pdf_docs
|
72 |
except Exception as e:
|
73 |
print(f"Error pdf loader, {pdf_url}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
return docs
|
75 |
|
76 |
def split_docs(self, docs, chunk_size=2000):
|
|
|
4 |
from langchain.document_loaders import WebBaseLoader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
import requests
|
7 |
+
from langchain_community.document_loaders import NotionDirectoryLoader
|
8 |
|
9 |
|
10 |
class DocumentsLoader:
|
|
|
18 |
|
19 |
def is_notion_url(self, url):
|
20 |
# Regular expressions to match Notion URLs
|
21 |
+
return "notion" in url
|
22 |
+
# notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
|
23 |
# Check if the URL matches the Notion regex
|
24 |
+
# return re.match(notion_regex, url) is not None
|
25 |
|
26 |
def is_pdf_url(self, url):
|
27 |
# Define a list of common PDF file extensions
|
|
|
44 |
return False
|
45 |
|
46 |
def load_docs(self, doc_urls: list) -> list:
|
47 |
+
web_urls, pdf_urls, notion_urls, docs = [], [], [], []
|
48 |
if isinstance(doc_urls[0], list):
|
49 |
doc_urls = [doc[0] for doc in doc_urls]
|
50 |
# doc_urls = doc_urls[0]
|
51 |
+
|
52 |
+
# split urls on pdf,web,
|
53 |
+
print("docs urls: ", doc_urls)
|
54 |
for url in doc_urls:
|
55 |
+
print("URL : ", url)
|
56 |
+
print(self.is_pdf_url(url))
|
57 |
if self.is_pdf_url(url):
|
58 |
pdf_urls.append(url)
|
59 |
+
if self.is_notion_url(url):
|
60 |
+
notion_urls.append(url)
|
61 |
else:
|
62 |
web_urls.append(url)
|
63 |
|
64 |
+
# load web urls
|
65 |
if len(web_urls) > 0:
|
66 |
web_urls = [url for url in web_urls if self.is_valid_url(url)]
|
67 |
for web_url in web_urls:
|
|
|
72 |
except Exception as e:
|
73 |
print(f"Error web loader, {web_url}: {str(e)}")
|
74 |
|
75 |
+
# load pdf urls
|
76 |
if len(pdf_urls) > 0:
|
77 |
+
# print("n urls", pdf_urls)
|
78 |
+
# pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
|
79 |
+
# print("n urls", pdf_urls)
|
80 |
for pdf_url in pdf_urls:
|
81 |
try:
|
82 |
pdf_loader = PyPDFLoader(pdf_url)
|
|
|
84 |
docs = docs + pdf_docs
|
85 |
except Exception as e:
|
86 |
print(f"Error pdf loader, {pdf_url}: {str(e)}")
|
87 |
+
|
88 |
+
# notion loade: not working
|
89 |
+
# if len(notion_urls) > 0:
|
90 |
+
# print("ADDING NOTION URLS")
|
91 |
+
# notion_urls = [url for url in notion_urls if self.is_notion_url(url)]
|
92 |
+
# for notion_url in notion_urls:
|
93 |
+
# print(notion_url)
|
94 |
+
# try:
|
95 |
+
# notion_loader = NotionDirectoryLoader(notion_url)
|
96 |
+
# notion_docs = notion_loader.load()
|
97 |
+
# print("Notion docs ", notion_docs)
|
98 |
+
# docs = notion_docs + docs
|
99 |
+
# except Exception as e:
|
100 |
+
# print(f"Error notion loader, {notion_url}: {str(e)}")
|
101 |
return docs
|
102 |
|
103 |
def split_docs(self, docs, chunk_size=2000):
|
services/vectordb_manager/vectordb_manager.py
CHANGED
@@ -21,8 +21,10 @@ sys.path.append("../..")
|
|
21 |
from dotenv import load_dotenv, find_dotenv
|
22 |
|
23 |
_ = load_dotenv(find_dotenv()) # read local .env file
|
24 |
-
#openai.api_key = os.environ["OPENAI_API_KEY"]
|
25 |
openai.api_key = st.secrets["OPENAI_API_KEY"]
|
|
|
|
|
26 |
|
27 |
class VectordbManager:
|
28 |
def __init__(
|
@@ -59,7 +61,7 @@ class VectordbManager:
|
|
59 |
collection = client.get_collection(
|
60 |
self.knowledge_base_name,
|
61 |
embedding_function=embedding_functions.OpenAIEmbeddingFunction(
|
62 |
-
api_key=
|
63 |
),
|
64 |
)
|
65 |
return collection
|
@@ -80,7 +82,7 @@ class VectordbManager:
|
|
80 |
collection = client.get_or_create_collection(
|
81 |
knowledge_base_name,
|
82 |
embedding_function=embedding_functions.OpenAIEmbeddingFunction(
|
83 |
-
api_key=
|
84 |
),
|
85 |
)
|
86 |
|
|
|
21 |
from dotenv import load_dotenv, find_dotenv
|
22 |
|
23 |
_ = load_dotenv(find_dotenv()) # read local .env file
|
24 |
+
# openai.api_key = os.environ["OPENAI_API_KEY"]
|
25 |
openai.api_key = st.secrets["OPENAI_API_KEY"]
|
26 |
+
openai_key = st.secrets["OPENAI_API_KEY"]
|
27 |
+
|
28 |
|
29 |
class VectordbManager:
|
30 |
def __init__(
|
|
|
61 |
collection = client.get_collection(
|
62 |
self.knowledge_base_name,
|
63 |
embedding_function=embedding_functions.OpenAIEmbeddingFunction(
|
64 |
+
api_key=openai_key
|
65 |
),
|
66 |
)
|
67 |
return collection
|
|
|
82 |
collection = client.get_or_create_collection(
|
83 |
knowledge_base_name,
|
84 |
embedding_function=embedding_functions.OpenAIEmbeddingFunction(
|
85 |
+
api_key=openai_key
|
86 |
),
|
87 |
)
|
88 |
|
utils.py
CHANGED
@@ -6,9 +6,10 @@ import os
|
|
6 |
import streamlit as st
|
7 |
|
8 |
load_dotenv()
|
9 |
-
#openai_key = os.getenv("OPENAI_API_KEY")
|
10 |
openai_key = st.secrets["OPENAI_API_KEY"]
|
11 |
|
|
|
12 |
def get_chroma_client(
|
13 |
host: str = "chroma.brianknows.org",
|
14 |
port: int = 443,
|
|
|
6 |
import streamlit as st
|
7 |
|
8 |
load_dotenv()
|
9 |
+
# openai_key = os.getenv("OPENAI_API_KEY")
|
10 |
openai_key = st.secrets["OPENAI_API_KEY"]
|
11 |
|
12 |
+
|
13 |
def get_chroma_client(
|
14 |
host: str = "chroma.brianknows.org",
|
15 |
port: int = 443,
|