marcellopoliti commited on
Commit
c5a0a6e
1 Parent(s): f85c548

feat: add pdf docs

Browse files
.streamlit/secrets.toml CHANGED
@@ -1 +1,2 @@
1
  password = "brianknowsai"
 
 
1
  password = "brianknowsai"
2
+ OPENAI_API_KEY = "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"
app.py CHANGED
@@ -5,6 +5,7 @@ from utils import get_chroma_client, get_embedding_function
5
 
6
  import hmac
7
  import streamlit as st
 
8
 
9
  __import__("pysqlite3")
10
  import sys
 
5
 
6
  import hmac
7
  import streamlit as st
8
+ import os
9
 
10
  __import__("pysqlite3")
11
  import sys
pages/manage_knowledge_box.py CHANGED
@@ -3,6 +3,8 @@ from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_informat
3
  from generate_kb import add_links_to_knowledge_base
4
  from app import client, default_embedding_function
5
  import pandas as pd
 
 
6
 
7
  st.title("Get knowledge boxes")
8
 
@@ -56,6 +58,28 @@ if st.button("add url to collection"):
56
  res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
57
  st.write(res)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  st.header("Add csv to existing collection")
61
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
 
3
  from generate_kb import add_links_to_knowledge_base
4
  from app import client, default_embedding_function
5
  import pandas as pd
6
+ from tempfile import NamedTemporaryFile
7
+ import os
8
 
9
  st.title("Get knowledge boxes")
10
 
 
58
  res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
59
  st.write(res)
60
 
61
+ st.header("Add pdf to existing collection")
62
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
63
+ if st.button("add pdf"):
64
+ # Create a temporary file
65
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
66
+ # Write the uploaded PDF to the temporary file
67
+ tmp_file.write(uploaded_file.getvalue())
68
+ tmp_path = tmp_file.name
69
+ print("PATH: ", tmp_path)
70
+ urls = [tmp_path]
71
+ res = add_links_to_knowledge_base(
72
+ client=client, kb_name=collection_name, urls=urls
73
+ )
74
+ st.write(res)
75
+ # Clean up: delete the temporary file
76
+ os.remove(tmp_path)
77
+
78
+ # if st.button("add pdf"):
79
+ # urls = [url_text] # put in a list even if only one
80
+ # res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
81
+ # st.write(res)
82
+
83
 
84
  st.header("Add csv to existing collection")
85
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
requirements.txt CHANGED
@@ -10,6 +10,7 @@ streamlit>=1.27.2
10
  python-dotenv==1.0.0
11
  fastapi>=0.104.0
12
  uvicorn>=0.23.2
 
13
  #pypdf==3.16.4
14
  #python-multipart==0.0.6
15
  #matplotlib==3.8.3
 
10
  python-dotenv==1.0.0
11
  fastapi>=0.104.0
12
  uvicorn>=0.23.2
13
+ pypdf
14
  #pypdf==3.16.4
15
  #python-multipart==0.0.6
16
  #matplotlib==3.8.3
services/document_manager/document_loader.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  from langchain.document_loaders import WebBaseLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  import requests
 
7
 
8
 
9
  class DocumentsLoader:
@@ -17,9 +18,10 @@ class DocumentsLoader:
17
 
18
  def is_notion_url(self, url):
19
  # Regular expressions to match Notion URLs
20
- notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
 
21
  # Check if the URL matches the Notion regex
22
- return re.match(notion_regex, url) is not None
23
 
24
  def is_pdf_url(self, url):
25
  # Define a list of common PDF file extensions
@@ -42,16 +44,24 @@ class DocumentsLoader:
42
  return False
43
 
44
  def load_docs(self, doc_urls: list) -> list:
45
- web_urls, pdf_urls, docs = [], [], []
46
  if isinstance(doc_urls[0], list):
47
  doc_urls = [doc[0] for doc in doc_urls]
48
  # doc_urls = doc_urls[0]
 
 
 
49
  for url in doc_urls:
 
 
50
  if self.is_pdf_url(url):
51
  pdf_urls.append(url)
 
 
52
  else:
53
  web_urls.append(url)
54
 
 
55
  if len(web_urls) > 0:
56
  web_urls = [url for url in web_urls if self.is_valid_url(url)]
57
  for web_url in web_urls:
@@ -62,8 +72,11 @@ class DocumentsLoader:
62
  except Exception as e:
63
  print(f"Error web loader, {web_url}: {str(e)}")
64
 
 
65
  if len(pdf_urls) > 0:
66
- pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
 
 
67
  for pdf_url in pdf_urls:
68
  try:
69
  pdf_loader = PyPDFLoader(pdf_url)
@@ -71,6 +84,20 @@ class DocumentsLoader:
71
  docs = docs + pdf_docs
72
  except Exception as e:
73
  print(f"Error pdf loader, {pdf_url}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  return docs
75
 
76
  def split_docs(self, docs, chunk_size=2000):
 
4
  from langchain.document_loaders import WebBaseLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  import requests
7
+ from langchain_community.document_loaders import NotionDirectoryLoader
8
 
9
 
10
  class DocumentsLoader:
 
18
 
19
  def is_notion_url(self, url):
20
  # Regular expressions to match Notion URLs
21
+ return "notion" in url
22
+ # notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
23
  # Check if the URL matches the Notion regex
24
+ # return re.match(notion_regex, url) is not None
25
 
26
  def is_pdf_url(self, url):
27
  # Define a list of common PDF file extensions
 
44
  return False
45
 
46
  def load_docs(self, doc_urls: list) -> list:
47
+ web_urls, pdf_urls, notion_urls, docs = [], [], [], []
48
  if isinstance(doc_urls[0], list):
49
  doc_urls = [doc[0] for doc in doc_urls]
50
  # doc_urls = doc_urls[0]
51
+
52
+ # split urls on pdf,web,
53
+ print("docs urls: ", doc_urls)
54
  for url in doc_urls:
55
+ print("URL : ", url)
56
+ print(self.is_pdf_url(url))
57
  if self.is_pdf_url(url):
58
  pdf_urls.append(url)
59
+ if self.is_notion_url(url):
60
+ notion_urls.append(url)
61
  else:
62
  web_urls.append(url)
63
 
64
+ # load web urls
65
  if len(web_urls) > 0:
66
  web_urls = [url for url in web_urls if self.is_valid_url(url)]
67
  for web_url in web_urls:
 
72
  except Exception as e:
73
  print(f"Error web loader, {web_url}: {str(e)}")
74
 
75
+ # load pdf urls
76
  if len(pdf_urls) > 0:
77
+ # print("n urls", pdf_urls)
78
+ # pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
79
+ # print("n urls", pdf_urls)
80
  for pdf_url in pdf_urls:
81
  try:
82
  pdf_loader = PyPDFLoader(pdf_url)
 
84
  docs = docs + pdf_docs
85
  except Exception as e:
86
  print(f"Error pdf loader, {pdf_url}: {str(e)}")
87
+
88
+ # notion loade: not working
89
+ # if len(notion_urls) > 0:
90
+ # print("ADDING NOTION URLS")
91
+ # notion_urls = [url for url in notion_urls if self.is_notion_url(url)]
92
+ # for notion_url in notion_urls:
93
+ # print(notion_url)
94
+ # try:
95
+ # notion_loader = NotionDirectoryLoader(notion_url)
96
+ # notion_docs = notion_loader.load()
97
+ # print("Notion docs ", notion_docs)
98
+ # docs = notion_docs + docs
99
+ # except Exception as e:
100
+ # print(f"Error notion loader, {notion_url}: {str(e)}")
101
  return docs
102
 
103
  def split_docs(self, docs, chunk_size=2000):
services/vectordb_manager/vectordb_manager.py CHANGED
@@ -21,8 +21,10 @@ sys.path.append("../..")
21
  from dotenv import load_dotenv, find_dotenv
22
 
23
  _ = load_dotenv(find_dotenv()) # read local .env file
24
- #openai.api_key = os.environ["OPENAI_API_KEY"]
25
  openai.api_key = st.secrets["OPENAI_API_KEY"]
 
 
26
 
27
  class VectordbManager:
28
  def __init__(
@@ -59,7 +61,7 @@ class VectordbManager:
59
  collection = client.get_collection(
60
  self.knowledge_base_name,
61
  embedding_function=embedding_functions.OpenAIEmbeddingFunction(
62
- api_key=os.environ["OPENAI_API_KEY"]
63
  ),
64
  )
65
  return collection
@@ -80,7 +82,7 @@ class VectordbManager:
80
  collection = client.get_or_create_collection(
81
  knowledge_base_name,
82
  embedding_function=embedding_functions.OpenAIEmbeddingFunction(
83
- api_key=os.environ["OPENAI_API_KEY"]
84
  ),
85
  )
86
 
 
21
  from dotenv import load_dotenv, find_dotenv
22
 
23
  _ = load_dotenv(find_dotenv()) # read local .env file
24
+ # openai.api_key = os.environ["OPENAI_API_KEY"]
25
  openai.api_key = st.secrets["OPENAI_API_KEY"]
26
+ openai_key = st.secrets["OPENAI_API_KEY"]
27
+
28
 
29
  class VectordbManager:
30
  def __init__(
 
61
  collection = client.get_collection(
62
  self.knowledge_base_name,
63
  embedding_function=embedding_functions.OpenAIEmbeddingFunction(
64
+ api_key=openai_key
65
  ),
66
  )
67
  return collection
 
82
  collection = client.get_or_create_collection(
83
  knowledge_base_name,
84
  embedding_function=embedding_functions.OpenAIEmbeddingFunction(
85
+ api_key=openai_key
86
  ),
87
  )
88
 
utils.py CHANGED
@@ -6,9 +6,10 @@ import os
6
  import streamlit as st
7
 
8
  load_dotenv()
9
- #openai_key = os.getenv("OPENAI_API_KEY")
10
  openai_key = st.secrets["OPENAI_API_KEY"]
11
 
 
12
  def get_chroma_client(
13
  host: str = "chroma.brianknows.org",
14
  port: int = 443,
 
6
  import streamlit as st
7
 
8
  load_dotenv()
9
+ # openai_key = os.getenv("OPENAI_API_KEY")
10
  openai_key = st.secrets["OPENAI_API_KEY"]
11
 
12
+
13
  def get_chroma_client(
14
  host: str = "chroma.brianknows.org",
15
  port: int = 443,