Spaces:
Running
Running
marcellopoliti
commited on
Commit
•
b6bc4e2
1
Parent(s):
f0ba710
bug fix video upload
Browse files- .streamlit/secrets.toml +2 -1
- generate_kb.py +7 -0
- pages/manage_knowledge_box.py +39 -24
- services/document_manager/document_loader.py +23 -21
- test_marcello.csv +0 -3
.streamlit/secrets.toml
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
password = "brianknowsai"
|
2 |
-
OPENAI_API_KEY = "sk-
|
|
|
|
1 |
password = "brianknowsai"
|
2 |
+
OPENAI_API_KEY = "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"
|
3 |
+
BRIAN_API_KEY="brian_Hun5m3s59XSvopywo"
|
generate_kb.py
CHANGED
@@ -83,6 +83,8 @@ def add_links_to_knowledge_base(
|
|
83 |
urls: list,
|
84 |
chunk_size: int = 2_000,
|
85 |
pdf_optional_link=None,
|
|
|
|
|
86 |
pdf_title=None,
|
87 |
embedding_fct=default_embedding_function,
|
88 |
):
|
@@ -95,6 +97,11 @@ def add_links_to_knowledge_base(
|
|
95 |
for md in metadatas:
|
96 |
md["source"] = pdf_optional_link
|
97 |
md["title"] = pdf_title
|
|
|
|
|
|
|
|
|
|
|
98 |
cleaned_contents = [
|
99 |
re.sub(r"\n+", " ", content) for content in contents
|
100 |
] # clean text a bit
|
|
|
83 |
urls: list,
|
84 |
chunk_size: int = 2_000,
|
85 |
pdf_optional_link=None,
|
86 |
+
youtube_optional_link=None,
|
87 |
+
video_title=None,
|
88 |
pdf_title=None,
|
89 |
embedding_fct=default_embedding_function,
|
90 |
):
|
|
|
97 |
for md in metadatas:
|
98 |
md["source"] = pdf_optional_link
|
99 |
md["title"] = pdf_title
|
100 |
+
|
101 |
+
if youtube_optional_link and video_title:
|
102 |
+
for md in metadatas:
|
103 |
+
md["source"] = youtube_optional_link
|
104 |
+
md["title"] = video_title
|
105 |
cleaned_contents = [
|
106 |
re.sub(r"\n+", " ", content) for content in contents
|
107 |
] # clean text a bit
|
pages/manage_knowledge_box.py
CHANGED
@@ -16,7 +16,6 @@ open_ai_key = "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"
|
|
16 |
|
17 |
st.title("Get knowledge boxes")
|
18 |
|
19 |
-
|
20 |
if st.button("Get current knowledge bases"):
|
21 |
kbs = get_current_knowledge_bases(client=client)
|
22 |
st.json(kbs)
|
@@ -53,7 +52,9 @@ if len(st.session_state["df"]) != 0:
|
|
53 |
st.text(f"unique urls: {len(unique_df)}")
|
54 |
st.dataframe(unique_df)
|
55 |
|
56 |
-
|
|
|
|
|
57 |
st.header("Remove a split")
|
58 |
id = st.text_input("Insert a split id")
|
59 |
if st.button("Remove Id from collection"):
|
@@ -64,6 +65,9 @@ if st.button("Remove Id from collection"):
|
|
64 |
st.error(f"id {id} not in kb")
|
65 |
|
66 |
|
|
|
|
|
|
|
67 |
st.header("Remove url from collection")
|
68 |
url = st.text_input("remove url")
|
69 |
if st.button("Remove url from collection"):
|
@@ -75,6 +79,9 @@ if st.button("Remove url from collection"):
|
|
75 |
st.error(str(e))
|
76 |
|
77 |
|
|
|
|
|
|
|
78 |
st.header("Add url to existing collection")
|
79 |
url_text = st.text_input("Insert a url link")
|
80 |
if st.button("add url to collection"):
|
@@ -107,7 +114,9 @@ if st.button("add pdf"):
|
|
107 |
# Clean up: delete the temporary file
|
108 |
os.remove(tmp_path)
|
109 |
|
110 |
-
|
|
|
|
|
111 |
st.header("Add csv to existing collection")
|
112 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
113 |
df = None
|
@@ -131,6 +140,8 @@ if uploaded_file is not None:
|
|
131 |
#############################
|
132 |
########## YOUTUBE ##########
|
133 |
#############################
|
|
|
|
|
134 |
def transcribe_audio(audio_path, chunk_length=10000):
|
135 |
"""
|
136 |
Transcribe audio by breaking it into chunks using wave and numpy.
|
@@ -210,31 +221,35 @@ def download_and_transcribe_youtube(youtube_url):
|
|
210 |
|
211 |
# audio_file = open("video.wav", "rb")
|
212 |
text = transcribe_audio("video.wav")
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
|
220 |
st.header("Add youtube video to collection")
|
221 |
-
|
|
|
|
|
|
|
222 |
|
|
|
|
|
223 |
if st.button("Add video"):
|
224 |
# Create a temporary file
|
225 |
# Write the uploaded PDF to the temporary file
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
# urls = [tmp_path]
|
232 |
-
# res = add_links_to_knowledge_base(
|
233 |
-
# client=client,
|
234 |
-
# kb_name=collection_name,
|
235 |
-
# urls=urls,
|
236 |
-
# pdf_optional_link=pdf_optional_link,
|
237 |
-
# pdf_title=pdf_title,
|
238 |
-
# )
|
239 |
-
# st.write(res)
|
240 |
-
# Clean up: delete the temporary file
|
|
|
16 |
|
17 |
st.title("Get knowledge boxes")
|
18 |
|
|
|
19 |
if st.button("Get current knowledge bases"):
|
20 |
kbs = get_current_knowledge_bases(client=client)
|
21 |
st.json(kbs)
|
|
|
52 |
st.text(f"unique urls: {len(unique_df)}")
|
53 |
st.dataframe(unique_df)
|
54 |
|
55 |
+
#############################
|
56 |
+
#### REMOVE A SPLIT #########
|
57 |
+
#############################
|
58 |
st.header("Remove a split")
|
59 |
id = st.text_input("Insert a split id")
|
60 |
if st.button("Remove Id from collection"):
|
|
|
65 |
st.error(f"id {id} not in kb")
|
66 |
|
67 |
|
68 |
+
#############################
|
69 |
+
#### REMOVE URL ############
|
70 |
+
#############################
|
71 |
st.header("Remove url from collection")
|
72 |
url = st.text_input("remove url")
|
73 |
if st.button("Remove url from collection"):
|
|
|
79 |
st.error(str(e))
|
80 |
|
81 |
|
82 |
+
#############################
|
83 |
+
########### ADD URL #########
|
84 |
+
#############################
|
85 |
st.header("Add url to existing collection")
|
86 |
url_text = st.text_input("Insert a url link")
|
87 |
if st.button("add url to collection"):
|
|
|
114 |
# Clean up: delete the temporary file
|
115 |
os.remove(tmp_path)
|
116 |
|
117 |
+
#############################
|
118 |
+
########### ADD CSV #########
|
119 |
+
#############################
|
120 |
st.header("Add csv to existing collection")
|
121 |
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
122 |
df = None
|
|
|
140 |
#############################
|
141 |
########## YOUTUBE ##########
|
142 |
#############################
|
143 |
+
|
144 |
+
|
145 |
def transcribe_audio(audio_path, chunk_length=10000):
|
146 |
"""
|
147 |
Transcribe audio by breaking it into chunks using wave and numpy.
|
|
|
221 |
|
222 |
# audio_file = open("video.wav", "rb")
|
223 |
text = transcribe_audio("video.wav")
|
224 |
+
f_out_path = f"{video_title}.txt"
|
225 |
+
with open(f"{video_title}.txt", "w") as f_out:
|
226 |
+
f_out.write(text)
|
227 |
+
urls = [f_out_path]
|
228 |
+
add_links_to_knowledge_base(
|
229 |
+
client=client,
|
230 |
+
kb_name=collection_name,
|
231 |
+
urls=urls,
|
232 |
+
youtube_optional_link=youtube_url,
|
233 |
+
video_title=video_title,
|
234 |
+
)
|
235 |
+
os.remove(f"{video_title}.txt")
|
236 |
+
os.remove("video.wav")
|
237 |
+
os.remove("temp_chunk.wav")
|
238 |
|
239 |
|
240 |
st.header("Add youtube video to collection")
|
241 |
+
st.image(
|
242 |
+
"",
|
243 |
+
width=200, # Manually Adjust the width of the image as per requirement
|
244 |
+
)
|
245 |
|
246 |
+
video_url = st.text_input("Youtube video url")
|
247 |
+
st.text("Aggiungere il video puo impiegare un bel pò. Avvia e vatti a fare una canna")
|
248 |
if st.button("Add video"):
|
249 |
# Create a temporary file
|
250 |
# Write the uploaded PDF to the temporary file
|
251 |
+
try:
|
252 |
+
download_and_transcribe_youtube(video_url)
|
253 |
+
st.success("Video Added")
|
254 |
+
except Exception as e:
|
255 |
+
st.error(f"{str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
services/document_manager/document_loader.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from langchain.document_loaders import PyPDFLoader
|
2 |
import pandas as pd
|
3 |
from langchain.document_loaders import WebBaseLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -18,9 +18,6 @@ class DocumentsLoader:
|
|
18 |
def is_notion_url(self, url):
|
19 |
# Regular expressions to match Notion URLs
|
20 |
return "notion" in url
|
21 |
-
# notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
|
22 |
-
# Check if the URL matches the Notion regex
|
23 |
-
# return re.match(notion_regex, url) is not None
|
24 |
|
25 |
def is_pdf_url(self, url):
|
26 |
# Define a list of common PDF file extensions
|
@@ -32,6 +29,16 @@ class DocumentsLoader:
|
|
32 |
return True
|
33 |
return False
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def is_valid_url(self, url):
|
36 |
# TODO: handle status codes not 200
|
37 |
try:
|
@@ -42,7 +49,7 @@ class DocumentsLoader:
|
|
42 |
return False
|
43 |
|
44 |
def load_docs(self, doc_urls: list) -> list:
|
45 |
-
web_urls, pdf_urls, notion_urls, docs = [], [], [], []
|
46 |
if isinstance(doc_urls[0], list):
|
47 |
doc_urls = [doc[0] for doc in doc_urls]
|
48 |
# doc_urls = doc_urls[0]
|
@@ -56,6 +63,8 @@ class DocumentsLoader:
|
|
56 |
pdf_urls.append(url)
|
57 |
if self.is_notion_url(url):
|
58 |
notion_urls.append(url)
|
|
|
|
|
59 |
else:
|
60 |
web_urls.append(url)
|
61 |
|
@@ -72,9 +81,6 @@ class DocumentsLoader:
|
|
72 |
|
73 |
# load pdf urls
|
74 |
if len(pdf_urls) > 0:
|
75 |
-
# print("n urls", pdf_urls)
|
76 |
-
# pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
|
77 |
-
# print("n urls", pdf_urls)
|
78 |
for pdf_url in pdf_urls:
|
79 |
try:
|
80 |
pdf_loader = PyPDFLoader(pdf_url)
|
@@ -83,19 +89,15 @@ class DocumentsLoader:
|
|
83 |
except Exception as e:
|
84 |
print(f"Error pdf loader, {pdf_url}: {str(e)}")
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
# print("Notion docs ", notion_docs)
|
96 |
-
# docs = notion_docs + docs
|
97 |
-
# except Exception as e:
|
98 |
-
# print(f"Error notion loader, {notion_url}: {str(e)}")
|
99 |
return docs
|
100 |
|
101 |
def split_docs(self, docs, chunk_size=2000):
|
|
|
1 |
+
from langchain.document_loaders import PyPDFLoader, TextLoader
|
2 |
import pandas as pd
|
3 |
from langchain.document_loaders import WebBaseLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
18 |
def is_notion_url(self, url):
|
19 |
# Regular expressions to match Notion URLs
|
20 |
return "notion" in url
|
|
|
|
|
|
|
21 |
|
22 |
def is_pdf_url(self, url):
|
23 |
# Define a list of common PDF file extensions
|
|
|
29 |
return True
|
30 |
return False
|
31 |
|
32 |
+
def is_txt_url(self, url):
|
33 |
+
# Define a list of common PDF file extensions
|
34 |
+
pdf_extensions = [".txt"]
|
35 |
+
|
36 |
+
# Check if the URL ends with a PDF file extension
|
37 |
+
for extension in pdf_extensions:
|
38 |
+
if url.endswith(extension):
|
39 |
+
return True
|
40 |
+
return False
|
41 |
+
|
42 |
def is_valid_url(self, url):
|
43 |
# TODO: handle status codes not 200
|
44 |
try:
|
|
|
49 |
return False
|
50 |
|
51 |
def load_docs(self, doc_urls: list) -> list:
|
52 |
+
web_urls, pdf_urls, notion_urls, text_urls, docs = [], [], [], [], []
|
53 |
if isinstance(doc_urls[0], list):
|
54 |
doc_urls = [doc[0] for doc in doc_urls]
|
55 |
# doc_urls = doc_urls[0]
|
|
|
63 |
pdf_urls.append(url)
|
64 |
if self.is_notion_url(url):
|
65 |
notion_urls.append(url)
|
66 |
+
if self.is_txt_url(url):
|
67 |
+
text_urls.append(url)
|
68 |
else:
|
69 |
web_urls.append(url)
|
70 |
|
|
|
81 |
|
82 |
# load pdf urls
|
83 |
if len(pdf_urls) > 0:
|
|
|
|
|
|
|
84 |
for pdf_url in pdf_urls:
|
85 |
try:
|
86 |
pdf_loader = PyPDFLoader(pdf_url)
|
|
|
89 |
except Exception as e:
|
90 |
print(f"Error pdf loader, {pdf_url}: {str(e)}")
|
91 |
|
92 |
+
if len(text_urls) > 0:
|
93 |
+
for txt_url in text_urls:
|
94 |
+
try:
|
95 |
+
txt_loader = TextLoader(txt_url)
|
96 |
+
txt_docs = txt_loader.load()
|
97 |
+
docs = docs + txt_docs
|
98 |
+
except Exception as e:
|
99 |
+
print(f"Error pdf loader, {txt_url}: {str(e)}")
|
100 |
+
|
|
|
|
|
|
|
|
|
101 |
return docs
|
102 |
|
103 |
def split_docs(self, docs, chunk_size=2000):
|
test_marcello.csv
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
url
|
2 |
-
https://en.wikipedia.org/wiki/Dragon_Ball
|
3 |
-
https://en.wikipedia.org/wiki/Naruto
|
|
|
|
|
|
|
|