marcellopoliti commited on
Commit
b6bc4e2
1 Parent(s): f0ba710

bug fix video upload

Browse files
.streamlit/secrets.toml CHANGED
@@ -1,2 +1,3 @@
1
  password = "brianknowsai"
2
- OPENAI_API_KEY = "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"
 
 
1
  password = "brianknowsai"
2
+ OPENAI_API_KEY = "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"
3
+ BRIAN_API_KEY="brian_Hun5m3s59XSvopywo"
generate_kb.py CHANGED
@@ -83,6 +83,8 @@ def add_links_to_knowledge_base(
83
  urls: list,
84
  chunk_size: int = 2_000,
85
  pdf_optional_link=None,
 
 
86
  pdf_title=None,
87
  embedding_fct=default_embedding_function,
88
  ):
@@ -95,6 +97,11 @@ def add_links_to_knowledge_base(
95
  for md in metadatas:
96
  md["source"] = pdf_optional_link
97
  md["title"] = pdf_title
 
 
 
 
 
98
  cleaned_contents = [
99
  re.sub(r"\n+", " ", content) for content in contents
100
  ] # clean text a bit
 
83
  urls: list,
84
  chunk_size: int = 2_000,
85
  pdf_optional_link=None,
86
+ youtube_optional_link=None,
87
+ video_title=None,
88
  pdf_title=None,
89
  embedding_fct=default_embedding_function,
90
  ):
 
97
  for md in metadatas:
98
  md["source"] = pdf_optional_link
99
  md["title"] = pdf_title
100
+
101
+ if youtube_optional_link and video_title:
102
+ for md in metadatas:
103
+ md["source"] = youtube_optional_link
104
+ md["title"] = video_title
105
  cleaned_contents = [
106
  re.sub(r"\n+", " ", content) for content in contents
107
  ] # clean text a bit
pages/manage_knowledge_box.py CHANGED
@@ -16,7 +16,6 @@ open_ai_key = "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"
16
 
17
  st.title("Get knowledge boxes")
18
 
19
-
20
  if st.button("Get current knowledge bases"):
21
  kbs = get_current_knowledge_bases(client=client)
22
  st.json(kbs)
@@ -53,7 +52,9 @@ if len(st.session_state["df"]) != 0:
53
  st.text(f"unique urls: {len(unique_df)}")
54
  st.dataframe(unique_df)
55
 
56
-
 
 
57
  st.header("Remove a split")
58
  id = st.text_input("Insert a split id")
59
  if st.button("Remove Id from collection"):
@@ -64,6 +65,9 @@ if st.button("Remove Id from collection"):
64
  st.error(f"id {id} not in kb")
65
 
66
 
 
 
 
67
  st.header("Remove url from collection")
68
  url = st.text_input("remove url")
69
  if st.button("Remove url from collection"):
@@ -75,6 +79,9 @@ if st.button("Remove url from collection"):
75
  st.error(str(e))
76
 
77
 
 
 
 
78
  st.header("Add url to existing collection")
79
  url_text = st.text_input("Insert a url link")
80
  if st.button("add url to collection"):
@@ -107,7 +114,9 @@ if st.button("add pdf"):
107
  # Clean up: delete the temporary file
108
  os.remove(tmp_path)
109
 
110
-
 
 
111
  st.header("Add csv to existing collection")
112
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
113
  df = None
@@ -131,6 +140,8 @@ if uploaded_file is not None:
131
  #############################
132
  ########## YOUTUBE ##########
133
  #############################
 
 
134
  def transcribe_audio(audio_path, chunk_length=10000):
135
  """
136
  Transcribe audio by breaking it into chunks using wave and numpy.
@@ -210,31 +221,35 @@ def download_and_transcribe_youtube(youtube_url):
210
 
211
  # audio_file = open("video.wav", "rb")
212
  text = transcribe_audio("video.wav")
213
- st.write(text)
214
- # save text
215
- # out_path = os.path.join("../data/files", video_title + ".txt")
216
- # with open(out_path, "w+") as f_out:
217
- # f_out.write(text["text"])
 
 
 
 
 
 
 
 
 
218
 
219
 
220
  st.header("Add youtube video to collection")
221
- video_url = st.text_input("Youtube video url")
 
 
 
222
 
 
 
223
  if st.button("Add video"):
224
  # Create a temporary file
225
  # Write the uploaded PDF to the temporary file
226
- download_and_transcribe_youtube(video_url)
227
-
228
- # tmp_file.write(uploaded_file.getvalue())
229
- # tmp_path = tmp_file.name
230
- # print("PATH: ", tmp_path)
231
- # urls = [tmp_path]
232
- # res = add_links_to_knowledge_base(
233
- # client=client,
234
- # kb_name=collection_name,
235
- # urls=urls,
236
- # pdf_optional_link=pdf_optional_link,
237
- # pdf_title=pdf_title,
238
- # )
239
- # st.write(res)
240
- # Clean up: delete the temporary file
 
16
 
17
  st.title("Get knowledge boxes")
18
 
 
19
  if st.button("Get current knowledge bases"):
20
  kbs = get_current_knowledge_bases(client=client)
21
  st.json(kbs)
 
52
  st.text(f"unique urls: {len(unique_df)}")
53
  st.dataframe(unique_df)
54
 
55
+ #############################
56
+ #### REMOVE A SPLIT #########
57
+ #############################
58
  st.header("Remove a split")
59
  id = st.text_input("Insert a split id")
60
  if st.button("Remove Id from collection"):
 
65
  st.error(f"id {id} not in kb")
66
 
67
 
68
+ #############################
69
+ #### REMOVE URL ############
70
+ #############################
71
  st.header("Remove url from collection")
72
  url = st.text_input("remove url")
73
  if st.button("Remove url from collection"):
 
79
  st.error(str(e))
80
 
81
 
82
+ #############################
83
+ ########### ADD URL #########
84
+ #############################
85
  st.header("Add url to existing collection")
86
  url_text = st.text_input("Insert a url link")
87
  if st.button("add url to collection"):
 
114
  # Clean up: delete the temporary file
115
  os.remove(tmp_path)
116
 
117
+ #############################
118
+ ########### ADD CSV #########
119
+ #############################
120
  st.header("Add csv to existing collection")
121
  uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
122
  df = None
 
140
  #############################
141
  ########## YOUTUBE ##########
142
  #############################
143
+
144
+
145
  def transcribe_audio(audio_path, chunk_length=10000):
146
  """
147
  Transcribe audio by breaking it into chunks using wave and numpy.
 
221
 
222
  # audio_file = open("video.wav", "rb")
223
  text = transcribe_audio("video.wav")
224
+ f_out_path = f"{video_title}.txt"
225
+ with open(f"{video_title}.txt", "w") as f_out:
226
+ f_out.write(text)
227
+ urls = [f_out_path]
228
+ add_links_to_knowledge_base(
229
+ client=client,
230
+ kb_name=collection_name,
231
+ urls=urls,
232
+ youtube_optional_link=youtube_url,
233
+ video_title=video_title,
234
+ )
235
+ os.remove(f"{video_title}.txt")
236
+ os.remove("video.wav")
237
+ os.remove("temp_chunk.wav")
238
 
239
 
240
  st.header("Add youtube video to collection")
241
+ st.image(
242
+ "",
243
+ width=200, # Manually Adjust the width of the image as per requirement
244
+ )
245
 
246
+ video_url = st.text_input("Youtube video url")
247
+ st.text("Aggiungere il video puo impiegare un bel pò. Avvia e vatti a fare una canna")
248
  if st.button("Add video"):
249
  # Create a temporary file
250
  # Write the uploaded PDF to the temporary file
251
+ try:
252
+ download_and_transcribe_youtube(video_url)
253
+ st.success("Video Added")
254
+ except Exception as e:
255
+ st.error(f"{str(e)}")
 
 
 
 
 
 
 
 
 
 
services/document_manager/document_loader.py CHANGED
@@ -1,4 +1,4 @@
1
- from langchain.document_loaders import PyPDFLoader
2
  import pandas as pd
3
  from langchain.document_loaders import WebBaseLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -18,9 +18,6 @@ class DocumentsLoader:
18
  def is_notion_url(self, url):
19
  # Regular expressions to match Notion URLs
20
  return "notion" in url
21
- # notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
22
- # Check if the URL matches the Notion regex
23
- # return re.match(notion_regex, url) is not None
24
 
25
  def is_pdf_url(self, url):
26
  # Define a list of common PDF file extensions
@@ -32,6 +29,16 @@ class DocumentsLoader:
32
  return True
33
  return False
34
 
 
 
 
 
 
 
 
 
 
 
35
  def is_valid_url(self, url):
36
  # TODO: handle status codes not 200
37
  try:
@@ -42,7 +49,7 @@ class DocumentsLoader:
42
  return False
43
 
44
  def load_docs(self, doc_urls: list) -> list:
45
- web_urls, pdf_urls, notion_urls, docs = [], [], [], []
46
  if isinstance(doc_urls[0], list):
47
  doc_urls = [doc[0] for doc in doc_urls]
48
  # doc_urls = doc_urls[0]
@@ -56,6 +63,8 @@ class DocumentsLoader:
56
  pdf_urls.append(url)
57
  if self.is_notion_url(url):
58
  notion_urls.append(url)
 
 
59
  else:
60
  web_urls.append(url)
61
 
@@ -72,9 +81,6 @@ class DocumentsLoader:
72
 
73
  # load pdf urls
74
  if len(pdf_urls) > 0:
75
- # print("n urls", pdf_urls)
76
- # pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
77
- # print("n urls", pdf_urls)
78
  for pdf_url in pdf_urls:
79
  try:
80
  pdf_loader = PyPDFLoader(pdf_url)
@@ -83,19 +89,15 @@ class DocumentsLoader:
83
  except Exception as e:
84
  print(f"Error pdf loader, {pdf_url}: {str(e)}")
85
 
86
- # notion loade: not working
87
- # if len(notion_urls) > 0:
88
- # print("ADDING NOTION URLS")
89
- # notion_urls = [url for url in notion_urls if self.is_notion_url(url)]
90
- # for notion_url in notion_urls:
91
- # print(notion_url)
92
- # try:
93
- # notion_loader = NotionDirectoryLoader(notion_url)
94
- # notion_docs = notion_loader.load()
95
- # print("Notion docs ", notion_docs)
96
- # docs = notion_docs + docs
97
- # except Exception as e:
98
- # print(f"Error notion loader, {notion_url}: {str(e)}")
99
  return docs
100
 
101
  def split_docs(self, docs, chunk_size=2000):
 
1
+ from langchain.document_loaders import PyPDFLoader, TextLoader
2
  import pandas as pd
3
  from langchain.document_loaders import WebBaseLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
18
  def is_notion_url(self, url):
19
  # Regular expressions to match Notion URLs
20
  return "notion" in url
 
 
 
21
 
22
  def is_pdf_url(self, url):
23
  # Define a list of common PDF file extensions
 
29
  return True
30
  return False
31
 
32
+ def is_txt_url(self, url):
33
+ # Define a list of common PDF file extensions
34
+ pdf_extensions = [".txt"]
35
+
36
+ # Check if the URL ends with a PDF file extension
37
+ for extension in pdf_extensions:
38
+ if url.endswith(extension):
39
+ return True
40
+ return False
41
+
42
  def is_valid_url(self, url):
43
  # TODO: handle status codes not 200
44
  try:
 
49
  return False
50
 
51
  def load_docs(self, doc_urls: list) -> list:
52
+ web_urls, pdf_urls, notion_urls, text_urls, docs = [], [], [], [], []
53
  if isinstance(doc_urls[0], list):
54
  doc_urls = [doc[0] for doc in doc_urls]
55
  # doc_urls = doc_urls[0]
 
63
  pdf_urls.append(url)
64
  if self.is_notion_url(url):
65
  notion_urls.append(url)
66
+ if self.is_txt_url(url):
67
+ text_urls.append(url)
68
  else:
69
  web_urls.append(url)
70
 
 
81
 
82
  # load pdf urls
83
  if len(pdf_urls) > 0:
 
 
 
84
  for pdf_url in pdf_urls:
85
  try:
86
  pdf_loader = PyPDFLoader(pdf_url)
 
89
  except Exception as e:
90
  print(f"Error pdf loader, {pdf_url}: {str(e)}")
91
 
92
+ if len(text_urls) > 0:
93
+ for txt_url in text_urls:
94
+ try:
95
+ txt_loader = TextLoader(txt_url)
96
+ txt_docs = txt_loader.load()
97
+ docs = docs + txt_docs
98
+ except Exception as e:
99
+ print(f"Error pdf loader, {txt_url}: {str(e)}")
100
+
 
 
 
 
101
  return docs
102
 
103
  def split_docs(self, docs, chunk_size=2000):
test_marcello.csv DELETED
@@ -1,3 +0,0 @@
1
- url
2
- https://en.wikipedia.org/wiki/Dragon_Ball
3
- https://en.wikipedia.org/wiki/Naruto