Spaces:

brianknowsai
/

collection-manager

Running

App Files Files Community

collection-manager / pages /manage_knowledge_box.py

marcellopoliti

feat: add pdf docs

c5a0a6e 8 months ago

raw

history blame

3.45 kB

	import streamlit as st
	from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_information
	from generate_kb import add_links_to_knowledge_base
	from app import client, default_embedding_function
	import pandas as pd
	from tempfile import NamedTemporaryFile
	import os

	st.title("Get knowledge boxes")

	if st.button("Get current knowledge bases"):
	kbs = get_current_knowledge_bases(client=client)
	st.json(kbs)

	collection_name = st.text_input(label="knowledge base name")
	info = {}
	collection = None

	if "df" not in st.session_state:
	st.session_state["df"] = pd.DataFrame()

	col1, col2 = st.columns(2)

	if st.button("Get All"):
	collection_info, coll = get_knowledge_base_information(
	client=client,
	embedding_function=default_embedding_function,
	kb_name=collection_name,
	)
	st.session_state["collection"] = coll
	collection = coll
	# st.write(collection_info)
	df = pd.DataFrame.from_records(collection_info)
	df["source"] = df["metadatas"].apply(lambda x: x.get("source", "unkown"))
	df["title"] = df["metadatas"].apply(lambda x: x.get("title", "unkown"))
	df = df[["documents", "source", "title", "ids"]]
	st.session_state["df"] = df


	if len(st.session_state["df"]) != 0:
	st.dataframe(st.session_state["df"], width=3_000)
	unique_df = st.session_state["df"]["source"].unique()
	st.text(f"unique urls: {len(unique_df)}")
	st.dataframe(unique_df)
	st.header("Remove a split")
	id = st.text_input("Insert a split id")
	if st.button("Remove Id from collection"):
	if id in st.session_state["df"]["ids"].values.tolist():
	res = st.session_state["collection"].delete(ids=[f"id"])
	st.success(f"id {id} deleted")
	else:
	st.error(f"id {id} not in kb")

	st.header("Add url to existing collection")
	url_text = st.text_input("Insert a url link")
	if st.button("add url to collection"):
	urls = [url_text] # put in a list even if only one
	res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
	st.write(res)

	st.header("Add pdf to existing collection")
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
	if st.button("add pdf"):
	# Create a temporary file
	with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	# Write the uploaded PDF to the temporary file
	tmp_file.write(uploaded_file.getvalue())
	tmp_path = tmp_file.name
	print("PATH: ", tmp_path)
	urls = [tmp_path]
	res = add_links_to_knowledge_base(
	client=client, kb_name=collection_name, urls=urls
	)
	st.write(res)
	# Clean up: delete the temporary file
	os.remove(tmp_path)

	# if st.button("add pdf"):
	# urls = [url_text] # put in a list even if only one
	# res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
	# st.write(res)


	st.header("Add csv to existing collection")
	uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
	df = None

	if uploaded_file is not None:
	try:
	new_df = pd.read_csv(uploaded_file)
	st.write("DataFrame:")
	st.write(new_df)
	except Exception as e:
	st.error(str(e))
	if st.button("add csv urls to collection"):
	urls = new_df.values.tolist()
	st.write(urls)
	res = add_links_to_knowledge_base(
	client=client, kb_name=collection_name, urls=urls
	)
	st.write(res)