collection-manager / pages /manage_knowledge_box.py
marcellopoliti's picture
feat: add pdf-url-title triplet
d258ef6
raw
history blame
4.14 kB
import streamlit as st
from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_information
from generate_kb import add_links_to_knowledge_base
from app import client, default_embedding_function
import pandas as pd
from tempfile import NamedTemporaryFile
import os
st.title("Get knowledge boxes")
if st.button("Get current knowledge bases"):
kbs = get_current_knowledge_bases(client=client)
st.json(kbs)
collection_name = st.text_input(label="knowledge base name")
info = {}
collection = None
if "df" not in st.session_state:
st.session_state["df"] = pd.DataFrame()
col1, col2 = st.columns(2)
if st.button("Get All"):
collection_info, coll, client = get_knowledge_base_information(
client=client,
embedding_function=default_embedding_function,
kb_name=collection_name,
)
st.session_state["collection"] = coll
st.session_state["client"] = client
collection = coll
# st.write(collection_info)
df = pd.DataFrame.from_records(collection_info)
df["source"] = df["metadatas"].apply(lambda x: x.get("source", "unkown"))
df["title"] = df["metadatas"].apply(lambda x: x.get("title", "unkown"))
df = df[["documents", "source", "title", "ids"]]
st.session_state["df"] = df
if len(st.session_state["df"]) != 0:
st.dataframe(st.session_state["df"], width=3_000)
unique_df = st.session_state["df"]["source"].unique()
st.text(f"unique urls: {len(unique_df)}")
st.dataframe(unique_df)
st.header("Remove a split")
id = st.text_input("Insert a split id")
if st.button("Remove Id from collection"):
if id in st.session_state["df"]["ids"].values.tolist():
res = st.session_state["collection"].delete(ids=[f"{id}"])
st.success(f"id {id} deleted")
else:
st.error(f"id {id} not in kb")
st.header("Remove url from collection")
url = st.text_input("remove url")
if st.button("Remove url from collection"):
try:
ids = st.session_state["collection"].get(where={"source": url})["ids"]
st.session_state["collection"].delete(ids=ids)
st.success("deleted")
except Exception as e:
st.error(str(e))
st.header("Add url to existing collection")
url_text = st.text_input("Insert a url link")
if st.button("add url to collection"):
urls = [url_text] # put in a list even if only one
res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
st.write(res)
st.header("Add pdf to existing collection")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
pdf_optional_link = st.text_input(
"Insert a URL link you want to associate with the pdf"
)
pdf_title = st.text_input("This title will be displayed as a resource in ask brian")
if st.button("add pdf"):
# Create a temporary file
with NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
# Write the uploaded PDF to the temporary file
tmp_file.write(uploaded_file.getvalue())
tmp_path = tmp_file.name
print("PATH: ", tmp_path)
urls = [tmp_path]
res = add_links_to_knowledge_base(
client=client,
kb_name=collection_name,
urls=urls,
pdf_optional_link=pdf_optional_link,
pdf_title=pdf_title,
)
st.write(res)
# Clean up: delete the temporary file
os.remove(tmp_path)
# if st.button("add pdf"):
# urls = [url_text] # put in a list even if only one
# res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
# st.write(res)
st.header("Add csv to existing collection")
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
df = None
if uploaded_file is not None:
try:
new_df = pd.read_csv(uploaded_file)
st.write("DataFrame:")
st.write(new_df)
except Exception as e:
st.error(str(e))
if st.button("add csv urls to collection"):
urls = new_df.values.tolist()
st.write(urls)
res = add_links_to_knowledge_base(
client=client, kb_name=collection_name, urls=urls
)
st.write(res)