from fastapi import APIRouter from services.document_manager.document_loader import DocumentsLoader # from services.vectordb_manager.vectordb_manager import VectordbManager import pandas as pd import chromadb from chromadb.config import Settings import chromadb.utils.embedding_functions as embedding_functions from dotenv import load_dotenv import os import requests from bs4 import BeautifulSoup from fastapi import FastAPI, File, UploadFile import re from app import client, default_embedding_function import secrets import streamlit as st load_dotenv() openai_key = os.getenv("OPENAI_API_KEY") openai_key = st.secrets["OPENAI_API_KEY"] def generate_knowledge_box_from_url( client, kb_name: str, urls: list, embedding_fct=default_embedding_function, chunk_size: int = 2_000, ): dl = DocumentsLoader() docs = dl.load_docs(urls) splits = dl.split_docs(docs, chunk_size=chunk_size) contents = [split.page_content for split in splits] metadatas = [split.metadata for split in splits] cleaned_contents = [ re.sub(r"\n+", " ", content) for content in contents ] # clean text a bit chroma_collection = client.create_collection( kb_name, embedding_function=embedding_fct, metadata={"hnsw:space": "cosine"}, ) ids = [secrets.token_hex(16) for _ in cleaned_contents] chroma_collection.add(documents=cleaned_contents, ids=ids, metadatas=metadatas) n_splits = chroma_collection.count() return {"status": 200, "n_split": n_splits} def add_links_to_knowledge_base( client, kb_name: str, urls: list, chunk_size: int = 2_000, embedding_fct=default_embedding_function, ): dl = DocumentsLoader() docs = dl.load_docs(urls) splits = dl.split_docs(docs, chunk_size=chunk_size) contents = [split.page_content for split in splits] metadatas = [split.metadata for split in splits] cleaned_contents = [ re.sub(r"\n+", " ", content) for content in contents ] # clean text a bit embeddings = default_embedding_function(cleaned_contents) chroma_collection = client.get_collection(name=kb_name) ids = [secrets.token_hex(16) for _ in cleaned_contents] chroma_collection.add( documents=cleaned_contents, embeddings=embeddings, ids=ids, metadatas=metadatas ) n_splits = chroma_collection.count() return {"status": 200, "n_split": n_splits} if __name__ == "__main__": df = pd.read_csv("test_marcello.csv") kb_name = "new_new_test" urls = df.values.tolist() # res = generate_knowledge_box_from_url( # client=client, # urls=urls, # kb_name=kb_name, # embedding_fct=default_embedding_function, # chunk_size=2_000, # ) df = pd.read_csv("test2.csv") urls = df.values.tolist() res = add_links_to_knowledge_base( client=client, kb_name="test", urls=urls, )