collection-manager / generate_kb.py
marcellopoliti's picture
Add application file
e04dd70
raw
history blame
2.87 kB
from fastapi import APIRouter
from services.document_manager.document_loader import DocumentsLoader
# from services.vectordb_manager.vectordb_manager import VectordbManager
import pandas as pd
import chromadb
from chromadb.config import Settings
import chromadb.utils.embedding_functions as embedding_functions
from dotenv import load_dotenv
import os
import requests
from bs4 import BeautifulSoup
from fastapi import FastAPI, File, UploadFile
import re
from main import client, default_embedding_function
import secrets
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
def generate_knowledge_box_from_url(
client,
kb_name: str,
urls: list,
embedding_fct=default_embedding_function,
chunk_size: int = 2_000,
):
dl = DocumentsLoader()
docs = dl.load_docs(urls)
splits = dl.split_docs(docs, chunk_size=chunk_size)
contents = [split.page_content for split in splits]
metadatas = [split.metadata for split in splits]
cleaned_contents = [
re.sub(r"\n+", " ", content) for content in contents
] # clean text a bit
chroma_collection = client.create_collection(
kb_name,
embedding_function=embedding_fct,
metadata={"hnsw:space": "cosine"},
)
ids = [secrets.token_hex(16) for _ in cleaned_contents]
chroma_collection.add(documents=cleaned_contents, ids=ids, metadatas=metadatas)
n_splits = chroma_collection.count()
return {"status": 200, "n_split": n_splits}
def add_links_to_knowledge_base(
client,
kb_name: str,
urls: list,
chunk_size: int = 2_000,
embedding_fct=default_embedding_function,
):
dl = DocumentsLoader()
docs = dl.load_docs(urls)
splits = dl.split_docs(docs, chunk_size=chunk_size)
contents = [split.page_content for split in splits]
metadatas = [split.metadata for split in splits]
cleaned_contents = [
re.sub(r"\n+", " ", content) for content in contents
] # clean text a bit
embeddings = default_embedding_function(cleaned_contents)
chroma_collection = client.get_collection(name=kb_name)
ids = [secrets.token_hex(16) for _ in cleaned_contents]
chroma_collection.add(
documents=cleaned_contents, embeddings=embeddings, ids=ids, metadatas=metadatas
)
n_splits = chroma_collection.count()
return {"status": 200, "n_split": n_splits}
if __name__ == "__main__":
df = pd.read_csv("test_marcello.csv")
kb_name = "new_new_test"
urls = df.values.tolist()
# res = generate_knowledge_box_from_url(
# client=client,
# urls=urls,
# kb_name=kb_name,
# embedding_fct=default_embedding_function,
# chunk_size=2_000,
# )
df = pd.read_csv("test2.csv")
urls = df.values.tolist()
res = add_links_to_knowledge_base(
client=client,
kb_name="test",
urls=urls,
)