Spaces:
Running
Running
from fastapi import APIRouter | |
from services.document_manager.document_loader import DocumentsLoader | |
# from services.vectordb_manager.vectordb_manager import VectordbManager | |
import pandas as pd | |
import chromadb | |
from chromadb.config import Settings | |
import chromadb.utils.embedding_functions as embedding_functions | |
from dotenv import load_dotenv | |
import os | |
import requests | |
from bs4 import BeautifulSoup | |
from fastapi import FastAPI, File, UploadFile | |
import re | |
from app import client, default_embedding_function | |
import secrets | |
import streamlit as st | |
load_dotenv() | |
openai_key = os.getenv("OPENAI_API_KEY") | |
openai_key = st.secrets["OPENAI_API_KEY"] | |
def generate_knowledge_box_from_url( | |
client, | |
kb_name: str, | |
urls: list, | |
embedding_fct=default_embedding_function, | |
chunk_size: int = 2_000, | |
): | |
dl = DocumentsLoader() | |
docs = dl.load_docs(urls) | |
splits = dl.split_docs(docs, chunk_size=chunk_size) | |
contents = [split.page_content for split in splits] | |
metadatas = [split.metadata for split in splits] | |
cleaned_contents = [ | |
re.sub(r"\n+", " ", content) for content in contents | |
] # clean text a bit | |
chroma_collection = client.create_collection( | |
kb_name, | |
embedding_function=embedding_fct, | |
metadata={"hnsw:space": "cosine"}, | |
) | |
ids = [secrets.token_hex(16) for _ in cleaned_contents] | |
chroma_collection.add(documents=cleaned_contents, ids=ids, metadatas=metadatas) | |
n_splits = chroma_collection.count() | |
return {"status": 200, "n_split": n_splits} | |
def add_links_to_knowledge_base( | |
client, | |
kb_name: str, | |
urls: list, | |
chunk_size: int = 2_000, | |
embedding_fct=default_embedding_function, | |
): | |
dl = DocumentsLoader() | |
docs = dl.load_docs(urls) | |
splits = dl.split_docs(docs, chunk_size=chunk_size) | |
contents = [split.page_content for split in splits] | |
metadatas = [split.metadata for split in splits] | |
cleaned_contents = [ | |
re.sub(r"\n+", " ", content) for content in contents | |
] # clean text a bit | |
embeddings = default_embedding_function(cleaned_contents) | |
chroma_collection = client.get_collection(name=kb_name) | |
ids = [secrets.token_hex(16) for _ in cleaned_contents] | |
chroma_collection.add( | |
documents=cleaned_contents, embeddings=embeddings, ids=ids, metadatas=metadatas | |
) | |
n_splits = chroma_collection.count() | |
return {"status": 200, "n_split": n_splits} | |
if __name__ == "__main__": | |
df = pd.read_csv("test_marcello.csv") | |
kb_name = "new_new_test" | |
urls = df.values.tolist() | |
# res = generate_knowledge_box_from_url( | |
# client=client, | |
# urls=urls, | |
# kb_name=kb_name, | |
# embedding_fct=default_embedding_function, | |
# chunk_size=2_000, | |
# ) | |
df = pd.read_csv("test2.csv") | |
urls = df.values.tolist() | |
res = add_links_to_knowledge_base( | |
client=client, | |
kb_name="test", | |
urls=urls, | |
) | |