collection-manager / generate_kb.py
marcellopoliti's picture
refactor
8e018ae
from services.document_manager.document_loader import DocumentsLoader
# from services.vectordb_manager.vectordb_manager import VectordbManager
import pandas as pd
from dotenv import load_dotenv
import os
import re
from app import client, default_embedding_function
import secrets
import streamlit as st
from spellchecker import SpellChecker
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
def clean_text(text):
# Remove non-ASCII characters
cleaned_text = re.sub(r"[^\x00-\x7F]+", " ", text)
# Replace specific artifacts you identified (example pattern below)
cleaned_text = re.sub(
r"�+", " ", cleaned_text
) # Replace sequences of � with a single space
# Further cleaning steps can be added here as needed
return cleaned_text
def correct_spelling(text):
spell = SpellChecker()
corrected_text = " ".join([spell.correction(word) for word in text.split()])
return corrected_text
def clean_list(contents):
cleaned_contents = []
for text in contents:
# Count weird characters (non-ASCII)
weird_chars = sum(not (ord(char) < 128) for char in text)
# Check if weird characters are more than 20% of the text
if weird_chars / len(text) <= 0.20:
cleaned_contents.append(text)
return cleaned_contents
def generate_knowledge_box_from_url(
client,
kb_name: str,
urls: list,
embedding_fct=default_embedding_function,
chunk_size: int = 2_000,
):
dl = DocumentsLoader()
docs = dl.load_docs(urls)
splits = dl.split_docs(docs, chunk_size=chunk_size)
contents = [split.page_content for split in splits]
metadatas = [split.metadata for split in splits]
cleaned_contents = clean_list(contents)
cleaned_contents = [
re.sub(r"\n+", " ", content) for content in cleaned_contents
] # clean text a bit
cleaned_contents = [clean_text(content) for content in cleaned_contents]
cleaned_contents = [correct_spelling(content) for content in cleaned_contents]
chroma_collection = client.create_collection(
kb_name,
embedding_function=embedding_fct,
metadata={"hnsw:space": "cosine"},
)
ids = [secrets.token_hex(16) for _ in cleaned_contents]
chroma_collection.add(documents=cleaned_contents, ids=ids, metadatas=metadatas)
n_splits = chroma_collection.count()
return {"status": 200, "n_split": n_splits}
def add_links_to_knowledge_base(
client,
kb_name: str,
urls: list,
chunk_size: int = 2_000,
pdf_optional_link=None,
youtube_optional_link=None,
video_title=None,
pdf_title=None,
embedding_fct=default_embedding_function,
):
dl = DocumentsLoader()
docs = dl.load_docs(urls)
splits = dl.split_docs(docs, chunk_size=chunk_size)
contents = [split.page_content for split in splits]
metadatas = [split.metadata for split in splits]
if pdf_optional_link and pdf_title:
for md in metadatas:
md["source"] = pdf_optional_link
md["title"] = pdf_title
if youtube_optional_link and video_title:
for md in metadatas:
md["source"] = youtube_optional_link
md["title"] = video_title
cleaned_contents = [
re.sub(r"\n+", " ", content) for content in contents
] # clean text a bit
embeddings = default_embedding_function(cleaned_contents)
chroma_collection = client.get_collection(name=kb_name)
ids = [secrets.token_hex(16) for _ in cleaned_contents]
chroma_collection.add(
documents=cleaned_contents, embeddings=embeddings, ids=ids, metadatas=metadatas
)
n_splits = chroma_collection.count()
return {"status": 200, "n_split": n_splits}
if __name__ == "__main__":
df = pd.read_csv("test_marcello.csv")
kb_name = "new_new_test"
urls = df.values.tolist()
# res = generate_knowledge_box_from_url(
# client=client,
# urls=urls,
# kb_name=kb_name,
# embedding_fct=default_embedding_function,
# chunk_size=2_000,
# )
df = pd.read_csv("test2.csv")
urls = df.values.tolist()
res = add_links_to_knowledge_base(
client=client,
kb_name="test",
urls=urls,
)