handbookreader / chroma_utils.py
danielRamon's picture
feat ✨: Add caching mechanism for sentence transformers models
44bec2e
import os
from langchain_chroma import Chroma
# from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
os.environ['SENTENCE_TRANSFORMERS_HOME'] = './.cache'
embed = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2")
# embed = OllamaEmbeddings(
# model="jina/jina-embeddings-v2-base-es") # Initialize embeddings
def save_handbook_to_chroma(handbook_data: list) -> bool:
"""
Saves the entire handbook data to Chroma with embeddings.
Args:
handbook_data (list): List of dictionaries containing title, URL, and text content of each section.
Returns:
bool: True if the handbook is saved correctly, False otherwise.
"""
documents = []
for chapter in handbook_data:
for section in chapter:
document = Document(
page_content=section.get('text', ''),
metadata={
'title': section.get('title', ''),
'url': section.get('url', '')
}
)
documents.append(document)
print("Saving handbook to Chroma. This process can take a long time.")
try:
ids = [str(i) for i in range(1, len(documents) + 1)]
Chroma.from_documents(
documents=documents, embedding=embed, persist_directory="./chroma_data", ids=ids)
return True
except Exception as e:
print(f"Error saving handbook to Chroma: {e}")
return False
def ask_chroma(question: str, k: int = 3) -> dict:
"""
Asks Chroma a question and returns the top k most similar results.
Args:
question (str): The question to ask Chroma.
k (int): The number of most similar results to return. Default is 3.
Returns:
dict: A dictionary containing the top k most similar results.
"""
try:
vectorstore = Chroma(
embedding_function=embed, # Provide the embedding function
persist_directory="./chroma_data"
)
results = vectorstore.similarity_search(question, k)
return results
except Exception as e:
print(f"Error asking Chroma: {e}")
return {}