Spaces:

jessica45
/

rag

Runtime error

App Files Files Community

jessica45 commited on Feb 12

Commit

64103af

verified ·

1 Parent(s): 1a10750

Update chroma_db_utils.py

Browse files

Files changed (1) hide show

chroma_db_utils.py +0 -167

chroma_db_utils.py CHANGED Viewed

@@ -1,170 +1,3 @@
-# import os
-# import chromadb
-# import numpy as np
-# from typing import List, Tuple
-# from gemini_embedding import GeminiEmbeddingFunction
-# def create_chroma_db(documents: List[str], dataset_name: str, base_path: str = "chroma_db"):
-#     """
-#     Creates a Chroma database using the provided documents.
-#     Automatically generates path and collection name based on dataset_name.
-#     """
-#     path = os.path.join(base_path, dataset_name)
-#     name = f"{dataset_name}_collection"
-#     if not os.path.exists(path):
-#         os.makedirs(path)
-#     chroma_client = chromadb.PersistentClient(path=path)
-#     db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
-#     for i, doc in enumerate(documents):
-#         db.add(documents=[doc], ids=[str(i)])
-#     return db
-# def load_chroma_collection(dataset_name: str, base_path: str = "chroma_db"):
-#     """
-#     Loads an existing Chroma collection.
-#     """
-#     path = os.path.join(base_path, dataset_name)
-#     name = f"{dataset_name}_collection"
-#     chroma_client = chromadb.PersistentClient(path=path)
-#     return chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
-# def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
-#     """
-#     Calculate cosine similarity between two vectors.
-#     Returns a value between -1 and 1, where 1 means most similar.
-#     """
-#     dot_product = np.dot(vec1, vec2)
-#     norm1 = np.linalg.norm(vec1)
-#     norm2 = np.linalg.norm(vec2)
-#     return dot_product / (norm1 * norm2)
-# def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
-#     """
-#     Retrieves relevant passages using explicit cosine similarity calculation.
-#     """
-#     # Get query embedding
-#     query_embedding = db._embedding_function([query])[0]
-#     # Get all document embeddings
-#     all_docs = db.get(include=['embeddings', 'documents'])
-#     doc_embeddings = all_docs['embeddings']
-#     documents = all_docs['documents']
-#     # Calculate cosine similarity for each document
-#     similarities = []
-#     for doc_embedding in doc_embeddings:
-#         similarity = cosine_similarity(query_embedding, doc_embedding)
-#         similarities.append(similarity)
-#     # Sort documents by similarity
-#     doc_similarities = list(zip(documents, similarities))
-#     doc_similarities.sort(key=lambda x: x[1], reverse=True)
-#     # Take top n results
-#     top_results = doc_similarities[:n_results]
-#     # Print results for debugging
-#     print(f"Number of relevant passages retrieved: {len(top_results)}")
-#     for i, (doc, similarity) in enumerate(top_results):
-#         print(f"Passage {i+1} (Cosine Similarity: {similarity:.4f}): {doc[:100]}...")
-#     # Return just the documents
-#     return [doc for doc, _ in top_results]
-# in memory
-# import chromadb
-# from typing import List
-# from gemini_embedding import GeminiEmbeddingFunction  # Ensure this is correctly implemented
-# import time
-# from chromadb.config import Settings
-# def create_chroma_db(chunks: List[str]):
-#     """Create and return an in-memory ChromaDB collection."""
-#     try:
-#         # Initialize in-memory ChromaDB with current recommended configuration
-#         client = chromadb.Client()
-#         # Create collection with unique name to avoid conflicts
-#         collection_name = f"temp_collection_{int(time.time())}"
-#         collection = client.create_collection(name=collection_name)
-#         # Add documents with unique IDs
-#         collection.add(
-#             documents=chunks,
-#             ids=[f"doc_{i}" for i in range(len(chunks))]
-#         )
-#         # Verify the data was added
-#         verify_count = collection.count()
-#         print(f"Verified: Added {verify_count} documents to collection {collection_name}")
-#         # Test query to ensure collection is working
-#         test_results = collection.query(
-#             query_texts=["test"],
-#             n_results=1
-#         )
-#         print("Verified: Collection is queryable")
-#         return collection
-#     except Exception as e:
-#         print(f"Error creating ChromaDB: {str(e)}")
-#         return None
-# def get_relevant_passage(query: str, db, n_results: int = 5) -> List[str]:
-#     """
-#     Retrieves relevant passages using ChromaDB's similarity search.
-#     """
-#     try:
-#         if db is None:
-#             print("Database not initialized")
-#             return []
-#         # Verify collection has documents
-#         count = db.count()
-#         if count == 0:
-#             print("Collection is empty")
-#             return []
-#         # Query the database
-#         results = db.query(
-#             query_texts=[query],
-#             n_results=min(n_results, count)  # Ensure we don't request more than we have
-#         )
-#         # Ensure results exist
-#         if not results["documents"]:
-#             print("No relevant passages found.")
-#             return []
-#         documents = results["documents"][0]  # First result batch
-#         distances = results["distances"][0]  # Corresponding distances
-#         # Debug output
-#         print(f"Number of relevant passages retrieved: {len(documents)}")
-#         for i, (doc, distance) in enumerate(zip(documents, distances)):
-#             similarity = 1 - distance  # Convert distance to similarity
-#             print(f"Passage {i+1} (Similarity: {similarity:.4f}): {doc[:100]}...")
-#         return documents
-#     except Exception as e:
-#         print(f"Error in get_relevant_passage: {str(e)}")
-#         return []
 import chromadb
 from chromadb.config import Settings
 from typing import List

 import chromadb
 from chromadb.config import Settings
 from typing import List