PleIAs
/

Cassandre-RAG

Model card Files Files and versions Community

Carlos Rosas commited on Oct 18, 2024

Commit

9d45bed

·

verified ·

1 Parent(s): 8b9eee8

Update README.md

Files changed (1) hide show

README.md +38 -4

README.md CHANGED Viewed

@@ -66,16 +66,18 @@ This answer will also contain the excerpts of the documents used and the ID of t
 ### Example Usage
-In this example, we will be using lancedb for the retrieval part. You can use your preffered embedding model to create the embeddings together with Lancedb and add them to the database, and its hybrid search feature allows us to combine vector search with keyword search for better retrieval.
 ```python
 import lancedb
 from vllm import LLM, SamplingParams
 import pandas as pd
 # Initialize LanceDB
 db = lancedb.connect("lancedb_data")
-table = db.open_table("education")
 # We will create some ficticious education documents to add to the database
 documents = [
@@ -96,8 +98,40 @@ documents = [
     }
 ]
-# Add documents to LanceDB
-table.add(documents)
 # Load the model
 model_name = "PleIAs/Cassandre-RAG"

 ### Example Usage
+In this example, we will be using BGE for the embeddings and lancedb for the retrieval part. You can use your preffered embedding model to create the embeddings and add them to the database, LanceDB hybrid search feature allows us to combine vector search with keyword search for better retrieval.
 ```python
 import lancedb
 from vllm import LLM, SamplingParams
 import pandas as pd
+import lancedb
+from lancedb.pydantic import LanceModel, Vector
+from lancedb.embeddings import get_registry
 # Initialize LanceDB
 db = lancedb.connect("lancedb_data")
 # We will create some ficticious education documents to add to the database
 documents = [
     }
 ]
+#BGE embedding model loading and creating the structure adapted to lance
+model = get_registry().get("sentence-transformers").create(name="BAAI/bge-m3", device="cuda")
+class Documents(LanceModel):
+    main_title: str
+    text: str = model.SourceField()
+    hash: str
+    vector: Vector(model.ndims()) = model.VectorField()
+#Create table
+table = db.create_table("example", schema=Documents, mode="overwrite")
+def process_batch(batch):
+    processed_documents = []
+    for item in batch:
+        try:
+            processed_documents.append({
+                "hash": item.get("hash", ""),
+                "main_title": item.get("main_title", ""),
+                "text": item.get("text", "")
+                # Add any other fields you want to include
+            })
+        except Exception as e:
+            print(f"Error processing item: {item}")
+            print(f"Error message: {str(e)}")
+    return processed_documents
+# Process and add documents in batches
+batch_size = 2  # Adjust as needed
+for i in tqdm(range(0, len(documents), batch_size)):
+    batch = documents[i:i+batch_size]
+    processed_batch = process_batch(batch)
+    if processed_batch:  # Only add if the batch is not empty
+        table.add(processed_batch)
 # Load the model
 model_name = "PleIAs/Cassandre-RAG"