Spaces:

colonelwatch
/

abstracts-index

Running on Zero

colonelwatch commited on Feb 23

Commit

d56bf2d

1 Parent(s): 2db96ca

Drop the plan to use shards

Files changed (1) hide show

app.py CHANGED Viewed

@@ -128,22 +128,15 @@ def get_model(
     )
-def merge_shards(dir: Path) -> faiss.Index:
-    empty_path = dir / "empty.faiss"
-    shard_paths = [str(p) for p in dir.glob("shard_*.faiss")]
-    merged_ivfdata_path = Path("temp.ivfdata")
-    index = faiss.read_index(str(empty_path))
-    merged_ivfdata_path.unlink(missing_ok=True)  # overwrite previous if it exists  (TODO: do I need this?)
-    merge_ondisk(index, shard_paths, str(merged_ivfdata_path))
-    return index
 def get_index(dir: Path, search_time_s: float) -> Dataset:
-    # NOTE: a private attr is used to get the faiss.IO_FLAG_ONDISK_SAME_DIR flag!
     index: Dataset = Dataset.from_parquet(str(dir / "ids.parquet"))  # type: ignore
-    faiss_index = merge_shards(dir / "shards")
     index._indexes["embedding"] = FaissIndex(None, None, None, faiss_index)
     with open(dir / "params.json", "r") as f:

     )
+def open_ondisk(dir: Path) -> faiss.Index:
+    # without IO_FLAG_ONDISK_SAME_DIR, read_index gets on-disk indices in working dir
+    return faiss.read_index(str(dir / "index.faiss"), faiss.IO_FLAG_ONDISK_SAME_DIR)
 def get_index(dir: Path, search_time_s: float) -> Dataset:
+    # NOTE: use a private attr to load the index with IO_FLAG_ONDISK_SAME_DIR!
     index: Dataset = Dataset.from_parquet(str(dir / "ids.parquet"))  # type: ignore
+    faiss_index = open_ondisk(dir)
     index._indexes["embedding"] = FaissIndex(None, None, None, faiss_index)
     with open(dir / "params.json", "r") as f: