colonelwatch commited on
Commit
d56bf2d
·
1 Parent(s): 2db96ca

Drop the plan to use shards

Browse files
Files changed (1) hide show
  1. app.py +5 -12
app.py CHANGED
@@ -128,22 +128,15 @@ def get_model(
128
  )
129
 
130
 
131
- def merge_shards(dir: Path) -> faiss.Index:
132
- empty_path = dir / "empty.faiss"
133
- shard_paths = [str(p) for p in dir.glob("shard_*.faiss")]
134
- merged_ivfdata_path = Path("temp.ivfdata")
135
-
136
- index = faiss.read_index(str(empty_path))
137
- merged_ivfdata_path.unlink(missing_ok=True) # overwrite previous if it exists (TODO: do I need this?)
138
- merge_ondisk(index, shard_paths, str(merged_ivfdata_path))
139
-
140
- return index
141
 
142
 
143
  def get_index(dir: Path, search_time_s: float) -> Dataset:
144
- # NOTE: a private attr is used to get the faiss.IO_FLAG_ONDISK_SAME_DIR flag!
145
  index: Dataset = Dataset.from_parquet(str(dir / "ids.parquet")) # type: ignore
146
- faiss_index = merge_shards(dir / "shards")
147
  index._indexes["embedding"] = FaissIndex(None, None, None, faiss_index)
148
 
149
  with open(dir / "params.json", "r") as f:
 
128
  )
129
 
130
 
131
+ def open_ondisk(dir: Path) -> faiss.Index:
132
+ # without IO_FLAG_ONDISK_SAME_DIR, read_index gets on-disk indices in working dir
133
+ return faiss.read_index(str(dir / "index.faiss"), faiss.IO_FLAG_ONDISK_SAME_DIR)
 
 
 
 
 
 
 
134
 
135
 
136
  def get_index(dir: Path, search_time_s: float) -> Dataset:
137
+ # NOTE: use a private attr to load the index with IO_FLAG_ONDISK_SAME_DIR!
138
  index: Dataset = Dataset.from_parquet(str(dir / "ids.parquet")) # type: ignore
139
+ faiss_index = open_ondisk(dir)
140
  index._indexes["embedding"] = FaissIndex(None, None, None, faiss_index)
141
 
142
  with open(dir / "params.json", "r") as f: