Spaces:

bioscan-ml
/

browser-backend

Runtime error

App Files Files Community

browser-backend / prepare_index.py

atwang

update code to download dataset files from separate repo

07356cd 5 days ago

raw

history blame contribute delete

3.05 kB

	import pickle
	from pathlib import Path

	import click
	import faiss
	import h5py

	ALL_KEY_TYPES = ["dna", "image"]
	ALL_INDEX_TYPES = ["IndexFlatIP", "IndexFlatL2", "IndexIVFFlat", "IndexHNSWFlat", "IndexLSH"]
	EMBEDDING_SIZE = 768


	def process(embedding_data, output: Path, key_type: str, index_type: str):
	# load embeddings
	embeddings = embedding_data[f"encoded_{key_type}_feature"][:]

	# FlatIP and FlatL2
	if index_type == "IndexFlatIP":
	test_index = faiss.IndexFlatIP(embeddings.shape[-1])
	elif index_type == "IndexFlatL2":
	test_index = faiss.IndexFlatL2(embeddings.shape[-1])
	elif index_type == "IndexIVFFlat":
	# IVFFlat
	quantizer = faiss.IndexFlatIP(embeddings.shape[-1])
	test_index = faiss.IndexIVFFlat(quantizer, embeddings.shape[-1], 128)
	test_index.train(embeddings)
	elif index_type == "IndexHNSWFlat":
	# HNSW
	# 16: connections for each vertex. efSearch: depth of search during search. efConstruction: depth of search during build
	test_index = faiss.IndexHNSWFlat(embeddings.shape[-1])
	test_index.hnsw.efSearch = 32
	test_index.hnsw.efConstruction = 64
	elif index_type == "IndexLSH":
	# LSH
	test_index = faiss.IndexLSH(embeddings.shape[-1], embeddings.shape[-1] * 2)
	else:
	raise ValueError(f"Index type {index_type} is not supported")

	test_index.add(embeddings)

	faiss.write_index(test_index, str(output / f"bioscan_5m_{key_type}_{index_type}.index"))
	print("Saved index to", output / f"bioscan_5m_{key_type}_{index_type}.index")


	@click.command()
	@click.option(
	"--input",
	type=click.Path(path_type=Path),
	default="bioscan-clip-scripts/extracted_features",
	help="Path to extracted features",
	)
	@click.option(
	"--output", type=click.Path(path_type=Path), default="bioscan-clip-scripts/index", help="Path to save the index"
	)
	@click.option(
	"--key-type", "key_type", type=click.Choice(["all", *ALL_KEY_TYPES]), default="all", help="Type of key to use"
	)
	@click.option(
	"--index-type",
	"index_type",
	type=click.Choice(["all", *ALL_INDEX_TYPES]),
	default="all",
	help="Type of index to use",
	)
	def main(input, output, key_type, index_type):
	output.mkdir(parents=True, exist_ok=True)

	if key_type == "all":
	key_types = ALL_KEY_TYPES
	else:
	key_types = [key_type]

	if index_type == "all":
	index_types = ALL_INDEX_TYPES
	else:
	index_types = [index_type]

	embedding_data = h5py.File(input / "extracted_features_for_all_5m_data.hdf5", "r", libver="latest")
	for key_type in key_types:
	for index_type in index_types:
	process(embedding_data, output, key_type, index_type)

	sample_ids = [raw_id.decode("utf-8") for raw_id in embedding_data["file_name_list"][:]]
	index_to_id = {index: id for index, id in enumerate(sample_ids)}
	with open(output / "big_indx_to_id_dict.pickle", "wb") as f:
	pickle.dump(index_to_id, f)


	if __name__ == "__main__":
	main()