Spaces:
Runtime error
Runtime error
from pathlib import Path | |
from .corpus_data_wrapper import CorpusDataWrapper | |
from .index_wrapper import Indexes, ContextIndexes | |
from config import CORPORA | |
from utils.f import memoize, delegates, GetAttr | |
from typing import List | |
def get_dir_names(path: Path) -> List[str]: | |
available = [g.name for g in filter(lambda g: g.is_dir(), path.glob("*"))] | |
return available | |
def from_model(model_name, corpus_name): | |
"""Get the convenience corpus wrapper for a model and a corpus""" | |
model_dir = Path(CORPORA) / model_name | |
available = get_dir_names(model_dir) | |
if not model_dir.exists() or len(available) == 0: | |
raise FileNotFoundError("There are no corpora present for this model") | |
base_dir = model_dir / corpus_name | |
if not base_dir.exists(): | |
raise FileNotFoundError(f"Desired corpus '{corpus_name}' not available") | |
return ConvenienceCorpus(base_dir) | |
def files_available(base_dir, glob_pattern="*.faiss"): | |
"""Determine whether the base_dir contains indexed files""" | |
if not base_dir.exists() or len(list(base_dir.glob(glob_pattern))) == 0: | |
return False | |
return True | |
class ConvenienceCorpus(GetAttr): | |
def __init__(self, base_dir): | |
bd = Path(base_dir) | |
self.base_dir = bd | |
self.model_dir = bd.parent | |
self.available_corpora = get_dir_names(self.model_dir) | |
self.model_name = self.model_dir.name | |
self.corpus_name = bd.name | |
self.name = f"{self.model_name}_{self.corpus_name}" | |
self.corpus_f = bd / 'data.hdf5' | |
self.embedding_dir = bd / 'embedding_faiss' | |
self.context_dir = bd / 'context_faiss' | |
# Define whether these different files exist or not | |
if not self.corpus_f.exists(): | |
raise FileNotFoundError("Main HDF5 file does not exist") | |
self.embeddings_available = files_available(self.embedding_dir) | |
self.contexts_available = files_available(self.context_dir) | |
self.corpus = CorpusDataWrapper(self.corpus_f, self.name) | |
self.embedding_faiss = Indexes(self.embedding_dir) | |
self.context_faiss = ContextIndexes(self.context_dir) | |
self.default = self.corpus # Almost acts like an inherited class, but is rather a composed class | |
def search_embeddings(self, layer, query, k): | |
D, I = self.embedding_faiss.search(layer, query, k) | |
return self.find2d(I)[0] | |
def search_contexts(self, layer, heads, query, k): | |
D, I = self.context_faiss.search(layer, heads, query, k) | |
return self.find2d(I)[0] | |
def __repr__(self): | |
return f"ConvenienceCorpus({self.name})" |