exbert / server /data_processing /convenience_corpus.py
bhoov's picture
First commit
63858e7
raw
history blame
2.62 kB
from pathlib import Path
from .corpus_data_wrapper import CorpusDataWrapper
from .index_wrapper import Indexes, ContextIndexes
from config import CORPORA
from utils.f import memoize, delegates, GetAttr
from typing import List
def get_dir_names(path: Path) -> List[str]:
available = [g.name for g in filter(lambda g: g.is_dir(), path.glob("*"))]
return available
@memoize
def from_model(model_name, corpus_name):
"""Get the convenience corpus wrapper for a model and a corpus"""
model_dir = Path(CORPORA) / model_name
available = get_dir_names(model_dir)
if not model_dir.exists() or len(available) == 0:
raise FileNotFoundError("There are no corpora present for this model")
base_dir = model_dir / corpus_name
if not base_dir.exists():
raise FileNotFoundError(f"Desired corpus '{corpus_name}' not available")
return ConvenienceCorpus(base_dir)
def files_available(base_dir, glob_pattern="*.faiss"):
"""Determine whether the base_dir contains indexed files"""
if not base_dir.exists() or len(list(base_dir.glob(glob_pattern))) == 0:
return False
return True
class ConvenienceCorpus(GetAttr):
def __init__(self, base_dir):
bd = Path(base_dir)
self.base_dir = bd
self.model_dir = bd.parent
self.available_corpora = get_dir_names(self.model_dir)
self.model_name = self.model_dir.name
self.corpus_name = bd.name
self.name = f"{self.model_name}_{self.corpus_name}"
self.corpus_f = bd / 'data.hdf5'
self.embedding_dir = bd / 'embedding_faiss'
self.context_dir = bd / 'context_faiss'
# Define whether these different files exist or not
if not self.corpus_f.exists():
raise FileNotFoundError("Main HDF5 file does not exist")
self.embeddings_available = files_available(self.embedding_dir)
self.contexts_available = files_available(self.context_dir)
self.corpus = CorpusDataWrapper(self.corpus_f, self.name)
self.embedding_faiss = Indexes(self.embedding_dir)
self.context_faiss = ContextIndexes(self.context_dir)
self.default = self.corpus # Almost acts like an inherited class, but is rather a composed class
def search_embeddings(self, layer, query, k):
D, I = self.embedding_faiss.search(layer, query, k)
return self.find2d(I)[0]
def search_contexts(self, layer, heads, query, k):
D, I = self.context_faiss.search(layer, heads, query, k)
return self.find2d(I)[0]
def __repr__(self):
return f"ConvenienceCorpus({self.name})"