File size: 2,619 Bytes
63858e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from pathlib import Path
from .corpus_data_wrapper import CorpusDataWrapper
from .index_wrapper import Indexes, ContextIndexes
from config import CORPORA
from utils.f import memoize, delegates, GetAttr
from typing import List

def get_dir_names(path: Path) -> List[str]:
    available = [g.name for g in filter(lambda g: g.is_dir(), path.glob("*"))]
    return available


@memoize
def from_model(model_name, corpus_name):
    """Get the convenience corpus wrapper for a model and a corpus"""
    model_dir = Path(CORPORA) / model_name
    available = get_dir_names(model_dir)
    if not model_dir.exists() or len(available) == 0:
        raise FileNotFoundError("There are no corpora present for this model")

    base_dir = model_dir / corpus_name

    if not base_dir.exists(): 
        raise FileNotFoundError(f"Desired corpus '{corpus_name}' not available")

    return ConvenienceCorpus(base_dir)

def files_available(base_dir, glob_pattern="*.faiss"):
    """Determine whether the base_dir contains indexed files"""
    if not base_dir.exists() or len(list(base_dir.glob(glob_pattern))) == 0:
        return False

    return True
class ConvenienceCorpus(GetAttr):
    def __init__(self, base_dir):
        bd = Path(base_dir)
        self.base_dir = bd
        self.model_dir = bd.parent
        self.available_corpora = get_dir_names(self.model_dir)

        self.model_name = self.model_dir.name
        self.corpus_name = bd.name
        self.name = f"{self.model_name}_{self.corpus_name}"

        self.corpus_f = bd / 'data.hdf5'
        self.embedding_dir = bd / 'embedding_faiss'
        self.context_dir = bd / 'context_faiss'

        # Define whether these different files exist or not
        if not self.corpus_f.exists(): 
            raise FileNotFoundError("Main HDF5 file does not exist")

        self.embeddings_available = files_available(self.embedding_dir)
        self.contexts_available = files_available(self.context_dir)

        self.corpus = CorpusDataWrapper(self.corpus_f, self.name)
        self.embedding_faiss = Indexes(self.embedding_dir)
        self.context_faiss = ContextIndexes(self.context_dir)

        self.default = self.corpus # Almost acts like an inherited class, but is rather a composed class

    def search_embeddings(self, layer, query, k):
        D, I = self.embedding_faiss.search(layer, query, k)
        return self.find2d(I)[0]

    def search_contexts(self, layer, heads, query, k):
        D, I = self.context_faiss.search(layer, heads, query, k)
        return self.find2d(I)[0]

    def __repr__(self):
        return f"ConvenienceCorpus({self.name})"