Spaces:

eubinecto
/

idiomify

Runtime error

App Files Files Community

eubinecto commited on Mar 4, 2022

Commit

e9d1a5a

1 Parent(s): 207cddf

[#1] checkpoint before amending builders.py

Browse files

Files changed (17) hide show

explore/explore_bart.py +16 -0
main_upload_idiom2context.py → explore/explore_bart_for_conditional_generation.py +3 -4
explore/explore_fetch_epie.py +1 -1
explore/explore_fetch_epie_counts.py +0 -1
explore/explore_fetch_idiom2def.py +0 -15
explore/explore_fetch_idioms.py +1 -1
explore/explore_fetch_literal2idiom.py +10 -0
explore/explore_fetch_pie.py +14 -0
idiomify/builders.py +26 -26
idiomify/fetchers.py +66 -47
idiomify/models.py +13 -110
idiomify/paths.py +4 -4
idiomify/urls.py +5 -0
main_infer.py +37 -36
main_upload_idioms.py +32 -4
main_upload_literal2idiom.py +46 -0
main_upload_tokenizer.py +0 -13

explore/explore_bart.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from transformers import BartTokenizer, BartModel
+def main():
+    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+    model = BartModel.from_pretrained('facebook/bart-large')
+    inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    outputs = model(**inputs)
+    H_all = outputs.last_hidden_state  # noqa
+    print(H_all.shape)  # (1, 8, 1024)
+if __name__ == '__main__':
+    main()

main_upload_idiom2context.py → explore/explore_bart_for_conditional_generation.py RENAMED Viewed

@@ -1,6 +1,5 @@
-"""
-Build and upload an idiom2context dataset to wandb.
-"""
 def main():
@@ -8,4 +7,4 @@ def main():
 if __name__ == '__main__':
-    main()

+from transformers import BartTokenizer, BartForConditionalGeneration
 def main():
 if __name__ == '__main__':
+    main()

explore/explore_fetch_epie.py CHANGED Viewed

@@ -11,7 +11,7 @@ def main():
     # so, what do you want? you want to build an idiom-masked language modeling?
     for idiom, context, tag in epie:
-        print(context)
     for idx, idiom in enumerate(idioms):
         print(idx, idiom)

     # so, what do you want? you want to build an idiom-masked language modeling?
     for idiom, context, tag in epie:
+        print(idiom, context)
     for idx, idiom in enumerate(idioms):
         print(idx, idiom)

explore/explore_fetch_epie_counts.py CHANGED Viewed

	@@ -1,4 +1,3 @@
1	-
2	from idiomify.fetchers import fetch_epie
3
4



1	from idiomify.fetchers import fetch_epie
2
3

explore/explore_fetch_idiom2def.py DELETED Viewed

@@ -1,15 +0,0 @@
-from idiomify.fetchers import fetch_idiom2def
-def main():
-    idiom2def = fetch_idiom2def("c")
-    for idiom, definition in idiom2def:
-        print(idiom, definition)
-    df = fetch_idiom2def("d")
-    for idiom, definition in idiom2def:
-        print(idiom, definition)
-if __name__ == '__main__':
-    main()

explore/explore_fetch_idioms.py CHANGED Viewed

@@ -2,7 +2,7 @@ from idiomify.fetchers import fetch_idioms
 def main():
-    print(fetch_idioms("c"))
 if __name__ == '__main__':

 def main():
+    print(fetch_idioms("pie_v0"))
 if __name__ == '__main__':

explore/explore_fetch_literal2idiom.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from idiomify.fetchers import fetch_literal2idiom
+def main():
+    for src, tgt in fetch_literal2idiom("pie_v0"):
+        print(src, "->", tgt)
+if __name__ == '__main__':
+    main()

explore/explore_fetch_pie.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from idiomify.fetchers import fetch_pie
+def main():
+    for idx, row in enumerate(fetch_pie()):
+        print(idx, row)
+        # the first 105 = V0.
+        if idx == 105:
+            break
+if __name__ == '__main__':
+    main()

idiomify/builders.py CHANGED Viewed

@@ -19,6 +19,16 @@ class TensorBuilder:
 class Idiom2SubwordsBuilder(TensorBuilder):
     def __call__(self, idioms: List[str], k: int) -> torch.Tensor:
         mask_id = self.tokenizer.mask_token_id
         pad_id = self.tokenizer.pad_token_id
         # temporarily disable single-token status of the idioms
@@ -31,38 +41,20 @@ class Idiom2SubwordsBuilder(TensorBuilder):
                                    max_length=k,  # set to k
                                    return_tensors="pt")
         input_ids = encodings['input_ids']
-        input_ids[input_ids == pad_id] = mask_id  # replace them with masks
         return input_ids
-class Idiom2DefBuilder(TensorBuilder):
-    def __call__(self, idiom2def: List[Tuple[str, str]], k: int) -> torch.Tensor:
-        defs = [definition for _, definition in idiom2def]
-        lefts = [" ".join(["[MASK]"] * k)] * len(defs)
-        encodings = self.tokenizer(text=lefts,
-                                   text_pair=defs,
-                                   return_tensors="pt",
-                                   add_special_tokens=True,
-                                   truncation=True,
-                                   padding=True,
-                                   verbose=True)
-        input_ids: torch.Tensor = encodings['input_ids']
-        cls_id: int = self.tokenizer.cls_token_id
-        sep_id: int = self.tokenizer.sep_token_id
-        mask_id: int = self.tokenizer.mask_token_id
-        wisdom_mask = torch.where(input_ids == mask_id, 1, 0)
-        desc_mask = torch.where(((input_ids != cls_id) & (input_ids != sep_id) & (input_ids != mask_id)), 1, 0)
-        return torch.stack([input_ids,
-                            encodings['token_type_ids'],
-                            encodings['attention_mask'],
-                            wisdom_mask,
-                            desc_mask], dim=1)
 class Idiom2ContextBuilder(TensorBuilder):
     def __call__(self, idiom2context: List[Tuple[str, str]]):
         contexts = [context for _, context in idiom2context]
         encodings = self.tokenizer(text=contexts,
                                    return_tensors="pt",
@@ -78,6 +70,14 @@ class Idiom2ContextBuilder(TensorBuilder):
 class TargetsBuilder(TensorBuilder):
     def __call__(self, idiom2sent: List[Tuple[str, str]], idioms: List[str]) -> torch.Tensor:
         return torch.LongTensor([
             idioms.index(idiom)
             for idiom, _ in idiom2sent

 class Idiom2SubwordsBuilder(TensorBuilder):
     def __call__(self, idioms: List[str], k: int) -> torch.Tensor:
+        """
+                1. The function takes in a list of idioms, and a maximum length of the input sequence.
+                2. It then splits the idioms into words, and pads the sequence to the maximum length.
+                3. It masks the padding tokens, and returns the input ids
+                :param idioms: a list of idioms, each of which is a list of tokens
+                :type idioms: List[str]
+                :param k: the maximum length of the idioms
+                :type k: int
+                :return: The input_ids of the idioms, with the pad tokens replaced by the mask token.
+        """
         mask_id = self.tokenizer.mask_token_id
         pad_id = self.tokenizer.pad_token_id
         # temporarily disable single-token status of the idioms
                                    max_length=k,  # set to k
                                    return_tensors="pt")
         input_ids = encodings['input_ids']
+        input_ids[input_ids == pad_id] = mask_id
         return input_ids
 class Idiom2ContextBuilder(TensorBuilder):
     def __call__(self, idiom2context: List[Tuple[str, str]]):
+        """
+            Given a list of tuples of idiom and context,
+            it returns a tensor of shape (batch_size, 3, max_seq_len)
+            :param idiom2context: List[Tuple[str, str]], a list of tuples of idiom and context
+            :type idiom2context: List[Tuple[str, str]]
+            :return: The input_ids, token_type_ids, and attention_mask for each context.
+        """
         contexts = [context for _, context in idiom2context]
         encodings = self.tokenizer(text=contexts,
                                    return_tensors="pt",
 class TargetsBuilder(TensorBuilder):
     def __call__(self, idiom2sent: List[Tuple[str, str]], idioms: List[str]) -> torch.Tensor:
+        """
+            Given a list of idioms and a list of sentences, return a list of indices of the idioms in the sentences
+            :param idiom2sent: A list of tuples, where each tuple is an idiom and its corresponding sentence
+            :type idiom2sent: List[Tuple[str, str]]
+            :param idioms: A list of idioms
+            :type idioms: List[str]
+            :return: A tensor of indices of the idioms in the list of idioms.
+        """
         return torch.LongTensor([
             idioms.index(idiom)
             for idiom, _ in idiom2sent

idiomify/fetchers.py CHANGED Viewed

@@ -1,73 +1,91 @@
 import csv
 import yaml
 import wandb
 import requests
 from typing import Tuple, List
 from wandb.sdk.wandb_run import Run
 from idiomify.models import Alpha, RD
-from idiomify.paths import idiom2def_dir, CONFIG_YAML, idioms_dir, alpha_dir
 from idiomify.urls import (
     EPIE_IMMUTABLE_IDIOMS_URL,
     EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL,
     EPIE_IMMUTABLE_IDIOMS_TAGS_URL,
     EPIE_MUTABLE_IDIOMS_URL,
     EPIE_MUTABLE_IDIOMS_CONTEXTS_URL,
-    EPIE_MUTABLE_IDIOMS_TAGS_URL
 )
-from idiomify.builders import Idiom2SubwordsBuilder
-from transformers import AutoModelForMaskedLM, AutoConfig, BertTokenizer
 # sources for dataset
-def fetch_epie() -> List[Tuple[str, str, str]]:
-    idioms = requests.get(EPIE_IMMUTABLE_IDIOMS_URL).text \
-             + requests.get(EPIE_MUTABLE_IDIOMS_URL).text
-    contexts = requests.get(EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL).text \
-               + requests.get(EPIE_MUTABLE_IDIOMS_CONTEXTS_URL).text
-    tags = requests.get(EPIE_IMMUTABLE_IDIOMS_TAGS_URL).text \
-           + requests.get(EPIE_MUTABLE_IDIOMS_TAGS_URL).text
     return list(zip(idioms.strip().split("\n"),
                     contexts.strip().split("\n"),
                     tags.strip().split("\n")))
-# you should somehow get this from... wandb.
-def fetch_idiom2context(ver: str, run: Run = None) -> List[Tuple[str, str]]:
     """
-    include run if you want to track the lineage
     """
     if run:
-        pass
-# dataset
-def fetch_idiom2def(ver: str) -> List[Tuple[str, str]]:
-    artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/idiom2def:{ver}", type="dataset")
-    artifact_path = idiom2def_dir(ver)
-    artifact.download(root=str(artifact_path))
-    tsv_path = artifact_path / "all.tsv"
-    with open(tsv_path, 'r') as fh:
-        reader = csv.reader(fh, delimiter="\t")
-        return [
-            (row[0], row[1])
-            for row in reader
-        ]
-def fetch_idioms(ver: str) -> List[str]:
-    artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/idioms:{ver}", type="dataset")
-    artifact_path = idioms_dir(ver)
-    artifact.download(root=str(artifact_path))
-    tsv_path = artifact_path / "all.tsv"
     with open(tsv_path, 'r') as fh:
         reader = csv.reader(fh, delimiter="\t")
-        next(reader)
-        return [
-            row[0]
-            for row in reader
-        ]
 def fetch_rd(model: str, ver: str) -> RD:
@@ -80,12 +98,13 @@ def fetch_rd(model: str, ver: str) -> RD:
     idioms = fetch_idioms(config['idioms_ver'])
     tokenizer = BertTokenizer.from_pretrained(config['bert'])
     idiom2subwords = Idiom2SubwordsBuilder(tokenizer)(idioms, config['k'])
-    if model == Alpha.name():
-        rd = Alpha.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
-    elif model == Gamma.name():
-        rd = Gamma.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
-    else:
-        raise ValueError
     return rd

 import csv
+from os import path
 import yaml
 import wandb
 import requests
 from typing import Tuple, List
 from wandb.sdk.wandb_run import Run
+from transformers import AutoModelForMaskedLM, AutoConfig, BertTokenizer
+from idiomify.builders import Idiom2SubwordsBuilder
 from idiomify.models import Alpha, RD
+from idiomify.paths import CONFIG_YAML, idioms_dir, alpha_dir, literal2idiom
 from idiomify.urls import (
     EPIE_IMMUTABLE_IDIOMS_URL,
     EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL,
     EPIE_IMMUTABLE_IDIOMS_TAGS_URL,
     EPIE_MUTABLE_IDIOMS_URL,
     EPIE_MUTABLE_IDIOMS_CONTEXTS_URL,
+    EPIE_MUTABLE_IDIOMS_TAGS_URL,
+    PIE_URL
 )
 # sources for dataset
+def fetch_epie(ver: str) -> List[Tuple[str, str, str]]:
+    """
+    It fetches the EPIE idioms, contexts, and tags from the web
+    :param ver: str
+    :type ver: str
+    :return: A list of tuples. Each tuple contains three strings: an idiom, a context, and a tag.
+    """
+    if ver == "immutable":
+        idioms_url = EPIE_IMMUTABLE_IDIOMS_URL
+        contexts_url = EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL
+        tags_url = EPIE_IMMUTABLE_IDIOMS_TAGS_URL
+    elif ver == "mutable":
+        idioms_url = EPIE_MUTABLE_IDIOMS_URL
+        contexts_url = EPIE_MUTABLE_IDIOMS_CONTEXTS_URL
+        tags_url = EPIE_MUTABLE_IDIOMS_TAGS_URL
+    else:
+        raise ValueError
+    idioms = requests.get(idioms_url).text
+    contexts = requests.get(contexts_url).text
+    tags = requests.get(tags_url).text
     return list(zip(idioms.strip().split("\n"),
                     contexts.strip().split("\n"),
                     tags.strip().split("\n")))
+def fetch_pie() -> list:
+    text = requests.get(PIE_URL).text
+    lines = (line for line in text.split("\n") if line)
+    reader = csv.reader(lines)
+    next(reader)  # skip the header
+    return [
+        row
+        for row in reader
+    ]
+# --- from wandb --- #
+def fetch_idioms(ver: str, run: Run = None) -> List[str]:
     """
+    why do you need this? -> you need this to have access to the idiom embeddings.
     """
+    # if run object is given, we track the lineage of the data.
+    # if not, we get the dataset via wandb Api.
     if run:
+        artifact = run.use_artifact("idioms", type="dataset", aliases=ver)
+    else:
+        artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
+    artifact_dir = artifact.download(root=idioms_dir(ver))
+    txt_path = path.join(artifact_dir, "all.txt")
+    with open(txt_path, 'r') as fh:
+        return [line.strip() for line in fh]
+def fetch_literal2idiom(ver: str, run: Run = None) -> List[Tuple[str, str]]:
+    # if run object is given, we track the lineage of the data.
+    # if not, we get the dataset via wandb Api.
+    if run:
+        artifact = run.use_artifact("literal2idiom", type="dataset", aliases=ver)
+    else:
+        artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiom:{ver}", type="dataset")
+    artifact_dir = artifact.download(root=literal2idiom(ver))
+    tsv_path = path.join(artifact_dir, "all.tsv")
     with open(tsv_path, 'r') as fh:
         reader = csv.reader(fh, delimiter="\t")
+        return [(row[0], row[1]) for row in reader]
 def fetch_rd(model: str, ver: str) -> RD:
     idioms = fetch_idioms(config['idioms_ver'])
     tokenizer = BertTokenizer.from_pretrained(config['bert'])
     idiom2subwords = Idiom2SubwordsBuilder(tokenizer)(idioms, config['k'])
+    # if model == Alpha.name():
+    #     rd = Alpha.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
+    # elif model == Gamma.name():
+    #     rd = Gamma.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
+    # else:
+    #     raise ValueError
+    rd = ...
     return rd

idiomify/models.py CHANGED Viewed

@@ -8,14 +8,12 @@ import pytorch_lightning as pl
 from transformers import BertForMaskedLM
-class RD(pl.LightningModule):
     """
     @eubinecto
     The superclass of all the reverse-dictionaries. This class houses any methods that are required by
     whatever reverse-dictionaries we define.
     """
-    # --- boilerplate; the loaders are defined in datamodules, so we don't define them here
     # passing them to avoid warnings ---  #
     def train_dataloader(self):
         pass
@@ -35,119 +33,24 @@ class RD(pl.LightningModule):
         :param idiom2subwords: (|W|, K)
         :return: (N, K, |V|); (num samples, k, the size of the vocabulary of subwords)
         """
-        super().__init__()
-        # -- hyper params --- #
-        # should be saved to self.hparams
-        # https://github.com/PyTorchLightning/pytorch-lightning/issues/4390#issue-730493746
-        self.save_hyperparameters(ignore=["mlm", "idiom2subwords"])
-        # -- the only neural network we need -- #
-        self.mlm = mlm
-        # --- to be used for getting H_k --- #
-        self.wisdom_mask: Optional[torch.Tensor] = None  # (N, L)
-        # --- to be used for getting H_desc --- #
-        self.desc_mask: Optional[torch.Tensor] = None  # (N, L)
-        # -- constant tensors -- #
-        self.register_buffer("idiom2subwords", idiom2subwords)  # (|W|, K)
     def forward(self, X: torch.Tensor) -> torch.Tensor:
         """
-        :param X: (N, 4, L);
-         (num samples, 0=input_ids/1=token_type_ids/2=attention_mask/3=wisdom_mask, the maximum length)
-        :return: (N, L, H); (num samples, k, the size of the vocabulary of subwords)
-        """
-        input_ids = X[:, 0]  # (N, 4, L) -> (N, L)
-        token_type_ids = X[:, 1]  # (N, 4, L) -> (N, L)
-        attention_mask = X[:, 2]  # (N, 4, L) -> (N, L)
-        self.wisdom_mask = X[:, 3]  # (N, 4, L) -> (N, L)
-        self.desc_mask = X[:, 4]  # (N, 4, L) -> (N, L)
-        H_all = self.mlm.bert.forward(input_ids, attention_mask, token_type_ids)[0]  # (N, 3, L) -> (N, L, H)
-        return H_all
-    def H_k(self, H_all: torch.Tensor) -> torch.Tensor:
-        """
-        You may want to override this. (e.g. RDGamma - the k's could be anywhere)
-        :param H_all (N, L, H)
-        :return H_k (N, K, H)
-        """
-        N, _, H = H_all.size()
-        # refer to: wisdomify/examples/explore_masked_select.py
-        wisdom_mask = self.wisdom_mask.unsqueeze(2).expand(H_all.shape)  # (N, L) -> (N, L, 1) -> (N, L, H)
-        H_k = torch.masked_select(H_all, wisdom_mask.bool())  # (N, L, H), (N, L, H) -> (N * K * H)
-        H_k = H_k.reshape(N, self.hparams['k'], H)  # (N * K * H) -> (N, K, H)
-        return H_k
-    def H_desc(self, H_all: torch.Tensor) -> torch.Tensor:
         """
-        :param H_all (N, L, H)
-        :return H_desc (N, L - (K + 3), H)
-        """
-        N, L, H = H_all.size()
-        desc_mask = self.desc_mask.unsqueeze(2).expand(H_all.shape)
-        H_desc = torch.masked_select(H_all, desc_mask.bool())  # (N, L, H), (N, L, H) -> (N * (L - (K + 3)) * H)
-        H_desc = H_desc.reshape(N, L - (self.hparams['k'] + 3), H)  # (N * (L - (K + 3)) * H) -> (N, L - (K + 3), H)
-        return H_desc
-    def S_wisdom_literal(self, H_k: torch.Tensor) -> torch.Tensor:
-        """
-        To be used for both RDAlpha & RDBeta
-        :param H_k: (N, K, H)
-        :return: S_wisdom_literal (N, |W|)
-        """
-        S_vocab = self.mlm.cls(H_k)  # bmm; (N, K, H) * (H, |V|) ->  (N, K, |V|)
-        indices = self.idiom2subwords.T.repeat(S_vocab.shape[0], 1, 1)  # (|W|, K) -> (N, K, |W|)
-        S_wisdom_literal = S_vocab.gather(dim=-1, index=indices)  # (N, K, |V|) -> (N, K, |W|)
-        S_wisdom_literal = S_wisdom_literal.sum(dim=1)  # (N, K, |W|) -> (N, |W|)
-        return S_wisdom_literal
-    def S_wisdom(self, H_all: torch.Tensor) -> torch.Tensor:
-        """
-        :param H_all: (N, L, H)
-        :return S_wisdom: (N, |W|)
-        """
-        raise NotImplementedError("An RD class must implement S_wisdom")
-    def P_wisdom(self, X: torch.Tensor) -> torch.Tensor:
-        """
-        :param X: (N, 3, L)
-        :return P_wisdom: (N, |W|), normalized over dim 1.
-        """
-        H_all = self.forward(X)  # (N, 3, L) -> (N, L, H)
-        S_wisdom = self.S_wisdom(H_all)  # (N, L, H) -> (N, W)
-        P_wisdom = F.softmax(S_wisdom, dim=1)  # (N, W) -> (N, W)
-        return P_wisdom
-    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> dict:
-        X, y = batch
-        H_all = self.forward(X)  # (N, 3, L) -> (N, L, H)
-        S_wisdom = self.S_wisdom(H_all)  # (N, L, H) -> (N, |W|)
-        loss = F.cross_entropy(S_wisdom, y)  # (N, |W|), (N,) -> (N,)
-        loss = loss.sum()  # (N,) -> (1,)
-        # so that the metrics accumulate over the course of this epoch
-        # why dict? - just a boilerplate
-        return {
-            # you cannot change the keyword for the loss
-            "loss": loss,
-        }
-    def on_train_batch_end(self, outputs: dict, *args, **kwargs) -> None:
-        # watch the loss for this batch
-        self.log("Train/Loss", outputs['loss'])
-    def training_epoch_end(self, outputs: List[dict]) -> None:
-        # to see an average performance over the batches in this specific epoch
-        avg_loss = torch.stack([output['loss'].detach() for output in outputs]).mean()
-        self.log("Train/Average Loss", avg_loss)
-    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> dict:
-        return self.training_step(batch, batch_idx)
-    def on_validation_batch_end(self, outputs: dict, *args, **kwargs) -> None:
-        self.log("Validation/Loss", outputs['loss'])
-    def validation_epoch_end(self, outputs: List[dict]) -> None:
-        # to see an average performance over the batches in this specific epoch
-        avg_loss = torch.stack([output['loss'].detach() for output in outputs]).mean()
-        self.log("Validation/Average Loss", avg_loss)
     def configure_optimizers(self) -> torch.optim.Optimizer:
         """
@@ -162,7 +65,7 @@ class RD(pl.LightningModule):
         return cls.__name__.lower()
-class Alpha(RD):
     """
     @eubinecto
     The first prototype.

 from transformers import BertForMaskedLM
+class Idiomifier(pl.LightningModule):
     """
     @eubinecto
     The superclass of all the reverse-dictionaries. This class houses any methods that are required by
     whatever reverse-dictionaries we define.
     """
     # passing them to avoid warnings ---  #
     def train_dataloader(self):
         pass
         :param idiom2subwords: (|W|, K)
         :return: (N, K, |V|); (num samples, k, the size of the vocabulary of subwords)
         """
+        pass
     def forward(self, X: torch.Tensor) -> torch.Tensor:
         """
+        given a batch, forward returns a batch of hidden vectors
+        :param X: (N, 3, L). input_ids, token_type_ids, and what was the last one...?
+        :return: (N, L, H)
         """
+        pass
+    def step(self):
+        pass
+    def predict(self):
+        pass
+    def training_step(self):
+        pass
     def configure_optimizers(self) -> torch.optim.Optimizer:
         """
         return cls.__name__.lower()
+class Alpha(Idiomifier):
     """
     @eubinecto
     The first prototype.

idiomify/paths.py CHANGED Viewed

@@ -5,14 +5,14 @@ ARTIFACTS_DIR = ROOT_DIR / "artifacts"
 CONFIG_YAML = ROOT_DIR / "config.yaml"
-def idiom2def_dir(ver: str) -> Path:
-    return ARTIFACTS_DIR / f"idiom2def_{ver}"
 def idioms_dir(ver: str) -> Path:
     return ARTIFACTS_DIR / f"idioms_{ver}"
 def alpha_dir(ver: str) -> Path:
     return ARTIFACTS_DIR / f"alpha_{ver}"

 CONFIG_YAML = ROOT_DIR / "config.yaml"
 def idioms_dir(ver: str) -> Path:
     return ARTIFACTS_DIR / f"idioms_{ver}"
+def literal2idiom(ver: str) -> Path:
+    return ARTIFACTS_DIR / f"literal2idiom_{ver}"
 def alpha_dir(ver: str) -> Path:
     return ARTIFACTS_DIR / f"alpha_{ver}"

idiomify/urls.py CHANGED Viewed

@@ -7,5 +7,10 @@ EPIE_MUTABLE_IDIOMS_TAGS_URL = "https://raw.githubusercontent.com/prateeksaxena2
 EPIE_MUTABLE_IDIOMS_URL = "https://raw.githubusercontent.com/prateeksaxena2809/EPIE_Corpus/master/Formal_Idioms_Corpus/Formal_Idioms_Candidates.txt"  # noqa
 EPIE_MUTABLE_IDIOMS_CONTEXTS_URL = "https://github.com/prateeksaxena2809/EPIE_Corpus/blob/master/Formal_Idioms_Corpus/Formal_Idioms_Words.txt"  # noqa

 EPIE_MUTABLE_IDIOMS_URL = "https://raw.githubusercontent.com/prateeksaxena2809/EPIE_Corpus/master/Formal_Idioms_Corpus/Formal_Idioms_Candidates.txt"  # noqa
 EPIE_MUTABLE_IDIOMS_CONTEXTS_URL = "https://github.com/prateeksaxena2809/EPIE_Corpus/blob/master/Formal_Idioms_Corpus/Formal_Idioms_Words.txt"  # noqa
+# PIE dataset (Zhou, 2021)
+# https://aclanthology.org/2021.mwe-1.5/
+# right, let's just work on it.
+PIE_URL = "https://raw.githubusercontent.com/zhjjn/MWE_PIE/main/data_cleaned.csv"

main_infer.py CHANGED Viewed

@@ -1,36 +1,37 @@
-import argparse
-from idiomify import tensors as T
-from idiomify.fetchers import fetch_config, fetch_rd, fetch_idioms
-from transformers import BertTokenizer
-from termcolor import colored
-def main():
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--model", type=str,
-                            default="alpha")
-        parser.add_argument("--ver", type=str,
-                            default="eng2eng")
-        parser.add_argument("--sent", type=str,
-                            default="to avoid getting to the point")
-        args = parser.parse_args()
-        config = fetch_config()[args.model][args.ver]
-        config.update(vars(args))
-        idioms = fetch_idioms(config['idioms_ver'])
-        rd = fetch_rd(config['model'], config['ver'])
-        rd.eval()
-        tokenizer = BertTokenizer.from_pretrained(config['bert'])
-        X = T.inputs([("", config['sent'])], tokenizer, config['k'])
-        probs = rd.P_wisdom(X).squeeze().tolist()
-        wisdom2prob = [
-                (wisdom, prob)
-                for wisdom, prob in zip(idioms, probs)
-        ]
-        # sort and append
-        res = list(sorted(wisdom2prob, key=lambda x: x[1], reverse=True))
-        print(f"query: {colored(text=config['sent'], color='blue')}")
-        for idx, (idiom, prob) in enumerate(res):
-            print(idx, idiom, prob)
-if __name__ == '__main__':
-    main()

+# we disable them for now.
+# import argparse
+# from idiomify.fetchers import fetch_config, fetch_rd, fetch_idioms
+# from transformers import BertTokenizer
+# from termcolor import colored
+#
+#
+# def main():
+#         parser = argparse.ArgumentParser()
+#         parser.add_argument("--model", type=str,
+#                             default="alpha")
+#         parser.add_argument("--ver", type=str,
+#                             default="eng2eng")
+#         parser.add_argument("--sent", type=str,
+#                             default="to avoid getting to the point")
+#         args = parser.parse_args()
+#         config = fetch_config()[args.model][args.ver]
+#         config.update(vars(args))
+#         idioms = fetch_idioms(config['idioms_ver'])
+#         rd = fetch_rd(config['model'], config['ver'])
+#         rd.eval()
+#         tokenizer = BertTokenizer.from_pretrained(config['bert'])
+#         X = T.inputs([("", config['sent'])], tokenizer, config['k'])
+#         probs = rd.P_wisdom(X).squeeze().tolist()
+#         wisdom2prob = [
+#                 (wisdom, prob)
+#                 for wisdom, prob in zip(idioms, probs)
+#         ]
+#         # sort and append
+#         res = list(sorted(wisdom2prob, key=lambda x: x[1], reverse=True))
+#         print(f"query: {colored(text=config['sent'], color='blue')}")
+#         for idx, (idiom, prob) in enumerate(res):
+#             print(idx, idiom, prob)
+#
+#
+# if __name__ == '__main__':
+#     main()

main_upload_idioms.py CHANGED Viewed

@@ -1,12 +1,40 @@
 """
-Here,
-ver a: Compatible with the first version
-ver b:
 """
 def main():
-    pass
 if __name__ == '__main__':

 """
+Here, what should you do here?
+just upload all idioms here - name it as epie.
 """
+import os
+from idiomify.paths import ROOT_DIR
+from idiomify.fetchers import fetch_pie
+import argparse
+import wandb
 def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ver", type=str, default="pie_v0",
+                        choices=["pie_v0", "pie_v1"])
+    config = vars(parser.parse_args())
+    # get the idioms here
+    if config['ver'] == "pie_v0":
+        # only the first 106, and this is for piloting
+        idioms = set([row[0] for row in fetch_pie()[:106]])
+    elif config['ver'] == "pie_v1":
+        # just include all
+        idioms = set([row[0] for row in fetch_pie()])
+    else:
+        raise NotImplementedError
+    idioms = list(idioms)
+    with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
+        artifact = wandb.Artifact(name="idioms", type="dataset")
+        txt_path = ROOT_DIR / "all.txt"
+        with open(txt_path, 'w') as fh:
+            for idiom in idioms:
+                fh.write(idiom + "\n")
+        artifact.add_file(txt_path)
+        run.log_artifact(artifact, aliases=["latest", config['ver']])
+        os.remove(txt_path)
 if __name__ == '__main__':

main_upload_literal2idiom.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Here, what should you do here?
+just upload all idioms here - name it as epie.
+"""
+import csv
+import os
+from idiomify.paths import ROOT_DIR
+from idiomify.fetchers import fetch_pie
+import argparse
+import wandb
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ver", type=str, default="pie_v0",
+                        choices=["pie_v0", "pie_v1"])
+    config = vars(parser.parse_args())
+    # get the idioms here
+    if config['ver'] == "pie_v0":
+        # only the first 106, and we use this just for piloting
+        literal2idiom = [
+            (row[3], row[2]) for row in fetch_pie()[:106]
+        ]
+    elif config['ver'] == "pie_v1":
+        # just include all
+        literal2idiom = [
+            (row[3], row[2]) for row in fetch_pie()
+        ]
+    else:
+        raise NotImplementedError
+    with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
+        artifact = wandb.Artifact(name="literal2idiom", type="dataset")
+        tsv_path = ROOT_DIR / "all.tsv"
+        with open(tsv_path, 'w') as fh:
+            writer = csv.writer(fh, delimiter="\t")
+            for row in literal2idiom:
+                writer.writerow(row)
+        artifact.add_file(tsv_path)
+        run.log_artifact(artifact, aliases=["latest", config['ver']])
+        os.remove(tsv_path)
+if __name__ == '__main__':
+    main()

main_upload_tokenizer.py DELETED Viewed

@@ -1,13 +0,0 @@
-"""
-Build & upload a tokenizer to wandb.
-You need this if you were to add more tokens there.
-"""
-def main():
-    pass
-    # TODO: fetch the dataset from wandb first!
-if __name__ == '__main__':
-    main()