Spaces:

bkhmsi
/

Partial-Arabic-Diacritization

Running

App Files Files Community

bkhmsi commited on Nov 10, 2023

Commit

d36d50b

1 Parent(s): ba05666

initialized repo

Browse files

Files changed (15) hide show

.gitignore +4 -0
app.py +56 -0
components/attention.py +130 -0
components/k_lstm.py +218 -0
components/linear_scheduler.py +24 -0
components/rnn.py +0 -0
components/rnn_base.py +199 -0
config.yaml +117 -0
data_utils.py +230 -0
dataloader.py +64 -0
diac_utils.py +223 -0
model_dd.py +526 -0
model_partial.py +348 -0
predict.py +170 -0
segment.py +89 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*.pyc
+*.pt
+*.vec
+.DS_Store

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import yaml
+import gdown
+import gradio as gr
+from predict import PredictTri
+output_path = "tashkeela-d2.pt"
+if not os.path.exists(output_path):
+    model_gdrive_id = "1FGelqImFkESbTyRsx_elkKIOZ9VbhRuo"
+    gdown.download(id=model_gdrive_id, output=output_path, quiet=False)
+output_path = "vocab.vec"
+if not os.path.exists(output_path):
+    vocab_gdrive_id = "1-0muGvcSYEf8RAVRcwXay4MRex6kmCii"
+    gdown.download(id=vocab_gdrive_id, output=output_path, quiet=False)
+with open("config.yaml", 'r', encoding="utf-8") as file:
+    config = yaml.load(file, Loader=yaml.FullLoader)
+config["train"]["max-sent-len"] = config["predictor"]["window"]
+config["train"]["max-token-count"] = config["predictor"]["window"] * 3
+def diacritze(text):
+    print(text)
+    predictor = PredictTri(config, text)
+    diacritized_lines = predictor.predict_majority_vote()
+    return '\n'.join(diacritized_lines)
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Partial Diacritization
+    TODO: put paper links here
+    """)
+    input_txt = gr.Textbox(
+        placeholder="اكتب هنا",
+        lines=5,
+        label="Input",
+        type='text',
+        # rtl=True,
+        # text_align='right',
+    )
+    output_txt = gr.Textbox(
+        lines=5,
+        label="Output",
+        type='text',
+        # rtl=True,
+        # text_align='right',
+    )
+    btn = gr.Button(value="Shakkel")
+    btn.click(diacritze, inputs=input_txt, outputs=output_txt)
+if __name__ == "__main__":
+    demo.launch()

components/attention.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from typing import (
+    Optional,
+)
+import math
+import torch as T
+from torch import nn
+from torch.nn import functional as F
+import opt_einsum as oe
+from torch import Tensor
+einsum = oe.contract
+def masked_softmax(xs: Tensor, mask: Tensor, dim: int = -1, eps=1e-12):
+    xs = xs.masked_fill(~mask, -1e9)
+    xs = F.softmax(xs, dim=dim)
+    return xs
+class Attention(nn.Module):
+    def __init__(
+            self,
+            kind: str,
+            query_dim: int,
+            input_dim: int,
+            output_dim: int = None,
+            activation: str = 'auto',
+            scaled = True,
+    ):
+        super().__init__()
+        assert kind in [
+            'dot',
+            'linear',
+        ]
+        self.kind = kind
+        self.Dq = query_dim
+        self.Din = input_dim
+        self.Dout = output_dim or self.Din
+        self.activation = 'auto'
+        self.scaled = scaled
+        self.Wq_ = nn.Linear(self.Dq, self.Din)
+        self.Wk_ = nn.Linear(self.Din, self.Din)
+        self.Wv_ = nn.Linear(self.Din, self.Dout)
+        self.Wz_ = nn.Linear(self.Din, self.Dout)
+    def forward(
+            self,
+            query: Tensor,
+            data: Tensor,
+            content_mask: Optional[Tensor] = None,
+            prejudice_mask: Optional[Tensor] = None,
+    ):
+        #^ query: [b, ts, tw, dq]
+        #^ data: [b, ts, di]
+        #^ content_mask: [b, ts, tw]
+        #^ prejudice_mask: [b, ts, ts]
+        #^ => output: [b, ts, tw, dz]
+        dimB, dimS, dimW, dimI = query.shape
+        # TODO: Optimize out the [ts, ts, *] intermediate
+        qs = self.Wq_(query)
+        ks = self.Wk_(data)
+        vs = self.Wv_(data)
+        if content_mask is not None:
+            words_mask = content_mask.any(2)
+            #^ words_mask : [b, ts]
+        else:
+            words_mask = qs.new_ones((dimB, dimS))
+        if self.kind == 'linear':
+            # Ref: https://twitter.com/francoisfleuret/status/1267455240007188486
+            assert prejudice_mask is None, "Linear mode does not support prejudice_mask."
+            assert content_mask is not None, "Linear mode requires a content_mask."
+            qs = T.relu(qs) * content_mask.unsqueeze(3)
+            #^ qs: [bswi]
+            ks = T.relu(ks) * words_mask.unsqueeze(2)
+            #^ ks: [bsi]
+            vks = einsum("bsi, bsz -> bzi", ks, vs)
+            #^ vks : [b, dz, di]
+            zs = einsum("bswi, bzi -> bswz", qs, vks)
+            #^ zs : [b, ts, tw, dz]
+            if self.scaled:
+                ks = ks.sum(1)
+                #^ ks: [bi]
+                denom = einsum("bswi, bi -> bsw", qs, ks) + 1e-9
+                zs = zs / denom
+        elif self.kind == 'dot':
+            # Ref: https://arxiv.org/abs/1706.03762
+            # s=ts in q
+            # S=ts in ks,vs
+            att_map = einsum("bqwi, bki -> bqkw", qs, ks)
+            #^ [b, ts:q, ts:k, tw]
+            if self.scaled == 'seqlen':
+                att_map_ndim = len(att_map.shape) - 1
+                norm_coeff = words_mask.sum(1).view(-1, *([1] * att_map_ndim))
+                #^ [b, _, _, _]
+                att_map = att_map / T.sqrt(norm_coeff.float())
+            else:
+                att_map = att_map / math.sqrt(self.Din)
+            if content_mask is None and prejudice_mask is None:
+                att_map = F.softmax(att_map, dim=2)
+            else:
+                if content_mask is None:
+                    assert prejudice_mask is not None # !for mypy
+                    qk_mask = prejudice_mask.unsqueeze(3)
+                    #^ qk_mask : [b, ts:q, ts:k, tw^]
+                elif prejudice_mask is None:
+                    qk_mask = words_mask.unsqueeze(1).unsqueeze(3) * content_mask.unsqueeze(2)
+                    #^ qk_mask : [b, ts:q, ts:k^, tw]
+                else:
+                    qk_mask = words_mask.unsqueeze(1).unsqueeze(3)
+                    # qk_mask = words_mask.unsqueeze(1).unsqueeze(3) * content_mask.unsqueeze(2)
+                    qk_mask = qk_mask * prejudice_mask.unsqueeze(3)
+                    #^ qk_mask : [b, ts:q^, ts:k, tw]
+                att_map = masked_softmax(att_map, qk_mask.bool(), dim=2)
+            #^ att_map : [b, ts:q, ts:k, tw]
+            zs = einsum("bqkw, bkz -> bqwz", att_map, vs)
+        zs = self.Wz_(zs)
+        return zs, att_map

components/k_lstm.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from typing import (
+    Tuple,
+    List,
+    Optional,
+    Dict,
+    Callable,
+    Union,
+    cast,
+)
+from collections import namedtuple
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import numpy as np
+import torch as T
+from torch import nn
+from torch.nn import functional as F
+from torch import Tensor
+from .rnn_base import (
+    IRecurrentCell,
+    IRecurrentCellBuilder,
+    RecurrentLayer,
+    RecurrentLayerStack,
+)
+__all__ = [
+    'K_LSTM',
+    'K_LSTM_Cell',
+    'K_LSTM_Cell_Builder',
+]
+ACTIVATIONS = {
+    'sigmoid': nn.Sigmoid(),
+    'tanh': nn.Tanh(),
+    'hard_tanh': nn.Hardtanh(),
+    'relu': nn.ReLU(),
+}
+GateSpans = namedtuple('GateSpans', ['I', 'F', 'G', 'O'])
+@dataclass
+class K_LSTM_Cell_Builder(IRecurrentCellBuilder):
+    vertical_dropout            : float = 0.0
+    recurrent_dropout           : float = 0.0
+    recurrent_dropout_mode      : str   = 'gal_tied'
+    input_kernel_initialization : str   = 'xavier_uniform'
+    recurrent_activation        : str   = 'sigmoid'
+    tied_forget_gate            : bool  = False
+    def make(self, input_size: int):
+        return K_LSTM_Cell(input_size, self)
+class K_LSTM_Cell(IRecurrentCell):
+    def __repr__(self):
+        return (
+            f'{self.__class__.__name__}('
+            + ', '.join(
+                [
+                    f'in: {self.Dx}',
+                    f'hid: {self.Dh}',
+                    f'rdo: {self.recurrent_dropout_p} @{self.recurrent_dropout_mode}',
+                    f'vdo: {self.vertical_dropout_p}'
+                ]
+            )
+            +')'
+        )
+    def __init__(
+            self,
+            input_size: int,
+            args: K_LSTM_Cell_Builder,
+    ):
+        super().__init__()
+        self._args = args
+        self.Dx = input_size
+        self.Dh = args.hidden_size
+        self.recurrent_kernel = nn.Linear(self.Dh, self.Dh * 4)
+        self.input_kernel     = nn.Linear(self.Dx, self.Dh * 4)
+        self.recurrent_dropout_p    = args.recurrent_dropout or 0.0
+        self.vertical_dropout_p     = args.vertical_dropout or 0.0
+        self.recurrent_dropout_mode = args.recurrent_dropout_mode
+        self.recurrent_dropout = nn.Dropout(self.recurrent_dropout_p)
+        self.vertical_dropout  = nn.Dropout(self.vertical_dropout_p)
+        self.tied_forget_gate = args.tied_forget_gate
+        if isinstance(args.recurrent_activation, str):
+            self.fun_rec = ACTIVATIONS[args.recurrent_activation]
+        else:
+            self.fun_rec = args.recurrent_activation
+        self.reset_parameters_()
+    # @T.jit.ignore
+    def get_recurrent_weights(self):
+        # type: () -> Tuple[GateSpans, GateSpans]
+        W = self.recurrent_kernel.weight.chunk(4, 0)
+        b = self.recurrent_kernel.bias.chunk(4, 0)
+        W = GateSpans(W[0], W[1], W[2], W[3])
+        b = GateSpans(b[0], b[1], b[2], b[3])
+        return W, b
+    # @T.jit.ignore
+    def get_input_weights(self):
+        # type: () -> Tuple[GateSpans, GateSpans]
+        W = self.input_kernel.weight.chunk(4, 0)
+        b = self.input_kernel.bias.chunk(4, 0)
+        W = GateSpans(W[0], W[1], W[2], W[3])
+        b = GateSpans(b[0], b[1], b[2], b[3])
+        return W, b
+    @T.jit.ignore
+    def reset_parameters_(self):
+        rw, rb = self.get_recurrent_weights()
+        iw, ib = self.get_input_weights()
+        nn.init.zeros_(self.input_kernel.bias)
+        nn.init.zeros_(self.recurrent_kernel.bias)
+        nn.init.ones_(rb.F)
+        #^ forget bias
+        for W in rw:
+            nn.init.orthogonal_(W)
+        for W in iw:
+            nn.init.xavier_uniform_(W)
+    @T.jit.export
+    def get_init_state(self, input: Tensor) -> Tuple[Tensor, Tensor]:
+        batch_size = input.shape[1]
+        h0 = T.zeros(batch_size, self.Dh, device=input.device)
+        c0 = T.zeros(batch_size, self.Dh, device=input.device)
+        return (h0, c0)
+    def apply_input_kernel(self, xt: Tensor) -> List[Tensor]:
+        xto = self.vertical_dropout(xt)
+        out = self.input_kernel(xto).chunk(4, 1)
+        # return cast(List[Tensor], out)
+        return out
+    def apply_recurrent_kernel(self, h_tm1: Tensor):
+        #^ h_tm1 : [b h]
+        mode = self.recurrent_dropout_mode
+        if mode == 'gal_tied':
+            hto = self.recurrent_dropout(h_tm1)
+            out = self.recurrent_kernel(hto)
+            #^ out : [b 4h]
+            outs = out.chunk(4, -1)
+        elif mode == 'gal_gates':
+            outs = []
+            WW, bb = self.get_recurrent_weights()
+            for i in range(4):
+                hto = self.recurrent_dropout(h_tm1)
+                outs.append(F.linear(hto, WW[i], bb[i]))
+        else:
+            outs = self.recurrent_kernel(h_tm1).chunk(4, -1)
+        return outs
+    def forward(self, input, state):
+        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
+        #^ input : [b i]
+        #^ state.h : [b h]
+        (h_tm1, c_tm1) = state
+        Xi, Xf, Xg, Xo = self.apply_input_kernel(input)
+        Hi, Hf, Hg, Ho = self.apply_recurrent_kernel(h_tm1)
+        ft = self.fun_rec(Xf + Hf)
+        ot = self.fun_rec(Xo + Ho)
+        if self.tied_forget_gate:
+            it = 1.0 - ft
+        else:
+            it = self.fun_rec(Xi + Hi)
+        gt = T.tanh(Xg + Hg) # * np.sqrt(3)
+        if self.recurrent_dropout_mode == 'semeniuta':
+            #* https://arxiv.org/abs/1603.05118
+            gt = self.recurrent_dropout(gt)
+        ct = (ft * c_tm1) + (it * gt)
+        ht = ot * T.tanh(ct)
+        return ht, (ht, ct)
+    @T.jit.export
+    def loop(self, inputs, state_t0, mask=None):
+        # type: (List[Tensor], Tuple[Tensor, Tensor], Optional[List[Tensor]]) -> Tuple[List[Tensor], Tuple[Tensor, Tensor]]
+        '''
+        This loops over t (time) steps
+        '''
+        #^ inputs      : t * [b i]
+        #^ state_t0[i] : [b s]
+        #^ out         : [t b h]
+        state = state_t0
+        outs = []
+        for xt in inputs:
+            ht, state = self(xt, state)
+            outs.append(ht)
+        return outs, state
+class K_LSTM(RecurrentLayerStack):
+    def __init__(
+            self,
+            *args,
+            **kargs,
+    ):
+        builder = K_LSTM_Cell_Builder
+        super().__init__(
+                builder,
+                *args, **kargs
+            )

components/linear_scheduler.py ADDED Viewed

	@@ -0,0 +1,24 @@

+class LinearSchedule:
+    def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
+        """Linear interpolation between initial_p and final_p over
+        schedule_timesteps. After this many timesteps pass final_p is
+        returned.
+        Parameters
+        ----------
+        schedule_timesteps: int
+            Number of timesteps for which to linearly anneal initial_p
+            to final_p
+        initial_p: float
+            initial output value
+        final_p: float
+            final output value
+        """
+        self.schedule_timesteps = schedule_timesteps
+        self.final_p = final_p
+        self.initial_p = initial_p
+    def value(self, t):
+        """See Schedule.value"""
+        fraction = min(float(t) / self.schedule_timesteps, 1.0)
+        return self.initial_p + fraction * (self.final_p - self.initial_p)

components/rnn.py ADDED Viewed

File without changes

components/rnn_base.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from typing import (
+    Tuple,
+    List,
+    Union,
+    Dict,
+    Optional,
+    Callable,
+)
+from collections import namedtuple
+from abc import ABC, abstractmethod
+import torch as T
+from torch import nn
+from torch.nn import functional as F
+from torch import Tensor
+import pdb
+from dataclasses import dataclass
+class IRecurrentCell(ABC, nn.Module):
+    @abstractmethod
+    def get_init_state(self, input: Tensor):
+        pass
+    @abstractmethod
+    def loop(self, inputs, state_t0, mask=None):
+        pass
+    # def forward(self, input, state, mask=None):
+    #     pass
+@dataclass
+class IRecurrentCellBuilder(ABC):
+    hidden_size: int
+    def make(self, input_size: int) -> IRecurrentCell:
+        pass
+    def make_scripted(self, *p, **ks) -> IRecurrentCell:
+        return T.jit.script(self.make(*p, **ks))
+class RecurrentLayer(nn.Module):
+    def reorder_inputs(self, inputs: Union[List[T.Tensor], T.Tensor]):
+        #^ inputs : [t b i]
+        if self.direction == 'backward':
+            return inputs[::-1]
+        return inputs
+    def __init__(
+            self,
+            cell: IRecurrentCell,
+            direction='forward',
+            batch_first=False,
+    ):
+        super().__init__()
+        if isinstance(batch_first, bool):
+            batch_first = (batch_first, batch_first)
+        self.batch_first = batch_first
+        self.direction = direction
+        self.cell_: IRecurrentCell = cell
+    @T.jit.ignore
+    def forward(self, input, state_t0, return_state=None):
+        if self.batch_first[0]:
+        #^ input : [b t i]
+            input = input.transpose(1, 0)
+        #^ input : [t b i]
+        inputs = input.unbind(0)
+        if state_t0 is None:
+            state_t0 = self.cell_.get_init_state(input)
+        inputs = self.reorder_inputs(inputs)
+        if return_state:
+            sequence, state = self.cell_.loop(inputs, state_t0)
+        else:
+            sequence, _ = self.cell_.loop(inputs, state_t0)
+        #^ sequence : t * [b h]
+        sequence = self.reorder_inputs(sequence)
+        sequence = T.stack(sequence)
+        #^ sequence : [t b h]
+        if self.batch_first[1]:
+            sequence = sequence.transpose(1, 0)
+        #^ sequence : [b t h]
+        if return_state:
+            return sequence, state
+        else:
+            return sequence, None
+class BidirectionalRecurrentLayer(nn.Module):
+    def __init__(
+            self,
+            input_size: int,
+            cell_builder: IRecurrentCellBuilder,
+            batch_first=False,
+            return_states=False
+    ):
+        super().__init__()
+        self.batch_first = batch_first
+        self.cell_builder = cell_builder
+        self.batch_first = batch_first
+        self.return_states = return_states
+        self.fwd = RecurrentLayer(
+            cell_builder.make_scripted(input_size),
+            direction='forward',
+            batch_first=batch_first
+        )
+        self.bwd = RecurrentLayer(
+            cell_builder.make_scripted(input_size),
+            direction='backward',
+            batch_first=batch_first
+        )
+    @T.jit.ignore
+    def forward(self, input, state_t0, is_last):
+        return_states = is_last and self.return_states
+        if return_states:
+            fwd, state_fwd = self.fwd(input, state_t0, return_states)
+            bwd, state_bwd = self.bwd(input, state_t0, return_states)
+            return T.cat([fwd, bwd], dim=-1), (T.cat([state_fwd[0], state_bwd[0]], dim=-1), T.cat([state_fwd[1], state_bwd[1]], dim=-1))
+        else:
+            fwd, _ = self.fwd(input, state_t0, return_states)
+            bwd, _ = self.bwd(input, state_t0, return_states)
+            return T.cat([fwd, bwd], dim=-1), None
+class RecurrentLayerStack(nn.Module):
+    def __init__(
+            self,
+            cell_builder  : Callable[..., IRecurrentCellBuilder],
+            input_size    : int,
+            num_layers    : int,
+            bidirectional : bool = False,
+            batch_first   : bool = False,
+            scripted      : bool = True,
+            return_states : bool = False,
+            *args, **kargs,
+    ):
+        super().__init__()
+        cell_builder_: IRecurrentCellBuilder = cell_builder(*args, **kargs)
+        self._cell_builder = cell_builder_
+        if bidirectional:
+            Dh = cell_builder_.hidden_size * 2
+            def make(isize: int, last=False):
+                return BidirectionalRecurrentLayer(isize, cell_builder_,
+                            batch_first=batch_first, return_states=return_states)
+        else:
+            Dh = cell_builder_.hidden_size
+            def make(isize: int, last=False):
+                cell = cell_builder_.make_scripted(isize)
+                return RecurrentLayer(cell, isize,
+                            batch_first=batch_first)
+        if num_layers > 1:
+            rnns = [
+                make(input_size),
+                *[
+                    make(Dh)
+                    for _ in range(num_layers - 2)
+                ],
+                make(Dh, last=True)
+            ]
+        else:
+            rnns = [make(input_size, last=True)]
+        self.rnn = nn.Sequential(*rnns)
+        self.input_size = input_size
+        self.hidden_size = self._cell_builder.hidden_size
+        self.num_layers = num_layers
+        self.bidirectional = bidirectional
+        self.return_states = return_states
+    def __repr__(self):
+        return (
+            f'${self.__class__.__name__}'
+            + '('
+            + f'in={self.input_size}, '
+            + f'hid={self.hidden_size}, '
+            + f'layers={self.num_layers}, '
+            + f'bi={self.bidirectional}'
+            + '; '
+            + str(self._cell_builder)
+        )
+    def forward(self, input, state_t0=None):
+        for layer_idx, rnn in enumerate(self.rnn):
+            is_last = (layer_idx == (len(self.rnn) - 1))
+            input, state = rnn(input, state_t0, is_last)
+        if self.return_states:
+            return input, state
+        return input

config.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+run-title: tashkeela-d2
+debug: false
+paths:
+  base: ./dataset/ashaar
+  save: ./models
+  load: tashkeela-d2.pt
+  resume: ./models/Tashkeela-D2/tashkeela-d2.pt
+  constants: ./dataset/helpers/constants
+  word-embs: vocab.vec
+  test: test
+loader:
+  wembs-limit: -1
+  num-workers: 0
+train:
+  epochs: 1000
+  batch-size: 32
+  char-embed-dim: 32
+  resume: false
+  resume-lr: false
+  max-word-len: 13
+  max-sent-len: 10
+  rnn-cell: lstm
+  sent-lstm-layers: 2
+  word-lstm-layers: 2
+  sent-lstm-units: 256
+  word-lstm-units: 512
+  decoder-units: 256
+  sent-dropout: 0.2
+  diac-dropout: 0
+  final-dropout: 0.2
+  sent-mask-zero: false
+  lr-factor: 0.5
+  lr-patience: 1
+  lr-min: 1.e-7
+  lr-init: 0.002
+  weight-decay: 0
+  vertical-dropout: 0.25
+  recurrent-dropout: 0.25
+  stopping-delta: 1.e-7
+  stopping-patience: 3
+predictor:
+  batch-size: 75
+  stride: 2
+  window: 20
+  gt-signal-prob: 0
+  seed-idx: 0
+sentence-break:
+  stride: 2
+  window: 10
+  min-window: 1
+  export-map: false
+  files:
+    - train/train.txt
+    - val/val.txt
+  delimeters:
+    - ،
+    - ؛
+    - ','
+    - ;
+    - «
+    - »
+    - '{'
+    - '}'
+    - '('
+    - ')'
+    - '['
+    - ']'
+    - '.'
+    - '*'
+    - '-'
+    - ':'
+    - '?'
+    - '!'
+    - ؟
+segment:
+  stride: 2
+  window: 10
+  min-window: 1
+  export-map: false
+  files:
+    - train/train.txt
+    - val/val.txt
+  delimeters:
+    - ،
+    - ؛
+    - ','
+    - ;
+    - «
+    - »
+    - '{'
+    - '}'
+    - '('
+    - ')'
+    - '['
+    - ']'
+    - '.'
+    - '*'
+    - '-'
+    - ':'
+    - '?'
+    - '!'
+    - ؟

data_utils.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import os
+import pickle
+import numpy as np
+from tqdm import tqdm
+from prettytable import PrettyTable
+from pyarabic.araby import tokenize, strip_tashkeel
+import diac_utils as du
+class DatasetUtils:
+    def __init__(self, config):
+        self.base_path = config["paths"]["base"]
+        self.special_tokens = ['<pad>', '<unk>', '<num>', '<punc>']
+        self.delimeters = config["sentence-break"]["delimeters"]
+        self.load_constants(config["paths"]["constants"])
+        self.debug = config["debug"]
+        self.stride = config["sentence-break"]["stride"]
+        self.window = config["sentence-break"]["window"]
+        self.val_stride = config["sentence-break"].get("val-stride", self.stride)
+        self.test_stride = config["predictor"]["stride"]
+        self.test_window = config["predictor"]["window"]
+        self.max_word_len = config["train"]["max-word-len"]
+        self.max_sent_len = config["train"]["max-sent-len"]
+        self.max_token_count = config["train"]["max-token-count"]
+        self.pad_target_val = -100
+        self.pad_char_id = du.LETTER_LIST.index('<pad>')
+        self.markov_signal = config['train'].get('markov-signal', False)
+        self.batch_first = config['train'].get('batch-first', True)
+        self.gt_prob = config["predictor"]["gt-signal-prob"]
+        if self.gt_prob > 0:
+            self.s_idx = config["predictor"]["seed-idx"]
+            subpath = f"test_gt_mask_{self.gt_prob}_{self.s_idx}.txt"
+            mask_path = os.path.join(self.base_path, "test", subpath)
+            with open(mask_path, 'r') as fin:
+                self.gt_mask = fin.readlines()
+        if "word-embs" in config["paths"] and config["paths"]["word-embs"].strip() != "":
+            self.pad_val = self.special_tokens.index("<pad>")
+            self.embeddings, self.vocab = self.load_embeddings(config["paths"]["word-embs"], config["loader"]["wembs-limit"])
+            self.embeddings = self.normalize(self.embeddings, ["unit", "centeremb", "unit"])
+            self.w2idx = {word: i for i, word in enumerate(self.vocab)}
+    def load_file(self, path):
+        with open(path, 'rb') as f:
+            return list(pickle.load(f))
+    def normalize(self, matrix, actions, mean=None):
+        def length_normalize(matrix):
+            norms = np.sqrt(np.sum(matrix**2, axis=1))
+            norms[norms == 0] = 1
+            matrix = matrix / norms[:, np.newaxis]
+            return matrix
+        def mean_center(matrix):
+            return matrix - mean
+        def length_normalize_dimensionwise(matrix):
+            norms = np.sqrt(np.sum(matrix**2, axis=0))
+            norms[norms == 0] = 1
+            matrix = matrix / norms
+            return matrix
+        def mean_center_embeddingwise(matrix):
+            avg = np.mean(matrix, axis=1)
+            matrix = matrix - avg[:, np.newaxis]
+            return matrix
+        for action in actions:
+            if action == 'unit':
+                matrix = length_normalize(matrix)
+            elif action == 'center':
+                matrix = mean_center(matrix)
+            elif action == 'unitdim':
+                matrix = length_normalize_dimensionwise(matrix)
+            elif action == 'centeremb':
+                matrix = mean_center_embeddingwise(matrix)
+        return matrix
+    def load_constants(self, path):
+        # self.numbers = [c for c in "0123456789"]
+        # self.letter_list = self.special_tokens + self.load_file(os.path.join(path, 'ARABIC_LETTERS_LIST.pickle'))
+        # self.diacritic_list = [' '] + self.load_file(os.path.join(path, 'DIACRITICS_LIST.pickle'))
+        self.numbers = du.NUMBERS
+        self.letter_list = du.LETTER_LIST
+        self.diacritic_list = du.DIACRITICS_SHORT
+    def split_word_on_characters_with_diacritics(self, word: str):
+        return du.split_word_on_characters_with_diacritics(word)
+    def load_mapping_v3(self, dtype, file_ext=None):
+        mapping = {}
+        if file_ext is None:
+            file_ext = f"-{self.test_stride}-{self.test_window}.map"
+        f_name = os.path.join(self.base_path, dtype, dtype + file_ext)
+        with open(f_name, 'r') as fin:
+            for line in fin:
+                sent_idx, seg_idx, t_idx, c_idx = map(int, line.split(','))
+                if sent_idx not in mapping:
+                    mapping[sent_idx] = {}
+                if seg_idx not in mapping[sent_idx]:
+                    mapping[sent_idx][seg_idx] = {}
+                if t_idx not in mapping[sent_idx][seg_idx]:
+                    mapping[sent_idx][seg_idx][t_idx] = []
+                mapping[sent_idx][seg_idx][t_idx] += [c_idx]
+        return mapping
+    def load_mapping_v3_from_list(self, mapping_list):
+        mapping = {}
+        for line in mapping_list:
+            sent_idx, seg_idx, t_idx, c_idx = map(int, line.split(','))
+            if sent_idx not in mapping:
+                mapping[sent_idx] = {}
+            if seg_idx not in mapping[sent_idx]:
+                mapping[sent_idx][seg_idx] = {}
+            if t_idx not in mapping[sent_idx][seg_idx]:
+                mapping[sent_idx][seg_idx][t_idx] = []
+            mapping[sent_idx][seg_idx][t_idx] += [c_idx]
+        return mapping
+    def load_embeddings(self, embs_path, limit=-1):
+        if self.debug:
+            return np.zeros((200+len(self.special_tokens),300)), self.special_tokens + ["c"] * 200
+        words = [self.special_tokens[0]]
+        print(f"[INFO] Reading Embeddings from {embs_path}")
+        with open(embs_path, encoding='utf-8', mode='r') as fin:
+            n, d = map(int, fin.readline().split())
+            limit = n if limit <= 0 else limit
+            embeddings = np.zeros((limit+1, d))
+            for i, line in tqdm(enumerate(fin), total=limit):
+                if i >= limit: break
+                tokens = line.rstrip().split()
+                words += [tokens[0]]
+                embeddings[i+1] = list(map(float, tokens[1:]))
+        return embeddings, words
+    def load_file_clean(self, dtype, strip=False):
+        f_name = os.path.join(self.base_path, dtype, dtype + ".txt")
+        with open(f_name, 'r', encoding="utf-8", newline='\n') as fin:
+            if strip:
+                original_lines = [strip_tashkeel(self.preprocess(line)) for line in fin.readlines()]
+            else:
+                original_lines = [self.preprocess(line) for line in fin.readlines()]
+        return original_lines
+    def preprocess(self, line):
+        return ' '.join(tokenize(line))
+    def pad_and_truncate_sequence(self, tokens, max_len, pad=None):
+        if pad is None:
+            pad = self.special_tokens.index("<pad>")
+        if len(tokens) < max_len:
+            offset = max_len - len(tokens)
+            return tokens + [pad] * offset
+        else:
+            return tokens[:max_len]
+    def stats(self, freq, percentile=90, name="stats"):
+        table = PrettyTable(["Dataset", "Mean", "Std", "Min", "Max", f"{percentile}th Percentile"])
+        freq = np.array(sorted(freq))
+        table.add_row([name, freq.mean(), freq.std(), freq.min(), freq.max(), np.percentile(freq, percentile)])
+        print(table)
+    def create_gt_mask(self, lines, prob, idx, seed=1111):
+        np.random.seed(seed)
+        gt_masks = []
+        for line in lines:
+            tokens = tokenize(line.strip())
+            gt_mask_token = ""
+            for t_idx, token in enumerate(tokens):
+                gt_mask_token += ''.join(map(str, np.random.binomial(1, prob, len(token))))
+                if t_idx+1 < len(tokens):
+                    gt_mask_token += " "
+            gt_masks += [gt_mask_token]
+        subpath = f"test_gt_mask_{prob}_{idx}.txt"
+        mask_path = os.path.join(self.base_path, "test", subpath)
+        with open(mask_path, 'w') as fout:
+            fout.write('\n'.join(gt_masks))
+    def create_gt_labels(self, lines):
+        gt_labels = []
+        for line in lines:
+            gt_labels_line = []
+            tokens = tokenize(line.strip())
+            for w_idx, word in enumerate(tokens):
+                split_word = self.split_word_on_characters_with_diacritics(word)
+                _, cy_flat, _ = du.create_label_for_word(split_word)
+                gt_labels_line.extend(cy_flat)
+                if w_idx+1 < len(tokens):
+                    gt_labels_line += [0]
+            gt_labels += [gt_labels_line]
+        return gt_labels
+    def get_ce(self, diac_word_y, e_idx=None, return_idx=False):
+        #^ diac_word_y: [Tw 3]
+        if e_idx is None: e_idx = len(diac_word_y)
+        for c_idx in reversed(range(e_idx)):
+            if diac_word_y[c_idx] != [0,0,0]:
+                return diac_word_y[c_idx] if not return_idx else c_idx
+        return diac_word_y[e_idx-1] if not return_idx else e_idx-1
+    def create_decoder_input(self, diac_code_y, prob=0):
+        #^ diac_code_y: [Ts Tw 3]
+        diac_code_x = np.zeros((*np.array(diac_code_y).shape[:-1], 8))
+        if not self.markov_signal:
+            return list(diac_code_x)
+        prev_ce = list(np.eye(6)[-1]) + [0,0] # bos tag
+        for w_idx, word in enumerate(diac_code_y):
+            diac_code_x[w_idx, 0, :] = prev_ce
+            for c_idx, char in enumerate(word[:-1]):
+                # if np.random.rand() < prob:
+                #     continue
+                if char[0] == self.pad_target_val:
+                    break
+                haraka = list(np.eye(6)[char[0]])
+                diac_code_x[w_idx, c_idx+1, :] = haraka + char[1:]
+            ce = self.get_ce(diac_code_y[w_idx], c_idx)
+            prev_ce = list(np.eye(6)[ce[0]]) + ce[1:]
+        return list(diac_code_x)

dataloader.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+from pyarabic.araby import tokenize, strip_tashkeel
+import numpy as np
+import torch as T
+from torch.utils.data import Dataset
+from data_utils import DatasetUtils
+import diac_utils as du
+class DataRetriever(Dataset):
+    def __init__(self, data_utils : DatasetUtils, lines: list):
+        super(DataRetriever).__init__()
+        self.data_utils = data_utils
+        self.lines = lines
+    def preprocess(self, data, dtype=T.long):
+        return [T.tensor(np.array(x), dtype=dtype) for x in data]
+    def __len__(self):
+        return len(self.lines)
+    def __getitem__(self, idx):
+        word_x, char_x, diac_x, diac_y = self.create_sentence(idx)
+        return self.preprocess((word_x, char_x, diac_x)), T.tensor(diac_y, dtype=T.long), T.tensor(diac_y, dtype=T.long)
+    def create_sentence(self, idx):
+        line = self.lines[idx]
+        tokens = tokenize(line.strip())
+        word_x = []
+        char_x = []
+        diac_x = []
+        diac_y = []
+        diac_y_tmp = []
+        for word in tokens:
+            word = du.strip_unknown_tashkeel(word)
+            word_chars = du.split_word_on_characters_with_diacritics(word)
+            cx, cy, cy_3head = du.create_label_for_word(word_chars)
+            word_strip = strip_tashkeel(word)
+            word_x += [self.data_utils.w2idx[word_strip] if word_strip in self.data_utils.w2idx else self.data_utils.w2idx["<pad>"]]
+            char_x += [self.data_utils.pad_and_truncate_sequence(cx, self.data_utils.max_word_len)]
+            diac_y += [self.data_utils.pad_and_truncate_sequence(cy, self.data_utils.max_word_len, pad=self.data_utils.pad_target_val)]
+            diac_y_tmp += [self.data_utils.pad_and_truncate_sequence(cy_3head, self.data_utils.max_word_len, pad=[self.data_utils.pad_target_val]*3)]
+        diac_x = self.data_utils.create_decoder_input(diac_y_tmp)
+        max_slen = self.data_utils.max_sent_len
+        max_wlen = self.data_utils.max_word_len
+        p_val = self.data_utils.pad_val
+        pt_val = self.data_utils.pad_target_val
+        word_x = self.data_utils.pad_and_truncate_sequence(word_x, max_slen)
+        char_x = self.data_utils.pad_and_truncate_sequence(char_x, max_slen, pad=[p_val]*max_wlen)
+        diac_x = self.data_utils.pad_and_truncate_sequence(diac_x, max_slen, pad=[[p_val]*8]*max_wlen)
+        diac_y = self.data_utils.pad_and_truncate_sequence(diac_y, max_slen, pad=[pt_val]*max_wlen)
+        return word_x, char_x, diac_x, diac_y

diac_utils.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from typing import List
+import torch as T
+import numpy as np
+from pyarabic.araby import (
+    tokenize,
+    strip_tashkeel,
+    strip_tatweel,
+    DIACRITICS
+)
+SEPARATE_DIACRITICS = {
+    "FATHA": 1,
+    "KASRA": 2,
+    "DAMMA": 3,
+    "SUKUN": 4
+}
+HARAKAT_MAP = [
+    #^ (haraka, tanween, shadda)
+    (0,0,0), #< No diacs on char
+    (1,0,0),
+    (1,1,0), #< Tanween on 2nd slot
+    (2,0,0),
+    (2,1,0),
+    (3,0,0),
+    (3,1,0),
+    (4,0,0),
+    (0,0,1), #< shadda on 3rd slot
+    (1,0,1),
+    (1,1,1),
+    (2,0,1),
+    (2,1,1),
+    (3,0,1),
+    (3,1,1),
+    (0,0,0), #< Padding == -1 (also for spaces)
+]
+SPECIAL_TOKENS = ['<pad>', '<unk>', '<num>', '<punc>']
+LETTER_LIST = SPECIAL_TOKENS + list("ءآأؤإئابةتثجحخدذرزسشصضطظعغفقكلمنهوىي")
+CLASSES_LIST = [' ', 'َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ْ', 'ّ', 'َّ', 'ًّ', 'ُّ', 'ٌّ', 'ِّ', 'ٍّ']
+DIACRITICS_SHORT = [' ', 'َ', 'ً', 'ِ', 'ٍ', 'ُ', 'ٌ', 'ْ', 'ّ']
+NUMBERS = list("0123456789")
+DELIMITERS = ["،","؛",",",";","«","»","{","}","(",")","[","]",".","*","-",":","?","!","؟"]
+UNKNOWN_DIACRITICS = list(set(DIACRITICS).difference(set(DIACRITICS_SHORT)))
+def shakkel_char(diac: int, tanween: bool, shadda: bool) -> str:
+    returned_text = ""
+    if shadda and diac != SEPARATE_DIACRITICS["SUKUN"]:
+        returned_text += "\u0651"
+    if diac == SEPARATE_DIACRITICS["FATHA"]:
+        returned_text += "\u064E" if not tanween else "\u064B"
+    elif diac == SEPARATE_DIACRITICS["KASRA"]:
+        returned_text += "\u0650" if not tanween else "\u064D"
+    elif diac == SEPARATE_DIACRITICS["DAMMA"]:
+        returned_text += "\u064F" if not tanween else "\u064C"
+    elif diac == SEPARATE_DIACRITICS["SUKUN"]:
+        returned_text += "\u0652"
+    return returned_text
+def diac_ids_of_line(line: str):
+    words = tokenize(line)
+    diacs = []
+    for word in words:
+        word_chars = split_word_on_characters_with_diacritics(word)
+        cx, cy, cy_3head = create_label_for_word(word_chars)
+        diacs.extend(cy)
+        diacs.append(-1)
+    return np.array(diacs[:-1])
+def strip_unknown_tashkeel(word: str):
+    #! FIXME! warnings.warn("Stripping unknown tashkeel is disabled.")
+    return word
+    return ''.join(c for c in word if c not in UNKNOWN_DIACRITICS)
+def split_word_on_characters_with_diacritics(word: str):
+    '''
+    TODO! Make faster without deque and looping
+    Returns: List[List[char: "letter or diacritic"]]
+    '''
+    chars_w_diac = []
+    i_start = 0
+    for i_c, c in enumerate(word):
+        #! FIXME! DIACRITICS_SHORT is missing a lot of less common diacritics ...
+        #! which are then treated as letters during splitting.
+        # if c not in DIACRITICS:
+        if c not in DIACRITICS_SHORT:
+            sub = list(word[i_start:i_c])
+            chars_w_diac.append(sub)
+            i_start = i_c
+    sub = list(word[i_start:])
+    if sub:
+        chars_w_diac.append(sub)
+    if not chars_w_diac[0]:
+        chars_w_diac = chars_w_diac[1:]
+    return chars_w_diac
+def char_type(char: str):
+    if char in LETTER_LIST:
+        return LETTER_LIST.index(char)
+    elif char in NUMBERS:
+        return LETTER_LIST.index('<num>')
+    elif char in DELIMITERS:
+        return LETTER_LIST.index('<punc>')
+    else:
+        return LETTER_LIST.index('<unk>')
+def create_labels(char_w_diac: str):
+    remap_dict = {0: 0, 1: 1, 3: 2, 5: 3, 7: 4}
+    char_w_diac = [char_w_diac[0]] + list(set(char_w_diac[1:]))
+    if len(char_w_diac) > 3:
+        char_w_diac = char_w_diac[:2] if DIACRITICS_SHORT[8] not in char_w_diac else char_w_diac[:3]
+    char_idx = None
+    diacritic_index = None
+    head_3 = None
+    char_idx = char_type(char_w_diac[0])
+    diacs = set(char_w_diac[1:])
+    diac_h3 = [0, 0, 0]
+    for diac in diacs:
+        if diac in DIACRITICS_SHORT:
+            diac_idx = DIACRITICS_SHORT.index(diac)
+            if diac_idx in [2, 4, 6]: #< Tanween
+                diac_h3[0] = remap_dict[diac_idx - 1]
+                diac_h3[1] = 1
+            elif diac_idx == 8: #< shadda
+                diac_h3[2] = 1
+            else: #< Haraka or sukoon
+                diac_h3[0] = remap_dict[diac_idx]
+    assert not (diac_h3[0] == 4 and (diac_h3[1] or diac_h3[2]))
+    diacritic_index = HARAKAT_MAP.index(tuple(diac_h3))
+    return char_idx, diacritic_index, diac_h3
+    if len(char_w_diac) == 1:
+        return char_idx, 0, [remap_dict[0], 0, 0]
+    elif len(char_w_diac) == 2:  # If shadda OR diac
+        diacritic_index = DIACRITICS_SHORT.index(char_w_diac[1])
+        if diacritic_index in [2, 4, 6]:  # list of tanween
+            head_3 = [remap_dict[diacritic_index - 1], 1, 0]
+        elif diacritic_index == 8:
+            head_3 = [0, 0, 1]
+        else:
+            head_3 = [remap_dict[diacritic_index], 0, 0]
+    elif len(char_w_diac) == 3:  # If shadda AND diac
+        if DIACRITICS_SHORT[8] == char_w_diac[1]:
+            diacritic_index = DIACRITICS_SHORT.index(char_w_diac[2])
+        else:
+            diacritic_index = DIACRITICS_SHORT.index(char_w_diac[1])
+        if diacritic_index in [2, 4, 6]:  # list of tanween
+            head_3 = [remap_dict[diacritic_index - 1], 1, 1]
+        else:
+            head_3 = [remap_dict[diacritic_index], 0, 1]
+        diacritic_index = diacritic_index+8
+    return char_idx, diacritic_index, head_3
+def create_label_for_word(split_word: List[List[str]]):
+    word_char_indices = []
+    word_diac_indices = []
+    word_diac_indices_h3 = []
+    for char_w_diac in split_word:
+        char_idx, diac_idx, diac_h3 = create_labels(char_w_diac)
+        if char_idx == None:
+            print(split_word)
+            raise ValueError(char_idx)
+        word_char_indices.append(char_idx)
+        word_diac_indices.append(diac_idx)
+        word_diac_indices_h3.append(diac_h3)
+    return word_char_indices, word_diac_indices, word_diac_indices_h3
+def flat_2_3head(output: T.Tensor):
+    '''
+    output: [b tw tc]
+    '''
+    haraka, tanween, shadda = [], [], []
+    # 0, 1,  2, 3,  4, 5,  6, 7, 8,  9,     10,  11,   12,  13,   14
+    # 0, F, FF, K, KK, D, DD, S, Sh, ShF, ShFF, ShK, ShKK, ShD, ShDD
+    b, ts, tw = output.shape
+    for b_idx in range(b):
+        h_s, t_s, s_s = [], [], []
+        for w_idx in range(ts):
+            h_w, t_w, s_w = [], [], []
+            for c_idx in range(tw):
+                c = HARAKAT_MAP[int(output[b_idx, w_idx, c_idx])]
+                h_w  += [c[0]]
+                t_w += [c[1]]
+                s_w  += [c[2]]
+            h_s += [h_w]
+            t_s += [t_w]
+            s_s += [s_w]
+        haraka  += [h_s]
+        tanween += [t_s]
+        shadda  += [s_s]
+    return haraka, tanween, shadda
+def flat2_3head(diac_idx):
+    '''
+    diac_idx: [tw]
+    '''
+    haraka, tanween, shadda = [], [], []
+    # 0, 1,  2, 3,  4, 5,  6, 7, 8,  9,     10,  11,   12,  13,   14
+    # 0, F, FF, K, KK, D, DD, S, Sh, ShF, ShFF, ShK, ShKK, ShD, ShDD
+    for diac in diac_idx:
+        c_out = HARAKAT_MAP[diac]
+        haraka += [c_out[0]]
+        tanween += [c_out[1]]
+        shadda += [c_out[2]]
+    return np.array(haraka), np.array(tanween), np.array(shadda)

model_dd.py ADDED Viewed

	@@ -0,0 +1,526 @@

+import numpy as np
+import torch as T
+from tqdm import tqdm
+from torch import nn
+from torch.nn import functional as F
+from components.k_lstm import K_LSTM
+from components.attention import Attention
+from data_utils import DatasetUtils
+from diac_utils import flat2_3head, flat_2_3head
+class DiacritizerD2(nn.Module):
+    def __init__(self, config):
+        super(DiacritizerD2, self).__init__()
+        self.max_word_len = config["train"]["max-word-len"]
+        self.max_sent_len = config["train"]["max-sent-len"]
+        self.char_embed_dim = config["train"]["char-embed-dim"]
+        self.final_dropout_p = config["train"]["final-dropout"]
+        self.sent_dropout_p = config["train"]["sent-dropout"]
+        self.diac_dropout_p = config["train"]["diac-dropout"]
+        self.vertical_dropout = config['train']['vertical-dropout']
+        self.recurrent_dropout = config['train']['recurrent-dropout']
+        self.recurrent_dropout_mode = config['train'].get('recurrent-dropout-mode', 'gal_tied')
+        self.recurrent_activation = config['train'].get('recurrent-activation', 'sigmoid')
+        self.sent_lstm_units = config["train"]["sent-lstm-units"]
+        self.word_lstm_units = config["train"]["word-lstm-units"]
+        self.decoder_units = config["train"]["decoder-units"]
+        self.sent_lstm_layers = config["train"]["sent-lstm-layers"]
+        self.word_lstm_layers = config["train"]["word-lstm-layers"]
+        self.cell = config['train'].get('rnn-cell', 'lstm')
+        self.num_layers = config["train"].get("num-layers", 2)
+        self.RNN_Layer = K_LSTM
+        self.batch_first = config['train'].get('batch-first', True)
+        self.device = 'cuda' if T.cuda.is_available() else 'cpu'
+        self.num_classes = 15
+    def build(self, wembs: T.Tensor, abjad_size: int):
+        self.closs = F.cross_entropy
+        self.bloss = F.binary_cross_entropy_with_logits
+        rnn_kargs = dict(
+            recurrent_dropout_mode=self.recurrent_dropout_mode,
+            recurrent_activation=self.recurrent_activation,
+        )
+        self.sent_lstm = self.RNN_Layer(
+            input_size=300,
+            hidden_size=self.sent_lstm_units,
+            num_layers=self.sent_lstm_layers,
+            bidirectional=True,
+            vertical_dropout=self.vertical_dropout,
+            recurrent_dropout=self.recurrent_dropout,
+            batch_first=self.batch_first,
+            **rnn_kargs,
+        )
+        self.word_lstm = self.RNN_Layer(
+            input_size=self.sent_lstm_units * 2 + self.char_embed_dim,
+            hidden_size=self.word_lstm_units,
+            num_layers=self.word_lstm_layers,
+            bidirectional=True,
+            vertical_dropout=self.vertical_dropout,
+            recurrent_dropout=self.recurrent_dropout,
+            batch_first=self.batch_first,
+            return_states=True,
+            **rnn_kargs,
+        )
+        self.char_embs = nn.Embedding(
+            abjad_size,
+            self.char_embed_dim,
+            padding_idx=0,
+        )
+        self.attention = Attention(
+            kind="dot",
+            query_dim=self.word_lstm_units * 2,
+            input_dim=self.sent_lstm_units * 2,
+        )
+        self.word_embs = T.tensor(wembs).clone().to(dtype=T.float32)
+        self.word_embs = self.word_embs.to(self.device)
+        self.classifier = nn.Linear(self.attention.Dout + self.word_lstm_units * 2, self.num_classes)
+        self.dropout = nn.Dropout(self.final_dropout_p)
+    def forward(self, sents, words, labels=None, subword_lengths=None):
+        #^ sents : [b ts]
+        #^ words : [b ts tw]
+        #^ labels: [b ts tw]
+        max_words = min(self.max_sent_len, sents.shape[1])
+        word_mask = words.ne(0.).float()
+        #^ word_mask: [b ts tw]
+        if self.training:
+            q = 1.0 - self.sent_dropout_p
+            sdo = T.bernoulli(T.full(sents.shape, q))
+            sents_do = sents * sdo.long()
+            #^ sents_do : [b ts] ; DO(ts)
+            wembs = self.word_embs[sents_do]
+            #^ wembs : [b ts dw] ; DO(ts)
+        else:
+            wembs = self.word_embs[sents]
+            #^ wembs : [b ts dw]
+        sent_enc = self.sent_lstm(wembs.to(self.device))
+        #^ sent_enc : [b ts dwe]
+        sentword_do = sent_enc.unsqueeze(2)
+        #^ sentword_do : [b ts _ dwe]
+        sentword_do = self.dropout(sentword_do * word_mask.unsqueeze(-1))
+        #^ sentword_do : [b ts tw dwe]
+        word_index = words.view(-1, self.max_word_len)
+        #^ word_index: [b*ts tw]?
+        cembs = self.char_embs(word_index)
+        #^ cembs : [b*ts tw dc]
+        sentword_do = sentword_do.view(-1, self.max_word_len, self.sent_lstm_units * 2)
+        #^ sentword_do : [b*ts tw dwe]
+        char_embs = T.cat([cembs, sentword_do], dim=-1)
+        #^ char_embs : [b*ts tw dcw] ; dcw = dc + dwe
+        char_enc, _ = self.word_lstm(char_embs)
+        #^ char_enc: [b*ts tw dce]
+        char_enc_reshaped = char_enc.view(-1, max_words, self.max_word_len, self.word_lstm_units * 2)
+        # #^ char_enc: [b ts tw dce]
+        omit_self_mask = (1.0 - T.eye(max_words)).unsqueeze(0).to(self.device)
+        attn_enc, attn_map = self.attention(char_enc_reshaped, sent_enc, word_mask.bool(), prejudice_mask=omit_self_mask)
+        # # #^ attn_enc: [b ts tw dae]
+        attn_enc = attn_enc.reshape(-1, self.max_word_len, self.attention.Dout)
+        # #^ attn_enc: [b*ts tw dae]
+        final_vec = T.cat([attn_enc, char_enc], dim=-1)
+        diac_out = self.classifier(self.dropout(final_vec))
+        #^ diac_out: [b*ts tw 7]
+        diac_out = diac_out.view(-1, max_words, self.max_word_len, self.num_classes)
+        #^ diac_out: [b ts tw 7]
+        if not self.batch_first:
+            diac_out = diac_out.swapaxes(1, 0)
+        return diac_out
+    def step(self, xt, yt, mask=None):
+        xt[1] = xt[1].to(self.device)
+        xt[2] = xt[2].to(self.device)
+        yt = yt.to(self.device)
+        #^ yt: [b ts tw]
+        diac, _ = self(*xt)
+        loss = self.closs(diac.view(-1, self.num_classes), yt.view(-1))
+        return loss
+    def predict(self, dataloader):
+        training = self.training
+        self.eval()
+        preds = {'haraka': [], 'shadda': [], 'tanween': []}
+        print("> Predicting...")
+        for inputs, _ in tqdm(dataloader, total=len(dataloader)):
+            inputs[0] = inputs[0].to(self.device)
+            inputs[1] = inputs[1].to(self.device)
+            diac, _ = self(*inputs)
+            output = np.argmax(T.softmax(diac.detach(), dim=-1).cpu().numpy(), axis=-1)
+            #^ [b ts tw]
+            haraka, tanween, shadda = flat_2_3head(output)
+            preds['haraka'].extend(haraka)
+            preds['tanween'].extend(tanween)
+            preds['shadda'].extend(shadda)
+        self.train(training)
+        return (
+            np.array(preds['haraka']),
+            np.array(preds["tanween"]),
+            np.array(preds["shadda"]),
+        )
+class DiacritizerD3(nn.Module):
+    def __init__(self, config, device='cuda'):
+        super(DiacritizerD3, self).__init__()
+        self.max_word_len = config["train"]["max-word-len"]
+        self.max_sent_len = config["train"]["max-sent-len"]
+        self.char_embed_dim = config["train"]["char-embed-dim"]
+        self.sent_dropout_p = config["train"]["sent-dropout"]
+        self.diac_dropout_p = config["train"]["diac-dropout"]
+        self.vertical_dropout = config['train']['vertical-dropout']
+        self.recurrent_dropout = config['train']['recurrent-dropout']
+        self.recurrent_dropout_mode = config['train'].get('recurrent-dropout-mode', 'gal_tied')
+        self.recurrent_activation = config['train'].get('recurrent-activation', 'sigmoid')
+        self.sent_lstm_units = config["train"]["sent-lstm-units"]
+        self.word_lstm_units = config["train"]["word-lstm-units"]
+        self.decoder_units = config["train"]["decoder-units"]
+        self.sent_lstm_layers = config["train"]["sent-lstm-layers"]
+        self.word_lstm_layers = config["train"]["word-lstm-layers"]
+        self.cell = config['train'].get('rnn-cell', 'lstm')
+        self.num_layers = config["train"].get("num-layers", 2)
+        self.RNN_Layer = K_LSTM
+        self.batch_first = config['train'].get('batch-first', True)
+        self.baseline = config["train"].get("baseline", False)
+        self.device = device
+    def build(self, wembs: T.Tensor, abjad_size: int):
+        self.closs = F.cross_entropy
+        self.bloss = F.binary_cross_entropy_with_logits
+        rnn_kargs = dict(
+            recurrent_dropout_mode=self.recurrent_dropout_mode,
+            recurrent_activation=self.recurrent_activation,
+        )
+        self.sent_lstm = self.RNN_Layer(
+            input_size=300,
+            hidden_size=self.sent_lstm_units,
+            num_layers=self.sent_lstm_layers,
+            bidirectional=True,
+            vertical_dropout=self.vertical_dropout,
+            recurrent_dropout=self.recurrent_dropout,
+            batch_first=self.batch_first,
+            **rnn_kargs,
+        )
+        self.word_lstm = self.RNN_Layer(
+            input_size=self.sent_lstm_units * 2 + self.char_embed_dim,
+            hidden_size=self.word_lstm_units,
+            num_layers=self.word_lstm_layers,
+            bidirectional=True,
+            vertical_dropout=self.vertical_dropout,
+            recurrent_dropout=self.recurrent_dropout,
+            batch_first=self.batch_first,
+            return_states=True,
+            **rnn_kargs,
+        )
+        self.char_embs = nn.Embedding(
+            abjad_size,
+            self.char_embed_dim,
+            padding_idx=0,
+        )
+        self.attention = Attention(
+            kind="dot",
+            query_dim=self.word_lstm_units * 2,
+            input_dim=self.sent_lstm_units * 2,
+       )
+        self.lstm_decoder = self.RNN_Layer(
+            input_size=self.word_lstm_units * 2 + self.attention.Dout + 8,
+            hidden_size=self.word_lstm_units * 2,
+            num_layers=1,
+            bidirectional=False,
+            vertical_dropout=self.vertical_dropout,
+            recurrent_dropout=self.recurrent_dropout,
+            batch_first=self.batch_first,
+            return_states=True,
+            **rnn_kargs,
+        )
+        self.word_embs = T.tensor(wembs, dtype=T.float32)
+        self.classifier = nn.Linear(self.lstm_decoder.hidden_size, 15)
+        self.dropout = nn.Dropout(0.2)
+    def forward(self, sents, words, labels):
+        #^ sents : [b ts]
+        #^ words : [b ts tw]
+        #^ labels: [b ts tw]
+        word_mask = words.ne(0.).float()
+        #^ word_mask: [b ts tw]
+        if self.training:
+            q = 1.0 - self.sent_dropout_p
+            sdo = T.bernoulli(T.full(sents.shape, q))
+            sents_do = sents * sdo.long()
+            #^ sents_do : [b ts] ; DO(ts)
+            wembs = self.word_embs[sents_do]
+            #^ wembs : [b ts dw] ; DO(ts)
+        else:
+            wembs = self.word_embs[sents]
+            #^ wembs : [b ts dw]
+        sent_enc = self.sent_lstm(wembs.to(self.device))
+        #^ sent_enc : [b ts dwe]
+        sentword_do = sent_enc.unsqueeze(2)
+        #^ sentword_do : [b ts _ dwe]
+        sentword_do = self.dropout(sentword_do * word_mask.unsqueeze(-1))
+        #^ sentword_do : [b ts tw dwe]
+        word_index = words.view(-1, self.max_word_len)
+        #^ word_index: [b*ts tw]?
+        cembs = self.char_embs(word_index)
+        #^ cembs : [b*ts tw dc]
+        sentword_do = sentword_do.view(-1, self.max_word_len, self.sent_lstm_units * 2)
+        #^ sentword_do : [b*ts tw dwe]
+        char_embs = T.cat([cembs, sentword_do], dim=-1)
+        #^ char_embs : [b*ts tw dcw] ; dcw = dc + dwe
+        char_enc, _ = self.word_lstm(char_embs)
+        #^ char_enc: [b*ts tw dce]
+        char_enc_reshaped = char_enc.view(-1, self.max_sent_len, self.max_word_len, self.word_lstm_units * 2)
+        #^ char_enc: [b ts tw dce]
+        omit_self_mask = (1.0 - T.eye(self.max_sent_len)).unsqueeze(0).to(self.device)
+        attn_enc, attn_map = self.attention(char_enc_reshaped, sent_enc, word_mask.bool(), prejudice_mask=omit_self_mask)
+        #^ attn_enc: [b ts tw dae]
+        attn_enc = attn_enc.view(-1, self.max_sent_len*self.max_word_len, self.attention.Dout)
+        #^ attn_enc: [b*ts tw dae]
+        if self.training and self.diac_dropout_p > 0:
+            q = 1.0 - self.diac_dropout_p
+            ddo = T.bernoulli(T.full(labels.shape[:-1], q))
+            labels = labels * ddo.unsqueeze(-1).long().to(self.device)
+            #^ labels : [b ts tw] ; DO(ts)
+        labels = labels.view(-1, self.max_sent_len*self.max_word_len, 8).float()
+        #^ labels: [b*ts tw 8]
+        char_enc = char_enc.view(-1, self.max_sent_len*self.max_word_len, self.word_lstm_units * 2)
+        final_vec = T.cat([attn_enc, char_enc, labels], dim=-1)
+        #^ final_vec: [b ts*tw dae+8]
+        dec_out, _ = self.lstm_decoder(final_vec)
+        #^ dec_out: [b*ts tw du]
+        dec_out = dec_out.reshape(-1, self.max_word_len, self.lstm_decoder.hidden_size)
+        diac_out = self.classifier(self.dropout(dec_out))
+        #^ diac_out: [b*ts tw 7]
+        diac_out = diac_out.view(-1, self.max_sent_len, self.max_word_len, 15)
+        #^ diac_out: [b ts tw 7]
+        if not self.batch_first:
+            diac_out = diac_out.swapaxes(1, 0)
+        return diac_out, attn_map
+    def predict_sample(self, sents, words, labels):
+        word_mask = words.ne(0.).float()
+        #^ mask: [b ts tw 1]
+        if self.training:
+            q = 1.0 - self.sent_dropout_p
+            sdo = T.bernoulli(T.full(sents.shape, q))
+            sents_do = sents * sdo.long()
+            #^ sents_do : [b ts] ; DO(ts)
+            wembs = self.word_embs[sents_do]
+            #^ wembs : [b ts dw] ; DO(ts)
+        else:
+            wembs = self.word_embs[sents]
+            #^ wembs : [b ts dw]
+        sent_enc = self.sent_lstm(wembs.to(self.device))
+        #^ sent_enc : [b ts dwe]
+        sentword_do = sent_enc.unsqueeze(2)
+        #^ sentword_do : [b ts _ dwe]
+        sentword_do = self.dropout(sentword_do * word_mask.unsqueeze(-1))
+        #^ sentword_do : [b ts tw dwe]
+        word_index = words.view(-1, self.max_word_len)
+        #^ word_index: [b*ts tw]?
+        cembs = self.char_embs(word_index)
+        #^ cembs : [b*ts tw dc]
+        sentword_do = sentword_do.view(-1, self.max_word_len, self.sent_lstm_units * 2)
+        #^ sentword_do : [b*ts tw dwe]
+        char_embs = T.cat([cembs, sentword_do], dim=-1)
+        #^ char_embs : [b*ts tw dcw] ; dcw = dc + dwe
+        char_enc, _ = self.word_lstm(char_embs)
+        #^ char_enc: [b*ts tw dce]
+        #^ word_states: ([b*ts dce], [b*ts dce])
+        char_enc = char_enc.view(-1, self.max_sent_len, self.max_word_len, self.word_lstm_units*2)
+        #^ char_enc: [b ts tw dce]
+        omit_self_mask = (1.0 - T.eye(self.max_sent_len)).unsqueeze(0).to(self.device)
+        attn_enc, _ = self.attention(char_enc, sent_enc, word_mask.bool(), prejudice_mask=omit_self_mask)
+        #^ attn_enc: [b ts tw dae]
+        all_out = T.zeros(*char_enc.size()[:-1], 15).to(self.device)
+        #^ all_out: [b ts tw 7]
+        batch_sz = char_enc.size()[0]
+        #^ batch_sz: b
+        zeros = T.zeros(1, batch_sz, self.lstm_decoder.hidden_size).to(self.device)
+        #^ zeros: [1 b du]
+        bos_tag = T.tensor([0,0,0,0,0,1,0,0]).unsqueeze(0)
+        #^ bos_tag: [1 8]
+        prev_label = T.cat([bos_tag]*batch_sz).to(self.device).float()
+        # bos_vec = T.cat([bos_tag]*batch_sz).to(self.device).float()
+        #^ prev_label: [b 8]
+        for ts in range(self.max_sent_len):
+            dec_hx = (zeros, zeros)
+            #^ dec_hx: [1 b du]
+            for tw in range(self.max_word_len):
+                final_vec = T.cat([attn_enc[:,ts,tw,:], char_enc[:,ts,tw,:], prev_label], dim=-1).unsqueeze(1)
+                #^ final_vec: [b 1 dce+8]
+                dec_out, dec_hx = self.lstm_decoder(final_vec, dec_hx)
+                #^ dec_out: [b 1 du]
+                dec_out = dec_out.squeeze(0)
+                dec_out = dec_out.transpose(0,1)
+                logits_raw = self.classifier(self.dropout(dec_out))
+                #^ logits_raw: [b 1 15]
+                out_idx = T.max(T.softmax(logits_raw.squeeze(), dim=-1), dim=-1)[1]
+                haraka, tanween, shadda = flat2_3head(out_idx.detach().cpu().numpy())
+                haraka_onehot = T.eye(6)[haraka].float().to(self.device)
+                #^ haraka_onehot+bos_tag: [b 6]
+                tanween = T.tensor(tanween).float().unsqueeze(-1).to(self.device)
+                shadda = T.tensor(shadda).float().unsqueeze(-1).to(self.device)
+                prev_label = T.cat([haraka_onehot, tanween, shadda], dim=-1)
+                all_out[:,ts,tw,:] = logits_raw.squeeze()
+        if not self.batch_first:
+            all_out = all_out.swapaxes(1, 0)
+        return all_out
+    def step(self, xt, yt, mask=None):
+        xt[1] = xt[1].to(self.device)
+        xt[2] = xt[2].to(self.device)
+        #^ yt: [b ts tw]
+        yt = yt.to(self.device)
+        if self.training:
+            diac, _ = self(*xt)
+        else:
+            diac = self.predict_sample(*xt)
+        #^ diac[0] : [b ts tw 5]
+        loss = self.closs(diac.view(-1,15), yt.view(-1))
+        return loss
+    def predict(self, dataloader):
+        training = self.training
+        self.eval()
+        preds = {'haraka': [], 'shadda': [], 'tanween': []}
+        print("> Predicting...")
+        for inputs, _ in tqdm(dataloader, total=len(dataloader)):
+            inputs[1] = inputs[1].to(self.device)
+            inputs[2] = inputs[2].to(self.device)
+            diac = self.predict_sample(*inputs)
+            output = np.argmax(T.softmax(diac.detach(), dim=-1).cpu().numpy(), axis=-1)
+            #^ [b ts tw]
+            haraka, tanween, shadda = flat_2_3head(output)
+            preds['haraka'].extend(haraka)
+            preds['tanween'].extend(tanween)
+            preds['shadda'].extend(shadda)
+        self.train(training)
+        return (
+            np.array(preds['haraka']),
+            np.array(preds["tanween"]),
+            np.array(preds["shadda"]),
+        )
+if __name__ == "__main__":
+    import yaml
+    config_path = "configs/dd/config_d2.yaml"
+    model_path = "models/tashkeela-d2.pt"
+    with open(config_path, 'r', encoding="utf-8") as file:
+        config = yaml.load(file, Loader=yaml.FullLoader)
+    data_utils = DatasetUtils(config)
+    vocab_size = len(data_utils.letter_list)
+    word_embeddings = data_utils.embeddings
+    model = DiacritizerD2(config, device='cpu')
+    model.build(word_embeddings, vocab_size)
+    model.load_state_dict(T.load(model_path, map_location=T.device('cpu'))["state_dict"])

model_partial.py ADDED Viewed

	@@ -0,0 +1,348 @@

+from typing import NamedTuple
+import yaml
+from tqdm import tqdm
+import numpy as np
+import torch as T
+from torch import nn
+from torch import functional as F
+from diac_utils import flat_2_3head
+from model_dd import DiacritizerD2
+class Readout(nn.Module):
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+    ):
+        super().__init__()
+        self.W1 = nn.Linear(in_size, in_size)
+        self.W2 = nn.Linear(in_size, out_size)
+    def forward(self, x: T.Tensor):
+        z = self.W1(x)
+        z = T.tanh(z)
+        z = self.W2(x)
+        return z
+class WordDD_LSTM(nn.Module):
+    def __init__(
+            self,
+            feature_size: int,
+            num_classes: int = 13,
+            return_logits: bool = True,
+    ):
+        super().__init__()
+        self.feature_size = feature_size
+        self.num_classes = num_classes
+        self.return_logits = return_logits
+        self.cell = nn.LSTM(feature_size)
+        self.head = Readout(feature_size, num_classes)
+    def forward(self, x: T.Tensor):
+        #^ x: [b tc dc]
+        z = self.cell(x)
+        #^ z: [b tc @dc]
+        y = self.head(z)
+        #^ y: [b tc Classes]
+        yhat = y
+        if not self.return_logits:
+            yhat = F.softmax(yhat, dim=1)
+        #^ yhat: [b tc @Classes]
+        return yhat
+class PartialDiacOutput(NamedTuple):
+    preds_hard: T.Tensor
+    preds_ctxt_logit: T.Tensor
+    preds_base_logit: T.Tensor
+class PartialDD(nn.Module):
+    def __init__(
+            self,
+            config: dict,
+            # feature_size: int,
+            # confidence_threshold: float,
+            d2=False
+    ):
+        super().__init__()
+        self._built = False
+        self.no_diac_id = 0
+        self._dummy = nn.Parameter(T.ones(1, 1))
+        self.config = config
+        self.sentence_diac = DiacritizerD2(self.config)
+        self.eval()
+    @property
+    def device(self):
+        return self._dummy.device
+    @property
+    def tokenizer(self):
+        return self.sentence_diac.tokenizer
+    def load_state_dict(
+            self,
+            state_dict: dict
+    ):
+        self.sentence_diac.load_state_dict(state_dict)
+    def _slim_batch(
+            self,
+            toke_ids: T.Tensor,
+            char_ids: T.Tensor,
+            diac_ids: T.Tensor,
+            subword_lengths: T.Tensor,
+    ):
+        #^ toke_ids: [b tt]
+        #^ char_ids: [b tw tc]
+        #^ diac_ids: [b tw tc "13"]
+        #^ subword_lengths: [b tw]
+        token_nonpad_mask = toke_ids.ne(self.tokenizer.pad_token_id)
+        Ttoken = token_nonpad_mask.sum(1).max()
+        toke_ids = toke_ids[:, :Ttoken]
+        char_nonpad_mask = char_ids.ne(0)
+        Tword = char_nonpad_mask.any(2).sum(1).max()
+        Tchar = char_nonpad_mask.sum(2).max()
+        char_ids = char_ids[:, :Tword, :Tchar]
+        diac_ids = diac_ids[:, :Tword, :Tchar]
+        subword_lengths = subword_lengths[:, :Tword]
+        return toke_ids, char_ids, diac_ids, subword_lengths
+    def word_diac(
+            self,
+            toke_ids: T.Tensor,
+            char_ids: T.Tensor,
+            diac_ids: T.Tensor,
+            subword_lengths: T.Tensor,
+            *,
+            shape: tuple = None,
+    ):
+        if shape is None:
+            toke_ids, char_ids, diac_ids, subword_lengths = self._slim_batch(
+                toke_ids, char_ids, diac_ids, subword_lengths
+            )
+        else:
+            Nb, Tw, Tc = shape
+            toke_ids = toke_ids[:, :]
+            char_ids = char_ids[:, :Tw, :Tc]
+            diac_ids = diac_ids[:, :Tw, :Tc, :]
+            subword_lengths = subword_lengths[:, :Tw]
+        Nb, Tw, Tc = char_ids.shape
+        # Tw = min(Tw, word_ids.shape[1])
+        #^ word_ids: [b tt]
+        #^ char_ids: [b tw tc]
+        # wids_flat = word_ids[:, Tw].reshape(Nb * Tw, 1)
+        # cids_flat = char_ids[:, Tw].reshape(Nb * Tw, 1, Tc)
+        # z = self.sentence_diac(wids_flat, cids_flat)
+        sent_word_strides = subword_lengths.cumsum(1)
+        assert tuple(subword_lengths.shape) == (Nb, Tw), f"{subword_lengths.shape} != {(Nb, Tw)=}"
+        max_tokens_per_word: int = subword_lengths.max().int().item()
+        word_x = T.zeros(Nb, Tw, max_tokens_per_word).to(toke_ids)
+        for i_b in range(toke_ids.shape[0]):
+            sent_i = toke_ids[i_b]
+            start_iw = 0
+            for i_word, end_iw in enumerate(sent_word_strides[i_b]):
+                if end_iw == start_iw: break
+                word = sent_i[start_iw:end_iw]
+                word_x[i_b, i_word, 0 : end_iw - start_iw] = word
+                start_iw = end_iw
+        #^ word_x: [b tw tt]
+        word_x = word_x.reshape(Nb * Tw, max_tokens_per_word)
+        cids_flat = char_ids.reshape(Nb * Tw, 1, Tc)
+        word_lengths = subword_lengths.reshape(Nb * Tw, 1)
+        z = self.sentence_diac(
+            word_x,
+            cids_flat,
+            diac_ids.reshape(Nb*Tw, Tc, -1),
+            subword_lengths=word_lengths,
+        )
+        # Nc = z.shape[-1]
+        #^ z: [b*tw, 1, tc, "13"]
+        z = z.reshape(Nb, Tw, Tc, -1)
+        return z
+    def forward(
+            self,
+            word_ids: T.Tensor,
+            char_ids: T.Tensor,
+            _labels: T.Tensor,
+            # ground_truth: T.Tensor,
+            # padding_mask: T.BoolTensor,
+            *,
+            eval_only: str = None,
+            subword_lengths: T.Tensor,
+            return_extra: bool = False
+    ):
+        # assert self._built and not self.training
+        assert not self.training
+        #^ word_ids: [b tw]
+        #^ char_ids: [b tw tc]
+        #^ ground_truth: [b tw tc]
+        padding_mask = char_ids.eq(0)
+        #^ padding_mask: [b tw tc]
+        if True or eval_only != 'base':
+            y_ctxt = self.sentence_diac(
+                word_ids,
+                char_ids,
+                _labels,
+                subword_lengths=subword_lengths,
+            )
+            out_shape = y_ctxt.shape[:-1]
+        else:
+            out_shape = self.sentence_diac._slim_batch_size(
+                word_ids,
+                char_ids,
+                _labels,
+                subword_lengths,
+            )[1].shape
+        #^ y_ctxt: [b tw tc "13"]
+        if eval_only == 'ctxt':
+            return y_ctxt.argmax(-1)
+        y_base = self.word_diac(
+            word_ids,
+            char_ids,
+            _labels,
+            subword_lengths,
+            shape=out_shape
+        )
+        #^ y_base: [b tw tc "13"]
+        if eval_only == 'base':
+            return y_base.argmax(-1)
+        ypred_ctxt = y_ctxt.argmax(-1)
+        ypred_base = y_base.argmax(-1)
+        #^ ypred: [b tw tc _]
+        # Maybe for eval
+        # ypred_ctxt[~((ypred_base == ground_truth) & (~padding_mask))] = self.no_diac_id
+        # return ypred_ctxt
+        ypred_ctxt[(padding_mask) | (ypred_base == ypred_ctxt)] = self.no_diac_id
+        if not return_extra:
+            return ypred_ctxt
+        else:
+            return PartialDiacOutput(ypred_ctxt, y_ctxt, y_base)
+    def step(self, xt, yt, mask=None):
+        raise NotImplementedError
+        xt[1] = xt[1].to(self.device)
+        xt[2] = xt[2].to(self.device)
+        yt = yt.to(self.device)
+        #^ yt: [b ts tw]
+        diac, _ = self(*xt) # xt: (word_ids, char_ids, _labels)
+        loss = self.closs(diac.view(-1, self.num_classes), yt.view(-1))
+        return loss
+    def predict_partial(
+            self,
+            dataloader,
+            return_extra=False,
+            eval_only: str = None,
+    ):
+        training = self.training
+        self.eval()
+        preds = {
+            'haraka':  [],
+            'shadda':  [],
+            'tanween': [],
+            'diacs':   [],
+            'y_ctxt':  [],
+            'y_base':  [],
+        }
+        print("> Predicting...")
+        # breakpoint()
+        for i_batch, (inputs, _, subword_lengths) in enumerate(tqdm(dataloader)):
+            # if i_batch > 10:
+            #     break
+            #^ inputs: [toke_ids, char_ids, diac_ids]
+            inputs[0] = inputs[0].to(self.device) #< toke_ids
+            inputs[1] = inputs[1].to(self.device) #< char_ids
+            # inputs[2] = inputs[2].to(self.device) #< diac_ids
+            if self._use_d2:
+                subword_lengths = T.ones_like(inputs[0])
+                subword_lengths[inputs[0] == 0] = 0
+            with T.no_grad():
+                output = self(
+                    *inputs,
+                    subword_lengths=subword_lengths,
+                    return_extra=return_extra,
+                    eval_only=eval_only,
+                )
+            # output = np.argmax(T.softmax(output.detach(), dim=-1).cpu().numpy(), axis=-1)
+            if return_extra:
+                assert isinstance(output, PartialDiacOutput)
+                marks = output.preds_hard
+                preds['diacs'].extend(list(marks.detach().cpu().numpy()))
+                preds['y_ctxt'].extend(list(output.preds_ctxt_logit.detach().cpu().numpy()))
+                preds['y_base'].extend(list(output.preds_base_logit.detach().cpu().numpy()))
+            else:
+                assert isinstance(output, T.Tensor)
+                marks = output
+                preds['diacs'].extend(list(marks.detach().cpu().numpy()))
+            #^ [b ts tw]
+            haraka, tanween, shadda = flat_2_3head(marks)
+            preds['haraka'].extend(haraka)
+            preds['tanween'].extend(tanween)
+            preds['shadda'].extend(shadda)
+        self.train(training)
+        return {
+            'diacritics': (
+                #! FIXME! Due to batch slimming, output diacritics may need padding.
+                np.array(preds['haraka']),
+                np.array(preds["tanween"]),
+                np.array(preds["shadda"]),
+            ),
+            'other': ( # Would be empty when !return_extra
+                preds['y_ctxt'],
+                preds['y_base'],
+                preds['diacs'],
+            )
+        }
+    def predict(self, dataloader):
+        training = self.training
+        self.eval()
+        preds = {'haraka': [], 'shadda': [], 'tanween': []}
+        print("> Predicting...")
+        for inputs, _ in tqdm(dataloader, total=len(dataloader)):
+            inputs[0] = inputs[0].to(self.device)
+            inputs[1] = inputs[1].to(self.device)
+            output = self(*inputs)
+            # output = np.argmax(T.softmax(output.detach(), dim=-1).cpu().numpy(), axis=-1)
+            marks = output
+            #^ [b ts tw]
+            haraka, tanween, shadda = flat_2_3head(marks)
+            preds['haraka'].extend(haraka)
+            preds['tanween'].extend(tanween)
+            preds['shadda'].extend(shadda)
+        self.train(training)
+        return (
+            np.array(preds['haraka']),
+            np.array(preds["tanween"]),
+            np.array(preds["shadda"]),
+        )

predict.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from typing import Iterable, Union, Tuple
+from collections import Counter
+import argparse
+import os
+import yaml
+from pyarabic.araby import tokenize, strip_tatweel
+from tqdm import tqdm
+import numpy as np
+import torch as T
+from torch.utils.data import DataLoader
+from diac_utils import HARAKAT_MAP, shakkel_char, diac_ids_of_line
+from model_partial import PartialDD
+from data_utils import DatasetUtils
+from dataloader import DataRetriever
+from segment import segment
+class Predictor:
+    def __init__(self, config, text):
+        self.data_utils = DatasetUtils(config)
+        vocab_size = len(self.data_utils.letter_list)
+        word_embeddings = self.data_utils.embeddings
+        stride = config["segment"]["stride"]
+        window = config["segment"]["window"]
+        min_window = config["segment"]["min-window"]
+        segments, mapping = segment([text], stride, window, min_window)
+        mapping_lines = []
+        for sent_idx, seg_idx, word_idx, char_idx in mapping:
+            mapping_lines += [f"{sent_idx}, {seg_idx}, {word_idx}, {char_idx}"]
+        self.mapping = self.data_utils.load_mapping_v3_from_list(mapping_lines)
+        self.original_lines = [text]
+        self.segments = segments
+        self.device = T.device(
+            config['predictor'].get('device', 'cuda:0')
+            if T.cuda.is_available() else 'cpu'
+        )
+        self.model = PartialDD(config, d2=True)
+        self.model.sentence_diac.build(word_embeddings, vocab_size)
+        state_dict = T.load(config["paths"]["load"], map_location=T.device(self.device))['state_dict']
+        self.model.load_state_dict(state_dict)
+        self.model.to(self.device)
+        self.model.eval()
+        self.data_loader = DataLoader(
+            DataRetriever(self.data_utils, segments),
+            batch_size=config["predictor"].get("batch-size", 32),
+            shuffle=False,
+            num_workers=config['loader'].get('num-workers', 0),
+        )
+class PredictTri(Predictor):
+    def __init__(self, config, text):
+        super().__init__(config, text)
+        self.diacritics = {
+            "FATHA": 1,
+            "KASRA": 2,
+            "DAMMA": 3,
+            "SUKUN": 4
+        }
+        self.votes: Union[Counter[int], Counter[bool]] = Counter()
+    def count_votes(
+            self,
+            things: Union[Iterable[int], Iterable[bool]]
+    ):
+        self.votes.clear()
+        self.votes.update(things)
+        return self.votes.most_common(1)[0][0]
+    def predict_majority_vote(self):
+        y_gen_diac, y_gen_tanween, y_gen_shadda = self.model.predict(self.data_loader)
+        diacritized_lines = self.coalesce_votes_by_majority(y_gen_diac, y_gen_tanween, y_gen_shadda)
+        return diacritized_lines
+    def predict_majority_vote_context_contrastive(self, overwrite_cache=False):
+        assert isinstance(self.model, PartialDD)
+        if not os.path.exists("dataset/cache/y_gen_diac.npy") or overwrite_cache:
+            if not os.path.exists("dataset/cache"):
+                os.mkdir("dataset/cache")
+            # segment_outputs = self.model.predict_partial(self.data_loader, return_extra=True)
+            segment_outputs = self.model.predict_partial(self.data_loader, return_extra=False, eval_only='ctxt')
+            T.save(segment_outputs, "dataset/cache/cache.pt")
+        else:
+            segment_outputs = T.load("dataset/cache/cache.pt")
+        y_gen_diac, y_gen_tanween, y_gen_shadda = segment_outputs['diacritics']
+        diacritized_lines, extra_for_lines = self.coalesce_votes_by_majority(
+            y_gen_diac, y_gen_tanween, y_gen_shadda,
+        )
+        extra_out = {
+            'line_data': {
+                **extra_for_lines,
+            },
+            'segment_data': {
+                **segment_outputs,
+                # 'logits': segment_outputs['logits'],
+            }
+        }
+        return diacritized_lines, extra_out
+    def coalesce_votes_by_majority(
+            self,
+            y_gen_diac: np.ndarray,
+            y_gen_tanween: np.ndarray,
+            y_gen_shadda: np.ndarray,
+    ):
+        prepped_lines_og = [' '.join(tokenize(strip_tatweel(line))) for line in self.original_lines]
+        max_line_chars = max(len(line) for line in prepped_lines_og)
+        diacritics_pred = np.full((len(self.original_lines), max_line_chars), fill_value=-1, dtype=int)
+        count_processed_sents = 0
+        do_break = False
+        diacritized_lines = []
+        for sent_idx, line in enumerate(tqdm(prepped_lines_og)):
+            count_processed_sents = sent_idx + 1
+            line = line.strip()
+            diacritized_line = ""
+            for char_idx, char in enumerate(line):
+                diacritized_line += char
+                char_vote_diacritic = []
+                # ? This is the voting part
+                if sent_idx not in self.mapping:
+                    continue
+                mapping_s_i = self.mapping[sent_idx]
+                for seg_idx in mapping_s_i:
+                    if self.data_utils.debug and seg_idx >= 256:
+                        do_break = True
+                        break
+                    mapping_g_i = mapping_s_i[seg_idx]
+                    for t_idx in mapping_g_i:
+                        mapping_t_i = mapping_g_i[t_idx]
+                        if char_idx in mapping_t_i:
+                            c_idx = mapping_t_i.index(char_idx)
+                            output_idx = np.s_[seg_idx, t_idx, c_idx]
+                            diac_h3 = (y_gen_diac[output_idx], y_gen_tanween[output_idx], y_gen_shadda[output_idx])
+                            diac_char_i = HARAKAT_MAP.index(diac_h3)
+                            if c_idx < 13 and diac_char_i != 0:
+                                char_vote_diacritic.append(diac_char_i)
+                if do_break:
+                    break
+                if len(char_vote_diacritic) > 0:
+                    char_mv_diac = self.count_votes(char_vote_diacritic)
+                    diacritized_line += shakkel_char(*HARAKAT_MAP[char_mv_diac])
+                    diacritics_pred[sent_idx, char_idx] = char_mv_diac
+                else:
+                    diacritics_pred[sent_idx, char_idx] = 0
+            if do_break:
+                break
+            diacritized_lines += [diacritized_line.strip()]
+        print(f'[INFO] Cutting stats from {len(diacritics_pred)} to {count_processed_sents}')
+        extra = {
+            'diac_pred': diacritics_pred[:count_processed_sents],
+        }
+        return diacritized_lines, extra

segment.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import argparse
+import yaml
+import os
+import pickle as pkl
+from tqdm import tqdm
+from pyarabic.araby import tokenize, strip_tashkeel, strip_tatweel
+def export(path, text):
+    with open(path, 'w', encoding="utf-8") as fout:
+        fout.write('\n'.join(text))
+def segment(lines, stride, window_sz, min_window_sz):
+    segments, mapping = [], []
+    real_seg_idx = 0
+    for sent_idx, line in tqdm(enumerate(lines), total=len(lines)):
+        line: str = strip_tatweel(line)
+        line = line.strip()
+        tokens = tokenize(line)
+        if len(tokens) == 0: continue
+        if tokens[-1] == '\n': tokens = tokens[:-1]
+        seg_idx, idx = 0, 0
+        while idx < len(tokens):
+            window = tokens[idx:idx+window_sz]
+            if window_sz == -1: window = tokens
+            if len(window) < min_window_sz and seg_idx != 0: break
+            segment = ' '.join(window)
+            segments += [segment]
+            char_offset = len(strip_tashkeel(' '.join(tokens[:idx])))
+            if seg_idx > 0:
+                char_offset += 1
+            seg_tokens = tokenize(strip_tashkeel(segment))
+            j = 0
+            for st_idx, st in enumerate(seg_tokens):
+                for _ in range(len(st)):
+                    mapping += [(sent_idx, real_seg_idx, st_idx, j+char_offset)]
+                    j += 1
+                j += 1
+            real_seg_idx += 1
+            seg_idx += 1
+            if stride == -1: break
+            idx += (window_sz if stride >= window_sz else stride)
+    return segments, mapping
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Sentence Breaker')
+    parser.add_argument('-c', '--config',  type=str,
+                        default="config.yaml", help='Run Configs')
+    parser.add_argument('-d', '--data_dir',  type=str,
+                        default=None, help='Override for data path')
+    args = parser.parse_args()
+    with open(args.config, 'r', encoding="utf-8") as file:
+        config = yaml.load(file, Loader=yaml.FullLoader)
+    BASE_PATH = args.data_dir or config["paths"].get("base")
+    stride = config["segment"]["stride"]
+    window = config["segment"]["window"]
+    min_window = config["segment"]["min-window"]
+    export_map = config["segment"]["export-map"]
+    for fpath in tqdm(config["segment"]["files"]):
+        FILE_PATH = os.path.join(BASE_PATH, fpath)
+        SAVE_PATH = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.txt")
+        MAP_PATH  = os.path.join(BASE_PATH, fpath[:-4] + f"-{stride}-{window}.map")
+        with open(FILE_PATH, 'r', encoding="utf-8") as fin:
+            lines = fin.readlines()
+        segments, mapping = segment(lines, stride, window, min_window)
+        with open(SAVE_PATH, 'w', encoding="utf-8") as fout:
+            fout.write('\n'.join(segments))
+        if not export_map: continue
+        with open(MAP_PATH, 'w', encoding="utf-8") as fout:
+            for sent_idx, seg_idx, word_idx, char_idx in mapping:
+                fout.write(f"{sent_idx}, {seg_idx}, {word_idx}, {char_idx}\n")