RMSnow's picture
add backend inference and inferface output
0883aa1
from typing import Optional
import torch
from torch import nn
from modules.wenet_extractor.utils.common import get_activation
class TransducerJoint(torch.nn.Module):
def __init__(
self,
voca_size: int,
enc_output_size: int,
pred_output_size: int,
join_dim: int,
prejoin_linear: bool = True,
postjoin_linear: bool = False,
joint_mode: str = "add",
activation: str = "tanh",
):
# TODO(Mddct): concat in future
assert joint_mode in ["add"]
super().__init__()
self.activatoin = get_activation(activation)
self.prejoin_linear = prejoin_linear
self.postjoin_linear = postjoin_linear
self.joint_mode = joint_mode
if not self.prejoin_linear and not self.postjoin_linear:
assert enc_output_size == pred_output_size == join_dim
# torchscript compatibility
self.enc_ffn: Optional[nn.Linear] = None
self.pred_ffn: Optional[nn.Linear] = None
if self.prejoin_linear:
self.enc_ffn = nn.Linear(enc_output_size, join_dim)
self.pred_ffn = nn.Linear(pred_output_size, join_dim)
# torchscript compatibility
self.post_ffn: Optional[nn.Linear] = None
if self.postjoin_linear:
self.post_ffn = nn.Linear(join_dim, join_dim)
self.ffn_out = nn.Linear(join_dim, voca_size)
def forward(self, enc_out: torch.Tensor, pred_out: torch.Tensor):
"""
Args:
enc_out (torch.Tensor): [B, T, E]
pred_out (torch.Tensor): [B, T, P]
Return:
[B,T,U,V]
"""
if (
self.prejoin_linear
and self.enc_ffn is not None
and self.pred_ffn is not None
):
enc_out = self.enc_ffn(enc_out) # [B,T,E] -> [B,T,V]
pred_out = self.pred_ffn(pred_out)
enc_out = enc_out.unsqueeze(2) # [B,T,V] -> [B,T,1,V]
pred_out = pred_out.unsqueeze(1) # [B,U,V] -> [B,1 U, V]
# TODO(Mddct): concat joint
_ = self.joint_mode
out = enc_out + pred_out # [B,T,U,V]
if self.postjoin_linear and self.post_ffn is not None:
out = self.post_ffn(out)
out = self.activatoin(out)
out = self.ffn_out(out)
return out