uetasr / model.py
thanhtvt
remove alsd
4ac7ffc
import os
import tensorflow as tf
from functools import lru_cache
from huggingface_hub import hf_hub_download
from hyperpyyaml import load_hyperpyyaml
from typing import Union
from decode import get_searcher
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
def _get_checkpoint_filename(
repo_id: str,
filename: str,
local_dir: str = None,
local_dir_use_symlinks: Union[bool, str] = "auto",
subfolder: str = "checkpoints"
) -> str:
model_filename = hf_hub_download(
repo_id=repo_id,
filename=filename,
subfolder=subfolder,
local_dir=local_dir,
local_dir_use_symlinks=local_dir_use_symlinks,
)
return model_filename
def _get_bpe_model_filename(
repo_id: str,
filename: str,
local_dir: str = None,
local_dir_use_symlinks: Union[bool, str] = "auto",
subfolder: str = "vocabs"
) -> str:
bpe_model_filename = hf_hub_download(
repo_id=repo_id,
filename=filename,
subfolder=subfolder,
local_dir=local_dir,
local_dir_use_symlinks=local_dir_use_symlinks,
)
return bpe_model_filename
@lru_cache(maxsize=1)
def _get_conformer_pre_trained_model(repo_id: str, checkpoint_dir: str = "checkpoints"):
for postfix in ["index", "data-00000-of-00001"]:
tmp = _get_checkpoint_filename(
repo_id=repo_id,
filename="avg_top5_27-32.ckpt.{}".format(postfix),
subfolder=checkpoint_dir,
local_dir=os.path.dirname(__file__), # noqa
local_dir_use_symlinks=True,
)
print(tmp)
for postfix in ["model", "vocab"]:
tmp = _get_bpe_model_filename(
repo_id=repo_id,
filename="subword_vietnamese_500.{}".format(postfix),
local_dir=os.path.dirname(__file__), # noqa
local_dir_use_symlinks=True,
)
print(tmp)
config_path = hf_hub_download(
repo_id=repo_id,
filename="config.yaml",
local_dir=os.path.dirname(__file__), # noqa
local_dir_use_symlinks=True,
)
with open(config_path, "r") as f:
config = load_hyperpyyaml(f)
encoder_model = config["encoder_model"]
text_encoder = config["text_encoder"]
jointer = config["jointer_model"]
decoder = config["decoder_model"]
# searcher = config["decoder"]
model = config["model"]
audio_encoder = config["audio_encoder"]
model.load_weights(os.path.join(checkpoint_dir, "avg_top5_27-32.ckpt")).expect_partial()
return audio_encoder, encoder_model, jointer, decoder, text_encoder, model
def read_audio(in_filename: str):
audio = tf.io.read_file(in_filename)
audio = tf.audio.decode_wav(audio)[0]
audio = tf.expand_dims(tf.squeeze(audio, axis=-1), axis=0)
return audio
class UETASRModel:
def __init__(
self,
repo_id: str,
decoding_method: str,
beam_size: int,
max_symbols_per_step: int,
):
self.featurizer, self.encoder_model, jointer, decoder, text_encoder, self.model = _get_conformer_pre_trained_model(repo_id)
self.searcher = get_searcher(
decoding_method,
decoder,
jointer,
text_encoder,
beam_size,
max_symbols_per_step,
)
def predict(self, in_filename: str):
inputs = read_audio(in_filename)
features = self.featurizer(inputs)
features = self.model.cmvn(features) if self.model.use_cmvn else features
mask = tf.sequence_mask([tf.shape(features)[1]], maxlen=tf.shape(features)[1])
mask = tf.expand_dims(mask, axis=1)
encoder_outputs, encoder_masks = self.encoder_model(
features, mask, training=False)
encoder_mask = tf.squeeze(encoder_masks, axis=1)
features_length = tf.math.reduce_sum(
tf.cast(encoder_mask, tf.int32),
axis=1
)
outputs = self.searcher.infer(encoder_outputs, features_length)
outputs = tf.squeeze(outputs)
outputs = tf.compat.as_str_any(outputs.numpy())
return outputs