sanchit-gandhi HF staff commited on Sep 30, 2022

Commit

6e60fe4

•

1 Parent(s): 080ad7c

Add scripts and weights

Browse files

Files changed (17) hide show

README.md +39 -0
config.json +109 -0
flax_model.msgpack +3 -0
get_ctc_tokenizer.py +102 -0
models/__init__.py +6 -0
models/configuration_bart.py +183 -0
models/configuration_speech_encoder_decoder.py +121 -0
models/configuration_wav2vec2.py +344 -0
models/modeling_flax_bart.py +816 -0
models/modeling_flax_speech_encoder_decoder.py +1245 -0
models/modeling_flax_wav2vec2.py +975 -0
preprocessor_config.json +10 -0
run_flax_speech_recognition_ctc.py +1398 -0
run_tedlium.sh +27 -0
special_tokens_map.json +6 -0
tokenizer_config.json +12 -0
vocab.json +34 -0

README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+language:
+- en
+tags:
+- esc
+datasets:
+- tedlium
+---
+To reproduce this run, first call `get_ctc_tokenizer.py` to train the CTC tokenizer and then execute the following command to train the CTC system:
+```python
+#!/usr/bin/env bash
+python run_flax_speech_recognition_ctc.py \
+        --model_name_or_path="esc/wav2vec2-pretrained" \
+        --tokenizer_name="wav2vec2-ctc-tedlium-tokenizer" \
+        --dataset_name="esc/esc-datasets" \
+        --dataset_config_name="tedlium" \
+        --output_dir="./" \
+        --wandb_project="wav2vec2-ctc" \
+        --wandb_name="wav2vec2-ctc-tedlium" \
+        --max_steps="50000" \
+        --save_steps="10000" \
+        --eval_steps="10000" \
+        --learning_rate="3e-4" \
+        --logging_steps="25" \
+        --warmup_steps="5000" \
+        --preprocessing_num_workers="1" \
+        --hidden_dropout="0.2" \
+        --activation_dropout="0.2" \
+        --feat_proj_dropout="0.2" \
+        --do_train \
+        --do_eval \
+        --do_predict \
+        --overwrite_output_dir \
+        --gradient_checkpointing \
+        --freeze_feature_encoder \
+        --push_to_hub \
+        --use_auth_token
+```

config.json ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "activation_dropout": 0.2,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.2,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "fuse_matmuls": false,
+  "gradient_checkpointing": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.2,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.1,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "transformers_version": "4.21.0.dev0",
+  "use_scan": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcfeb02537a755eff6e041ef612a85e67c80ccc473dfe55d781b92d643f6d724
+size 1261888150

get_ctc_tokenizer.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+from datasets import load_dataset
+from collections import Counter
+import json
+import os
+import tempfile
+from transformers import Wav2Vec2CTCTokenizer
+# which dataset
+dataset_name = "tedlium"
+# which split -> we should only use train to train our tokenizer
+split = "train"
+# in case the dataset requires access
+use_auth_token = True
+# name of tok to upload to the Hub
+tokenizer_name = f"wav2vec2-ctc-{dataset_name}-tokenizer"
+# FIX the cutoff freq for all datasets -> an entirely dataset-agnostic approach
+cutoff_freq = 0.01
+dataset = load_dataset(
+    "esc/esc-datasets",
+    dataset_name,
+    split=split,
+    use_auth_token=use_auth_token,
+)
+# remove all data that is unnecessary to save RAM
+dataset = dataset.remove_columns(list(set(dataset.column_names) - {"text"}))
+# define function to see stats about letters and to create vocab
+def create_vocabulary_from_data(dataset, word_delimiter_token="|", cutoff_freq=0.0):
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["text"])
+        count_chars_dict = Counter(list(all_text))
+        # sort by freq
+        count_chars_dict = sorted(count_chars_dict.items(), key=lambda item: (-item[1], item[0]))
+        # retrieve dict, freq
+        vocab, freqs = zip(*count_chars_dict)
+        return {"vocab": list(vocab), "freqs": list(freqs)}
+    dataset = dataset.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        remove_columns=dataset.column_names,
+    )
+    vocab, freqs = dataset["vocab"], dataset["freqs"]
+    total_num_chars = sum(freqs)
+    chars_to_remove = []
+    print("Character Occurences")
+    print(f"Total characters in dataset: {total_num_chars}")
+    print(50 * "-")
+    print(f"{'Char'.rjust(5)} | {'Total occ'.rjust(10)} | {'% of total occ'.rjust(20)} |")
+    print(50 * "-")
+    for char, freq in zip(vocab, freqs):
+        freq_in_percent = freq / total_num_chars * 100
+        print(f"{char.rjust(5)} | {str(freq).rjust(10)} | {str(round(freq_in_percent, 3)).rjust(20)} |")
+        if freq_in_percent < cutoff_freq:
+            chars_to_remove.append(char)
+    print(50 * "-")
+    vocab = list(set(vocab) - set(chars_to_remove))
+    # Wav2Vec2CTC Tokenizers always have those as the first tokens (important for CTC)
+    vocab = ["<pad>", "<s>", "</s>", "<unk>"] + vocab
+    # create json dict
+    vocab_dict = {v: k for k, v in enumerate(list(vocab))}
+    # replace white space with delimiter token
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]
+    return vocab_dict
+# Note that the functions accepts the following important args
+# --cutoff_freq
+# => This is very important! Lots of datasets will contain "wrong" characters in the training set, e.g.
+# characters that just occur a couple of times.
+# By default, the CTC vocab creation would just add them to the vocab even if their occurance is neglectible # compared to the "super frequent" letters. We can see such characters as "errors" or irrelevant in the
+# dataset, so that we should delete them from the vocab. During training, they would then just be classified
+# unknown <unk> tokens which the model can handle.
+# In this script, we deploy a mechanism to remove all chars whose freq in % is below a certain threshold.
+# We FIX this threshold for all datasets (i.e. dataset-agnostic)
+vocab_dict = create_vocabulary_from_data(dataset, cutoff_freq=cutoff_freq)
+# save vocab dict to be loaded into tokenizer
+with tempfile.TemporaryDirectory() as tmp:
+    with open(os.path.join(tmp, "vocab.json"), "w") as file:
+        json.dump(vocab_dict, file)
+    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(tmp)
+# push tokenizer to the Hub
+tokenizer.push_to_hub(tokenizer_name)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from models.configuration_bart import BartConfig
+from models.configuration_wav2vec2 import Wav2Vec2Config
+from models.configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig
+from models.modeling_flax_wav2vec2 import FlaxWav2Vec2Model, FlaxWav2Vec2Module, FlaxWav2Vec2ForCTC, FlaxWav2Vec2ForCTCModule
+from models.modeling_flax_bart import FlaxBartForCausalLM, FlaxBartForCausalLMModule
+from models.modeling_flax_speech_encoder_decoder import FlaxSpeechEncoderDecoderModel

models/configuration_bart.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BART model configuration"""
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
+    # See all BART models at https://huggingface.co/models?filter=bart
+}
+class BartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BartModel`]. It is used to instantiate a BART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BartModel`] or [`TFBartModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels: (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`BartForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+        use_scan (`bool`, *optional*, defaults to `False`):
+            Whether or not to use nn.scan in the Flax Bart attention layers.
+    Example:
+    ```python
+    >>> from transformers import BartModel, BartConfig
+    >>> # Initializing a BART facebook/bart-large style configuration
+    >>> configuration = BartConfig()
+    >>> # Initializing a model from the facebook/bart-large style configuration
+    >>> model = BartModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bart"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        use_scan=False,
+        fuse_matmuls=False,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.use_scan = use_scan
+        self.fuse_matmuls = fuse_matmuls
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )

models/configuration_speech_encoder_decoder.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from models.configuration_wav2vec2 import Wav2Vec2Config
+from models.configuration_bart import BartConfig
+from transformers import AutoConfig
+logger = logging.get_logger(__name__)
+class SpeechEncoderDecoderConfig(PretrainedConfig):
+    r"""
+    [`SpeechEncoderDecoderConfig`] is the configuration class to store the configuration of a
+    [`SpeechEncoderDecoderModel`]. It is used to instantiate an Encoder Decoder model according to the specified
+    arguments, defining the encoder and decoder configs.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the encoder config.
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the decoder config.
+    Examples:
+    ```python
+    >>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
+    >>> # Initializing a Wav2Vec2 & BERT style configuration
+    >>> config_encoder = Wav2Vec2Config()
+    >>> config_decoder = BertConfig()
+    >>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+    >>> # Initializing a Wav2Vec2Bert model from a Wav2Vec2 & bert-base-uncased style configurations
+    >>> model = SpeechEncoderDecoderModel(config=config)
+    >>> # Accessing the model configuration
+    >>> config_encoder = model.config.encoder
+    >>> config_decoder = model.config.decoder
+    >>> # set decoder config to causal lm
+    >>> config_decoder.is_decoder = True
+    >>> config_decoder.add_cross_attention = True
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("my-model")
+    >>> # loading model and config from pretrained folder
+    >>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained("my-model")
+    >>> model = SpeechEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
+    ```"""
+    model_type = "speech-encoder-decoder"
+    is_composition = True
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError(
+                f"A configuraton of type {self.model_type} cannot be instantiated because not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
+            )
+        encoder_config = kwargs.pop("encoder")
+        decoder_config = kwargs.pop("decoder")
+        # TODO: Load configs from AutoConfig (as done in Transformers 🤗)
+        self.encoder = Wav2Vec2Config(**encoder_config)
+        self.decoder = BartConfig(**decoder_config)
+        self.is_encoder_decoder = True
+    @classmethod
+    def from_encoder_decoder_configs(
+        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
+    ) -> PretrainedConfig:
+        r"""
+        Instantiate a [`SpeechEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
+        configuration and decoder model configuration.
+        Returns:
+            [`SpeechEncoderDecoderConfig`]: An instance of a configuration object
+        """
+        logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+        decoder_config.is_decoder = True
+        decoder_config.add_cross_attention = True
+        return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["encoder"] = self.encoder.to_dict()
+        output["decoder"] = self.decoder.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output

models/configuration_wav2vec2.py ADDED Viewed

	@@ -0,0 +1,344 @@

+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Wav2Vec2 model configuration"""
+import functools
+import operator
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/config.json",
+    # See all Wav2Vec2 models at https://huggingface.co/models?filter=wav2vec2
+}
+class Wav2Vec2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Wav2Vec2Model`]. It is used to instantiate an
+    Wav2Vec2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Wav2Vec2
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the Wav2Vec2 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Wav2Vec2Model`] or [`TFWav2Vec2Model`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Wav2Vec2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for quantized feature encoder states.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is
+            True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
+            False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2ForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Wav2Vec2ForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
+            warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+        use_scan (`bool`, *optional*, defaults to `False`):
+            Whether or not to use nn.scan in the Flax Wav2Vec2 transformer layers.
+    Example:
+    ```python
+    >>> from transformers import Wav2Vec2Model, Wav2Vec2Config
+    >>> # Initializing a Wav2Vec2 facebook/wav2vec2-base-960h style configuration
+    >>> configuration = Wav2Vec2Config()
+    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> model = Wav2Vec2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "wav2vec2"
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        do_stable_layer_norm=False,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        use_scan=False,
+        fuse_matmuls=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.use_scan = use_scan
+        self.fuse_matmuls = fuse_matmuls
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. "
+                "It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, "
+                f"but is `len(config.conv_dim) = {len(self.conv_dim)}`, `len(config.conv_stride) "
+                f"= {len(self.conv_stride)}`, `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)

models/modeling_flax_bart.py ADDED Viewed

	@@ -0,0 +1,816 @@

+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax Bart model."""
+import math
+import random
+from functools import partial
+from typing import Optional, Tuple
+import numpy as np
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from jax import lax
+from jax.random import PRNGKey
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+)
+from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
+from models import BartConfig
+scan_with_axes = nn_partitioning.scan_with_axes
+remat = nn_partitioning.remat
+def shift_tokens_right(input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray:
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = np.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1]
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    shifted_input_ids = np.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    return shifted_input_ids
+class FlaxBartAttention(nn.Module):
+    config: BartConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.fused_proj = nn.Dense(
+            self.embed_dim * 3,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fused_key_value = nn.Dense(
+            self.embed_dim * 2,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.out_proj = dense()
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        if self.config.fuse_matmuls:
+            # get key, value proj
+            if is_cross_attention:
+                # get query proj
+                query_states = self.q_proj(hidden_states)
+                # cross_attentions
+                attention_states = self.fused_key_value(key_value_states)
+                key_states, value_states = jnp.split(attention_states, 2, axis=-1)
+            else:
+                attention_states = self.fused_proj(hidden_states)
+                query_states, key_states, value_states = jnp.split(attention_states, 3, axis=-1)
+        else:
+            # get query proj
+            query_states = self.q_proj(hidden_states)
+            # get key, value proj
+            if is_cross_attention:
+                # cross_attentions
+                key_states = self.k_proj(key_value_states)
+                value_states = self.v_proj(key_value_states)
+            else:
+                # self_attention
+                key_states = self.k_proj(hidden_states)
+                value_states = self.v_proj(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+class FlaxBartDecoderLayer(nn.Module):
+    config: BartConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxBartAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.encoder_attn = FlaxBartAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.fc1 = nn.Dense(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        if self.config.use_scan:
+            hidden_states = hidden_states[0]
+        residual = hidden_states
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
+        )
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if self.config.use_scan:
+            outputs = (outputs, None)
+        return outputs
+class FlaxBartDecoderLayerCollection(nn.Module):
+    config: BartConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        num_decoder_layers = self.config.decoder_layers
+        BlockDecoderLayer = (
+            remat(
+                FlaxBartDecoderLayer,
+                static_argnums=(4, 5, 6),
+                prevent_cse=not self.config.use_scan,
+            )
+            if self.config.gradient_checkpointing
+            else FlaxBartDecoderLayer
+        )
+        if self.config.use_scan:
+            # since all decoder layers are the same, we use nn.scan directly
+            assert not output_attentions, "cannot use `scan` with `output_attentions` set to `True`"
+            assert not output_hidden_states, "cannot use `scan` with `output_hidden_states` set to `True`"
+            hidden_states = (hidden_states,)
+            # TODO: add layerdrop in checkpointed scan (note: default value for layerdrop in config is zero)
+            hidden_states, _ = scan_with_axes(
+                BlockDecoderLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(nn.broadcast, nn.broadcast, nn.broadcast, nn.broadcast, nn.broadcast, nn.broadcast),
+                length=num_decoder_layers,
+            )(self.config, dtype=self.dtype, name="FlaxBartDecoderLayers")(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                output_attentions,
+                deterministic,
+            )
+            hidden_states = hidden_states[0]
+        else:
+            for layer in range(num_decoder_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+                dropout_probability = random.uniform(0, 1)
+                if not deterministic and (dropout_probability < self.config.decoder_layerdrop):
+                    layer_outputs = (None, None, None)
+                else:
+                    layer_outputs = BlockDecoderLayer(self.config, dtype=self.dtype, name=str(layer),)(
+                        hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        init_cache,
+                        output_attentions,
+                        deterministic,
+                    )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+                    if encoder_hidden_states is not None:
+                        all_cross_attentions += (layer_outputs[2],)
+            # add hidden states from the last decoder layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class FlaxBartDecoder(nn.Module):
+    config: BartConfig
+    embed_tokens: nn.Embed
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        embed_dim = self.config.d_model
+        self.padding_idx = self.config.pad_token_id
+        self.max_target_positions = self.config.max_position_embeddings
+        self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        self.embed_positions = nn.Embed(
+            self.config.max_position_embeddings + self.offset,
+            embed_dim,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.layers = FlaxBartDecoderLayerCollection(self.config, self.dtype)
+        self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        input_shape = input_ids.shape
+        input_ids = input_ids.reshape(-1, input_shape[-1])
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # embed positions
+        positions = self.embed_positions(position_ids + self.offset)
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return outputs
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+class FlaxBartDecoderPreTrainedModel(FlaxPreTrainedModel):
+    config_class = BartConfig
+    base_model_prefix: str = "model"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: BartConfig,
+        input_shape: Tuple[int] = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs
+    ):
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        batch_size, sequence_length = input_ids.shape
+        position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        encoder_hidden_states = jnp.zeros(input_shape + (self.config.d_model,))
+        encoder_attention_mask = attention_mask
+        module_init_outputs = self.module.init(
+            rngs,
+            input_ids,
+            attention_mask,
+            position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            return_dict=False,
+        )
+        return module_init_outputs["params"]
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        past_key_values: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        """
+        Args:
+        input_ids (`jnp.ndarray` of shape `(target_batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            For translation and summarization training, `decoder_input_ids` should be provided. If no
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+            for denoising pre-training following the paper.
+        attention_mask (`jnp.ndarray` of shape `(target_batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(target_batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        encoder_hidden_states (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
+            A sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if encoder_hidden_states is not None and encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        # prepare decoder inputs
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            batch_size, sequence_length = input_ids.shape
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+        # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+        # changed by FlaxBartAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        outputs = self.module.apply(
+            inputs,
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            position_ids=jnp.array(position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+        return outputs
+class FlaxBartDecoderWrapper(nn.Module):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+    config: BartConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        embed_dim = self.config.d_model
+        embed_tokens = nn.Embed(
+            self.config.vocab_size,
+            embed_dim,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.decoder = FlaxBartDecoder(config=self.config, embed_tokens=embed_tokens, dtype=self.dtype)
+    def __call__(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+class FlaxBartForCausalLMModule(nn.Module):
+    """Bart Decoder Module with a language modeling head on top (linear layer with weights tied to the input embeddings)
+    e.g. for autoregressive tasks.
+    """
+    config: BartConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.model = FlaxBartDecoderWrapper(config=self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_ids,
+            attention_mask,
+            position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.variables["params"]["decoder"]["embed_tokens"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=lm_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+class FlaxBartForCausalLM(FlaxBartDecoderPreTrainedModel):
+    """Bart Decoder Model with a language modeling head on top (linear layer with weights tied to the input embeddings)
+    e.g. for autoregressive tasks.
+    """
+    module_class = FlaxBartForCausalLMModule
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs

models/modeling_flax_speech_encoder_decoder.py ADDED Viewed

	@@ -0,0 +1,1245 @@

+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Classes to support Flax Speech-Encoder-Decoder architectures"""
+import os
+from functools import partial
+from typing import Optional, Tuple, Union, Dict
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, unfreeze
+from jax import lax
+from jax.random import PRNGKey
+import numpy as np
+from transformers.modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
+from transformers.modeling_flax_utils import FlaxPreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, ModelOutput
+from transformers.generation_flax_utils import FlaxLogitsProcessorList
+from models import (
+    FlaxWav2Vec2Model,
+    FlaxWav2Vec2Module,
+    FlaxBartForCausalLM,
+    FlaxBartForCausalLMModule,
+    BartConfig,
+    Wav2Vec2Config,
+    SpeechEncoderDecoderConfig,
+)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "SpeechEncoderDecoderConfig"
+SPEECH_ENCODER_DECODER_START_DOCSTRING = r"""
+    This class can be used to initialize a speech-sequence-to-text-sequence model with any pretrained speech
+    autoencoding model as the encoder and any pretrained text autoregressive model as the decoder. The encoder is
+    loaded via [`~AutoModel.from_pretrained`] function and the decoder is loaded via
+    [`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added to the decoder
+    and should be fine-tuned on a downstream generative task, like summarization.
+    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
+    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
+    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+    Zhou, Wei Li, Peter J. Liu.
+    Additionally, in [Large-Scale Self- and Semi-Supervised Learning for Speech
+    Translation](https://arxiv.org/abs/2104.06678) it is shown how leveraging large pretrained speech models for speech
+    translation yields a significant performance improvement.
+    After such an Speech-Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other
+    models (see the examples for more information).
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Parameters:
+        config ([`SpeechEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
+            Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac*
+            or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile
+            library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or
+            [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
+            *torch.FloatTensor*.
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
+            created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
+            and prepending them with the `decoder_start_token_id`.
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.decoder.max_position_embeddings - 1]`.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
+"""
+SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*):
+            Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac*
+            or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile
+            library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or
+            [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type
+            *torch.FloatTensor*.
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
+"""
+SPEECH_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
+            created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
+            and prepending them with the `decoder_start_token_id`.
+        encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.decoder.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxCausalLMOutputWithCrossAttentions`] instead of a
+            plain tuple.
+"""
+@flax.struct.dataclass
+class FlaxBeamSearchOutput(ModelOutput):
+    """
+    Flax Base class for outputs of decoder-only generation models using greedy search.
+    Args:
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
+            The generated sequences.
+        scores (`jnp.ndarray` of shape `(batch_size,)`):
+            The scores (log probabilites) of the generated sequences.
+    """
+    sequences: jnp.ndarray = None
+    scores: jnp.ndarray = None
+@flax.struct.dataclass
+class BeamSearchState:
+    cur_len: jnp.ndarray
+    running_sequences: jnp.ndarray
+    running_scores: jnp.ndarray
+    sequences: jnp.ndarray
+    scores: jnp.ndarray
+    is_sent_finished: jnp.ndarray
+    model_kwargs: Dict[str, jnp.ndarray]
+class FlaxSpeechEncoderDecoderModule(nn.Module):
+    config: SpeechEncoderDecoderConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        encoder_config = self.config.encoder
+        decoder_config = self.config.decoder
+        # TODO: configure FlaxAutoModel mappings (required when trialling different encoder-decoder combinations)
+        encoder_module = FlaxWav2Vec2Module
+        decoder_module = FlaxBartForCausalLMModule
+        self.encoder = encoder_module(encoder_config, dtype=self.dtype)
+        self.decoder = decoder_module(decoder_config, dtype=self.dtype)
+        # encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Dense(
+                self.decoder.config.hidden_size,
+                kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
+                dtype=self.dtype,
+            )
+        else:
+            self.enc_to_dec_proj = None
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.encoder.add_adapter if add_adapter is None else add_adapter
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size, stride in zip(self.config.encoder.conv_kernel, self.config.encoder.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        if add_adapter:
+            for _ in range(self.config.encoder.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.encoder.adapter_stride)
+        return input_lengths
+    def _get_encoder_module(self):
+        return self.encoder
+    def _get_projection_module(self):
+        return self.enc_to_dec_proj
+    def _get_decoder_module(self):
+        return self.decoder
+    def __call__(
+        self,
+        inputs,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        decoder_position_ids,
+        encoder_outputs=None,
+        extract_features=None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        output_features: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+        freeze_feature_encoder: bool = False,
+    ):
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs,
+                attention_mask=attention_mask,
+                extract_features=extract_features,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                output_features=output_features,
+                return_dict=return_dict,
+                deterministic=deterministic,
+                freeze_feature_encoder=freeze_feature_encoder,
+            )
+        if output_features:
+            return encoder_outputs
+        encoder_hidden_states = encoder_outputs[0]
+        # optionally project encoder_hidden_states
+        if self.enc_to_dec_proj is not None:
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+        # compute correct encoder attention mask
+        if attention_mask is not None:
+            encoder_attention_mask = self.encoder._get_feature_vector_attention_mask(
+                encoder_hidden_states.shape[1], attention_mask
+            )
+        else:
+            encoder_attention_mask = None
+        # flax script modeling_flax_wav2vec2.py
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return FlaxSeq2SeqLMOutput(
+            logits=decoder_outputs.logits,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_hidden_states,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING)
+class FlaxSpeechEncoderDecoderModel(FlaxPreTrainedModel):
+    r"""
+    [`FlaxSpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture
+    with the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one
+    as decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the
+    encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
+    """
+    config_class = SpeechEncoderDecoderConfig
+    base_model_prefix: str = "speech_encoder_decoder"
+    module_class = FlaxSpeechEncoderDecoderModule
+    def __init__(
+        self,
+        config: SpeechEncoderDecoderConfig,
+        input_shape: Optional[Tuple] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs
+    ):
+        if not _do_init:
+            raise ValueError(
+                "`FlaxSpeechEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
+            )
+        if config.decoder.cross_attention_hidden_size is not None:
+            # Raise ValueError or option to project enc to dec hidden_size (eg EncAdapterLayer)
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, "
+                    "it has to be equal to the encoder's `hidden_size`. "
+                    f"Got {config.decoder.cross_attention_hidden_size} for `config.decoder.cross_attention_hidden_size` "
+                    f"and {config.encoder.hidden_size} for `config.encoder.hidden_size`."
+                )
+        # make sure input & output embeddings are not tied
+        config.tie_word_embeddings = False
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        if input_shape is None:
+            # speech encoders almost always downsample the sequence length dimension
+            encoder_input_length = 1024
+            decoder_input_length = module._get_feat_extract_output_lengths(encoder_input_length)
+            input_shape = ((1, encoder_input_length), (1, decoder_input_length))
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
+        encoder_input_shape, decoder_input_shape = input_shape
+        # init input DeviceArrays
+        inputs = jnp.zeros(encoder_input_shape, dtype="f4")
+        attention_mask = jnp.ones_like(inputs, dtype="i4")
+        decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        batch_size, sequence_length = inputs.shape
+        decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape
+        if not decoder_batch_size == batch_size:
+            raise ValueError(
+                f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder and {decoder_batch_size} for decoder."
+            )
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
+        )
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        return self.module.init(
+            rngs,
+            inputs,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+        )["params"]
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+        )
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        return self.module._get_feat_extract_output_lengths(input_lengths, add_adapter=add_adapter)
+    @add_start_docstrings(SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def encode(
+        self,
+        inputs: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        extract_features: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_features: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        freeze_feature_encoder: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import FlaxSpeechEncoderDecoderModel
+        >>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "facebook/wav2vec2-large-lv60", "facebook/bart-large"
+        ... )
+        >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)
+        >>> encoder_outputs = model.encode(inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(inputs, dtype="i4")
+        if extract_features is not None:
+            extract_features = jnp.array(extract_features, dtype="f4")
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        def _encoder_forward(module, inputs, attention_mask, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(inputs, attention_mask, **kwargs)
+        outputs = self.module.apply(
+            {"params": params or self.params},
+            inputs=jnp.array(inputs, dtype="f4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            extract_features=extract_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_features=output_features,
+            return_dict=return_dict,
+            deterministic=not train,
+            freeze_feature_encoder=freeze_feature_encoder,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+        if return_dict and not output_features:
+            outputs = FlaxBaseModelOutput(
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        return outputs
+    @add_start_docstrings(SPEECH_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import FlaxSpeechEncoderDecoderModel
+        >>> import jax.numpy as jnp
+        >>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "facebook/wav2vec2-large-lv60", "facebook/bart-large"
+        ... )
+        >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)
+        >>> encoder_outputs = model.encode(inputs)
+        >>> decoder_start_token_id = model.config.decoder.bos_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        if encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        params = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxBartAttention module
+        if past_key_values:
+            params["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
+        ):
+            projection_module = module._get_projection_module()
+            decoder_module = module._get_decoder_module()
+            # optionally project encoder_hidden_states
+            if projection_module is not None:
+                encoder_hidden_states = projection_module(encoder_hidden_states)
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                encoder_hidden_states,
+                **kwargs,
+            )
+        outputs = self.module.apply(
+            params,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    @add_start_docstrings_to_model_forward(SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def __call__(
+        self,
+        inputs: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        extract_features: Optional[jnp.ndarray] = None,
+        decoder_input_ids: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_features: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        freeze_feature_encoder: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import FlaxSpeechEncoderDecoderModel, BartTokenizer
+        >>> # load a fine-tuned wav2vec2-2-bart model
+        >>> model = FlaxSpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large")
+        >>> # load output tokenizer
+        >>> tokenizer_output = BartTokenizer.from_pretrained("facebook/bart-large")
+        >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32)
+        >>> # use bart's special bos, pad and eos tokens
+        >>> model.config.decoder_start_token_id = model.decoder.config.bos_token_id
+        >>> model.config.pad_token_id = model.decoder.config.pad_token_id
+        >>> model.config.eos_token_id = model.decoder.config.eos_token_id
+        >>> outputs = model.generate(inputs)
+        # Assert something? More interesting input? dtype correct?
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # prepare encoder inputs
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(inputs, dtype="i4")
+        if extract_features is not None:
+            inputs = None  # we can omit passing the inputs to the model to save memory
+            extract_features = jnp.array(extract_features, dtype="f4")
+        else:
+            inputs = jnp.array(inputs, dtype="f4")
+        # prepare decoder inputs
+        if decoder_input_ids is None:
+            raise ValueError(
+                "`decoder_input_ids` cannot be `None`. For sequence to sequence training, `decoder_position_ids` must be specified as an input argument."
+            )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        if decoder_position_ids is None:
+            batch_size, sequence_length = decoder_input_ids.shape
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            inputs=inputs,
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            extract_features=extract_features,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_features=output_features,
+            return_dict=return_dict,
+            deterministic=not train,
+            freeze_feature_encoder=freeze_feature_encoder,
+            rngs=rngs,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jnp.DeviceArray] = None,
+        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
+            )
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": decoder_position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        *model_args,
+        **kwargs
+    ) -> FlaxPreTrainedModel:
+        r"""
+        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+        checkpoints.
+        Params:
+            encoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*):
+                Information necessary to initiate the encoder. Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+            decoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+            model_args (remaining positional arguments, *optional*):
+                All remaning positional arguments will be passed to the underlying model's `__init__` method.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+        Example:
+        ```python
+        >>> from transformers import FlaxSpeechEncoderDecoderModel
+        >>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "facebook/wav2vec2-large-lv60", "facebook/bart-large"
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./wav2vec2-2-bart-large")
+        >>> # load fine-tuned model
+        >>> model = FlaxSpeechEncoderDecoderModel.from_pretrained("./wav2vec2-2-bart-large")
+        ```"""
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder.keys():
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder.keys():
+            del kwargs["decoder_" + key]
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+            if "config" not in kwargs_encoder:
+                # TODO: AutoConfig .from_pretrained
+                encoder_config, kwargs_encoder = Wav2Vec2Config.from_pretrained(
+                    encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
+                )
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and casual mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+                kwargs_encoder["config"] = encoder_config
+            # TODO: FlaxAutoModel .from_pretrained
+            encoder = FlaxWav2Vec2Model.from_pretrained(
+                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
+            )
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+            if "config" not in kwargs_decoder:
+                # TODO: AutoConfig .from_pretrained
+                decoder_config, kwargs_decoder = BartConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. "
+                        f"Cross attention layers are added to {decoder_pretrained_model_name_or_path} "
+                        f"and randomly initialized if {decoder_pretrained_model_name_or_path}'s architecture allows for "
+                        "cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+                kwargs_decoder["config"] = decoder_config
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+            # TODO: FlaxAutoModelForCausalLM .from_pretrained
+            decoder = FlaxBartForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+        # instantiate config with corresponding kwargs
+        dtype = kwargs.pop("dtype", jnp.float32)
+        config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+        # make sure input & output word embeddings are not tied
+        config.tie_word_embeddings = False
+        # init model
+        model = cls(config, dtype=dtype)
+        model.params["encoder"] = encoder.params
+        model.params["decoder"] = decoder.params
+        return model
+    def _beam_search(
+        self,
+        input_ids: None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        early_stopping: Optional[bool] = None,
+        logits_processor: Optional[FlaxLogitsProcessorList] = None,
+        trace: bool = True,
+        params: Optional[Dict[str, jnp.ndarray]] = None,
+        model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
+    ):
+        """
+        This beam search function is heavily inspired by Flax's official example:
+        https://github.com/google/flax/blob/master/examples/wmt/train.py#L254
+        """
+        def flatten_beam_dim(tensor):
+            """Flattens the first two dimensions of a non-scalar array."""
+            # ignore scalars (e.g. cache index)
+            if tensor.ndim == 0 or tensor.ndim == 1:
+                return tensor
+            elif tensor.ndim == 6:
+                return tensor.reshape(tensor.shape[:1] + (tensor.shape[1] * tensor.shape[2],) + tensor.shape[3:])
+            return tensor.reshape((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+        def unflatten_beam_dim(tensor, batch_size, num_beams):
+            """Unflattens the first, flat batch*beam dimension of a non-scalar array."""
+            # ignore scalars (e.g. cache index)
+            if tensor.ndim == 0 or tensor.ndim == 1:
+                return tensor
+            if tensor.ndim == 5:
+                return tensor.reshape(tensor.shape[:1] + (batch_size, num_beams) + tensor.shape[2:])
+            return tensor.reshape((batch_size, num_beams) + tensor.shape[1:])
+        def gather_beams(nested, beam_indices, batch_size, new_num_beams):
+            """
+            Gathers the beam slices indexed by beam_indices into new beam array.
+            """
+            batch_indices = jnp.reshape(
+                jnp.arange(batch_size * new_num_beams) // new_num_beams, (batch_size, new_num_beams)
+            )
+            def gather_fn(tensor):
+                # ignore scalars (e.g. cache index)
+                if tensor.ndim == 0 or tensor.ndim == 1:
+                    return tensor
+                if tensor.ndim == 6:
+                    return tensor[:, batch_indices, beam_indices]
+                return tensor[batch_indices, beam_indices]
+            return jax.tree_map(gather_fn, nested)
+        # init values
+        max_length = max_length if max_length is not None else self.config.max_length
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+        batch_size, num_beams, cur_len = input_ids.shape
+        eos_token_id = jnp.array(eos_token_id)
+        pad_token_id = jnp.array(pad_token_id)
+        cur_len = jnp.array(cur_len)
+        # per batch,beam-item holding current token in loop.
+        sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
+        running_sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
+        running_sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0, 0))
+        # per batch,beam-item state bit indicating if sentence has finished.
+        is_sent_finished = jnp.zeros((batch_size, num_beams), dtype=jnp.bool_)
+        # per batch,beam-item score, logprobs
+        running_scores = jnp.tile(jnp.array([0.0] + [np.array(-1.0e7)] * (num_beams - 1)), [batch_size, 1])
+        scores = jnp.ones((batch_size, num_beams)) * np.array(-1.0e7)
+        # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop
+        # and pass it the `encoder_outputs`, which are part of the `model_kwargs`.
+        model = self.decode if self.config.is_encoder_decoder else self
+        # flatten beam dim
+        if "encoder_outputs" in model_kwargs:
+            model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim(
+                model_kwargs["encoder_outputs"]["last_hidden_state"]
+            )
+        if "attention_mask" in model_kwargs:
+            model_kwargs["attention_mask"] = flatten_beam_dim(model_kwargs["attention_mask"])
+        # initialize model specific kwargs
+        model_kwargs = self.prepare_inputs_for_generation(flatten_beam_dim(input_ids), max_length, **model_kwargs)
+        # initialize state
+        state = BeamSearchState(
+            cur_len=cur_len,
+            running_sequences=running_sequences,
+            running_scores=running_scores,
+            sequences=sequences,
+            scores=scores,
+            is_sent_finished=is_sent_finished,
+            model_kwargs=model_kwargs,
+        )
+        def beam_search_cond_fn(state):
+            """beam search state termination condition fn."""
+            # 1. is less than max length?
+            not_max_length_yet = state.cur_len < max_length
+            # 2. can the new beams still improve?
+            best_running_score = state.running_scores[:, -1:] / (max_length**length_penalty)
+            worst_finished_score = jnp.where(
+                state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7)
+            )
+            improvement_still_possible = jnp.all(worst_finished_score < best_running_score)
+            # 3. is there still a beam that has not finished?
+            still_open_beam = ~(jnp.all(state.is_sent_finished) & early_stopping)
+            return not_max_length_yet & still_open_beam & improvement_still_possible
+        def beam_search_body_fn(state, input_ids_length=1):
+            """beam search state update fn."""
+            # 1. Forward current tokens
+            # Collect the current position slice along length to feed the fast
+            # autoregressive decoder model.  Flatten the beam dimension into batch
+            # dimension for feeding into the model.
+            # unflatten beam dimension
+            # Unflatten beam dimension in attention cache arrays
+            input_token = flatten_beam_dim(
+                lax.dynamic_slice(
+                    state.running_sequences,
+                    (0, 0, state.cur_len - input_ids_length),
+                    (batch_size, num_beams, input_ids_length),
+                )
+            )
+            model_outputs = model(input_token, params=params, **state.model_kwargs)
+            logits = unflatten_beam_dim(model_outputs.logits[:, -1], batch_size, num_beams)
+            cache = jax.tree_map(
+                lambda tensor: unflatten_beam_dim(tensor, batch_size, num_beams), model_outputs.past_key_values
+            )
+            # adapt logits for FlaxMarianMTModel
+            logits = self._adapt_logits_for_beam_search(logits)
+            # 2. Compute log probs
+            # get log probabilities from logits,
+            # process logits with processors (*e.g.* min_length, ...), and
+            # add new logprobs to existing running logprobs scores.
+            log_probs = jax.nn.log_softmax(logits)
+            log_probs = logits_processor(
+                flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), state.cur_len
+            )
+            log_probs = unflatten_beam_dim(log_probs, batch_size, num_beams)
+            log_probs = log_probs + jnp.expand_dims(state.running_scores, axis=2)
+            vocab_size = log_probs.shape[2]
+            log_probs = log_probs.reshape((batch_size, num_beams * vocab_size))
+            # 3. Retrieve top-K
+            # Each item in batch has num_beams * vocab_size candidate sequences.
+            # For each item, get the top 2*k candidates with the highest log-
+            # probabilities. We gather the top 2*K beams here so that even if the best
+            # K sequences reach EOS simultaneously, we have another K sequences
+            # remaining to continue the live beam search.
+            # Gather the top 2*K scores from _all_ beams.
+            # Gather 2*k top beams.
+            # Recover the beam index by floor division.
+            # Recover token id by modulo division and expand Id array for broadcasting.
+            # Update sequences for the 2*K top-k new sequences.
+            beams_to_keep = 2 * num_beams
+            topk_log_probs, topk_indices = lax.top_k(log_probs, k=beams_to_keep)
+            topk_beam_indices = topk_indices // vocab_size
+            topk_running_sequences = gather_beams(
+                state.running_sequences, topk_beam_indices, batch_size, beams_to_keep
+            )
+            topk_ids = jnp.expand_dims(topk_indices % vocab_size, axis=2)
+            topk_sequences = lax.dynamic_update_slice(topk_running_sequences, topk_ids, (0, 0, state.cur_len))
+            # 4. Check which sequences have ended
+            # Update current sequences:
+            # Did any of these sequences reach an end marker?
+            # To prevent these just finished sequences from being added to the current sequences
+            # set of active beam search sequences, set their log probs to a very large
+            # negative value.
+            did_topk_just_finished = topk_sequences[:, :, state.cur_len] == eos_token_id
+            running_topk_log_probs = topk_log_probs + did_topk_just_finished * np.array(-1.0e7)
+            # 5. Get running sequences scores for next
+            # Determine the top k beam indices (from top 2*k beams) from log probs
+            # and gather top k beams (from top 2*k beams).
+            next_topk_indices = jnp.flip(lax.top_k(running_topk_log_probs, k=num_beams)[1], axis=1)
+            next_running_sequences, next_running_scores = gather_beams(
+                [topk_sequences, running_topk_log_probs], next_topk_indices, batch_size, num_beams
+            )
+            # 6. Process topk logits
+            # Further process log probs:
+            # - add length penalty
+            # - make sure no scores can be added anymore if beam is full
+            # - make sure still running sequences cannot be chosen as finalized beam
+            topk_log_probs = topk_log_probs / (state.cur_len**length_penalty)
+            beams_in_batch_are_full = (
+                jnp.broadcast_to(state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape)
+                & early_stopping
+            )
+            add_penalty = ~did_topk_just_finished | beams_in_batch_are_full
+            topk_log_probs += add_penalty * np.array(-1.0e7)
+            # 7. Get scores, sequences, is sentence finished for next.
+            # Combine sequences, scores, and flags along the beam dimension and compare
+            # new finished sequence scores to existing finished scores and select the
+            # best from the new set of beams
+            merged_sequences = jnp.concatenate([state.sequences, topk_sequences], axis=1)
+            merged_scores = jnp.concatenate([state.scores, topk_log_probs], axis=1)
+            merged_is_sent_finished = jnp.concatenate([state.is_sent_finished, did_topk_just_finished], axis=1)
+            topk_merged_indices = jnp.flip(lax.top_k(merged_scores, k=num_beams)[1], axis=1)
+            next_sequences, next_scores, next_is_sent_finished = gather_beams(
+                [merged_sequences, merged_scores, merged_is_sent_finished], topk_merged_indices, batch_size, num_beams
+            )
+            # 8. Update model kwargs.
+            # Determine the top k beam indices from the original set of all beams.
+            # With these, gather the top k beam-associated caches.
+            next_running_indices = gather_beams(topk_beam_indices, next_topk_indices, batch_size, num_beams)
+            next_cache = gather_beams(cache, next_running_indices, batch_size, num_beams)
+            model_outputs["past_key_values"] = jax.tree_map(lambda x: flatten_beam_dim(x), next_cache)
+            next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs)
+            return BeamSearchState(
+                cur_len=state.cur_len + 1,
+                running_scores=next_running_scores,
+                running_sequences=next_running_sequences,
+                scores=next_scores,
+                sequences=next_sequences,
+                is_sent_finished=next_is_sent_finished,
+                model_kwargs=next_model_kwargs,
+            )
+        # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
+        if input_ids.shape[-1] > 1:
+            state = partial(beam_search_body_fn, input_ids_length=input_ids.shape[-1])(state)
+        if not trace:
+            state = self._run_loop_in_debug(beam_search_cond_fn, beam_search_body_fn, state)
+        else:
+            state = lax.while_loop(beam_search_cond_fn, beam_search_body_fn, state)
+        # Account for the edge-case where there are no finished sequences for a
+        # particular batch item. If so, return running sequences for that batch item.
+        none_finished = jnp.any(state.is_sent_finished, axis=1)
+        sequences = jnp.where(none_finished[:, None, None], state.sequences, state.running_sequences)
+        scores = jnp.where(none_finished[:, None], state.scores, state.running_scores)
+        # return all beams for each batch and the best score
+        sequences = sequences[:, :]
+        scores = scores[:, -1]
+        return FlaxBeamSearchOutput(sequences=sequences, scores=scores)

models/modeling_flax_wav2vec2.py ADDED Viewed

	@@ -0,0 +1,975 @@

+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax Wav2Vec2 model."""
+from functools import partial
+from typing import Optional, Tuple, Union
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from jax import lax
+from transformers.modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
+from transformers.utils import ModelOutput
+from models import Wav2Vec2Config
+scan_with_axes = nn_partitioning.scan_with_axes
+remat = nn_partitioning.remat
+@flax.struct.dataclass
+class FlaxWav2Vec2BaseModelOutput(ModelOutput):
+    """
+    Output type of [`FlaxWav2Vec2BaseModelOutput`], with potential hidden states and attentions.
+    Args:
+        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        extract_features (`jnp.ndarray` of shape `(batch_size, sequence_length, last_conv_dim)`):
+            Sequence of extracted feature vectors of the last convolutional layer of the model with `last_conv_dim`
+            being the dimension of the last convolutional layer.
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    last_hidden_state: jnp.ndarray = None
+    extract_features: jnp.ndarray = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+    attentions: Optional[Tuple[jnp.ndarray]] = None
+WAV_2_VEC_2_START_DOCSTRING = r"""
+    Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding
+            and conversion into a tensor of type *jnp.ndarray*. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask) .. warning:: `attention_mask` should only be passed
+            if the corresponding processor has `config.return_attention_mask == True`. For all models whose processor
+            has `config.return_attention_mask == False`, such as
+            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be
+            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
+            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
+            different results depending on whether `input_values` is padded or not.
+        mask_time_indices (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class FlaxWav2Vec2LayerNormConvLayer(nn.Module):
+    config: Wav2Vec2Config
+    layer_id: int = 0
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.in_conv_dim = self.config.conv_dim[self.layer_id] if self.layer_id > 0 else 1
+        self.out_conv_dim = self.config.conv_dim[self.layer_id]
+        self.conv = nn.Conv(
+            features=self.config.conv_dim[self.layer_id],
+            kernel_size=(self.config.conv_kernel[self.layer_id],),
+            strides=(self.config.conv_stride[self.layer_id],),
+            use_bias=self.config.conv_bias,
+            kernel_init=jax.nn.initializers.he_normal(),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.feat_extract_activation]
+    def __call__(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+class FlaxConvWithWeightNorm(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.conv = nn.Conv(
+            features=self.config.hidden_size,
+            kernel_size=(self.config.num_conv_pos_embeddings,),
+            kernel_init=jax.nn.initializers.he_normal(),
+            padding="VALID",
+            feature_group_count=self.config.num_conv_pos_embedding_groups,
+            dtype=self.dtype,
+        )
+        weight_shape = (
+            self.conv.features,
+            self.conv.features // self.conv.feature_group_count,
+            self.conv.kernel_size[0],
+        )
+        self.weight_v = self.param("weight_v", jax.nn.initializers.he_normal(), weight_shape)
+        self.weight_g = self.param("weight_g", lambda _: jnp.linalg.norm(self.weight_v, axis=(0, 1))[None, None, :])
+        self.bias = self.param("bias", jax.nn.initializers.zeros, (self.conv.features,))
+        self.prev_padding = self.conv.kernel_size[0] // 2
+    def _get_normed_weights(self):
+        weight_v_norm = jnp.linalg.norm(self.weight_v, axis=(0, 1))[None, None, :]
+        normed_weight_v = jnp.divide(self.weight_v, weight_v_norm)
+        normed_kernel = jnp.multiply(normed_weight_v, self.weight_g)
+        return normed_kernel
+    def __call__(self, hidden_states):
+        kernel = self._get_normed_weights()
+        hidden_states = jnp.pad(hidden_states, ((0, 0), (self.prev_padding, self.prev_padding), (0, 0)))
+        hidden_states = self.conv.apply({"params": {"kernel": kernel.T, "bias": self.bias}}, hidden_states)
+        return hidden_states
+class FlaxWav2Vec2PositionalConvEmbedding(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.conv = FlaxConvWithWeightNorm(self.config, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.feat_extract_activation]
+        self.num_pad_remove = 1 if self.config.num_conv_pos_embeddings % 2 == 0 else 0
+    def __call__(self, hidden_states):
+        hidden_states = hidden_states.transpose((0, 1, 2))
+        hidden_states = self.conv(hidden_states)
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, : -self.num_pad_remove, :]
+        hidden_states = self.activation(hidden_states)
+        hidden_states = hidden_states.transpose((0, 1, 2))
+        return hidden_states
+class FlaxConvLayersCollection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        if self.config.feat_extract_norm == "layer":
+            # note that we can't use scan on the conv layers as they differ on a layer-by-layer basis
+            BlockLayer = remat(FlaxWav2Vec2LayerNormConvLayer) if self.config.gradient_checkpointing else FlaxWav2Vec2LayerNormConvLayer
+            self.layers = [
+                BlockLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_feat_extract_layers)
+            ]
+        elif self.config.feat_extract_norm == "group":
+            raise NotImplementedError("At the moment only ``config.feat_extact_norm == 'layer'`` is supported")
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {self.config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+    def __call__(self, hidden_states):
+        for i, conv_layer in enumerate(self.layers):
+            hidden_states = conv_layer(hidden_states)
+        return hidden_states
+class FlaxWav2Vec2FeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.conv_layers = FlaxConvLayersCollection(self.config, dtype=self.dtype)
+    def __call__(self, input_values, freeze_feature_encoder=False):
+        hidden_states = input_values[:, :, None]
+        hidden_states = self.conv_layers(hidden_states)
+        if freeze_feature_encoder:
+            hidden_states = jax.lax.stop_gradient(hidden_states)
+        return hidden_states
+class FlaxWav2Vec2FeatureProjection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.projection = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.feat_proj_dropout)
+    def __call__(self, hidden_states, deterministic=True):
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states, norm_hidden_states
+class FlaxWav2Vec2Attention(nn.Module):
+    config: Wav2Vec2Config
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.fused_proj = nn.Dense(
+            self.embed_dim * 3,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.out_proj = dense()
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        """Input shape: Batch x Time x Channel"""
+        if self.config.fuse_matmuls:
+            attention_states = self.fused_proj(hidden_states)
+            query_states, key_states, value_states = jnp.split(attention_states, 3, axis=-1)
+        else:
+            # get query proj
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        if attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, float("-inf")).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+class FlaxWav2Vec2FeedForward(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.intermediate_dropout = nn.Dropout(rate=self.config.activation_dropout)
+        self.intermediate_dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        if isinstance(self.config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[self.config.hidden_act]
+        else:
+            self.intermediate_act_fn = self.config.hidden_act
+        self.output_dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.output_dropout = nn.Dropout(rate=self.config.hidden_dropout)
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+class FlaxWav2Vec2EncoderLayerStableLayerNorm(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.attention = FlaxWav2Vec2Attention(
+            config=self.config,
+            embed_dim=self.config.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.feed_forward = FlaxWav2Vec2FeedForward(self.config, dtype=self.dtype)
+        self.final_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+    def __call__(self, hidden_states, attention_mask=None, deterministic=True, output_attentions=False):
+        if self.config.use_scan:
+            hidden_states = hidden_states[0]
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights = self.attention(
+            hidden_states, attention_mask=attention_mask, deterministic=deterministic
+        )
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(
+            self.final_layer_norm(hidden_states), deterministic=deterministic
+        )
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        if self.config.use_scan:
+            outputs = (outputs, None)
+        return outputs
+class FlaxWav2Vec2EncoderLayerStableLayerNormCollection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        num_layers = self.config.num_hidden_layers
+        BlockEncoderLayer = (
+            remat(
+                FlaxWav2Vec2EncoderLayerStableLayerNorm,
+                static_argnums=(2, 3),
+                prevent_cse=not self.config.use_scan,
+            )
+            if self.config.gradient_checkpointing
+            else FlaxWav2Vec2EncoderLayerStableLayerNorm
+        )
+        if self.config.use_scan:
+            # since all decoder layers are the same, we use nn.scan directly
+            assert not output_attentions, "cannot use `scan` with `output_attentions` set to `True`"
+            assert not output_hidden_states, "cannot use `scan` with `output_hidden_states` set to `True`"
+            hidden_states = (hidden_states,)
+            hidden_states, _ = scan_with_axes(
+                BlockEncoderLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(nn.broadcast, nn.broadcast, nn.broadcast),
+                length=num_layers,
+            )(self.config, dtype=self.dtype, name="FlaxWav2Vec2EncoderLayers",)(
+                hidden_states, attention_mask, deterministic, output_attentions
+            )
+            hidden_states = hidden_states[0]
+        else:
+            for layer in range(num_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                layer_outputs = BlockEncoderLayer(
+                    self.config,
+                    dtype=self.dtype,
+                    name=str(layer),
+                )(hidden_states, attention_mask, deterministic, output_attentions)
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_attentions += (layer_outputs[1],)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+class FlaxWav2Vec2StableLayerNormEncoder(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.pos_conv_embed = FlaxWav2Vec2PositionalConvEmbedding(self.config, dtype=self.dtype)
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout)
+        self.layers = FlaxWav2Vec2EncoderLayerStableLayerNormCollection(self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic=True,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            hidden_states = jnp.where(
+                jnp.broadcast_to(attention_mask[:, :, None], hidden_states.shape), hidden_states, 0
+            )
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = self.layer_norm(outputs[0])
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_state,)
+        if not return_dict:
+            outputs = (last_hidden_state, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=outputs.attentions
+        )
+class FlaxWav2Vec2Adapter(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        # hidden_states require down-projection if feature dims don't match
+        if self.config.output_hidden_size != self.config.hidden_size:
+            self.proj = nn.Dense(
+                self.config.output_hidden_size,
+                kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+                dtype=self.dtype,
+            )
+            self.proj_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = FlaxWav2Vec2AdapterLayersCollection(self.config, dtype=self.dtype)
+    def __call__(self, hidden_states, deterministic=True):
+        # down-project hidden_states if required
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+        hidden_states = self.layers(hidden_states)
+        return hidden_states
+class FlaxWav2Vec2AdapterLayer(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.conv = nn.Conv(
+            features=2 * self.config.output_hidden_size,
+            kernel_size=(self.config.adapter_kernel_size,),
+            strides=(self.config.adapter_stride,),
+            padding=((1, 1),),
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+    def __call__(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.glu(hidden_states, axis=2)
+        return hidden_states
+class FlaxWav2Vec2AdapterLayersCollection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        BlockAdapterLayer = remat(FlaxWav2Vec2AdapterLayer) if self.config.gradient_checkpointing else FlaxWav2Vec2AdapterLayer
+        self.layers = [
+            BlockAdapterLayer(self.config, name=str(i), dtype=self.dtype)
+            for i in range(self.config.num_adapter_layers)
+        ]
+    def __call__(self, hidden_states):
+        for conv_layer in self.layers:
+            hidden_states = conv_layer(hidden_states)
+        return hidden_states
+class FlaxWav2Vec2PreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Wav2Vec2Config
+    base_model_prefix: str = "wav2vec2"
+    main_input_name = "input_values"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: Wav2Vec2Config,
+        input_shape: Tuple = (1, 1024),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
+        # init input tensors
+        input_values = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_values)
+        params_rng, dropout_rng = jax.random.split(rng, 2)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        return self.module.init(rngs, input_values, attention_mask, return_dict=False)["params"]
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        extract_features=None,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_features: Optional[bool] = None,
+        freeze_feature_encoder: bool = False,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if attention_mask is None:
+            batch_size, sequence_length = input_values.shape
+            attention_mask = jnp.ones((batch_size, sequence_length))
+        if extract_features is not None:
+            extract_features = jnp.array(extract_features, dtype="f4")
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        return self.module.apply(
+            inputs,
+            jnp.array(input_values, dtype="f4"),
+            jnp.array(attention_mask, dtype="i4"),
+            mask_time_indices,
+            extract_features,
+            not train,
+            output_attentions,
+            output_hidden_states,
+            output_features,
+            freeze_feature_encoder,
+            return_dict,
+            rngs=rngs,
+        )
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        return self.module._get_feat_extract_output_lengths(input_lengths, add_adapter=add_adapter)
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: jnp.ndarray, add_adapter=None
+    ):
+        return self.module._get_feature_vector_attention_mask(feature_vector_length, attention_mask, add_adapter=add_adapter)
+class FlaxWav2Vec2Module(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.feature_extractor = FlaxWav2Vec2FeatureEncoder(self.config, dtype=self.dtype)
+        self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype)
+        self.masked_spec_embed = self.param(
+            "masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,)
+        )
+        if self.config.do_stable_layer_norm:
+            self.encoder = FlaxWav2Vec2StableLayerNormEncoder(self.config, dtype=self.dtype)
+        else:
+            raise NotImplementedError("``config.do_stable_layer_norm is False`` is currently not supported.")
+        self.adapter = FlaxWav2Vec2Adapter(self.config, dtype=self.dtype) if self.config.add_adapter else None
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        extract_features=None,
+        deterministic=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        output_features=False,
+        freeze_feature_encoder=False,
+        return_dict=None,
+    ):
+        # forward pass through the feature extractor if features not specified
+        if extract_features is None:
+            extract_features = self.feature_extractor(input_values, freeze_feature_encoder=freeze_feature_encoder)
+        if output_features:
+            return extract_features
+        # make sure that no loss is computed on padded inputs
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features, deterministic=deterministic)
+        if mask_time_indices is not None:  # apply SpecAugment along time axis with given indices
+            hidden_states = jnp.where(
+                jnp.broadcast_to(mask_time_indices[:, :, None], hidden_states.shape),
+                jnp.broadcast_to(self.masked_spec_embed[None, None, :], hidden_states.shape),
+                hidden_states,
+            )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+        return FlaxWav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        return input_lengths
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: jnp.ndarray, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        batch_size = attention_mask.shape[0]
+        attention_mask = jnp.zeros((batch_size, feature_vector_length), dtype=attention_mask.dtype)
+        # these two operations makes sure that all values
+        # before the output lengths indices are attended to
+        attention_mask = attention_mask.at[jnp.arange(attention_mask.shape[0]), output_lengths - 1].set(1)
+        attention_mask = jnp.flip(jnp.flip(attention_mask, -1).cumsum(-1), -1).astype("bool")
+        return attention_mask
+class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel):
+    module_class = FlaxWav2Vec2Module
+class FlaxWav2Vec2ForCTCModule(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.wav2vec2 = FlaxWav2Vec2Module(self.config, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.final_dropout)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        extract_features=None,
+        deterministic=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        output_features=False,
+        freeze_feature_encoder=False,
+        return_dict=None,
+    ):
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            mask_time_indices=mask_time_indices,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_feature_encoder=freeze_feature_encoder,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.lm_head(hidden_states)
+        if not return_dict:
+            return (logits,) + outputs[2:]
+        return FlaxCausalLMOutput(logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        return input_lengths
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: jnp.ndarray, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        batch_size = attention_mask.shape[0]
+        attention_mask = jnp.zeros((batch_size, feature_vector_length), dtype=attention_mask.dtype)
+        # these two operations makes sure that all values
+        # before the output lengths indices are attended to
+        attention_mask = attention_mask.at[jnp.arange(attention_mask.shape[0]), output_lengths - 1].set(1)
+        attention_mask = jnp.flip(jnp.flip(attention_mask, -1).cumsum(-1), -1).astype("bool")
+        return attention_mask
+class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel):
+    module_class = FlaxWav2Vec2ForCTCModule

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

run_flax_speech_recognition_ctc.py ADDED Viewed

	@@ -0,0 +1,1398 @@

+#!/usr/bin/env python
+# coding=utf-8
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the Flax library models for connectionist temporal classification (CTC) speech recognition.
+"""
+# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
+import logging
+import math
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+import datasets
+import numpy as np
+from datasets import DatasetDict, load_dataset, load_metric
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+import wandb as wandb
+from flax import core, jax_utils, struct, traverse_util
+from flax.jax_utils import unreplicate, pad_shard_unpad
+from flax.training.common_utils import get_metrics, shard, shard_prng_key
+from huggingface_hub import Repository
+from models import Wav2Vec2Config, FlaxWav2Vec2ForCTC
+from optax._src import linear_algebra
+from transformers import (
+    AutoFeatureExtractor,
+    AutoProcessor,
+    AutoTokenizer,
+    HfArgumentParser,
+    TrainingArguments,
+    is_tensorboard_available,
+)
+from transformers.file_utils import get_full_repo_name
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.17.0.dev0")
+require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
+logger = logging.getLogger(__name__)
+@flax.struct.dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "feature extractor name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    freeze_feature_encoder: bool = field(
+        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
+    )
+    activation_dropout: float = field(
+        default=0.1,
+        metadata={
+            "help": "The hidden activation dropout probability in the embeddings, encoder, and pooler."
+        },
+    )
+    hidden_dropout: float = field(
+        default=0.1,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+    feat_proj_dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The feat proj dropout probability for feature encoder representations."
+        },
+    )
+    mask_time_prob: float = field(
+        default=0.1,
+        metadata={
+            "help": "The spec aug dropout probability for feature encoder representations."
+        },
+    )
+@flax.struct.dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: str = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    text_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Path to cache directory for saving and loading datasets"}
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_test_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "value if set."
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
+    )
+    text_column_name: str = field(
+        default="text",
+        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
+    )
+    max_duration_in_seconds: float = field(
+        default=20.0,
+        metadata={
+            "help": "Filter audio files in the training set that are longer than `max_duration_in_seconds` seconds"
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files in the training set that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    max_label_length: Optional[int] = field(
+        default=512,
+        metadata={
+            "help": "The minimum total sequence length for target text after tokenization. Sequences shorter "
+            "than this will be filtered."
+        },
+    )
+    min_label_length: Optional[int] = field(
+        default=0,
+        metadata={
+            "help": "The minimum total sequence length for target text after tokenization. Sequences shorter "
+            "than this will be filtered."
+        },
+    )
+    max_eval_duration_in_seconds: float = field(
+        default=None,
+        metadata={
+            "help": "Filter audio files in the eval/test set that are longer than `max_duration_in_seconds` seconds"
+        },
+    )
+    pad_input_to_multiple_of: Optional[int] = field(
+        default=32000,
+        metadata={
+            "help": "If set will pad the input sequence to a multiple of the provided value. "
+            "This is important to avoid triggering recompilations on TPU."
+        },
+    )
+    pad_target_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set will pad the target sequence to a multiple of the provided value. "
+            "This is important to avoid triggering recompilations on TPU."
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to only do data preprocessing and skip training. "
+            "This is especially useful when data preprocessing errors out in distributed training due to timeout. "
+            "In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
+            "so that the cached datasets can consequently be loaded in distributed training"
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    wandb_project: str = field(
+        default="flax-speech-recognition-ctc",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_name: str = field(
+        default=None,
+        metadata={"help": "The name of the wandb run."},
+    )
+    wandb_job_type: str = field(
+        default="CTC",
+        metadata={"help": "The name of the wandb job type."},
+    )
+    test_split_name: str = field(
+        default="test",
+        metadata={"help": "The name of the test data set split to use (via the datasets library). Defaults to 'test'"},
+    )
+# @flax.struct.dataclass
+@dataclass
+class FlaxTrainingArguments(TrainingArguments):
+    precision: str = field(
+        default="full",
+        metadata={
+            "help": "Whether to enable mixed-precision training. If true, the optimizer is stored in half-precision (bfloat16) and computations are executed in half-precision"
+            "**Note that this only specifies the dtype of the computation and optimizer state. It does not influence the dtype of model parameters.**"
+        },
+    )
+    matmul_precision: str = field(
+        default="default",
+        metadata={
+            "help": "Default floating-point precision of internal computations used in TPU matrix multiplications and convolutions. "
+            "This configuration option controls the default precision for JAX operations that take an optional precision argument (e.g. `lax.conv_general_dilated` and `lax.dot`). "
+            "This configuration option does not change the behaviours of such calls with explicit precision arguments; "
+            "it only changes the behaviors of calls with no such argument provided. "
+            "One of `['highest', 'float32', 'high', 'bfloat16_3x', 'default', 'bfloat16', 'fastest', None]`."
+        },
+    )
+    multisteps: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use Optax MultiSteps for gradient accumulation. If `False` (default) and `gradient_accumulation_steps > 1`, "
+            "a custom gradient accumulation implementation will be employed."
+        },
+    )
+def to_fp32(t):
+    return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
+def to_bf16(t):
+    return jax.tree_map(lambda x: x.astype(jnp.bfloat16) if x.dtype == jnp.float32 else x, t)
+class MixedPrecisionTrainState(struct.PyTreeNode):
+    """Train state for use with a single Optax optimizer.
+    Adapted from flax train_state https://github.com/google/flax/blob/main/flax/training/train_state.py
+    Synopsis::
+        state = TrainState.create(
+            apply_fn=model.apply,
+            params=variables['params'],
+            tx=tx)
+        grad_fn = jax.grad(make_loss_fn(state.apply_fn))
+        for batch in data:
+          grads = grad_fn(state.params, batch)
+          state = state.apply_gradients(grads=grads)
+    Args:
+      step: Counter starts at 0 and is incremented by every call to
+        `.apply_gradients()`.
+      apply_fn: Usually set to `model.apply()`. Kept in this dataclass for
+        convenience to have a shorter params list for the `train_step()` function
+        in your training loop.
+      params: The parameters to be updated by `tx` and used by `apply_fn`.
+      tx: An Optax gradient transformation.
+      opt_state: The state for `tx`.
+      dropout_rng: PRNG key for stochastic operations.
+      bf16: Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training.
+    """
+    step: int
+    apply_fn: Callable = struct.field(pytree_node=False)
+    get_attention_mask_fn: Callable = struct.field(pytree_node=False)
+    params: core.FrozenDict[str, Any]
+    tx: optax.GradientTransformation = struct.field(pytree_node=False)
+    opt_state: optax.OptState
+    dropout_rng: jnp.ndarray
+    max_grad_norm: Optional[float] = 1.0
+    def apply_gradients(self, *, grads, to_dtype, **kwargs):
+        """Updates `step`, `params`, `opt_state` and `**kwargs` in return value.
+        Note that internally this function calls `.tx.update()` followed by a call
+        to `optax.apply_updates()` to update `params` and `opt_state`.
+        Args:
+          grads: Gradients that have the same pytree structure as `.params`.
+          **kwargs: Additional dataclass attributes that should be `.replace()`-ed.
+        Returns:
+          An updated instance of `self` with `step` incremented by one, `params`
+          and `opt_state` updated by applying `grads`, and additional attributes
+          replaced as specified by `kwargs`.
+        """
+        # clip gradients by global l2 norm
+        casted_max_grad_norm = to_dtype(self.max_grad_norm)
+        g_norm = linear_algebra.global_norm(grads)
+        g_norm = jnp.maximum(casted_max_grad_norm, g_norm)
+        grads = jax.tree_map(lambda t: (t / g_norm) * casted_max_grad_norm, grads)
+        # perform update step in fp32 and subsequently downcast optimizer states if mixed precision training
+        # grads and opt_state in bf16 (need to upcast), params in fp32 (leave as is)
+        updates, new_opt_state = self.tx.update(to_fp32(grads), to_fp32(self.opt_state), self.params)
+        new_params = optax.apply_updates(self.params, updates)
+        return self.replace(
+            step=self.step + 1,
+            params=new_params,
+            opt_state=to_dtype(new_opt_state),
+            **kwargs,
+        )
+    @classmethod
+    def create(cls, *, apply_fn, params, tx, to_dtype, **kwargs):
+        """Creates a new instance with `step=0` and initialized `opt_state`."""
+        # downcast optimizer state to bf16 if mixed-precision training
+        opt_state = tx.init(to_dtype(params)) if tx is not None else None
+        return cls(
+            step=0,
+            apply_fn=apply_fn,
+            params=params,
+            tx=tx,
+            opt_state=opt_state,
+            **kwargs,
+        )
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+@flax.struct.dataclass
+class FlaxDataCollatorSpeechSeq2SeqWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor ([`Wav2Vec2Processor`])
+            The processor used for proccessing the data.
+        decoder_start_token_id (:obj: `int`)
+            The begin-of-sentence of the decoder.
+        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
+            See above for details.
+        max_input_length (:obj:`float`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        pad_input_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the input sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        pad_target_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the target sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    processor: Any
+    input_padding: Union[bool, str] = "longest"
+    label_padding: Union[bool, str] = "max_length"
+    pad_input_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_label: Optional[int] = None
+    max_input_length: Optional[float] = None
+    max_label_length: Optional[float] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+        # reformat list to dict and set to pytorch format
+        batch = self.processor.feature_extractor.pad(
+            input_features,
+            max_length=self.max_input_length,
+            padding=self.input_padding,
+            pad_to_multiple_of=self.pad_input_to_multiple_of,
+            return_tensors="np",
+        )
+        labels_batch = self.processor.tokenizer.pad(
+            label_features,
+            max_length=self.max_label_length,
+            padding=self.label_padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_label,
+            return_tensors="np",
+        )
+        labels = labels_batch["input_ids"]
+        labels = np.ma.array(labels, mask=np.not_equal(labels_batch.attention_mask, 1))
+        labels = labels.filled(fill_value=-100)
+        batch["labels"] = labels
+        return batch
+def get_grouped_indices(
+    dataset, batch_size: int, rng: Optional[List[int]] = None, mega_batch_mult: Optional[int] = None
+) -> np.array:
+    """
+    Adapted from the `get_length_grouped_indices` function in the PyTorch Trainer utils file (https://github.com/huggingface/transformers/blob/main/src/transformers/trainer_pt_utils.py#L486)
+    Function that returns a list of indices in which each slice of `batch_size` consecutive indices correspond to elements of similar
+    lengths. To do this, the indices are:
+    - randomly permuted (if a JAX rng is specified)
+    - grouped in mega-batches of size `mega_batch_mult * batch_size`
+    - sorted by length in each mega-batch
+    The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of
+    maximum length placed first, so that an OOM happens sooner rather than later.
+    """
+    lengths = dataset["input_length"]
+    # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller.
+    if mega_batch_mult is None:
+        mega_batch_mult = min(len(lengths) // (batch_size * 4), 50)
+        # Just in case, for tiny datasets
+        if mega_batch_mult == 0:
+            mega_batch_mult = 1
+    # We need to use JAX for the random permutation as the PRNG key will be set based on the seed outside of the sampler.
+    num_samples = len(lengths)
+    indices = jax.random.permutation(rng, np.arange(num_samples)) if rng is not None else np.arange(num_samples)
+    megabatch_size = mega_batch_mult * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [list(sorted(megabatch, key=lambda i: lengths[i], reverse=True)) for megabatch in megabatches]
+    # The rest is to get the biggest batch first.
+    # Since each megabatch is sorted by descending length, the longest element is the first
+    megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches]
+    max_idx = np.argmax(megabatch_maximums).item()
+    # Switch to put the longest batch in first position
+    # (note that this is different to the PT grouped sampler in which we only put the longest element in the first position, and not its batch)
+    megabatches[0], megabatches[max_idx] = megabatches[max_idx], megabatches[0]
+    megabatches = np.array([i for megabatch in megabatches for i in megabatch])
+    return megabatches
+def generate_batch_splits(samples_idx: np.ndarray, batch_size: int, drop_last=True) -> np.ndarray:
+    """Generate batches of data for a specified batch size from sample indices. If the dataset size is not divisible by
+    the batch size and `drop_last` is `True`, the last incomplete batch is dropped. Else, it is returned."""
+    num_samples = len(samples_idx)
+    if drop_last:
+        samples_to_remove = num_samples % batch_size
+        if samples_to_remove != 0:
+            samples_idx = samples_idx[:-samples_to_remove]
+        sections_split = num_samples // batch_size
+        samples_idx = samples_idx.reshape((sections_split, batch_size))
+    else:
+        sections_split = math.ceil(num_samples / batch_size)
+        samples_idx = np.array_split(samples_idx, sections_split)
+    return samples_idx
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step, pred_str=None):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+    if pred_str is not None:
+        # write output actual predictions for debugging
+        summary_writer.text("eval_predictions", "\n".join(pred_str), step)
+def write_wandb_log(metrics, step, prefix=None):
+    if jax.process_index() == 0:
+        log_metrics = {}
+        for k, v in metrics.items():
+            if "layer" in k:
+                log_metrics[f"{k}/"] = v
+            elif prefix is not None:
+                log_metrics[f"{prefix}/{k}"] = v
+            else:
+                log_metrics[k] = v
+        wandb.log(log_metrics, step)
+def write_wandb_pred(pred_str, label_str, step, final_step=False, prefix="eval"):
+    if jax.process_index() == 0:
+        # convert str data to a wandb compatible format
+        str_data = [[label_str[i], pred_str[i]] for i in range(len(pred_str))]
+        if not final_step:
+            # we'll log the first 50 predictions for each intermediate epoch
+            wandb.log(
+                {
+                    f"{prefix}/step_{int(step / 1000)}k": wandb.Table(
+                        columns=["label_str", "pred_str"], data=str_data[:50]
+                    )
+                },
+                step,
+            )
+        else:
+            # we'll log all predictions for the last epoch
+            wandb.log(
+                {
+                    f"{prefix}/step_{int(step / 1000)}k_all": wandb.Table(
+                        columns=["label_str", "pred_str"], data=str_data
+                    )
+                },
+                step,
+            )
+def create_learning_rate_fn(
+    num_train_steps: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def ctc_loss(
+    logits,
+    logits_attention_mask,
+    labels,
+    blank_id,
+    loss_reduction="mean",
+    output_emission_dict=False,
+    log_epsilon=-100000.0,
+):
+    """Computes CTC loss.
+    This function performs forward computation over an FSA with `N * 2` states
+    where `N` is the max number of labels. The states are split into two groups:
+    Phi states and emission states. a phi-state accepts repetition of
+    phi (blank)-symbols and transits to emission state when the correct label is
+    observed. An emission state accepts repetition of the label and transits to
+    the next phi states at any time (so called epsilon-transition).
+    Below, `B` denotes the batch size, `T` denotes the time steps in `logits`,
+    and `N` denotes the time steps in `labels`.
+    Args:
+      logits: (B, T, K)-array containing log-probabilities of each class.
+      logitpaddings: (B, T)-array. Padding indicators for `logits`.
+      labels: (B, N)-array containing reference integer labels.
+      labelpaddings: (B, N)-array. Padding indicators for `labels`. Currently,
+        `labels` must be right-padded, i.e. each row of `labelpaddings` must be
+        repetition of zeroes, followed by repetition of ones.
+      blank_id: Id for blank token.
+      loss_reduction: one of "mean", "sum", "default"
+        - "none": no reduction is applied.
+        - "mean": output loss will be divided by target lengths and then the
+          mean over the batch is taken.
+        - "sum": output loss are summed over batch
+      output_emission_dict: whether to output additional information about the emission probs
+    Returns:
+      A pair of `(per_seq_loss, aux)`.
+      per_seq_loss:
+        (B,)-array containing loss values for each sequence in the batch.
+      aux: Dictionary containing interim variables used for computing losses.
+        aux['logalpha_phi']: (T, B, N+1)-array. Log-forward-probabilities of each
+          phi-state corresponding to the n-th label.
+        aux['logalpha_emit']: (T, B, N)-array. Log-forward-probabilities of each
+          emission-state corresponding to the n-th label.
+        aux['logprobs_phi']: (T, B, 1)-array. Probability of the phi-symbol
+          corresponding to each time frame.
+        aux['logprobs_emit']: (T, B, N)-array. Probability of the n-th label
+          corresponding to each time frame.
+    """
+    # label paddings are indicated by -100
+    labelpaddings = labels < 0
+    # logit paddings are the inverse of attention_mask
+    logitpaddings = ~logits_attention_mask
+    # Copied from https://github.com/tensorflow/lingvo/blob/master/lingvo/jax/layers/ctc_objectives.py
+    batchsize, unused_maxinputlen, num_classes = logits.shape
+    batchsize_, maxlabellen = labels.shape
+    logprobs = jax.nn.log_softmax(logits)
+    labellens = maxlabellen - jnp.sum(labelpaddings, axis=1).astype(jnp.int32)
+    # repeat[b, n] == 1.0 when label[b, n] == label[b, n+1].
+    repeat = (labels[:, :-1] == labels[:, 1:]).astype(jnp.float32)
+    repeat = jnp.pad(repeat, ((0, 0), (0, 1)))
+    logprobs_phi = logprobs[:, :, blank_id : blank_id + 1]  # [B, T, 1]
+    logprobs_phi = jnp.transpose(logprobs_phi, (1, 0, 2))  # [T, B, 1]
+    one_hot = jax.nn.one_hot(labels, num_classes=num_classes)  # [B, N, K]
+    logprobs_emit = jnp.einsum("btk,bnk->btn", logprobs, one_hot)
+    logprobs_emit = jnp.transpose(logprobs_emit, (1, 0, 2))  # [T, B, N]
+    logalpha_phi_init = jnp.ones((batchsize, maxlabellen + 1)) * log_epsilon  # [B, N]
+    logalpha_phi_init = logalpha_phi_init.at[:, 0].set(0.0)
+    logalpha_emit_init = jnp.ones((batchsize, maxlabellen)) * log_epsilon  # [B, N]
+    def loop_body(prev, x):
+        prev_phi, prev_emit = prev
+        # emit-to-phi epsilon transition, except if the next label is repetition
+        prev_phi_orig = prev_phi
+        prev_phi = prev_phi.at[:, 1:].set(jnp.logaddexp(prev_phi[:, 1:], prev_emit + log_epsilon * repeat))
+        logprob_emit, logprob_phi, pad = x
+        # phi-to-emit transition
+        next_emit = jnp.logaddexp(prev_phi[:, :-1] + logprob_emit, prev_emit + logprob_emit)
+        # self-loop transition
+        next_phi = prev_phi + logprob_phi
+        # emit-to-phi blank transition only when the next label is repetition
+        next_phi = next_phi.at[:, 1:].set(
+            jnp.logaddexp(next_phi[:, 1:], prev_emit + logprob_phi + log_epsilon * (1.0 - repeat))
+        )
+        pad = pad.reshape((batchsize, 1))
+        next_emit = pad * prev_emit + (1.0 - pad) * next_emit
+        next_phi = pad * prev_phi_orig + (1.0 - pad) * next_phi
+        return (next_phi, next_emit), (next_phi, next_emit)
+    xs = (logprobs_emit, logprobs_phi, logitpaddings.transpose((1, 0)))
+    _, (logalpha_phi, logalpha_emit) = jax.lax.scan(loop_body, (logalpha_phi_init, logalpha_emit_init), xs)
+    # last row needs to be updated with the last epsilon transition
+    logalpha_phi_last = logalpha_phi[-1].at[:, 1:].set(jnp.logaddexp(logalpha_phi[-1, :, 1:], logalpha_emit[-1]))
+    logalpha_phi = logalpha_phi.at[-1].set(logalpha_phi_last)
+    # extract per_seq_loss
+    one_hot = jax.nn.one_hot(labellens, num_classes=maxlabellen + 1)  # [B, N+1]
+    per_seq_loss = -jnp.einsum("bn,bn->b", logalpha_phi_last, one_hot)
+    if loss_reduction == "mean":
+        target_lengths = labelpaddings.shape[-1] - labelpaddings.sum(axis=-1)
+        loss = (per_seq_loss / target_lengths).mean()
+    elif loss_reduction == "sum":
+        loss = per_seq_loss.sum()
+    else:
+        loss = per_seq_loss
+    if not output_emission_dict:
+        return loss
+    return loss, {
+        "logalpha_phi": logalpha_phi,
+        "logalpha_emit": logalpha_emit,
+        "logprobs_phi": logprobs_phi,
+        "logprobs_emit": logprobs_emit,
+    }
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, FlaxTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # 2. Setup logging
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    # Set the verbosity to info of the Transformers logger.
+    # We only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # Set up wandb run
+    if jax.process_index() == 0:
+        wandb.init(project=data_args.wandb_project, name=data_args.wandb_name, job_type=data_args.wandb_job_type)
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Set the default TPU matmul precision and display the number of devices
+    jax.config.update("jax_default_matmul_precision", training_args.matmul_precision)
+    logger.info(f"JAX devices: {jax.device_count()}, matmul precision: {training_args.matmul_precision}")
+    # 4. Load dataset
+    raw_datasets = DatasetDict()
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            cache_dir=data_args.dataset_cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            cache_dir=data_args.dataset_cache_dir,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    if training_args.do_predict:
+        test_split = data_args.test_split_name.split("+")
+        for split in test_split:
+            raw_datasets[split] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=split,
+                cache_dir=data_args.dataset_cache_dir,
+                use_auth_token=True if model_args.use_auth_token else None,
+            )
+    if not training_args.do_train and not training_args.do_eval and not training_args.do_predict:
+        raise ValueError(
+            "Cannot not train, not do evaluation and not do prediction. At least one of "
+            "training, evaluation or prediction has to be done."
+        )
+    # if not training, there is no need to run multiple epochs
+    if not training_args.do_train:
+        training_args.num_train_epochs = 1
+    if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
+        raise ValueError(
+            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--audio_column_name` to the correct audio column - one of "
+            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
+        )
+    if data_args.text_column_name not in next(iter(raw_datasets.values())).column_names:
+        raise ValueError(
+            f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+            "Make sure to set `--text_column_name` to the correct text column - one of "
+            f"{', '.join(next(iter(raw_datasets.values())).column_names)}."
+        )
+    # 5. Load pretrained model, tokenizer, and feature extractor
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    config = Wav2Vec2Config.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # update config according to training args, model args, and tokenizer attributes
+    config.update(
+        {
+            "gradient_checkpointing": training_args.gradient_checkpointing,
+            "activation_dropout": model_args.activation_dropout,
+            "hidden_dropout": model_args.hidden_dropout,
+            "feat_proj_dropout": model_args.feat_proj_dropout,
+            "mask_time_prob": model_args.mask_time_prob,
+            "vocab_size": tokenizer.vocab_size,
+        }
+    )
+    if training_args.precision == "full_mixed":
+        dtype = jnp.bfloat16
+        training_args.mixed_precision = True
+    elif training_args.precision == "half_mixed":
+        dtype = jnp.bfloat16
+        training_args.mixed_precision = False
+    else:
+        dtype = jnp.float32
+        training_args.mixed_precision = False
+    model = FlaxWav2Vec2ForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        dtype=dtype,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # 6. Resample speech dataset ALWAYS
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+    )
+    # 7. Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    max_input_length = int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
+    min_input_length = int(data_args.min_duration_in_seconds * feature_extractor.sampling_rate)
+    max_eval_input_length = int(data_args.max_eval_duration_in_seconds * feature_extractor.sampling_rate) if data_args.max_eval_duration_in_seconds else None
+    max_target_length = data_args.max_label_length
+    min_target_length = data_args.min_label_length
+    pad_input_to_multiple_of = data_args.pad_input_to_multiple_of
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    text_column_name = data_args.text_column_name
+    model_input_name = feature_extractor.model_input_names[0]
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+    if training_args.do_predict and data_args.max_test_samples is not None:
+        for split in test_split:
+            raw_datasets[split] = raw_datasets[split].select(range(data_args.max_eval_samples))
+    def prepare_dataset(batch):
+        # Pre-process audio
+        sample = batch[audio_column_name]
+        # normalise audio (mean, std) to (0, 1)
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        # process audio length
+        batch[model_input_name] = inputs.input_values[0]
+        batch["input_length"] = len(batch["input_values"])
+        input_str = batch[text_column_name]
+        batch["labels"] = tokenizer(input_str).input_ids
+        batch["labels_length"] = len(batch["labels"])
+        return batch
+    vectorized_datasets = raw_datasets.map(
+        prepare_dataset,
+        remove_columns=next(iter(raw_datasets.values())).column_names,
+        num_proc=num_workers,
+        desc="preprocess dataset",
+    )
+    # filter training data with inputs longer than max_input_length
+    def is_audio_in_length_range(length):
+        return min_input_length < length < max_input_length
+    if training_args.do_train:
+        vectorized_datasets["train"] = vectorized_datasets["train"].filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )
+    # filter data with targets shorter than min_target_length or longer than max_target_length
+    def is_labels_in_length_range(length):
+        return min_target_length < length < max_target_length
+    if training_args.do_train:
+        vectorized_datasets["train"] = vectorized_datasets["train"].filter(
+            is_labels_in_length_range,
+            num_proc=num_workers,
+            input_columns=["labels_length"],
+        )
+    if max_eval_input_length is not None:
+        # filter training data with inputs longer than max_input_length
+        def is_eval_audio_in_length_range(length):
+            return min_input_length < length < max_eval_input_length
+        if training_args.do_eval:
+            vectorized_datasets["eval"] = vectorized_datasets["eval"].filter(
+                is_eval_audio_in_length_range,
+                num_proc=num_workers,
+                input_columns=["input_length"],
+            )
+        if training_args.do_predict:
+            for split in test_split:
+                vectorized_datasets[split] = vectorized_datasets[split].filter(
+                    is_eval_audio_in_length_range,
+                    num_proc=num_workers,
+                    input_columns=["input_length"],
+                )
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with `args.preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step `args.preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+    # 8. Load Metrics
+    wer_metric = load_metric("wer")
+    cer_metric = load_metric("cer")
+    def compute_metrics(pred_ids: List[List[int]], label_ids: List[List[int]]):
+        padded_ids = np.where(np.asarray(label_ids) == -100, tokenizer.pad_token_id, np.asarray(label_ids))
+        pred_str = tokenizer.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(padded_ids, group_tokens=False)
+        wer = wer_metric.compute(predictions=pred_str, references=label_str)
+        cer = cer_metric.compute(predictions=pred_str, references=label_str)
+        return {"wer": wer, "cer": cer}, pred_str, label_str
+    # 9. save feature extractor, tokenizer and config
+    feature_extractor.save_pretrained(training_args.output_dir)
+    tokenizer.save_pretrained(training_args.output_dir)
+    config.save_pretrained(training_args.output_dir)
+    processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    data_collator = FlaxDataCollatorSpeechSeq2SeqWithPadding(
+        processor=processor,
+        input_padding="longest",
+        pad_input_to_multiple_of=pad_input_to_multiple_of,
+        max_label_length=data_args.max_label_length,
+    )
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run `pip install tensorboard` to enable."
+        )
+    # 10. Handle the repository creation
+    if training_args.push_to_hub:
+        with open(os.path.join(training_args.output_dir, ".gitattributes"), "r+") as f:
+            git_lfs_extensions = f.read()
+            if "*.wandb" not in git_lfs_extensions:
+                f.write("*.wandb filter=lfs diff=lfs merge=lfs -text")
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    # 11. Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+    # Store some constants
+    max_steps = int(training_args.max_steps)
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    batch_size_per_update = train_batch_size * gradient_accumulation_steps
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    to_dtype = to_bf16 if training_args.mixed_precision else to_fp32
+    if training_args.do_train:
+        num_train_samples = len(vectorized_datasets["train"])
+        steps_per_epoch = num_train_samples // batch_size_per_update
+        if max_steps > 0:
+            num_epochs = -(training_args.max_steps // -steps_per_epoch)
+            total_train_steps = max_steps
+        else:
+            num_epochs = int(training_args.num_train_epochs)
+            total_train_steps = steps_per_epoch * num_epochs
+        # Create learning rate schedule
+        # Create learning rate schedule
+        linear_decay_lr_schedule_fn = create_learning_rate_fn(
+            total_train_steps,
+            training_args.warmup_steps,
+            training_args.learning_rate,
+        )
+        # We use Optax's "masking" functionality to not apply weight decay
+        # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+        # mask boolean with the same structure as the parameters.
+        # The mask is True for parameters that should be decayed.
+        # Note that this mask is specifically adapted for FlaxWav2Vec2 and FlaxBart.
+        # For FlaxT5, one should correct the layer norm parameter naming
+        # accordingly - see `run_t5_mlm_flax.py` e.g.
+        def decay_mask_fn(params):
+            flat_params = traverse_util.flatten_dict(params)
+            layer_norm_params = [
+                (name, "scale")
+                for name in ["layer_norm", "self_attn_layer_norm", "layernorm_embedding", "final_layer_norm"]
+            ]
+            flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_params) for path in flat_params}
+            return traverse_util.unflatten_dict(flat_mask)
+        if training_args.adafactor:
+            # Create Adafactor optimizer
+            optim = optax.adafactor(
+                learning_rate=linear_decay_lr_schedule_fn,
+                dtype_momentum=jnp.bfloat16 if training_args.mixed_precision else jnp.float32,
+                weight_decay_rate=training_args.weight_decay,
+                weight_decay_mask=decay_mask_fn,
+            )
+        else:
+            # Create AdamW optimizer
+            optim = optax.adamw(
+                learning_rate=linear_decay_lr_schedule_fn,
+                b1=training_args.adam_beta1,
+                b2=training_args.adam_beta2,
+                eps=training_args.adam_epsilon,
+                weight_decay=training_args.weight_decay,
+                mask=decay_mask_fn,
+            )
+        # Optax MultiSteps for gradient accumulation. We'll only call this optimizer transformation if gradient accumulation is required (i.e. gradient accumulation steps > 1)
+        if training_args.multisteps and gradient_accumulation_steps > 1:
+            optim = optax.MultiSteps(optim, gradient_accumulation_steps, use_grad_mean=False)
+    else:
+        num_epochs = 0
+        total_train_steps = 0
+        num_train_samples = 0
+        optim = None
+    # Setup train state
+    state = MixedPrecisionTrainState.create(
+        apply_fn=model.__call__,
+        get_attention_mask_fn=model._get_feature_vector_attention_mask,
+        params=model.params,
+        tx=optim,
+        to_dtype=to_dtype,
+        dropout_rng=dropout_rng,
+        max_grad_norm=training_args.max_grad_norm,
+    )
+    # Replicate the train state on each device
+    state = state.replicate()
+    blank_id = model.config.pad_token_id
+    # Define gradient update step fn
+    def train_step(state, batch):
+        # only one single rng per grad step, with or without accumulation, as the graph should be identical over one effective training batch
+        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
+        def compute_loss(params, minibatch):
+            labels = minibatch.pop("labels")
+            logits = state.apply_fn(
+                **minibatch,
+                params=params,
+                dropout_rng=dropout_rng,
+                freeze_feature_encoder=model_args.freeze_feature_encoder,
+                train=True,
+            )[0]
+            logits_mask = state.get_attention_mask_fn(logits.shape[1], batch["attention_mask"])
+            loss = ctc_loss(logits, logits_mask, labels, blank_id, loss_reduction="mean")
+            return loss
+        grad_fn = jax.value_and_grad(compute_loss)
+        if gradient_accumulation_steps == 1 or training_args.multisteps:
+            loss, grad = grad_fn(to_dtype(state.params), batch)
+        # Custom gradient accumulation
+        else:
+            # add a first dimension over gradient_accumulation_steps for minibatch slices
+            batch = jax.tree_map(
+                lambda x: x.reshape(
+                    gradient_accumulation_steps, training_args.per_device_train_batch_size, *x.shape[1::]
+                ),
+                batch,
+            )
+            def accum_minibatch_step(accum_grad, minibatch):
+                # compute loss, num labels and grad over minibatch and accumulate
+                loss, grad = grad_fn(to_dtype(state.params), minibatch)
+                return jax.tree_map(jnp.add, accum_grad, grad), loss
+            # create an initial state for accumulating losses, num labels and gradients
+            init_grad = jax.tree_map(jnp.zeros_like, to_dtype(state.params))
+            # loop accum minibatch step over the number of gradient accumulation steps
+            grad, loss = jax.lax.scan(accum_minibatch_step, init_grad, batch)
+        # update state
+        new_state = state.apply_gradients(
+            grads=grad,
+            dropout_rng=new_dropout_rng,
+            to_dtype=to_dtype,
+        )
+        # compute gradient norms over all layers and globally for detailed monitoring
+        layer_grad_norm = jax.tree_map(jnp.linalg.norm, grad)
+        logs = {
+            "layer_grad_norm": layer_grad_norm,
+            "grad_norm": jnp.linalg.norm(jax.tree_util.tree_leaves(layer_grad_norm)),
+        }
+        # compute parameter norms over all layers and globally for detailed monitoring
+        layer_param_norm = jax.tree_map(jnp.linalg.norm, new_state.params)
+        logs["layer_param_norm"] = layer_param_norm
+        logs["param_norm"] = jnp.linalg.norm(jax.tree_util.tree_leaves(layer_param_norm))
+        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
+        metrics.update(logs)
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        # metrics = to_fp32(metrics)
+        return new_state, metrics
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        logits_mask = model._get_feature_vector_attention_mask(logits.shape[1], batch["attention_mask"])
+        loss = ctc_loss(logits, logits_mask, labels, blank_id, loss_reduction="mean")
+        pred_ids = jnp.argmax(logits, axis=-1)
+        # summarize metrics
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        # metrics = to_fp32(metrics)
+        return metrics, pred_ids
+    # Create parallel version of the train and eval step
+    if training_args.do_train:
+        p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    if training_args.do_eval or training_args.do_predict:
+        p_eval_step = jax.pmap(eval_step, "batch")
+    def run_evaluation(step, final_step=False):
+        if training_args.do_eval:
+            # ======================== Evaluating ==============================
+            eval_metrics = []
+            eval_preds = []
+            eval_labels = []
+            # Generate eval set by sequentially sampling indices from the eval dataset and grouping by length
+            eval_samples_idx = get_grouped_indices(vectorized_datasets["eval"], eval_batch_size)
+            eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False)
+            for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                samples = [vectorized_datasets["eval"][int(idx)] for idx in batch_idx]
+                batch = data_collator(samples)
+                labels = batch["labels"]
+                try:
+                    metrics, pred_ids = pad_shard_unpad(p_eval_step)(state.params, batch.data, min_device_batch=per_device_eval_batch_size)
+                except TypeError:
+                    continue
+                eval_preds.extend(jax.device_get(pred_ids.reshape(-1, pred_ids.shape[-1])))
+                eval_metrics.append(metrics)
+                eval_labels.extend(labels)
+            # normalize eval metrics
+            eval_metrics = get_metrics(eval_metrics)
+            eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+            eval_metrics = to_fp32(eval_metrics)
+            # always run compute metrics
+            error_rate_metric, pred_str, label_str = compute_metrics(eval_preds, eval_labels)
+            eval_metrics.update(error_rate_metric)
+            error_rate_desc = " ".join([f"Eval {key}: {value} |" for key, value in error_rate_metric.items()])
+            # Print metrics and update progress bar
+            desc = f"Step... ({step}/{total_train_steps} | Eval Loss: {eval_metrics['loss']} | {error_rate_desc})"
+            epochs.write(desc)
+            epochs.desc = desc
+            # Save metrics
+            write_wandb_log(eval_metrics, step, prefix="eval")
+            write_wandb_pred(pred_str, label_str, step, final_step=final_step)
+            # if has_tensorboard and jax.process_index() == 0:
+            # write_eval_metric(summary_writer, eval_metrics, step, pred_str=pred_str)
+    def save_checkpoint(step):
+        # save and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(training_args.output_dir, params=params)
+            tokenizer.save_pretrained(training_args.output_dir)
+            if training_args.push_to_hub:
+                repo.push_to_hub(commit_message=f"{wandb.run.id}: saving weights and logs of step {int(step / 1000)}k", blocking=False)
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {num_train_samples}")
+    logger.info(f"  Num Epochs = {num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+    logger.info(f"  Num gradient accumulation steps = {gradient_accumulation_steps}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {batch_size_per_update}")
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+    logger.info(f"  Gradient checkpointing: {config.gradient_checkpointing}")
+    logger.info(f"  Use scan: {config.use_scan}")
+    logger.info(f"  Fuse matmuls: {config.fuse_matmuls}")
+    train_time = cur_step = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        if training_args.do_train:
+            # ======================== Training ================================
+            train_start = time.time()
+            # Create sampling rng
+            rng, input_rng = jax.random.split(rng)
+            # Generate an epoch by randomly shuffling sampling indices from the train dataset and grouping by length
+            train_samples_idx = get_grouped_indices(vectorized_datasets["train"], batch_size_per_update, input_rng)
+            train_batch_idx = generate_batch_splits(train_samples_idx, batch_size_per_update)
+            # Gather the indices for creating the batch and do a training step
+            for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1), 1):
+                samples = [vectorized_datasets["train"][int(idx)] for idx in batch_idx]
+                batch = data_collator(samples)
+                batch = shard(batch.data)
+                try:
+                    state, train_metric = p_train_step(state, batch)
+                except TypeError as e:
+                    logger.warning("Encountered following error: \n", e)
+                cur_step = epoch * (num_train_samples // batch_size_per_update) + step
+                if cur_step % training_args.logging_steps == 0:
+                    # Save metrics
+                    train_metric = unreplicate(train_metric)
+                    train_time += time.time() - train_start
+                    # need to upcast all device arrays to fp32 for wandb logging (jnp.bfloat16 not supported) -> do this here OR in train_step
+                    write_wandb_log(to_fp32(train_metric), cur_step, prefix="train")
+                    # we won't log to tensorboard for now (it is fiddly logging param and grad norms on a layer-by-layer basis)
+                    # if has_tensorboard and jax.process_index() == 0:
+                    # write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                    epochs.write(
+                        f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']}, Gradient Norm: {train_metric['grad_norm']})"
+                    )
+                if cur_step % total_train_steps == 0:
+                    break
+                if training_args.eval_steps and cur_step % training_args.eval_steps == 0:
+                    run_evaluation(cur_step, final_step=False)
+                if cur_step % training_args.save_steps == 0:
+                    save_checkpoint(cur_step)
+            if training_args.eval_steps == 0 and (epoch + 1) != num_epochs:
+                # run evaluation at the end of the epoch if eval steps are not specified
+                run_evaluation(cur_step, final_step=False)
+                save_checkpoint(cur_step)
+    if training_args.do_train:
+        save_checkpoint(cur_step)
+    cur_step = max_steps if max_steps > 0 else cur_step  # set step to max steps so that eval happens in alignment with training
+    if training_args.do_eval:
+        run_evaluation(cur_step, final_step=True)
+    # TODO: collapse 'do_predict' into the run_evaluation function
+    if training_args.do_predict:
+        for split in test_split:
+            # ======================== Evaluating ==============================
+            eval_metrics = []
+            eval_preds = []
+            eval_labels = []
+            # Generate eval set by sequentially sampling indices from the test dataset and grouping by length
+            eval_samples_idx = get_grouped_indices(vectorized_datasets[split], eval_batch_size)
+            eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size, drop_last=False)
+            for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc=f"Predicting {split}...", position=2)):
+                samples = [vectorized_datasets[split][int(idx)] for idx in batch_idx]
+                batch = data_collator(samples)
+                labels = batch["labels"]
+                metrics, pred_ids = pad_shard_unpad(p_eval_step)(state.params, batch.data, min_device_batch=per_device_eval_batch_size)
+                eval_preds.extend(jax.device_get(pred_ids.reshape(-1, pred_ids.shape[-1])))
+                eval_metrics.append(metrics)
+                eval_labels.extend(labels)
+            # normalize eval metrics
+            eval_metrics = get_metrics(eval_metrics)
+            eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+            eval_metrics = to_fp32(eval_metrics)
+            # always run compute metrics
+            error_rate_metric, pred_str, label_str = compute_metrics(eval_preds, eval_labels)
+            eval_metrics.update(error_rate_metric)
+            error_rate_desc = " ".join([f"Eval {key}: {value} |" for key, value in error_rate_metric.items()])
+            # Print metrics and update progress bar
+            desc = f"Step... ({cur_step}/{total_train_steps} | Eval Loss: {eval_metrics['loss']} | {error_rate_desc})"
+            epochs.write(desc)
+            epochs.desc = desc
+            # Save metrics
+            write_wandb_log(eval_metrics, cur_step, prefix=split)
+            write_wandb_pred(pred_str, label_str, cur_step, final_step=True, prefix=split)
+            # if has_tensorboard and jax.process_index() == 0:
+            # write_eval_metric(summary_writer, eval_metrics, cur_step, pred_str=pred_str)
+if __name__ == "__main__":
+    main()

run_tedlium.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env bash
+python run_flax_speech_recognition_ctc.py \
+        --model_name_or_path="esc/wav2vec2-pretrained" \
+        --tokenizer_name="wav2vec2-ctc-tedlium-tokenizer" \
+        --dataset_name="esc/esc-datasets" \
+        --dataset_config_name="tedlium" \
+        --output_dir="./" \
+        --wandb_project="wav2vec2-ctc" \
+        --wandb_name="wav2vec2-ctc-tedlium" \
+        --max_steps="50000" \
+        --save_steps="10000" \
+        --eval_steps="10000" \
+        --learning_rate="3e-4" \
+        --logging_steps="25" \
+        --warmup_steps="5000" \
+        --preprocessing_num_workers="1" \
+        --hidden_dropout="0.2" \
+        --activation_dropout="0.2" \
+        --feat_proj_dropout="0.2" \
+        --do_train \
+        --do_eval \
+        --do_predict \
+        --overwrite_output_dir \
+        --gradient_checkpointing \
+        --freeze_feature_encoder \
+        --push_to_hub \
+        --use_auth_token

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token": "<s>",
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "name_or_path": "sanchit-gandhi/wav2vec2-ctc-tedlium-black-box-tokenizer",
+  "pad_token": "<pad>",
+  "replace_word_delimiter_char": " ",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": "|"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "'": 14,
+  "</s>": 2,
+  "<pad>": 0,
+  "<s>": 1,
+  "<unk>": 3,
+  "a": 15,
+  "b": 11,
+  "c": 6,
+  "d": 31,
+  "e": 16,
+  "f": 22,
+  "g": 23,
+  "h": 30,
+  "i": 25,
+  "j": 18,
+  "k": 26,
+  "l": 24,
+  "m": 5,
+  "n": 13,
+  "o": 7,
+  "p": 28,
+  "q": 21,
+  "r": 17,
+  "s": 27,
+  "t": 10,
+  "u": 8,
+  "v": 20,
+  "w": 19,
+  "x": 29,
+  "y": 12,
+  "z": 4,
+  "|": 9
+}