Commit 
							
							·
						
						79a5416
	
1
								Parent(s):
							
							26b6a6c
								
add model
Browse files- README.md +54 -0
- alphabet.json +1 -0
- config.json +108 -0
- eval.py +146 -0
- language_model/attrs.json +1 -0
- language_model/lm.binary +3 -0
- language_model/unigrams.txt +3 -0
- log_mozilla-foundation_common_voice_8_0_nl_test_predictions.txt +0 -0
- log_mozilla-foundation_common_voice_8_0_nl_test_predictions_greedy.txt +0 -0
- log_mozilla-foundation_common_voice_8_0_nl_test_targets.txt +0 -0
- log_mozilla-foundation_common_voice_8_0_nl_test_targets_greedy.txt +0 -0
- mozilla-foundation_common_voice_8_0_nl_test_eval_results.txt +2 -0
- mozilla-foundation_common_voice_8_0_nl_test_eval_results_greedy.txt +2 -0
- preprocessor_config.json +10 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- vocab.json +1 -0
    	
        README.md
    ADDED
    
    | @@ -0,0 +1,54 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            language:
         | 
| 3 | 
            +
            - nl
         | 
| 4 | 
            +
            license: apache-2.0
         | 
| 5 | 
            +
            tags:
         | 
| 6 | 
            +
            - automatic-speech-recognition
         | 
| 7 | 
            +
            - mozilla-foundation/common_voice_8_0
         | 
| 8 | 
            +
            - nl
         | 
| 9 | 
            +
            - robust-speech-event
         | 
| 10 | 
            +
            datasets:
         | 
| 11 | 
            +
            - mozilla-foundation/common_voice_8_0
         | 
| 12 | 
            +
            model-index:
         | 
| 13 | 
            +
            - name: XLS-R-1B - Dutch
         | 
| 14 | 
            +
              results:
         | 
| 15 | 
            +
              - task: 
         | 
| 16 | 
            +
                  name: Automatic Speech Recognition 
         | 
| 17 | 
            +
                  type: automatic-speech-recognition
         | 
| 18 | 
            +
                dataset:
         | 
| 19 | 
            +
                  name: Common Voice 8
         | 
| 20 | 
            +
                  type: mozilla-foundation/common_voice_8_0
         | 
| 21 | 
            +
                  args: nl
         | 
| 22 | 
            +
                metrics:
         | 
| 23 | 
            +
                   - name: Test WER
         | 
| 24 | 
            +
                     type: wer
         | 
| 25 | 
            +
                     value: 10.63
         | 
| 26 | 
            +
                   - name: Test CER
         | 
| 27 | 
            +
                     type: cer
         | 
| 28 | 
            +
                     value: 3.15
         | 
| 29 | 
            +
                   - name: Test WER (+LM)
         | 
| 30 | 
            +
                     type: wer
         | 
| 31 | 
            +
                     value: 8.50
         | 
| 32 | 
            +
                   - name: Test CER (+LM)
         | 
| 33 | 
            +
                     type: cer
         | 
| 34 | 
            +
                     value: 2.75
         | 
| 35 | 
            +
            ---
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            # XLS-R-1B-DUTCH
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            This model is a fine-tuned version of [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b) on the MOZILLA-FOUNDATION/COMMON_VOICE_8_0 - NL dataset.
         | 
| 40 | 
            +
             | 
| 41 | 
            +
             | 
| 42 | 
            +
            ## Evaluation Commands
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            1. To evaluate on `mozilla-foundation/common_voice_8_0` with split `test`
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            ```bash
         | 
| 47 | 
            +
            python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-dutch --dataset mozilla-foundation/common_voice_8_0 --config nl --split test
         | 
| 48 | 
            +
            ```
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            2. To evaluate on `speech-recognition-community-v2/dev_data`
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            ```bash
         | 
| 53 | 
            +
            python eval.py --model_id jonatasgrosman/wav2vec2-xls-r-1b-dutch --dataset speech-recognition-community-v2/dev_data --config nl --split validation --chunk_length_s 5.0 --stride_length_s 1.0
         | 
| 54 | 
            +
            ```
         | 
    	
        alphabet.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"labels": ["", "<s>", "</s>", "\u2047", " ", "'", "-", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u00e0", "\u00e2", "\u00e8", "\u00e9", "\u00ea", "\u00eb", "\u00ee", "\u00ef", "\u00f4", "\u00fb"], "is_bpe": false}
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,108 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "facebook/wav2vec2-xls-r-1b",
         | 
| 3 | 
            +
              "activation_dropout": 0.05,
         | 
| 4 | 
            +
              "adapter_kernel_size": 3,
         | 
| 5 | 
            +
              "adapter_stride": 2,
         | 
| 6 | 
            +
              "add_adapter": false,
         | 
| 7 | 
            +
              "apply_spec_augment": true,
         | 
| 8 | 
            +
              "architectures": [
         | 
| 9 | 
            +
                "Wav2Vec2ForCTC"
         | 
| 10 | 
            +
              ],
         | 
| 11 | 
            +
              "attention_dropout": 0.05,
         | 
| 12 | 
            +
              "bos_token_id": 1,
         | 
| 13 | 
            +
              "classifier_proj_size": 256,
         | 
| 14 | 
            +
              "codevector_dim": 1024,
         | 
| 15 | 
            +
              "contrastive_logits_temperature": 0.1,
         | 
| 16 | 
            +
              "conv_bias": true,
         | 
| 17 | 
            +
              "conv_dim": [
         | 
| 18 | 
            +
                512,
         | 
| 19 | 
            +
                512,
         | 
| 20 | 
            +
                512,
         | 
| 21 | 
            +
                512,
         | 
| 22 | 
            +
                512,
         | 
| 23 | 
            +
                512,
         | 
| 24 | 
            +
                512
         | 
| 25 | 
            +
              ],
         | 
| 26 | 
            +
              "conv_kernel": [
         | 
| 27 | 
            +
                10,
         | 
| 28 | 
            +
                3,
         | 
| 29 | 
            +
                3,
         | 
| 30 | 
            +
                3,
         | 
| 31 | 
            +
                3,
         | 
| 32 | 
            +
                2,
         | 
| 33 | 
            +
                2
         | 
| 34 | 
            +
              ],
         | 
| 35 | 
            +
              "conv_stride": [
         | 
| 36 | 
            +
                5,
         | 
| 37 | 
            +
                2,
         | 
| 38 | 
            +
                2,
         | 
| 39 | 
            +
                2,
         | 
| 40 | 
            +
                2,
         | 
| 41 | 
            +
                2,
         | 
| 42 | 
            +
                2
         | 
| 43 | 
            +
              ],
         | 
| 44 | 
            +
              "ctc_loss_reduction": "mean",
         | 
| 45 | 
            +
              "ctc_zero_infinity": false,
         | 
| 46 | 
            +
              "diversity_loss_weight": 0.1,
         | 
| 47 | 
            +
              "do_stable_layer_norm": true,
         | 
| 48 | 
            +
              "eos_token_id": 2,
         | 
| 49 | 
            +
              "feat_extract_activation": "gelu",
         | 
| 50 | 
            +
              "feat_extract_dropout": 0.0,
         | 
| 51 | 
            +
              "feat_extract_norm": "layer",
         | 
| 52 | 
            +
              "feat_proj_dropout": 0.05,
         | 
| 53 | 
            +
              "feat_quantizer_dropout": 0.0,
         | 
| 54 | 
            +
              "final_dropout": 0.05,
         | 
| 55 | 
            +
              "gradient_checkpointing": false,
         | 
| 56 | 
            +
              "hidden_act": "gelu",
         | 
| 57 | 
            +
              "hidden_dropout": 0.05,
         | 
| 58 | 
            +
              "hidden_size": 1280,
         | 
| 59 | 
            +
              "initializer_range": 0.02,
         | 
| 60 | 
            +
              "intermediate_size": 5120,
         | 
| 61 | 
            +
              "layer_norm_eps": 1e-05,
         | 
| 62 | 
            +
              "layerdrop": 0.05,
         | 
| 63 | 
            +
              "mask_feature_length": 10,
         | 
| 64 | 
            +
              "mask_feature_min_masks": 0,
         | 
| 65 | 
            +
              "mask_feature_prob": 0.0,
         | 
| 66 | 
            +
              "mask_time_length": 10,
         | 
| 67 | 
            +
              "mask_time_min_masks": 2,
         | 
| 68 | 
            +
              "mask_time_prob": 0.05,
         | 
| 69 | 
            +
              "model_type": "wav2vec2",
         | 
| 70 | 
            +
              "num_adapter_layers": 3,
         | 
| 71 | 
            +
              "num_attention_heads": 16,
         | 
| 72 | 
            +
              "num_codevector_groups": 2,
         | 
| 73 | 
            +
              "num_codevectors_per_group": 320,
         | 
| 74 | 
            +
              "num_conv_pos_embedding_groups": 16,
         | 
| 75 | 
            +
              "num_conv_pos_embeddings": 128,
         | 
| 76 | 
            +
              "num_feat_extract_layers": 7,
         | 
| 77 | 
            +
              "num_hidden_layers": 48,
         | 
| 78 | 
            +
              "num_negatives": 100,
         | 
| 79 | 
            +
              "output_hidden_size": 1280,
         | 
| 80 | 
            +
              "pad_token_id": 0,
         | 
| 81 | 
            +
              "proj_codevector_dim": 1024,
         | 
| 82 | 
            +
              "tdnn_dilation": [
         | 
| 83 | 
            +
                1,
         | 
| 84 | 
            +
                2,
         | 
| 85 | 
            +
                3,
         | 
| 86 | 
            +
                1,
         | 
| 87 | 
            +
                1
         | 
| 88 | 
            +
              ],
         | 
| 89 | 
            +
              "tdnn_dim": [
         | 
| 90 | 
            +
                512,
         | 
| 91 | 
            +
                512,
         | 
| 92 | 
            +
                512,
         | 
| 93 | 
            +
                512,
         | 
| 94 | 
            +
                1500
         | 
| 95 | 
            +
              ],
         | 
| 96 | 
            +
              "tdnn_kernel": [
         | 
| 97 | 
            +
                5,
         | 
| 98 | 
            +
                3,
         | 
| 99 | 
            +
                3,
         | 
| 100 | 
            +
                1,
         | 
| 101 | 
            +
                1
         | 
| 102 | 
            +
              ],
         | 
| 103 | 
            +
              "torch_dtype": "float32",
         | 
| 104 | 
            +
              "transformers_version": "4.16.0.dev0",
         | 
| 105 | 
            +
              "use_weighted_layer_sum": false,
         | 
| 106 | 
            +
              "vocab_size": 43,
         | 
| 107 | 
            +
              "xvector_output_dim": 512
         | 
| 108 | 
            +
            }
         | 
    	
        eval.py
    ADDED
    
    | @@ -0,0 +1,146 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/usr/bin/env python3
         | 
| 2 | 
            +
            from datasets import load_dataset, load_metric, Audio, Dataset
         | 
| 3 | 
            +
            from transformers import pipeline, AutoFeatureExtractor, AutoTokenizer
         | 
| 4 | 
            +
            import re
         | 
| 5 | 
            +
            import torch
         | 
| 6 | 
            +
            import argparse
         | 
| 7 | 
            +
            from typing import Dict
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            def log_results(result: Dataset, args: Dict[str, str]):
         | 
| 11 | 
            +
                """ DO NOT CHANGE. This function computes and logs the result metrics. """
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                log_outputs = args.log_outputs
         | 
| 14 | 
            +
                dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                # load metric
         | 
| 17 | 
            +
                wer = load_metric("wer")
         | 
| 18 | 
            +
                cer = load_metric("cer")
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                # compute metrics
         | 
| 21 | 
            +
                wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
         | 
| 22 | 
            +
                cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                # print & log results
         | 
| 25 | 
            +
                result_str = (
         | 
| 26 | 
            +
                    f"WER: {wer_result}\n"
         | 
| 27 | 
            +
                    f"CER: {cer_result}"
         | 
| 28 | 
            +
                )
         | 
| 29 | 
            +
                print(result_str)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                with open(f"{dataset_id}_eval_results.txt", "w") as f:
         | 
| 32 | 
            +
                    f.write(result_str)
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                # log all results in text file. Possibly interesting for analysis
         | 
| 35 | 
            +
                if log_outputs is not None:
         | 
| 36 | 
            +
                    pred_file = f"log_{dataset_id}_predictions.txt"
         | 
| 37 | 
            +
                    target_file = f"log_{dataset_id}_targets.txt"
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    with open(pred_file, "w") as p, open(target_file, "w") as t:
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                        # mapping function to write output
         | 
| 42 | 
            +
                        def write_to_file(batch, i):
         | 
| 43 | 
            +
                            p.write(f"{i}" + "\n")
         | 
| 44 | 
            +
                            p.write(batch["prediction"] + "\n")
         | 
| 45 | 
            +
                            t.write(f"{i}" + "\n")
         | 
| 46 | 
            +
                            t.write(batch["target"] + "\n")
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                        result.map(write_to_file, with_indices=True)
         | 
| 49 | 
            +
             | 
| 50 | 
            +
             | 
| 51 | 
            +
            def normalize_text(text: str, invalid_chars_regex: str, to_lower: bool) -> str:
         | 
| 52 | 
            +
                """ DO ADAPT FOR YOUR USE CASE. this function normalizes the target text. """
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                text = text.lower() if to_lower else text.upper()
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                text = re.sub(invalid_chars_regex, " ", text)
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                text = re.sub("\s+", " ", text).strip()
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                return text
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            def main(args):
         | 
| 64 | 
            +
                # load dataset
         | 
| 65 | 
            +
                dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                # for testing: only process the first two examples as a test
         | 
| 68 | 
            +
                # dataset = dataset.select(range(10))
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                # load processor
         | 
| 71 | 
            +
                feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
         | 
| 72 | 
            +
                sampling_rate = feature_extractor.sampling_rate
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                # resample audio
         | 
| 75 | 
            +
                dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                # load eval pipeline
         | 
| 78 | 
            +
                if args.device is None:
         | 
| 79 | 
            +
                    args.device = 0 if torch.cuda.is_available() else -1
         | 
| 80 | 
            +
                asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                # build normalizer config
         | 
| 83 | 
            +
                tokenizer = AutoTokenizer.from_pretrained(args.model_id)
         | 
| 84 | 
            +
                tokens = [x for x in tokenizer.convert_ids_to_tokens(range(0, tokenizer.vocab_size))]
         | 
| 85 | 
            +
                special_tokens = [
         | 
| 86 | 
            +
                    tokenizer.pad_token, tokenizer.word_delimiter_token,
         | 
| 87 | 
            +
                    tokenizer.unk_token, tokenizer.bos_token,
         | 
| 88 | 
            +
                    tokenizer.eos_token,
         | 
| 89 | 
            +
                ]
         | 
| 90 | 
            +
                non_special_tokens = [x for x in tokens if x not in special_tokens]
         | 
| 91 | 
            +
                invalid_chars_regex = f"[^\s{re.escape(''.join(set(non_special_tokens)))}]"
         | 
| 92 | 
            +
                normalize_to_lower = False
         | 
| 93 | 
            +
                for token in non_special_tokens:
         | 
| 94 | 
            +
                    if token.isalpha() and token.islower():
         | 
| 95 | 
            +
                        normalize_to_lower = True
         | 
| 96 | 
            +
                        break
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                # map function to decode audio
         | 
| 99 | 
            +
                def map_to_pred(batch, args=args, asr=asr, invalid_chars_regex=invalid_chars_regex, normalize_to_lower=normalize_to_lower):
         | 
| 100 | 
            +
                    prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                    batch["prediction"] = prediction["text"]
         | 
| 103 | 
            +
                    batch["target"] = normalize_text(batch["sentence"], invalid_chars_regex, normalize_to_lower)
         | 
| 104 | 
            +
                    return batch
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                # run inference on all examples
         | 
| 107 | 
            +
                result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                # compute and log_results
         | 
| 110 | 
            +
                # do not change function below
         | 
| 111 | 
            +
                log_results(result, args)
         | 
| 112 | 
            +
             | 
| 113 | 
            +
             | 
| 114 | 
            +
            if __name__ == "__main__":
         | 
| 115 | 
            +
                parser = argparse.ArgumentParser()
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                parser.add_argument(
         | 
| 118 | 
            +
                    "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
         | 
| 119 | 
            +
                )
         | 
| 120 | 
            +
                parser.add_argument(
         | 
| 121 | 
            +
                    "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
         | 
| 122 | 
            +
                )
         | 
| 123 | 
            +
                parser.add_argument(
         | 
| 124 | 
            +
                    "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
         | 
| 125 | 
            +
                )
         | 
| 126 | 
            +
                parser.add_argument(
         | 
| 127 | 
            +
                    "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
         | 
| 128 | 
            +
                )
         | 
| 129 | 
            +
                parser.add_argument(
         | 
| 130 | 
            +
                    "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
         | 
| 131 | 
            +
                )
         | 
| 132 | 
            +
                parser.add_argument(
         | 
| 133 | 
            +
                    "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
         | 
| 134 | 
            +
                )
         | 
| 135 | 
            +
                parser.add_argument(
         | 
| 136 | 
            +
                    "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
         | 
| 137 | 
            +
                )
         | 
| 138 | 
            +
                parser.add_argument(
         | 
| 139 | 
            +
                    "--device",
         | 
| 140 | 
            +
                    type=int,
         | 
| 141 | 
            +
                    default=None,
         | 
| 142 | 
            +
                    help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
         | 
| 143 | 
            +
                )
         | 
| 144 | 
            +
                args = parser.parse_args()
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                main(args)
         | 
    	
        language_model/attrs.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
         | 
    	
        language_model/lm.binary
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:d64e9f981412d81e0d86965ed6d574e797320a9a6603432e75136b46456f4c17
         | 
| 3 | 
            +
            size 212018701
         | 
    	
        language_model/unigrams.txt
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:6b9299acc35056cd86a2df6ce40019d54ab8e6358ebb6d17def96b7509853d21
         | 
| 3 | 
            +
            size 2256432
         | 
    	
        log_mozilla-foundation_common_voice_8_0_nl_test_predictions.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        log_mozilla-foundation_common_voice_8_0_nl_test_predictions_greedy.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        log_mozilla-foundation_common_voice_8_0_nl_test_targets.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        log_mozilla-foundation_common_voice_8_0_nl_test_targets_greedy.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        mozilla-foundation_common_voice_8_0_nl_test_eval_results.txt
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            WER: 0.08502335170614718
         | 
| 2 | 
            +
            CER: 0.027540302344623565
         | 
    	
        mozilla-foundation_common_voice_8_0_nl_test_eval_results_greedy.txt
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            WER: 0.10638700719485
         | 
| 2 | 
            +
            CER: 0.03152695480678647
         | 
    	
        preprocessor_config.json
    ADDED
    
    | @@ -0,0 +1,10 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "do_normalize": true,
         | 
| 3 | 
            +
              "feature_extractor_type": "Wav2Vec2FeatureExtractor",
         | 
| 4 | 
            +
              "feature_size": 1,
         | 
| 5 | 
            +
              "padding_side": "right",
         | 
| 6 | 
            +
              "padding_value": 0,
         | 
| 7 | 
            +
              "processor_class": "Wav2Vec2ProcessorWithLM",
         | 
| 8 | 
            +
              "return_attention_mask": true,
         | 
| 9 | 
            +
              "sampling_rate": 16000
         | 
| 10 | 
            +
            }
         | 
    	
        pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:8820100d2702c412ddd6be79f8c1197152b4f761d83536d9326bf907742fb0ee
         | 
| 3 | 
            +
            size 3850533041
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
         | 
    	
        vocab.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "'": 5, "-": 6, "a": 7, "b": 8, "c": 9, "d": 10, "e": 11, "f": 12, "g": 13, "h": 14, "i": 15, "j": 16, "k": 17, "l": 18, "m": 19, "n": 20, "o": 21, "p": 22, "q": 23, "r": 24, "s": 25, "t": 26, "u": 27, "v": 28, "w": 29, "x": 30, "y": 31, "z": 32, "à": 33, "â": 34, "è": 35, "é": 36, "ê": 37, "ë": 38, "î": 39, "ï": 40, "ô": 41, "û": 42}
         | 
