Fix character whitelist
Browse files- README.md +3 -3
 - tokenizer_config.json +8 -1
 
    	
        README.md
    CHANGED
    
    | 
         @@ -23,7 +23,7 @@ model-index: 
     | 
|
| 23 | 
         
             
                metrics:
         
     | 
| 24 | 
         
             
                   - name: Test WER
         
     | 
| 25 | 
         
             
                     type: wer
         
     | 
| 26 | 
         
            -
                     value:  
     | 
| 27 | 
         
             
            ---
         
     | 
| 28 | 
         | 
| 29 | 
         
             
            # Wav2Vec2-Large-XLSR-53-Dutch
         
     | 
| 
         @@ -87,7 +87,7 @@ processor = Wav2Vec2Processor.from_pretrained("wietsedv/wav2vec2-large-xlsr-53-f 
     | 
|
| 87 | 
         
             
            model = Wav2Vec2ForCTC.from_pretrained("wietsedv/wav2vec2-large-xlsr-53-frisian")
         
     | 
| 88 | 
         
             
            model.to("cuda")
         
     | 
| 89 | 
         | 
| 90 | 
         
            -
            chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”]'
         
     | 
| 91 | 
         
             
            resampler = torchaudio.transforms.Resample(48_000, 16_000)
         
     | 
| 92 | 
         | 
| 93 | 
         
             
            # Preprocessing the datasets.
         
     | 
| 
         @@ -117,7 +117,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8) 
     | 
|
| 117 | 
         
             
            print("WER: {:.2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
         
     | 
| 118 | 
         
             
            ```
         
     | 
| 119 | 
         | 
| 120 | 
         
            -
            **Test Result**:  
     | 
| 121 | 
         | 
| 122 | 
         | 
| 123 | 
         
             
            ## Training
         
     | 
| 
         | 
|
| 23 | 
         
             
                metrics:
         
     | 
| 24 | 
         
             
                   - name: Test WER
         
     | 
| 25 | 
         
             
                     type: wer
         
     | 
| 26 | 
         
            +
                     value: 16.25
         
     | 
| 27 | 
         
             
            ---
         
     | 
| 28 | 
         | 
| 29 | 
         
             
            # Wav2Vec2-Large-XLSR-53-Dutch
         
     | 
| 
         | 
|
| 87 | 
         
             
            model = Wav2Vec2ForCTC.from_pretrained("wietsedv/wav2vec2-large-xlsr-53-frisian")
         
     | 
| 88 | 
         
             
            model.to("cuda")
         
     | 
| 89 | 
         | 
| 90 | 
         
            +
            chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\'\“\%\‘\”]'
         
     | 
| 91 | 
         
             
            resampler = torchaudio.transforms.Resample(48_000, 16_000)
         
     | 
| 92 | 
         | 
| 93 | 
         
             
            # Preprocessing the datasets.
         
     | 
| 
         | 
|
| 117 | 
         
             
            print("WER: {:.2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
         
     | 
| 118 | 
         
             
            ```
         
     | 
| 119 | 
         | 
| 120 | 
         
            +
            **Test Result**: 16.25 %
         
     | 
| 121 | 
         | 
| 122 | 
         | 
| 123 | 
         
             
            ## Training
         
     | 
    	
        tokenizer_config.json
    CHANGED
    
    | 
         @@ -1 +1,8 @@ 
     | 
|
| 1 | 
         
            -
            { 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
                "unk_token": "<unk>",
         
     | 
| 3 | 
         
            +
                "bos_token": "<s>",
         
     | 
| 4 | 
         
            +
                "eos_token": "</s>",
         
     | 
| 5 | 
         
            +
                "pad_token": "<pad>",
         
     | 
| 6 | 
         
            +
                "do_lower_case": true,
         
     | 
| 7 | 
         
            +
                "word_delimiter_token": "|"
         
     | 
| 8 | 
         
            +
            }
         
     |