valhalla commited on
Commit
a7472a9
1 Parent(s): b439595

update processor and readme

Browse files
Files changed (4) hide show
  1. README.md +7 -5
  2. config.json +3 -2
  3. preprocessor_config.json +3 -3
  4. pytorch_model.bin +2 -2
README.md CHANGED
@@ -35,16 +35,18 @@ transcripts by passing the speech features to the model.
35
  *Note: The `Speech2TextProcessor` object uses [torchaudio](https://github.com/pytorch/audio) to extract the
36
  filter bank features. Make sure to install the `torchaudio` package before running this example.*
37
 
38
- To install `torchaudio` run `pip install torchaudio`
 
 
39
 
40
 
41
  ```python
42
  import torch
43
- from transformers import Speech2TextProcessor, Speech2TextTransformerForConditionalGeneration
44
  from datasets import load_dataset
45
  import soundfile as sf
46
 
47
- model = Speech2TextTransformerForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr")
48
  processor = Speech2Textprocessor.from_pretrained("facebook/s2t-large-librispeech-asr")
49
 
50
  def map_to_array(batch):
@@ -76,13 +78,13 @@ The following script shows how to evaluate this model on the [LibriSpeech](https
76
 
77
  ```python
78
  from datasets import load_dataset, load_metric
79
- from transformers import Speech2TextTransformerForConditionalGeneration, Speech2TextProcessor
80
  import soundfile as sf
81
 
82
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test") # change to "other" for other test dataset
83
  wer = load_metric("wer")
84
 
85
- model = Speech2TextTransformerForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr").to("cuda")
86
  processor = Speech2TextProcessor.from_pretrained("facebook/s2t-large-librispeech-asr", do_upper_case=True)
87
 
88
  def map_to_array(batch):
 
35
  *Note: The `Speech2TextProcessor` object uses [torchaudio](https://github.com/pytorch/audio) to extract the
36
  filter bank features. Make sure to install the `torchaudio` package before running this example.*
37
 
38
+ You could either install those as extra speech dependancies with
39
+ `pip install transformers"[speech, sentencepiece]"` or install the packages seperatly
40
+ with `pip install torchaudio sentencepiece`.
41
 
42
 
43
  ```python
44
  import torch
45
+ from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
46
  from datasets import load_dataset
47
  import soundfile as sf
48
 
49
+ model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr")
50
  processor = Speech2Textprocessor.from_pretrained("facebook/s2t-large-librispeech-asr")
51
 
52
  def map_to_array(batch):
 
78
 
79
  ```python
80
  from datasets import load_dataset, load_metric
81
+ from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
82
  import soundfile as sf
83
 
84
  librispeech_eval = load_dataset("librispeech_asr", "clean", split="test") # change to "other" for other test dataset
85
  wer = load_metric("wer")
86
 
87
+ model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-large-librispeech-asr").to("cuda")
88
  processor = Speech2TextProcessor.from_pretrained("facebook/s2t-large-librispeech-asr", do_upper_case=True)
89
 
90
  def map_to_array(batch):
config.json CHANGED
@@ -1,8 +1,9 @@
1
  {
 
2
  "activation_dropout": 0.2,
3
  "activation_function": "relu",
4
  "architectures": [
5
- "Speech2TextTransformerForConditionalGeneration"
6
  ],
7
  "attention_dropout": 0.2,
8
  "bos_token_id": 0,
@@ -33,7 +34,7 @@
33
  "max_length": 200,
34
  "max_source_positions": 6000,
35
  "max_target_positions": 1024,
36
- "model_type": "speech_to_text_transformer",
37
  "num_beams": 5,
38
  "num_conv_layers": 2,
39
  "num_hidden_layers": 12,
 
1
  {
2
+ "_name_or_path": "hf_models_fb/s2t-large-librispeech-asr/",
3
  "activation_dropout": 0.2,
4
  "activation_function": "relu",
5
  "architectures": [
6
+ "Speech2TextForConditionalGeneration"
7
  ],
8
  "attention_dropout": 0.2,
9
  "bos_token_id": 0,
 
34
  "max_length": 200,
35
  "max_source_positions": 6000,
36
  "max_target_positions": 1024,
37
+ "model_type": "speech_to_text",
38
  "num_beams": 5,
39
  "num_conv_layers": 2,
40
  "num_hidden_layers": 12,
preprocessor_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "do_normalize": true,
3
  "feature_size": 80,
4
- "norm_means": true,
5
- "norm_vars": true,
6
  "num_mel_bins": 80,
7
  "padding_side": "right",
8
  "padding_value": 0.0,
 
1
  {
2
+ "do_ceptral_normalize": true,
3
  "feature_size": 80,
4
+ "normalize_means": true,
5
+ "normalize_vars": true,
6
  "num_mel_bins": 80,
7
  "padding_side": "right",
8
  "padding_value": 0.0,
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02f8a1dd9fbebf969266e155a5a9df274f24bf82f544d000ec4cc55ca3ebda9a
3
- size 1071473998
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55e3aa76d71b3792f1a0a055316c0c205e0697a44778d4a6e6af4fc9994fd93c
3
+ size 1071459644