Automatic Speech Recognition
Transformers
Safetensors
Japanese
whisper
audio
hf-asr-leaderboard
Eval Results
Inference Endpoints
asahi417 commited on
Commit
ca97597
·
verified ·
1 Parent(s): 612332b

Update README.md

Browse files

Fix the code snippet with the correct sampling size.

Files changed (1) hide show
  1. README.md +13 -11
README.md CHANGED
@@ -124,7 +124,7 @@ class to transcribe short-form audio files (< 30-seconds) as follows:
124
  ```python
125
  import torch
126
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
127
- from datasets import load_dataset
128
 
129
  # config
130
  model_id = "kotoba-tech/kotoba-whisper-v1.0"
@@ -145,8 +145,9 @@ pipe = pipeline(
145
  device=device,
146
  )
147
 
148
- # load sample audio
149
- dataset = load_dataset("japanese-asr/ja_asr.common_voice_8_0", split="test")
 
150
  sample = dataset[0]["audio"]
151
 
152
  # run inference
@@ -154,7 +155,7 @@ result = pipe(sample)
154
  print(result["text"])
155
  ```
156
 
157
- - To transcribe a local audio file, simply pass the path to your audio file when you call the pipeline:
158
  ```diff
159
  - result = pipe(sample)
160
  + result = pipe("audio.mp3")
@@ -205,7 +206,8 @@ pipe = pipeline(
205
  )
206
 
207
  # load sample audio (concatenate instances to creaete a long audio)
208
- dataset = load_dataset("japanese-asr/ja_asr.common_voice_8_0", split="test")
 
209
  sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
210
 
211
  # run inference
@@ -247,7 +249,8 @@ pipe = pipeline(
247
  )
248
 
249
  # load sample audio (concatenate instances to creaete a long audio)
250
- dataset = load_dataset("japanese-asr/ja_asr.common_voice_8_0", split="test")
 
251
  sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
252
 
253
  # run inference
@@ -318,14 +321,14 @@ Evaluation can then be run end-to-end with the following example:
318
 
319
  ```python
320
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
321
- from datasets import load_dataset, features
322
  from evaluate import load
323
  import torch
324
  from tqdm import tqdm
325
 
326
  # config
327
  model_id = "kotoba-tech/kotoba-whisper-v1.0"
328
- dataset_name = "japanese-asr/ja_asr.common_voice_8_0"
329
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
330
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
331
  audio_column = 'audio'
@@ -339,8 +342,7 @@ processor = AutoProcessor.from_pretrained(model_id)
339
 
340
  # load the dataset and sample the audio with 16kHz
341
  dataset = load_dataset(dataset_name, split="test")
342
- dataset = dataset.cast_column(audio_column, features.Audio(sampling_rate=processor.feature_extractor.sampling_rate))
343
- dataset = dataset.select([0, 1, 2, 3, 4, 5, 6])
344
 
345
  # preprocess and batch the dataset
346
 
@@ -379,7 +381,7 @@ The huggingface links to the major Japanese ASR datasets for evaluation are summ
379
  For example, to evaluate the model on JSUT Basic5000, change the `dataset_name`:
380
 
381
  ```diff
382
- - dataset_name = "japanese-asr/ja_asr.common_voice_8_0"
383
  + dataset_name = "japanese-asr/ja_asr.jsut_basic5000"
384
  ```
385
 
 
124
  ```python
125
  import torch
126
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
127
+ from datasets import load_dataset, Audio
128
 
129
  # config
130
  model_id = "kotoba-tech/kotoba-whisper-v1.0"
 
145
  device=device,
146
  )
147
 
148
+ # load sample audio & downsample to 16kHz
149
+ dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
150
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
151
  sample = dataset[0]["audio"]
152
 
153
  # run inference
 
155
  print(result["text"])
156
  ```
157
 
158
+ - To transcribe a local audio file, simply pass the path to your audio file when you call the pipeline (make sure the audio is sampled in 16kHz):
159
  ```diff
160
  - result = pipe(sample)
161
  + result = pipe("audio.mp3")
 
206
  )
207
 
208
  # load sample audio (concatenate instances to creaete a long audio)
209
+ dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
210
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
211
  sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
212
 
213
  # run inference
 
249
  )
250
 
251
  # load sample audio (concatenate instances to creaete a long audio)
252
+ dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
253
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
254
  sample = {"array": np.concatenate([i["array"] for i in dataset[:20]["audio"]]), "sampling_rate": dataset[0]['audio']['sampling_rate'], "path": "tmp"}
255
 
256
  # run inference
 
321
 
322
  ```python
323
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
324
+ from datasets import load_dataset, Audio
325
  from evaluate import load
326
  import torch
327
  from tqdm import tqdm
328
 
329
  # config
330
  model_id = "kotoba-tech/kotoba-whisper-v1.0"
331
+ dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
332
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
333
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
334
  audio_column = 'audio'
 
342
 
343
  # load the dataset and sample the audio with 16kHz
344
  dataset = load_dataset(dataset_name, split="test")
345
+ dataset = dataset.cast_column(audio_column, Audio(sampling_rate=processor.feature_extractor.sampling_rate))
 
346
 
347
  # preprocess and batch the dataset
348
 
 
381
  For example, to evaluate the model on JSUT Basic5000, change the `dataset_name`:
382
 
383
  ```diff
384
+ - dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
385
  + dataset_name = "japanese-asr/ja_asr.jsut_basic5000"
386
  ```
387