readme
Browse files
README.md
CHANGED
@@ -109,12 +109,11 @@ it inherits the benefit of the improved latency compared to [openai/whisper-larg
|
|
109 |
|
110 |
## Transformers Usage
|
111 |
Kotoba-Whisper is supported in the Hugging Face 🤗 Transformers library from version 4.39 onwards. To run the model, first
|
112 |
-
install the latest version of Transformers.
|
113 |
-
from the Hugging Face Hub:
|
114 |
|
115 |
```bash
|
116 |
pip install --upgrade pip
|
117 |
-
pip install --upgrade transformers accelerate
|
118 |
```
|
119 |
|
120 |
### Short-Form Transcription
|
@@ -124,7 +123,7 @@ class to transcribe short-form audio files (< 30-seconds) as follows:
|
|
124 |
```python
|
125 |
import torch
|
126 |
from transformers import pipeline
|
127 |
-
from datasets import load_dataset
|
128 |
|
129 |
# config
|
130 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
@@ -142,7 +141,7 @@ pipe = pipeline(
|
|
142 |
model_kwargs=model_kwargs
|
143 |
)
|
144 |
|
145 |
-
# load sample audio
|
146 |
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
147 |
sample = dataset[0]["audio"]
|
148 |
|
@@ -222,7 +221,7 @@ Kotoba-whisper can generate transcription with prompting as below:
|
|
222 |
import re
|
223 |
import torch
|
224 |
from transformers import pipeline
|
225 |
-
from datasets import load_dataset
|
226 |
|
227 |
# config
|
228 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
@@ -240,7 +239,7 @@ pipe = pipeline(
|
|
240 |
model_kwargs=model_kwargs
|
241 |
)
|
242 |
|
243 |
-
# load sample audio
|
244 |
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
245 |
|
246 |
# --- Without prompt ---
|
@@ -297,11 +296,9 @@ pip install --upgrade transformers datasets[audio] evaluate jiwer
|
|
297 |
Evaluation can then be run end-to-end with the following example:
|
298 |
|
299 |
```python
|
300 |
-
from tqdm import tqdm
|
301 |
-
|
302 |
import torch
|
303 |
from transformers import pipeline
|
304 |
-
from datasets import load_dataset
|
305 |
from evaluate import load
|
306 |
|
307 |
# model config
|
@@ -312,7 +309,6 @@ model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else
|
|
312 |
generate_kwargs = {"language": "japanese", "task": "transcribe"}
|
313 |
|
314 |
# data config
|
315 |
-
generate_kwargs = {"language": "japanese", "task": "transcribe"}
|
316 |
dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
|
317 |
audio_column = 'audio'
|
318 |
text_column = 'transcription'
|
|
|
109 |
|
110 |
## Transformers Usage
|
111 |
Kotoba-Whisper is supported in the Hugging Face 🤗 Transformers library from version 4.39 onwards. To run the model, first
|
112 |
+
install the latest version of Transformers.
|
|
|
113 |
|
114 |
```bash
|
115 |
pip install --upgrade pip
|
116 |
+
pip install --upgrade transformers accelerate
|
117 |
```
|
118 |
|
119 |
### Short-Form Transcription
|
|
|
123 |
```python
|
124 |
import torch
|
125 |
from transformers import pipeline
|
126 |
+
from datasets import load_dataset
|
127 |
|
128 |
# config
|
129 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
|
|
141 |
model_kwargs=model_kwargs
|
142 |
)
|
143 |
|
144 |
+
# load sample audio
|
145 |
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
146 |
sample = dataset[0]["audio"]
|
147 |
|
|
|
221 |
import re
|
222 |
import torch
|
223 |
from transformers import pipeline
|
224 |
+
from datasets import load_dataset
|
225 |
|
226 |
# config
|
227 |
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
|
|
239 |
model_kwargs=model_kwargs
|
240 |
)
|
241 |
|
242 |
+
# load sample audio
|
243 |
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
244 |
|
245 |
# --- Without prompt ---
|
|
|
296 |
Evaluation can then be run end-to-end with the following example:
|
297 |
|
298 |
```python
|
|
|
|
|
299 |
import torch
|
300 |
from transformers import pipeline
|
301 |
+
from datasets import load_dataset
|
302 |
from evaluate import load
|
303 |
|
304 |
# model config
|
|
|
309 |
generate_kwargs = {"language": "japanese", "task": "transcribe"}
|
310 |
|
311 |
# data config
|
|
|
312 |
dataset_name = "japanese-asr/ja_asr.reazonspeech_test"
|
313 |
audio_column = 'audio'
|
314 |
text_column = 'transcription'
|