Update README.md
Browse files
README.md
CHANGED
@@ -258,6 +258,41 @@ result = pipe(sample)
|
|
258 |
print(result["text"])
|
259 |
```
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
### Additional Speed & Memory Improvements
|
262 |
You can apply additional speed and memory improvements to further reduce the inference speed and VRAM
|
263 |
requirements. These optimisations primarily target the attention kernel, swapping it from an eager implementation to a
|
|
|
258 |
print(result["text"])
|
259 |
```
|
260 |
|
261 |
+
### Transcription with Prompt
|
262 |
+
Kotoba-whisper can generate transcription with prompting as below:
|
263 |
+
|
264 |
+
```python
|
265 |
+
import torch
|
266 |
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
267 |
+
from datasets import load_dataset, Audio
|
268 |
+
|
269 |
+
# config
|
270 |
+
model_id = "kotoba-tech/kotoba-whisper-v1.0"
|
271 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
272 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
273 |
+
|
274 |
+
# load model
|
275 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
|
276 |
+
model.to(device)
|
277 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
278 |
+
|
279 |
+
# load sample audio & downsample to 16kHz
|
280 |
+
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
281 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
282 |
+
input_features = processor(dataset[10]["audio"]["array"], return_tensors="pt").input_features
|
283 |
+
|
284 |
+
# --- Without prompt ---
|
285 |
+
output_without_prompt = model.generate(input_features)
|
286 |
+
print(processor.decode(output_without_prompt[0]))
|
287 |
+
# <|startoftranscript|><|ko|><|transcribe|><|notimestamps|>81ζ³γεεΌ·γθ΅°γγ«ε€γγ£γ¦γγΎγγ<|endoftext|>
|
288 |
+
|
289 |
+
# --- With prompt ---: Let's change `81` to `91`.
|
290 |
+
prompt_ids = processor.get_prompt_ids("91ζ³", return_tensors="pt")
|
291 |
+
output_with_prompt = model.generate(input_features, prompt_ids=prompt_ids)
|
292 |
+
print(processor.decode(output_with_prompt[0]))
|
293 |
+
# <|startofprev|> 91ζ³<|startoftranscript|><|ko|><|transcribe|><|notimestamps|> γγ£γΆγ£γγ§γγΉγ«γ¬γγγ91ζ³γεεΌ·γθ΅°γγ«ε€γγ£γ¦γγΎγγ<|endoftext|>
|
294 |
+
```
|
295 |
+
|
296 |
### Additional Speed & Memory Improvements
|
297 |
You can apply additional speed and memory improvements to further reduce the inference speed and VRAM
|
298 |
requirements. These optimisations primarily target the attention kernel, swapping it from an eager implementation to a
|