Spaces:

kotoba-speech
/

kotoba-whisper-demo

Running on Zero

asahi417 commited on Apr 21, 2024

Commit

da4f293

verified ·

1 Parent(s): 9768dae

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,16 +9,27 @@ import tempfile
 import os
 MODEL_NAME = "kotoba-tech/kotoba-whisper-v1.0"
-BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files
-device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
-    chunk_length_s=15,
     device=device,
 )
@@ -26,7 +37,8 @@ pipe = pipeline(
 def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    return pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
 def _return_yt_html_embed(yt_url):
@@ -68,7 +80,8 @@ def yt_transcribe(yt_url, max_filesize=75.0):
             inputs = f.read()
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
     return html_embed_str, text

 import os
 MODEL_NAME = "kotoba-tech/kotoba-whisper-v1.0"
+BATCH_SIZE = 16
+CHUNK_LENGTH_S = 15
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files
+if torch.cuda.is_available():
+    torch_dtype = torch.bfloat16
+    device = "cuda:0"
+    model_kwargs = {'attn_implementation': 'sdpa'}
+else:
+    torch_dtype = torch.float32
+    device = "cpu"
+    model_kwargs = {}
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=MODEL_NAME,
+    chunk_length_s=CHUNK_LENGTH_S,
+    torch_dtype=torch_dtype,
     device=device,
+    model_kwargs=model_kwargs
 )
 def transcribe(inputs):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    generate_kwargs = {"language": "japanese", "task": "transcribe"}
+    return pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs=generate_kwargs)["text"]
 def _return_yt_html_embed(yt_url):
             inputs = f.read()
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    generate_kwargs = {"language": "japanese", "task": "transcribe"}
+    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs=generate_kwargs)["text"]
     return html_embed_str, text