Short form transcription - Does distil-medium.en only transcribe for max 30 seconds of a video/audio?
Hi,
I am just testing this new model. I used openai's whisper for transcribing an audio (~ 4 mins length) and it transcribes the whole audio.
I used short form transcription using distil-medium.en using the code shown on the model card page but it only transcribes the first 30 seconds. Why is that so?
Here's the code:
import os
import argparse
import whisper
import time
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from writeToJSON import createJSON
def openai_transcript(vid_path):
model = whisper.load_model('medium.en', device = "cpu")
print(f"Transcribing {vid_path} using openai whisper...")
start_time = time.time()
result = model.transcribe(vid_path)
end_time = time.time()
return result["text"], f"{end_time - start_time:.2f} seconds"
# print(f"{vid_path} using openai whisper took => {end_time - start_time:.2f} seconds")
def distil_whisper_transcript(vid_path):
device = "cpu"
torch_dtype = torch.float32
model_id = "distil-whisper/distil-medium.en"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, use_safetensors=True #low_cpu_mem_usage=True,
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
# max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
print(f"Transcribing {vid_path} using distil-whisper...")
start_time = time.time()
result = pipe(vid_path)
end_time = time.time()
return result["text"], f"{end_time - start_time:.2f} seconds"
# print(f"{vid_path} using distil-whisper took => {end_time - start_time:.2f} seconds")
if name == "main":
# Create an argument parser
parser = argparse.ArgumentParser(description="Transcription-summarization pipeline")
# Define expected command-line arguments
parser.add_argument('--vid_folder', type=str, help='Enter the video file path')
# Parse the command-line arguments
args = parser.parse_args()
vid_folder = args.vid_folder
for vid in os.listdir(vid_folder):
vid_path = os.path.join(vid_folder, vid)
# Transcribe using openai whisper medium.en
transcript_openai, time_openai = openai_transcript(vid_path)
# Transcribe using distil-whisper medium.en
transcript_distil, time_distil = distil_whisper_transcript(vid_path)
createJSON(vid_path, transcript_openai, time_openai, transcript_distil, time_distil, "output.json")
Hey @anuragrawal - by 'short-form' audio we refer to audio segments less than 30s. To transcribe 'long-form' audio (>30s) please see the example usage: https://huggingface.co/distil-whisper/distil-medium.en#long-form-transcription