from pywhispercpp.model import Model import config import soundfile from pywhispercpp.utils import to_timestamp mel, _, = soundfile.read("test/6_before_cut_56640.wav") # mel, _, = soundfile.read(f"{config.ASSERT_DIR}/jfk.flac") models_dir = config.MODEL_DIR.as_posix() model = Model( model=config.WHISPER_MODEL, models_dir=models_dir, n_threads=4, print_realtime=False, print_progress=False, print_timestamps=False, translate=False, temperature=0., no_context=True ) print(mel.shape, mel.dtype) # (160000,) float64 segments = model.transcribe(mel, # initial_prompt="",# 'The following is an English sentence.', # "以下是简体中文句子。" language='en', # initial_prompt="以下是简体中文句子。", # language='zh', token_timestamps=True, max_len=1,) for segment in segments: print(to_timestamp(segment.t0), to_timestamp(segment.t1), segment.text)