getting an error trying to extract word-level timestamps
#182
by
SphinxKing
- opened
File "c:\Users\Adham\Desktop\Coding stuff\TwitchPy\whisper_openai.py", line 87, in <module>
main()
File "c:\Users\Adham\Desktop\Coding stuff\TwitchPy\whisper_openai.py", line 81, in main
print(whisp.audio_to_text(filename, "word"))
File "c:\Users\Adham\Desktop\Coding stuff\TwitchPy\whisper_openai.py", line 54, in audio_to_text
result = self.pipe(audio_file, return_timestamps="word")
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\pipelines\automatic_speech_recognition.py", line 283, in __call__
return super().__call__(inputs, **kwargs)
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\pipelines\base.py", line 1360, in __call__
return next(
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\pipelines\pt_utils.py", line 124, in __next__
item = next(self.iterator)
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\pipelines\pt_utils.py", line 269, in __next__
processed = self.infer(next(self.iterator), **self.params)
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\pipelines\base.py", line 1275, in forward
model_outputs = self._forward(model_inputs, **forward_params)
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\pipelines\automatic_speech_recognition.py", line 521, in _forward
tokens = self.model.generate(
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\models\whisper\generation_whisper.py", line 774, in generate
) = self.generate_with_fallback(
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\models\whisper\generation_whisper.py", line 965, in generate_with_fallback
seek_sequences, seek_outputs = self._postprocess_outputs(
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\models\whisper\generation_whisper.py", line 1067, in _postprocess_outputs
seek_outputs["token_timestamps"] = self._extract_token_timestamps(
File "C:\Users\Adham\Desktop\Coding stuff\TwitchPy\twitch-obs\lib\site-packages\transformers\models\whisper\generation_whisper.py", line 315, in _extract_token_timestamps
matrix = weights[batch_idx, ..., : num_frames[batch_idx] // 2]
IndexError: index 0 is out of bounds for axis 0 with size 0```
this is the error log that i get
from this code snippet:
```import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
class WhisperManager():
# Uses Whisper on HuggingFace: https://huggingface.co/openai/whisper-large-v3
# Need to make sure you've installed torch with CUDA support, rather than just default torch: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# I tried a lot but could not get Flash Attention 2 to install. It would speed up performance but isn't necessary.
def __init__(self):
print(torch.cuda.is_available()) # Should return True if CUDA is available
print(torch.cuda.get_device_name(0)) # Should return the name of your GPU, e.g., "NVIDIA GeForce RTX 4070 Ti"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
torch.backends.cuda.enable_mem_efficient_sdp(True)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True,
attn_implementation="eager"
)
model.to(device)
model.generation_config.is_multilingual = False
model.generation_config.language = "en"
processor = AutoProcessor.from_pretrained(model_id)
self.pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=256,
chunk_length_s=30,
batch_size=16,
torch_dtype=torch_dtype,
device=device,
)
# Converts an audio file into transcribed text. Can provide also provide timestamps
# wav and mp3 files appear to take the same amount of time to process
# With test files, word timestamps took 3.5-4 seconds, sentence timestamps took 2.2 seconds, no timestamps took 1.9-2 seconds
def audio_to_text(self, audio_file, timestamps=None):
if timestamps == None:
result = self.pipe(audio_file, return_timestamps=False)
elif timestamps == "sentence":
result = self.pipe(audio_file, return_timestamps=True)
elif timestamps == "word":
result = self.pipe(audio_file, return_timestamps="word")
print(result)
else:
result = {"text": " "}
if timestamps == None:
# If they didn't want the timestamps, then just return the text
return result["text"]
else:
# Return an array of dictionaries that contain every sentence/word with its corresponding start and end time
# I reformat the data a bit so that it's more intuitive to work with.
# Each dictionary will look like: {'text': 'here is my speech', 'start_time': 11.58, 'end_time': 14.74}
timestamped_chunks = []
for chunk in result['chunks']:
new_chunk = {
'text': chunk['text'],
'start_time': chunk['timestamp'][0],
'end_time': chunk['timestamp'][1]
}
timestamped_chunks.append(new_chunk)
return timestamped_chunks
running python 3.9.2, transformers 4.49.0, torch/torchaudio 2.6.0, torchvision 0.21, cuda 12.6 for all these packages even though i currently have cuda 12.8 on my machine. windows 10