fish-speech-1 / tools /whisper_asr.py
PoTaTo721's picture
Update to V1.5
b2eb230
"""
Used to transcribe all audio files in one folder into another folder.
e.g.
Directory structure:
--pre_data_root
----SP_1
------01.wav
------02.wav
------......
----SP_2
------01.wav
------02.wav
------......
Use
python tools/whisper_asr.py --audio-dir pre_data_root/SP_1 --save-dir data/SP_1
to transcribe the first speaker.
Use
python tools/whisper_asr.py --audio-dir pre_data_root/SP_2 --save-dir data/SP_2
to transcribe the second speaker.
Note: Be aware of your audio sample rate, which defaults to 44.1kHz.
"""
import re
from pathlib import Path
import click
import soundfile as sf
from faster_whisper import WhisperModel
from loguru import logger
from pydub import AudioSegment
from tqdm import tqdm
from tools.file import AUDIO_EXTENSIONS, list_files
@click.command()
@click.option("--model-size", default="large-v3", help="Size of the Whisper model")
@click.option(
"--compute-type",
default="float16",
help="Computation Precision of the Whisper model [float16 / int8_float16 / int8]",
)
@click.option("--audio-dir", required=True, help="Directory containing audio files")
@click.option(
"--save-dir", required=True, help="Directory to save processed audio files"
)
@click.option(
"--sample-rate",
default=44100,
type=int,
help="Output sample rate, default to input sample rate",
)
@click.option("--device", default="cuda", help="Device to use [cuda / cpu]")
@click.option("--language", default="auto", help="Language of the transcription")
@click.option("--initial-prompt", default=None, help="Initial prompt for transcribing")
def main(
model_size,
compute_type,
audio_dir,
save_dir,
sample_rate,
device,
language,
initial_prompt,
):
logger.info("Loading / Downloading Faster Whisper model...")
model = WhisperModel(
model_size,
device=device,
compute_type=compute_type,
download_root="faster_whisper",
)
logger.info("Model loaded.")
save_path = Path(save_dir)
save_path.mkdir(parents=True, exist_ok=True)
audio_files = list_files(
path=audio_dir, extensions=AUDIO_EXTENSIONS, recursive=True
)
for file_path in tqdm(audio_files, desc="Processing audio file"):
file_stem = file_path.stem
file_suffix = file_path.suffix
rel_path = Path(file_path).relative_to(audio_dir)
(save_path / rel_path.parent).mkdir(parents=True, exist_ok=True)
audio = AudioSegment.from_file(file_path)
segments, info = model.transcribe(
file_path,
beam_size=5,
language=None if language == "auto" else language,
initial_prompt=initial_prompt,
)
print(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
print("Total len(ms): ", len(audio))
whole_text = None
for segment in segments:
id, start, end, text = (
segment.id,
segment.start,
segment.end,
segment.text,
)
print("Segment %03d [%.2fs -> %.2fs] %s" % (id, start, end, text))
if not whole_text:
whole_text = text
else:
whole_text += ", " + text
whole_text += "."
audio_save_path = save_path / rel_path.parent / f"{file_stem}{file_suffix}"
audio.export(audio_save_path, format=file_suffix[1:])
print(f"Exported {audio_save_path}")
transcript_save_path = save_path / rel_path.parent / f"{file_stem}.lab"
with open(
transcript_save_path,
"w",
encoding="utf-8",
) as f:
f.write(whole_text)
if __name__ == "__main__":
main()
exit(0)
audio = AudioSegment.from_wav(
r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav"
)
model_size = "large-v3"
model = WhisperModel(
model_size,
device="cuda",
compute_type="float16",
download_root="faster_whisper",
)
segments, info = model.transcribe(
r"D:\PythonProject\原神语音中文\胡桃\vo_hutao_draw_appear.wav",
beam_size=5,
)
print(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
print("Total len(ms): ", len(audio))
for i, segment in enumerate(segments):
print(
"Segment %03d [%.2fs -> %.2fs] %s"
% (i, segment.start, segment.end, segment.text)
)
start_ms = int(segment.start * 1000)
end_ms = int(segment.end * 1000)
segment_audio = audio[start_ms:end_ms]
segment_audio.export(f"segment_{i:03d}.wav", format="wav")
print(f"Exported segment_{i:03d}.wav")
print("All segments have been exported.")