Spaces:
Runtime error
Runtime error
"""Speech tokenizer class. | |
Copyright PolyAI Limited. | |
""" | |
import logging | |
import os | |
import numpy as np | |
import torch | |
import torchaudio | |
from speechtokenizer import SpeechTokenizer as ST | |
from modules.tokenizer import BaseTokenizer | |
class SpeechTokenizer(BaseTokenizer): | |
def __init__(self, config_path: str, ckpt_path: str): | |
self.device = torch.device( | |
"cuda" if torch.cuda.is_available() else "cpu") | |
self.model = ST.load_from_checkpoint( | |
config_path, ckpt_path).to(self.device) | |
self.model.eval() | |
def encode_file( | |
self, folder_path: str, destination_folder: str, filename: str): | |
dest_path = os.path.join( | |
destination_folder, "semantic", | |
os.path.splitext(filename)[0] + ".npy" | |
) | |
dest_path2 = os.path.join( | |
destination_folder, "acoustic", | |
os.path.splitext(filename)[0] + ".npy" | |
) | |
if os.path.exists(dest_path) and os.path.exists(dest_path2): | |
pass | |
else: | |
self._create_subfolders(destination_folder=destination_folder) | |
file_path = os.path.join(folder_path, filename) | |
wav_info = torchaudio.info(file_path) | |
wav_dur_sec = wav_info.num_frames / wav_info.sample_rate | |
if wav_dur_sec > 60: | |
logging.info( | |
f"Skipping {file_path} is too long: {wav_dur_sec:.3f} sec," | |
"can cause CUDA OOM" | |
) | |
return | |
wav, sr = torchaudio.load(file_path) | |
if sr != self.model.sample_rate: | |
logging.warning( | |
"Wav sample rate %(wav_sr)s does not match the model" | |
"sampling rate %(model_sr)s. Resampling audio", | |
{"wav_sr": sr, "model_sr": self.model.sample_rate}, | |
) | |
wav = torchaudio.functional.resample( | |
wav, sr, self.model.sample_rate) | |
wav = wav.unsqueeze(0) | |
wav = wav.to(self.device) | |
# Extract discrete codes from SpeechTokenizer | |
with torch.no_grad(): | |
codes = self.model.encode(wav) # codes: (n_q, B, T) | |
semantic_tokens = codes[0, 0, :] | |
acoustic_tokens = codes[1:, 0, :] | |
# Save the encoding as .npy | |
dest_path = os.path.join( | |
destination_folder, "acoustic", | |
os.path.splitext(filename)[0] + ".npy" | |
) | |
np.save(dest_path, acoustic_tokens.cpu().numpy()) | |
dest_path = os.path.join( | |
destination_folder, "semantic", | |
os.path.splitext(filename)[0] + ".npy" | |
) | |
np.save(dest_path, semantic_tokens.cpu().numpy()) | |
def _create_subfolders(destination_folder: str): | |
if not os.path.exists(destination_folder + "/acoustic"): | |
os.makedirs(destination_folder + "/acoustic") | |
if not os.path.exists(destination_folder + "/semantic"): | |
os.makedirs(destination_folder + "/semantic") | |