Spaces:
Running
Running
| import os | |
| import sys | |
| from random import randint | |
| from typing import List, Optional, Set, Union | |
| from tortoise.utils.audio import get_voices, load_audio, load_voices | |
| from tortoise.utils.text import split_and_recombine_text | |
| def get_all_voices(extra_voice_dirs_str: str = ""): | |
| extra_voice_dirs = extra_voice_dirs_str.split(",") if extra_voice_dirs_str else [] | |
| return sorted(get_voices(extra_voice_dirs)), extra_voice_dirs | |
| def parse_voice_str(voice_str: str, all_voices: List[str]): | |
| selected_voices = all_voices if voice_str == "all" else voice_str.split(",") | |
| selected_voices = [v.split("&") if "&" in v else [v] for v in selected_voices] | |
| for voices in selected_voices: | |
| for v in voices: | |
| if v != "random" and v not in all_voices: | |
| raise ValueError( | |
| f"voice {v} not available, use --list-voices to see available voices." | |
| ) | |
| return selected_voices | |
| def voice_loader(selected_voices: list, extra_voice_dirs: List[str]): | |
| for voices in selected_voices: | |
| yield voices, *load_voices(voices, extra_voice_dirs) | |
| def parse_multiarg_text(text: List[str]): | |
| return (" ".join(text) if text else "".join(line for line in sys.stdin)).strip() | |
| def split_text(text: str, text_split: str): | |
| if text_split: | |
| desired_length, max_length = map(int, text_split.split(",")) | |
| if desired_length > max_length: | |
| raise ValueError( | |
| f"--text-split: desired_length ({desired_length}) must be <= max_length ({max_length})" | |
| ) | |
| texts = split_and_recombine_text(text, desired_length, max_length) | |
| else: | |
| texts = split_and_recombine_text(text) | |
| # | |
| if not texts: | |
| raise ValueError("no text provided") | |
| return texts | |
| def validate_output_dir(output_dir: str, selected_voices: list, candidates: int): | |
| if output_dir: | |
| os.makedirs(output_dir, exist_ok=True) | |
| else: | |
| if len(selected_voices) > 1: | |
| raise ValueError('cannot have multiple voices without --output-dir"') | |
| if candidates > 1: | |
| raise ValueError('cannot have multiple candidates without --output-dir"') | |
| return output_dir | |
| def check_pydub(play: bool): | |
| if play: | |
| try: | |
| import pydub | |
| import pydub.playback | |
| return pydub | |
| except ImportError: | |
| raise RuntimeError( | |
| '--play requires pydub to be installed, which can be done with "pip install pydub"' | |
| ) | |
| def get_seed(seed: Optional[int]): | |
| return randint(0, 2**32 - 1) if seed is None else seed | |
| from pathlib import Path | |
| from typing import Any, Callable | |
| import torch | |
| import torchaudio | |
| def run_and_save_tts( | |
| call_tts, | |
| text, | |
| output_dir: Path, | |
| return_deterministic_state, | |
| return_filepaths=False, | |
| voicefixer=True, | |
| ): | |
| output_dir.mkdir(exist_ok=True) | |
| if return_deterministic_state: | |
| gen, dbg = call_tts(text) | |
| torch.save(dbg, output_dir / "dbg.pt") | |
| else: | |
| gen = call_tts(text) | |
| # | |
| if not isinstance(gen, list): | |
| gen = [gen] | |
| gen = [g.squeeze(0).cpu() for g in gen] | |
| fps = [] | |
| for i, g in enumerate(gen): | |
| fps.append(output_dir / f"{i}.wav") | |
| save_gen_with_voicefix(g, fps[-1], squeeze=False, voicefixer=voicefixer) | |
| # torchaudio.save(output_dir/f'{i}.wav', g, 24000) | |
| return fps if return_filepaths else gen | |
| def infer_on_texts( | |
| call_tts: Callable[[str], Any], | |
| texts: List[str], | |
| output_dir: Union[str, Path], | |
| return_deterministic_state: bool, | |
| lines_to_regen: Set[int], | |
| logger=print, | |
| return_filepaths=False, | |
| voicefixer=True, | |
| ): | |
| audio_chunks = [] | |
| base_p = Path(output_dir) | |
| base_p.mkdir(exist_ok=True) | |
| for text_idx, text in enumerate(texts): | |
| line_p = base_p / f"{text_idx}" | |
| line_p.mkdir(exist_ok=True) | |
| # | |
| if text_idx not in lines_to_regen: | |
| files = list(line_p.glob("*.wav")) | |
| if files: | |
| logger(f"loading existing audio fragments for [{text_idx}]") | |
| audio_chunks.append([load_audio(str(f), 24000) for f in files]) | |
| continue | |
| else: | |
| logger(f"no existing audio fragment for [{text_idx}]") | |
| # | |
| logger(f"generating audio for text {text_idx}: {text}") | |
| audio_chunks.append( | |
| run_and_save_tts( | |
| call_tts, | |
| text, | |
| line_p, | |
| return_deterministic_state, | |
| voicefixer=voicefixer, | |
| ) | |
| ) | |
| fnames = [] | |
| results = [] | |
| for i in range(len(audio_chunks[0])): | |
| resultant = torch.cat([c[i] for c in audio_chunks], dim=-1) | |
| fnames.append(base_p / f"combined-{i}.wav") | |
| save_gen_with_voicefix( | |
| resultant, fnames[-1], squeeze=False, voicefixer=False | |
| ) # do not run fix on combined!! | |
| results.append(resultant) | |
| # torchaudio.save(base_p/'combined.wav', resultant, 24000) | |
| return fnames if return_filepaths else results | |
| from voicefixer import VoiceFixer | |
| vfixer = VoiceFixer() | |
| def save_gen_with_voicefix(g, fpath, squeeze=True, voicefixer=True): | |
| torchaudio.save(fpath, g.squeeze(0).cpu() if squeeze else g, 24000, format="wav") | |
| if voicefixer: | |
| vfixer.restore( | |
| input=fpath, | |
| output=fpath, | |
| cuda=True, | |
| mode=0, | |
| # your_vocoder_func = convert_mel_to_wav # TODO test if integration with unvinet improves things | |
| ) | |