piper-tts / piper /voice.py
DLI-SLQ's picture
Upload 9 files
35f6708
raw
history blame
5.65 kB
import json
import logging
import wave
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, List, Optional, Union
import numpy as np
import onnxruntime
from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
from .config import PhonemeType, PiperConfig
from .const import BOS, EOS, PAD
from .util import audio_float_to_int16
_LOGGER = logging.getLogger(__name__)
@dataclass
class PiperVoice:
session: onnxruntime.InferenceSession
config: PiperConfig
@staticmethod
def load(
model_path: Union[str, Path],
config_path: Optional[Union[str, Path]] = None,
use_cuda: bool = False,
) -> "PiperVoice":
"""Load an ONNX model and config."""
if config_path is None:
config_path = f"{model_path}.json"
with open(config_path, "r", encoding="utf-8") as config_file:
config_dict = json.load(config_file)
return PiperVoice(
config=PiperConfig.from_dict(config_dict),
session=onnxruntime.InferenceSession(
str(model_path),
sess_options=onnxruntime.SessionOptions(),
providers=["CPUExecutionProvider"]
if not use_cuda
else ["CUDAExecutionProvider"],
),
)
def phonemize(self, text: str) -> List[List[str]]:
"""Text to phonemes grouped by sentence."""
if self.config.phoneme_type == PhonemeType.ESPEAK:
if self.config.espeak_voice == "ar":
# Arabic diacritization
# https://github.com/mush42/libtashkeel/
text = tashkeel_run(text)
return phonemize_espeak(text, self.config.espeak_voice)
if self.config.phoneme_type == PhonemeType.TEXT:
return phonemize_codepoints(text)
raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
"""Phonemes to ids."""
id_map = self.config.phoneme_id_map
ids: List[int] = list(id_map[BOS])
for phoneme in phonemes:
if phoneme not in id_map:
_LOGGER.warning("Missing phoneme from id map: %s", phoneme)
continue
ids.extend(id_map[phoneme])
ids.extend(id_map[PAD])
ids.extend(id_map[EOS])
return ids
def synthesize(
self,
text: str,
wav_file: wave.Wave_write,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
):
"""Synthesize WAV audio from text."""
wav_file.setframerate(self.config.sample_rate)
wav_file.setsampwidth(2) # 16-bit
wav_file.setnchannels(1) # mono
for audio_bytes in self.synthesize_stream_raw(
text,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
sentence_silence=sentence_silence,
):
wav_file.writeframes(audio_bytes)
def synthesize_stream_raw(
self,
text: str,
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
sentence_silence: float = 0.0,
) -> Iterable[bytes]:
"""Synthesize raw audio per sentence from text."""
sentence_phonemes = self.phonemize(text)
# 16-bit mono
num_silence_samples = int(sentence_silence * self.config.sample_rate)
silence_bytes = bytes(num_silence_samples * 2)
for phonemes in sentence_phonemes:
phoneme_ids = self.phonemes_to_ids(phonemes)
yield self.synthesize_ids_to_raw(
phoneme_ids,
speaker_id=speaker_id,
length_scale=length_scale,
noise_scale=noise_scale,
noise_w=noise_w,
) + silence_bytes
def synthesize_ids_to_raw(
self,
phoneme_ids: List[int],
speaker_id: Optional[int] = None,
length_scale: Optional[float] = None,
noise_scale: Optional[float] = None,
noise_w: Optional[float] = None,
) -> bytes:
"""Synthesize raw audio from phoneme ids."""
if length_scale is None:
length_scale = self.config.length_scale
if noise_scale is None:
noise_scale = self.config.noise_scale
if noise_w is None:
noise_w = self.config.noise_w
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
scales = np.array(
[noise_scale, length_scale, noise_w],
dtype=np.float32,
)
if (self.config.num_speakers > 1) and (speaker_id is None):
# Default speaker
speaker_id = 0
sid = None
if speaker_id is not None:
sid = np.array([speaker_id], dtype=np.int64)
# Synthesize through Onnx
audio = self.session.run(
None,
{
"input": phoneme_ids_array,
"input_lengths": phoneme_ids_lengths,
"scales": scales,
"sid": sid,
},
)[0].squeeze((0, 1))
audio = audio_float_to_int16(audio.squeeze())
return audio.tobytes()