from typing import Dict, Any,Union |
import tempfile |
import numpy as np |
import torch |
import pyewts |
import noisereduce as nr |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
from num2tib.core import convert |
from num2tib.core import convert2text |
import soundfile as sf |
import base64 |
import re |
import requests |
import os |
from pydub import AudioSegment |
def increase_volume_without_distortion(audio_data, sample_rate, target_dBFS): |
audio_segment = AudioSegment( |
audio_data.tobytes(), |
frame_rate=sample_rate, |
sample_width=audio_data.dtype.itemsize, |
channels=1 |
) |
change_in_dBFS = target_dBFS - audio_segment.dBFS |
normalized_audio = audio_segment.apply_gain(change_in_dBFS) |
normalized_audio_data = np.array(normalized_audio.get_array_of_samples()).astype(np.int16) |
return normalized_audio_data |
converter = pyewts.pyewts() |
def download_file(url, destination): |
response = requests.get(url) |
with open(destination, 'wb') as file: |
file.write(response.content) |
download_file('https://huggingface.co/openpecha/speecht5-tts-01/resolve/main/female_2.npy', 'female_2.npy') |
def replace_numbers_with_convert(sentence, wylie=True): |
pattern = r'\d+(\.\d+)?' |
def replace(match): |
return convert(match.group(), wylie) |
result = re.sub(pattern, replace, sentence) |
return result |
def cleanup_text(inputs): |
for src, dst in replacements: |
inputs = inputs.replace(src, dst) |
return inputs |
speaker_embeddings = { |
"Lhasa(female)": "female_2.npy", |
} |
replacements = [ |
('_', '_'), |
('*', 'v'), |
('`', ';'), |
('~', ','), |
('+', ','), |
('\\', ';'), |
('|', ';'), |
('â•š',''), |
('â•—','') |
] |
class EndpointHandler(): |
def __init__(self, path=""): |
self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") |
self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b") |
self.model.to('cuda') |
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]: |
"""_summary_ |
Args: |
data (Dict[str, Any]): _description_ |
Returns: |
bytes: _description_ |
""" |
text = data.pop("inputs",data) |
if len(text.strip()) == 0: |
return (16000, np.zeros(0).astype(np.int16)) |
text = converter.toWylie(text) |
text=cleanup_text(text) |
text=replace_numbers_with_convert(text) |
inputs = self.processor(text=text, return_tensors="pt") |
input_ids = inputs["input_ids"] |
input_ids = input_ids[..., :self.model.config.max_text_positions] |
speaker_embedding = np.load(speaker_embeddings['Lhasa(female)']) |
speaker_embedding = torch.tensor(speaker_embedding) |
speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda')) |
speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000) |
if isinstance(speech, torch.Tensor): |
speech = speech.numpy() |
target_dBFS = -20.0 |
speech = increase_volume_without_distortion(speech, 16000, target_dBFS) |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file: |
temp_wav_path = temp_wav_file.name |
sf.write(temp_wav_path, speech, 16000, 'PCM_24') |
with open(temp_wav_path, "rb") as wav_file: |
audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8") |
os.remove(temp_wav_path) |
return { |
"sample_rate": 16000, |
"audio_base64": audio_base64, |
} |