File size: 4,290 Bytes
e3092d1 75700af 1b84f28 4f18e92 e3092d1 1b84f28 4d2be3f 75700af 0db7ad9 1b84f28 4d2be3f 1b84f28 e3092d1 37e5267 1b84f28 37e5267 1b84f28 61bebd5 37e5267 1b84f28 61bebd5 1b84f28 0db7ad9 4f18e92 75700af 5f5648d 4f18e92 75700af 4f18e92 75700af 4f18e92 e3092d1 4f18e92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from typing import Dict, Any,Union
import tempfile
import numpy as np
import torch
import pyewts
import noisereduce as nr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from num2tib.core import convert
from num2tib.core import convert2text
import soundfile as sf
import base64
import re
import requests
import os
from pydub import AudioSegment
def increase_volume_without_distortion(audio_data, sample_rate, target_dBFS):
# Create an AudioSegment from raw audio data
audio_segment = AudioSegment(
channels=1 # or 2 for stereo
# Normalize the audio level
change_in_dBFS = target_dBFS - audio_segment.dBFS
normalized_audio = audio_segment.apply_gain(change_in_dBFS)
# Convert the AudioSegment back to a numpy array
normalized_audio_data = np.array(normalized_audio.get_array_of_samples()).astype(np.int16)
return normalized_audio_data
converter = pyewts.pyewts()
def download_file(url, destination):
response = requests.get(url)
with open(destination, 'wb') as file:
# Example usage:
download_file('', 'female_2.npy')
def replace_numbers_with_convert(sentence, wylie=True):
pattern = r'\d+(\.\d+)?'
def replace(match):
return convert(, wylie)
result = re.sub(pattern, replace, sentence)
return result
def cleanup_text(inputs):
for src, dst in replacements:
inputs = inputs.replace(src, dst)
return inputs
speaker_embeddings = {
"Lhasa(female)": "female_2.npy",
replacements = [
('_', '_'),
('*', 'v'),
('`', ';'),
('~', ','),
('+', ','),
('\\', ';'),
('|', ';'),
class EndpointHandler():
def __init__(self, path=""):
# load the model
self.processor = SpeechT5Processor.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")
self.model = SpeechT5ForTextToSpeech.from_pretrained("TenzinGayche/TTS_run3_ep20_174k_b")'cuda')
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
def __call__(self, data: Dict[str, Any]) -> Dict[str, Union[int, str]]:
data (Dict[str, Any]): _description_
bytes: _description_
text = data.pop("inputs",data)
# process input
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
text = converter.toWylie(text)
inputs = self.processor(text=text, return_tensors="pt")
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :self.model.config.max_text_positions]
speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
speaker_embedding = torch.tensor(speaker_embedding)
speech = self.model.generate_speech('cuda'),'cuda'),'cuda'))
speech = nr.reduce_noise('cpu'), sr=16000)
if isinstance(speech, torch.Tensor):
speech = speech.numpy()
# Increase volume without distortion
target_dBFS = -20.0 # Adjust the value according to your requirement
speech = increase_volume_without_distortion(speech, 16000, target_dBFS)
# Create a unique temporary WAV file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
temp_wav_path =
sf.write(temp_wav_path, speech, 16000, 'PCM_24') # Use sf.write to write the WAV file
# Read the WAV file and encode it as base64
with open(temp_wav_path, "rb") as wav_file:
audio_base64 = base64.b64encode("utf-8")
# Clean up the temporary WAV file
return {
"sample_rate": 16000,
"audio_base64": audio_base64, # Base64-encoded audio data