Spaces:
Running
Running
import os, sys | |
now_dir = os.getcwd() | |
sys.path.append(now_dir) | |
sys.path.append(os.path.join(now_dir, "GPT_SoVITS")) | |
import os, re, logging, json | |
logging.getLogger("markdown_it").setLevel(logging.ERROR) | |
logging.getLogger("urllib3").setLevel(logging.ERROR) | |
logging.getLogger("httpcore").setLevel(logging.ERROR) | |
logging.getLogger("httpx").setLevel(logging.ERROR) | |
logging.getLogger("asyncio").setLevel(logging.ERROR) | |
logging.getLogger("charset_normalizer").setLevel(logging.ERROR) | |
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) | |
import pdb | |
import torch | |
if "_CUDA_VISIBLE_DEVICES" in os.environ: | |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"] | |
is_half = eval(os.environ.get("is_half", "True")) | |
from TTS_infer_pack.TTS import TTS, TTS_Config | |
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 | |
if torch.cuda.is_available(): | |
device = "cuda" | |
else: | |
device = "cpu" | |
is_half = False | |
# 取得模型文件夹路径 | |
config_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "config.json") | |
if os.path.exists(config_path): | |
with open(config_path, 'r', encoding='utf-8') as f: | |
_config = json.load(f) | |
if _config.get("device", "auto") != "auto": | |
device = _config["device"] | |
if device == "cpu": | |
is_half = False | |
if _config.get("half_precision", "auto") != "auto": | |
is_half = _config["half_precision"].lower() == "true" | |
locale_language = str(_config.get("locale", "auto")) | |
locale_language = None if locale_language.lower() == "auto" else locale_language | |
print(f"device: {device}, is_half: {is_half}") | |
from tools.i18n.i18n import I18nAuto | |
i18n = I18nAuto(locale_language,os.path.join(os.path.dirname(os.path.dirname(__file__)), "i18n/locale")) | |
dict_language = { | |
"中文": "all_zh",#全部按中文识别 | |
"英文": "en",#全部按英文识别#######不变 | |
"日文": "all_ja",#全部按日文识别 | |
"中英混合": "zh",#按中英混合识别####不变 | |
"日英混合": "ja",#按日英混合识别####不变 | |
"多语种混合": "auto",#多语种启动切分识别语种 | |
"auto": "auto", | |
"zh": "zh", | |
"en": "en", | |
"ja": "ja", | |
"all_zh": "all_zh", | |
"all_ja": "all_ja", | |
} | |
tts_config = TTS_Config("") | |
tts_config.device = device | |
tts_config.is_half = is_half | |
tts_pipline = TTS(tts_config) | |
gpt_path = tts_config.t2s_weights_path | |
sovits_path = tts_config.vits_weights_path | |
def inference(text, text_lang, | |
ref_audio_path, prompt_text, | |
prompt_lang, top_k, | |
top_p, temperature, | |
text_split_method, batch_size, | |
speed_factor, ref_text_free, | |
split_bucket, | |
return_fragment, | |
seed | |
): | |
try: | |
text_lang = dict_language[text_lang.lower()] | |
prompt_lang = dict_language[prompt_lang.lower()] | |
except: | |
text_lang = "auto" | |
prompt_lang = "auto" | |
inputs={ | |
"text": text, | |
"text_lang": text_lang, | |
"ref_audio_path": ref_audio_path, | |
"prompt_text": prompt_text if not ref_text_free else "", | |
"prompt_lang": prompt_lang, | |
"top_k": top_k, | |
"top_p": top_p, | |
"temperature": temperature, | |
"text_split_method": text_split_method, | |
"batch_size":int(batch_size), | |
"speed_factor":float(speed_factor), | |
"split_bucket":split_bucket, | |
"return_fragment":return_fragment, | |
"seed":seed | |
} | |
return tts_pipline.run(inputs) | |
# from https://github.com/RVC-Boss/GPT-SoVITS/pull/448 | |
import tempfile, io, wave | |
from pydub import AudioSegment | |
# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py | |
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000): | |
# This will create a wave header then append the frame input | |
# It should be first on a streaming wav file | |
# Other frames better should not have it (else you will hear some artifacts each chunk start) | |
wav_buf = io.BytesIO() | |
with wave.open(wav_buf, "wb") as vfout: | |
vfout.setnchannels(channels) | |
vfout.setsampwidth(sample_width) | |
vfout.setframerate(sample_rate) | |
vfout.writeframes(frame_input) | |
wav_buf.seek(0) | |
return wav_buf.read() | |
def get_streaming_tts_wav(params): | |
chunks = inference(**params) | |
byte_stream = True | |
if byte_stream: | |
yield wave_header_chunk() | |
for sr, chunk in chunks: | |
if chunk is not None: | |
chunk = chunk.tobytes() | |
yield chunk | |
else: | |
print("None chunk") | |
pass | |
else: | |
pass | |
# Send chunk files | |
# i = 0 | |
# format = "wav" | |
# for chunk in chunks: | |
# i += 1 | |
# file = f"{tempfile.gettempdir()}/{i}.{format}" | |
# segment = AudioSegment(chunk, frame_rate=32000, sample_width=2, channels=1) | |
# segment.export(file, format=format) | |
# yield file | |