Spaces:

innnky
/

vits-nyaru

Running

App Files Files Community

rcell commited on Aug 18, 2022

Commit

d950ac3

1 Parent(s): ac38b6b

complete app.py

Browse files

Files changed (1) hide show

app.py +75 -4

app.py CHANGED Viewed

@@ -1,7 +1,78 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import os
+os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
+import json
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+import commons
+import utils
+from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
+from models import SynthesizerTrn
+from text.symbols import symbols
+from text import text_to_sequence
+from scipy.io.wavfile import write
+def get_text(text, hps):
+    text_norm = text_to_sequence(text, hps.data.text_cleaners)
+    if hps.data.add_blank:
+        text_norm = commons.intersperse(text_norm, 0)
+    text_norm = torch.LongTensor(text_norm)
+    # print(text_norm.shape)
+    return text_norm
+hps_ms = utils.get_hparams_from_file("/configs/japanese_base.json")
+hps = utils.get_hparams_from_file("/configs/japanese_base.json")
+net_g_ms = SynthesizerTrn(
+    len(symbols),
+    hps_ms.data.filter_length // 2 + 1,
+    hps_ms.train.segment_size // hps.data.hop_length,
+    n_speakers=hps_ms.data.n_speakers,
+    **hps_ms.model)
+def jtts(spkid, text):
+    sid = torch.LongTensor([spkid])  # speaker identity
+    stn_tst = get_text(text, hps_ms)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+        # print(stn_tst.size())
+        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
+            0, 0].data.float().numpy()
+    return
+_ = utils.load_checkpoint("/output.pth", net_g_ms, None)
+def tts(text):
+    sid = torch.LongTensor([2])  # speaker identity
+    stn_tst = get_text(text, hps_ms)
+    with torch.no_grad():
+        x_tst = stn_tst.unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+        # print(stn_tst.size())
+        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
+            0, 0].data.float().numpy()
+    return "成功", (hps.data.sampling_rate, audio)
+app = gr.Blocks()
+with app:
+    tts_input1 = gr.TextArea(label="请输入日语文本", value="こんにちは。")
+    # tts_input2 = gr.Dropdown(label="Speaker", choices=hps.speakers, type="index", value=hps.speakers[0])
+    tts_submit = gr.Button("Generate", variant="primary")
+    tts_output1 = gr.Textbox(label="Output Message")
+    tts_output2 = gr.Audio(label="Output Audio")
+    tts_submit.click(tts, [tts_input1], [tts_output1, tts_output2])
+    app.launch()