Flux9665 commited on
Commit
185fc75
β€’
1 Parent(s): 52d3547

implement zero GPU compatible solution (hopefully)

Browse files
Files changed (2) hide show
  1. app.py +78 -3
  2. requirements.txt +0 -1
app.py CHANGED
@@ -1,11 +1,86 @@
1
  import os
2
- import torch
3
 
4
- os.system("git clone --branch v3.1 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
 
5
  os.system("mv toucan_codebase/* .")
6
 
7
  from run_model_downloader import download_models
8
- from run_GUI_demo import TTSWebUI
9
 
10
  download_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")
 
1
  import os
2
+ import spaces
3
 
4
+
5
+ os.system("git clone --branch v2.5 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
6
  os.system("mv toucan_codebase/* .")
7
 
8
  from run_model_downloader import download_models
 
9
 
10
  download_models()
11
+
12
+ import gradio as gr
13
+ import torch.cuda
14
+ from InferenceInterfaces.ControllableInterface import ControllableInterface
15
+ from Preprocessing.multilinguality.SimilaritySolver import load_json_from_path
16
+ from Utility.utils import float2pcm
17
+
18
+
19
+ class TTSWebUI:
20
+
21
+ def __init__(self, gpu_id="cpu", title="Controllable Text-to-Speech for over 7000 Languages", article="", available_artificial_voices=1000, path_to_iso_list="Preprocessing/multilinguality/iso_to_fullname.json"):
22
+ iso_to_name = load_json_from_path(path_to_iso_list)
23
+ text_selection = [f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name]
24
+ # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
25
+
26
+ self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
27
+ available_artificial_voices=available_artificial_voices)
28
+ self.iface = gr.Interface(fn=self.read,
29
+ inputs=[gr.Textbox(lines=2,
30
+ placeholder="write what you want the synthesis to read here...",
31
+ value="The woods are lovely, dark and deep, but I have promises to keep, and miles to go, before I sleep.",
32
+ label="Text input"),
33
+ gr.Dropdown(text_selection,
34
+ type="value",
35
+ value='English Text (eng)',
36
+ label="Select the Language of the Text (type on your keyboard to find it quickly)"),
37
+ gr.Slider(minimum=0, maximum=available_artificial_voices, step=1,
38
+ value=279,
39
+ label="Random Seed for the artificial Voice"),
40
+ gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.7, label="Prosody Creativity"),
41
+ gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Duration Scale"),
42
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
43
+ gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
44
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Femininity / Masculinity"),
45
+ gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
46
+ ],
47
+ outputs=[gr.Audio(type="numpy", label="Speech"),
48
+ gr.Image(label="Visualization")],
49
+ title=title,
50
+ theme="default",
51
+ allow_flagging="never",
52
+ article=article)
53
+ self.iface.launch()
54
+
55
+ @spaces.GPU
56
+ def read(self,
57
+ prompt,
58
+ language,
59
+ voice_seed,
60
+ prosody_creativity,
61
+ duration_scaling_factor,
62
+ pitch_variance_scale,
63
+ energy_variance_scale,
64
+ emb1,
65
+ emb2
66
+ ):
67
+ sr, wav, fig = self.controllable_ui.read(prompt,
68
+ language.split(" ")[-1].split("(")[1].split(")")[0],
69
+ language.split(" ")[-1].split("(")[1].split(")")[0],
70
+ voice_seed,
71
+ prosody_creativity,
72
+ duration_scaling_factor,
73
+ 1.,
74
+ pitch_variance_scale,
75
+ energy_variance_scale,
76
+ emb1,
77
+ emb2,
78
+ 0.,
79
+ 0.,
80
+ 0.,
81
+ 0.,
82
+ -24.)
83
+ return (sr, float2pcm(wav)), fig
84
+
85
+
86
  TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")
requirements.txt CHANGED
@@ -2,7 +2,6 @@ torch_complex~=0.4.3
2
  tqdm~=4.64.1
3
  scipy~=1.9.3
4
  librosa~=0.9.2
5
- scikit-learn~=1.1.3
6
  praat-parselmouth~=0.4.2
7
  torch~=2.3.0
8
  numpy~=1.23.4
 
2
  tqdm~=4.64.1
3
  scipy~=1.9.3
4
  librosa~=0.9.2
 
5
  praat-parselmouth~=0.4.2
6
  torch~=2.3.0
7
  numpy~=1.23.4