skytnt commited on
Commit
8c486cf
·
1 Parent(s): c580b60

add device argument

Browse files
Files changed (2) hide show
  1. app.py +14 -12
  2. text/cleaners.py +25 -21
app.py CHANGED
@@ -62,9 +62,9 @@ def create_tts_fn(model, hps, speaker_ids):
62
  speaker_id = speaker_ids[speaker]
63
  stn_tst = get_text(text, hps, is_symbol)
64
  with no_grad():
65
- x_tst = stn_tst.unsqueeze(0)
66
- x_tst_lengths = LongTensor([stn_tst.size(0)])
67
- sid = LongTensor([speaker_id])
68
  audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
69
  length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
70
  del stn_tst, x_tst, x_tst_lengths, sid
@@ -94,10 +94,10 @@ def create_vc_fn(model, hps, speaker_ids):
94
  y = y.unsqueeze(0)
95
  spec = spectrogram_torch(y, hps.data.filter_length,
96
  hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
97
- center=False)
98
- spec_lengths = LongTensor([spec.size(-1)])
99
- sid_src = LongTensor([original_speaker_id])
100
- sid_tgt = LongTensor([target_speaker_id])
101
  audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
102
  0, 0].data.cpu().float().numpy()
103
  del y, spec, spec_lengths, sid_src, sid_tgt
@@ -125,10 +125,10 @@ def create_soft_vc_fn(model, hps, speaker_ids):
125
  if sampling_rate != 16000:
126
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
127
  with torch.inference_mode():
128
- units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0))
129
  with no_grad():
130
- unit_lengths = LongTensor([units.size(1)])
131
- sid = LongTensor([target_speaker_id])
132
  audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
133
  noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
134
  del units, unit_lengths, sid
@@ -147,9 +147,11 @@ def create_to_symbol_fn(hps):
147
 
148
  if __name__ == '__main__':
149
  parser = argparse.ArgumentParser()
 
150
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
151
  args = parser.parse_args()
152
 
 
153
  models_tts = []
154
  models_vc = []
155
  models_soft_vc = []
@@ -171,7 +173,7 @@ if __name__ == '__main__':
171
  n_speakers=hps.data.n_speakers,
172
  **hps.model)
173
  utils.load_checkpoint(model_path, model, None)
174
- model.eval()
175
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
176
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
177
 
@@ -184,7 +186,7 @@ if __name__ == '__main__':
184
  elif t == "soft-vits-vc":
185
  models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
186
 
187
- hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
188
 
189
  app = gr.Blocks()
190
 
 
62
  speaker_id = speaker_ids[speaker]
63
  stn_tst = get_text(text, hps, is_symbol)
64
  with no_grad():
65
+ x_tst = stn_tst.unsqueeze(0).to(device)
66
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
67
+ sid = LongTensor([speaker_id]).to(device)
68
  audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
69
  length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
70
  del stn_tst, x_tst, x_tst_lengths, sid
 
94
  y = y.unsqueeze(0)
95
  spec = spectrogram_torch(y, hps.data.filter_length,
96
  hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
97
+ center=False).to(device)
98
+ spec_lengths = LongTensor([spec.size(-1)]).to(device)
99
+ sid_src = LongTensor([original_speaker_id]).to(device)
100
+ sid_tgt = LongTensor([target_speaker_id]).to(device)
101
  audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
102
  0, 0].data.cpu().float().numpy()
103
  del y, spec, spec_lengths, sid_src, sid_tgt
 
125
  if sampling_rate != 16000:
126
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
127
  with torch.inference_mode():
128
+ units = hubert.units(torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0).to(device))
129
  with no_grad():
130
+ unit_lengths = LongTensor([units.size(1)]).to(device)
131
+ sid = LongTensor([target_speaker_id]).to(device)
132
  audio = model.infer(units, unit_lengths, sid=sid, noise_scale=.667,
133
  noise_scale_w=0.8)[0][0, 0].data.cpu().float().numpy()
134
  del units, unit_lengths, sid
 
147
 
148
  if __name__ == '__main__':
149
  parser = argparse.ArgumentParser()
150
+ parser.add_argument('--device', type=str, default='cpu')
151
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
152
  args = parser.parse_args()
153
 
154
+ device = torch.device(args.device)
155
  models_tts = []
156
  models_vc = []
157
  models_soft_vc = []
 
173
  n_speakers=hps.data.n_speakers,
174
  **hps.model)
175
  utils.load_checkpoint(model_path, model, None)
176
+ model.eval().to(device)
177
  speaker_ids = [sid for sid, name in enumerate(hps.speakers) if name != "None"]
178
  speakers = [name for sid, name in enumerate(hps.speakers) if name != "None"]
179
 
 
186
  elif t == "soft-vits-vc":
187
  models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
188
 
189
+ hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).to(device)
190
 
191
  app = gr.Blocks()
192
 
text/cleaners.py CHANGED
@@ -1,4 +1,7 @@
1
  import re
 
 
 
2
 
3
 
4
  def japanese_cleaners(text):
@@ -36,9 +39,9 @@ def zh_ja_mixture_cleaners(text):
36
  from text.mandarin import chinese_to_romaji
37
  from text.japanese import japanese_to_romaji_with_accent
38
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
39
- lambda x: chinese_to_romaji(x.group(1))+' ', text)
40
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
41
- x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
42
  text = re.sub(r'\s+$', '', text)
43
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
44
  return text
@@ -58,15 +61,15 @@ def cjks_cleaners(text):
58
  from text.sanskrit import devanagari_to_ipa
59
  from text.english import english_to_lazy_ipa
60
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
61
- lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
62
  text = re.sub(r'\[JA\](.*?)\[JA\]',
63
- lambda x: japanese_to_ipa(x.group(1))+' ', text)
64
  text = re.sub(r'\[KO\](.*?)\[KO\]',
65
- lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
66
  text = re.sub(r'\[SA\](.*?)\[SA\]',
67
- lambda x: devanagari_to_ipa(x.group(1))+' ', text)
68
  text = re.sub(r'\[EN\](.*?)\[EN\]',
69
- lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
70
  text = re.sub(r'\s+$', '', text)
71
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
72
  return text
@@ -78,13 +81,13 @@ def cjke_cleaners(text):
78
  from text.korean import korean_to_ipa
79
  from text.english import english_to_ipa2
80
  text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
81
- 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
82
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
83
- 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
84
  text = re.sub(r'\[KO\](.*?)\[KO\]',
85
- lambda x: korean_to_ipa(x.group(1))+' ', text)
86
  text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
87
- 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
88
  text = re.sub(r'\s+$', '', text)
89
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
90
  return text
@@ -96,13 +99,13 @@ def cjke_cleaners2(text):
96
  from text.korean import korean_to_ipa
97
  from text.english import english_to_ipa2
98
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
99
- lambda x: chinese_to_ipa(x.group(1))+' ', text)
100
  text = re.sub(r'\[JA\](.*?)\[JA\]',
101
- lambda x: japanese_to_ipa2(x.group(1))+' ', text)
102
  text = re.sub(r'\[KO\](.*?)\[KO\]',
103
- lambda x: korean_to_ipa(x.group(1))+' ', text)
104
  text = re.sub(r'\[EN\](.*?)\[EN\]',
105
- lambda x: english_to_ipa2(x.group(1))+' ', text)
106
  text = re.sub(r'\s+$', '', text)
107
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
108
  return text
@@ -130,17 +133,18 @@ def chinese_dialect_cleaners(text):
130
  from text.english import english_to_lazy_ipa2
131
  from text.ngu_dialect import ngu_dialect_to_ipa
132
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
133
- lambda x: chinese_to_ipa2(x.group(1))+' ', text)
134
  text = re.sub(r'\[JA\](.*?)\[JA\]',
135
- lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
136
  text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
137
- '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
 
138
  text = re.sub(r'\[GD\](.*?)\[GD\]',
139
- lambda x: cantonese_to_ipa(x.group(1))+' ', text)
140
  text = re.sub(r'\[EN\](.*?)\[EN\]',
141
- lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
142
  text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
143
- 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
144
  text = re.sub(r'\s+$', '', text)
145
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
146
  return text
 
1
  import re
2
+ import pyopenjtalk
3
+
4
+ pyopenjtalk._lazy_init()
5
 
6
 
7
  def japanese_cleaners(text):
 
39
  from text.mandarin import chinese_to_romaji
40
  from text.japanese import japanese_to_romaji_with_accent
41
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
42
+ lambda x: chinese_to_romaji(x.group(1)) + ' ', text)
43
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
44
+ x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…') + ' ', text)
45
  text = re.sub(r'\s+$', '', text)
46
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
47
  return text
 
61
  from text.sanskrit import devanagari_to_ipa
62
  from text.english import english_to_lazy_ipa
63
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
64
+ lambda x: chinese_to_lazy_ipa(x.group(1)) + ' ', text)
65
  text = re.sub(r'\[JA\](.*?)\[JA\]',
66
+ lambda x: japanese_to_ipa(x.group(1)) + ' ', text)
67
  text = re.sub(r'\[KO\](.*?)\[KO\]',
68
+ lambda x: korean_to_lazy_ipa(x.group(1)) + ' ', text)
69
  text = re.sub(r'\[SA\](.*?)\[SA\]',
70
+ lambda x: devanagari_to_ipa(x.group(1)) + ' ', text)
71
  text = re.sub(r'\[EN\](.*?)\[EN\]',
72
+ lambda x: english_to_lazy_ipa(x.group(1)) + ' ', text)
73
  text = re.sub(r'\s+$', '', text)
74
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
75
  return text
 
81
  from text.korean import korean_to_ipa
82
  from text.english import english_to_ipa2
83
  text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
84
+ 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
85
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
86
+ 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
87
  text = re.sub(r'\[KO\](.*?)\[KO\]',
88
+ lambda x: korean_to_ipa(x.group(1)) + ' ', text)
89
  text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
90
+ 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
91
  text = re.sub(r'\s+$', '', text)
92
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
93
  return text
 
99
  from text.korean import korean_to_ipa
100
  from text.english import english_to_ipa2
101
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
102
+ lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
103
  text = re.sub(r'\[JA\](.*?)\[JA\]',
104
+ lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
105
  text = re.sub(r'\[KO\](.*?)\[KO\]',
106
+ lambda x: korean_to_ipa(x.group(1)) + ' ', text)
107
  text = re.sub(r'\[EN\](.*?)\[EN\]',
108
+ lambda x: english_to_ipa2(x.group(1)) + ' ', text)
109
  text = re.sub(r'\s+$', '', text)
110
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
111
  return text
 
133
  from text.english import english_to_lazy_ipa2
134
  from text.ngu_dialect import ngu_dialect_to_ipa
135
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
136
+ lambda x: chinese_to_ipa2(x.group(1)) + ' ', text)
137
  text = re.sub(r'\[JA\](.*?)\[JA\]',
138
+ lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ') + ' ', text)
139
  text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
140
+ '˧˧˦').replace(
141
+ '6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e') + ' ', text)
142
  text = re.sub(r'\[GD\](.*?)\[GD\]',
143
+ lambda x: cantonese_to_ipa(x.group(1)) + ' ', text)
144
  text = re.sub(r'\[EN\](.*?)\[EN\]',
145
+ lambda x: english_to_lazy_ipa2(x.group(1)) + ' ', text)
146
  text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
147
+ 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ') + ' ', text)
148
  text = re.sub(r'\s+$', '', text)
149
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
150
  return text