OpenSound commited on
Commit
a5f8270
·
verified ·
1 Parent(s): d6547b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -73
app.py CHANGED
@@ -33,44 +33,44 @@ MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
33
  os.makedirs(MODELS_PATH, exist_ok=True)
34
  device = "cuda" if torch.cuda.is_available() else "cpu"
35
 
36
- # if not os.path.exists(os.path.join(MODELS_PATH, "wmencodec.th")):
37
- # # download wmencodec
38
- # url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th"
39
- # filename = os.path.join(MODELS_PATH, "wmencodec.th")
40
- # response = requests.get(url, stream=True)
41
- # response.raise_for_status()
42
- # with open(filename, "wb") as file:
43
- # for chunk in response.iter_content(chunk_size=8192):
44
- # file.write(chunk)
45
- # print(f"File downloaded to: {filename}")
46
- # else:
47
- # print("wmencodec model found")
48
-
49
- # if not os.path.exists(os.path.join(MODELS_PATH, "English.pth")):
50
- # # download english model
51
- # url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/English.pth"
52
- # filename = os.path.join(MODELS_PATH, "English.pth")
53
- # response = requests.get(url, stream=True)
54
- # response.raise_for_status()
55
- # with open(filename, "wb") as file:
56
- # for chunk in response.iter_content(chunk_size=8192):
57
- # file.write(chunk)
58
- # print(f"File downloaded to: {filename}")
59
- # else:
60
- # print("english model found")
61
-
62
- # if not os.path.exists(os.path.join(MODELS_PATH, "Mandarin.pth")):
63
- # # download mandarin model
64
- # url = "https://huggingface.co/westbrook/SSR-Speech-Mandarin/resolve/main/Mandarin.pth"
65
- # filename = os.path.join(MODELS_PATH, "Mandarin.pth")
66
- # response = requests.get(url, stream=True)
67
- # response.raise_for_status()
68
- # with open(filename, "wb") as file:
69
- # for chunk in response.iter_content(chunk_size=8192):
70
- # file.write(chunk)
71
- # print(f"File downloaded to: {filename}")
72
- # else:
73
- # print("mandarin model found")
74
 
75
  def get_random_string():
76
  return "".join(str(uuid.uuid4()).split("-"))
@@ -132,39 +132,39 @@ from whisperx import align as align_func
132
  text_tokenizer_en = TextTokenizer(backend="espeak")
133
  text_tokenizer_zh = TextTokenizer(backend="espeak", language='cmn')
134
 
135
- # ssrspeech_fn_en = f"{MODELS_PATH}/English.pth"
136
- # ckpt_en = torch.load(ssrspeech_fn_en)
137
- # model_en = ssr.SSR_Speech(ckpt_en["config"])
138
- # model_en.load_state_dict(ckpt_en["model"])
139
- # config_en = model_en.args
140
- # phn2num_en = ckpt_en["phn2num"]
141
- # model_en.to(device)
142
-
143
- # ssrspeech_fn_zh = f"{MODELS_PATH}/Mandarin.pth"
144
- # ckpt_zh = torch.load(ssrspeech_fn_zh)
145
- # model_zh = ssr.SSR_Speech(ckpt_zh["config"])
146
- # model_zh.load_state_dict(ckpt_zh["model"])
147
- # config_zh = model_zh.args
148
- # phn2num_zh = ckpt_zh["phn2num"]
149
- # model_zh.to(device)
150
-
151
- # encodec_fn = f"{MODELS_PATH}/wmencodec.th"
152
-
153
- # ssrspeech_model_en = {
154
- # "config": config_en,
155
- # "phn2num": phn2num_en,
156
- # "model": model_en,
157
- # "text_tokenizer": text_tokenizer_en,
158
- # "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
159
- # }
160
-
161
- # ssrspeech_model_zh = {
162
- # "config": config_zh,
163
- # "phn2num": phn2num_zh,
164
- # "model": model_zh,
165
- # "text_tokenizer": text_tokenizer_zh,
166
- # "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
167
- # }
168
 
169
 
170
  def get_transcribe_state(segments):
@@ -468,8 +468,6 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
468
 
469
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
470
 
471
- print(orig_transcript)
472
-
473
  converter = opencc.OpenCC('t2s')
474
  orig_transcript = converter.convert(orig_transcript)
475
  transcribe_state,_ = align_zh(traditional_to_simplified(segments), audio_path)
 
33
  os.makedirs(MODELS_PATH, exist_ok=True)
34
  device = "cuda" if torch.cuda.is_available() else "cpu"
35
 
36
+ if not os.path.exists(os.path.join(MODELS_PATH, "wmencodec.th")):
37
+ # download wmencodec
38
+ url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th"
39
+ filename = os.path.join(MODELS_PATH, "wmencodec.th")
40
+ response = requests.get(url, stream=True)
41
+ response.raise_for_status()
42
+ with open(filename, "wb") as file:
43
+ for chunk in response.iter_content(chunk_size=8192):
44
+ file.write(chunk)
45
+ print(f"File downloaded to: {filename}")
46
+ else:
47
+ print("wmencodec model found")
48
+
49
+ if not os.path.exists(os.path.join(MODELS_PATH, "English.pth")):
50
+ # download english model
51
+ url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/English.pth"
52
+ filename = os.path.join(MODELS_PATH, "English.pth")
53
+ response = requests.get(url, stream=True)
54
+ response.raise_for_status()
55
+ with open(filename, "wb") as file:
56
+ for chunk in response.iter_content(chunk_size=8192):
57
+ file.write(chunk)
58
+ print(f"File downloaded to: {filename}")
59
+ else:
60
+ print("english model found")
61
+
62
+ if not os.path.exists(os.path.join(MODELS_PATH, "Mandarin.pth")):
63
+ # download mandarin model
64
+ url = "https://huggingface.co/westbrook/SSR-Speech-Mandarin/resolve/main/Mandarin.pth"
65
+ filename = os.path.join(MODELS_PATH, "Mandarin.pth")
66
+ response = requests.get(url, stream=True)
67
+ response.raise_for_status()
68
+ with open(filename, "wb") as file:
69
+ for chunk in response.iter_content(chunk_size=8192):
70
+ file.write(chunk)
71
+ print(f"File downloaded to: {filename}")
72
+ else:
73
+ print("mandarin model found")
74
 
75
  def get_random_string():
76
  return "".join(str(uuid.uuid4()).split("-"))
 
132
  text_tokenizer_en = TextTokenizer(backend="espeak")
133
  text_tokenizer_zh = TextTokenizer(backend="espeak", language='cmn')
134
 
135
+ ssrspeech_fn_en = f"{MODELS_PATH}/English.pth"
136
+ ckpt_en = torch.load(ssrspeech_fn_en)
137
+ model_en = ssr.SSR_Speech(ckpt_en["config"])
138
+ model_en.load_state_dict(ckpt_en["model"])
139
+ config_en = model_en.args
140
+ phn2num_en = ckpt_en["phn2num"]
141
+ model_en.to(device)
142
+
143
+ ssrspeech_fn_zh = f"{MODELS_PATH}/Mandarin.pth"
144
+ ckpt_zh = torch.load(ssrspeech_fn_zh)
145
+ model_zh = ssr.SSR_Speech(ckpt_zh["config"])
146
+ model_zh.load_state_dict(ckpt_zh["model"])
147
+ config_zh = model_zh.args
148
+ phn2num_zh = ckpt_zh["phn2num"]
149
+ model_zh.to(device)
150
+
151
+ encodec_fn = f"{MODELS_PATH}/wmencodec.th"
152
+
153
+ ssrspeech_model_en = {
154
+ "config": config_en,
155
+ "phn2num": phn2num_en,
156
+ "model": model_en,
157
+ "text_tokenizer": text_tokenizer_en,
158
+ "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
159
+ }
160
+
161
+ ssrspeech_model_zh = {
162
+ "config": config_zh,
163
+ "phn2num": phn2num_zh,
164
+ "model": model_zh,
165
+ "text_tokenizer": text_tokenizer_zh,
166
+ "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
167
+ }
168
 
169
 
170
  def get_transcribe_state(segments):
 
468
 
469
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
470
 
 
 
471
  converter = opencc.OpenCC('t2s')
472
  orig_transcript = converter.convert(orig_transcript)
473
  transcribe_state,_ = align_zh(traditional_to_simplified(segments), audio_path)