Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Dec 23, 2024

Commit

a5f8270

verified ·

1 Parent(s): d6547b1

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -73

app.py CHANGED Viewed

@@ -33,44 +33,44 @@ MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
 os.makedirs(MODELS_PATH, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# if not os.path.exists(os.path.join(MODELS_PATH, "wmencodec.th")):
-#     # download wmencodec
-#     url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th"
-#     filename = os.path.join(MODELS_PATH, "wmencodec.th")
-#     response = requests.get(url, stream=True)
-#     response.raise_for_status()
-#     with open(filename, "wb") as file:
-#         for chunk in response.iter_content(chunk_size=8192):
-#             file.write(chunk)
-#     print(f"File downloaded to: {filename}")
-# else:
-#     print("wmencodec model found")
-# if not os.path.exists(os.path.join(MODELS_PATH, "English.pth")):
-#     # download english model
-#     url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/English.pth"
-#     filename = os.path.join(MODELS_PATH, "English.pth")
-#     response = requests.get(url, stream=True)
-#     response.raise_for_status()
-#     with open(filename, "wb") as file:
-#         for chunk in response.iter_content(chunk_size=8192):
-#             file.write(chunk)
-#     print(f"File downloaded to: {filename}")
-# else:
-#     print("english model found")
-# if not os.path.exists(os.path.join(MODELS_PATH, "Mandarin.pth")):
-#     # download mandarin model
-#     url = "https://huggingface.co/westbrook/SSR-Speech-Mandarin/resolve/main/Mandarin.pth"
-#     filename = os.path.join(MODELS_PATH, "Mandarin.pth")
-#     response = requests.get(url, stream=True)
-#     response.raise_for_status()
-#     with open(filename, "wb") as file:
-#         for chunk in response.iter_content(chunk_size=8192):
-#             file.write(chunk)
-#     print(f"File downloaded to: {filename}")
-# else:
-#     print("mandarin model found")
 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
@@ -132,39 +132,39 @@ from whisperx import align as align_func
 text_tokenizer_en = TextTokenizer(backend="espeak")
 text_tokenizer_zh = TextTokenizer(backend="espeak", language='cmn')
-# ssrspeech_fn_en = f"{MODELS_PATH}/English.pth"
-# ckpt_en = torch.load(ssrspeech_fn_en)
-# model_en = ssr.SSR_Speech(ckpt_en["config"])
-# model_en.load_state_dict(ckpt_en["model"])
-# config_en = model_en.args
-# phn2num_en = ckpt_en["phn2num"]
-# model_en.to(device)
-# ssrspeech_fn_zh = f"{MODELS_PATH}/Mandarin.pth"
-# ckpt_zh = torch.load(ssrspeech_fn_zh)
-# model_zh = ssr.SSR_Speech(ckpt_zh["config"])
-# model_zh.load_state_dict(ckpt_zh["model"])
-# config_zh = model_zh.args
-# phn2num_zh = ckpt_zh["phn2num"]
-# model_zh.to(device)
-# encodec_fn = f"{MODELS_PATH}/wmencodec.th"
-# ssrspeech_model_en = {
-#     "config": config_en,
-#     "phn2num": phn2num_en,
-#     "model": model_en,
-#     "text_tokenizer": text_tokenizer_en,
-#     "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
-# }
-# ssrspeech_model_zh = {
-#     "config": config_zh,
-#     "phn2num": phn2num_zh,
-#     "model": model_zh,
-#     "text_tokenizer": text_tokenizer_zh,
-#     "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
-# }
 def get_transcribe_state(segments):
@@ -468,8 +468,6 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
-    print(orig_transcript)
     converter = opencc.OpenCC('t2s')
     orig_transcript = converter.convert(orig_transcript)
     transcribe_state,_ = align_zh(traditional_to_simplified(segments), audio_path)

 os.makedirs(MODELS_PATH, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+if not os.path.exists(os.path.join(MODELS_PATH, "wmencodec.th")):
+    # download wmencodec
+    url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th"
+    filename = os.path.join(MODELS_PATH, "wmencodec.th")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    with open(filename, "wb") as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+    print(f"File downloaded to: {filename}")
+else:
+    print("wmencodec model found")
+if not os.path.exists(os.path.join(MODELS_PATH, "English.pth")):
+    # download english model
+    url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/English.pth"
+    filename = os.path.join(MODELS_PATH, "English.pth")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    with open(filename, "wb") as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+    print(f"File downloaded to: {filename}")
+else:
+    print("english model found")
+if not os.path.exists(os.path.join(MODELS_PATH, "Mandarin.pth")):
+    # download mandarin model
+    url = "https://huggingface.co/westbrook/SSR-Speech-Mandarin/resolve/main/Mandarin.pth"
+    filename = os.path.join(MODELS_PATH, "Mandarin.pth")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    with open(filename, "wb") as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+    print(f"File downloaded to: {filename}")
+else:
+    print("mandarin model found")
 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
 text_tokenizer_en = TextTokenizer(backend="espeak")
 text_tokenizer_zh = TextTokenizer(backend="espeak", language='cmn')
+ssrspeech_fn_en = f"{MODELS_PATH}/English.pth"
+ckpt_en = torch.load(ssrspeech_fn_en)
+model_en = ssr.SSR_Speech(ckpt_en["config"])
+model_en.load_state_dict(ckpt_en["model"])
+config_en = model_en.args
+phn2num_en = ckpt_en["phn2num"]
+model_en.to(device)
+ssrspeech_fn_zh = f"{MODELS_PATH}/Mandarin.pth"
+ckpt_zh = torch.load(ssrspeech_fn_zh)
+model_zh = ssr.SSR_Speech(ckpt_zh["config"])
+model_zh.load_state_dict(ckpt_zh["model"])
+config_zh = model_zh.args
+phn2num_zh = ckpt_zh["phn2num"]
+model_zh.to(device)
+encodec_fn = f"{MODELS_PATH}/wmencodec.th"
+ssrspeech_model_en = {
+    "config": config_en,
+    "phn2num": phn2num_en,
+    "model": model_en,
+    "text_tokenizer": text_tokenizer_en,
+    "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
+}
+ssrspeech_model_zh = {
+    "config": config_zh,
+    "phn2num": phn2num_zh,
+    "model": model_zh,
+    "text_tokenizer": text_tokenizer_zh,
+    "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
+}
 def get_transcribe_state(segments):
     [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
     converter = opencc.OpenCC('t2s')
     orig_transcript = converter.convert(orig_transcript)
     transcribe_state,_ = align_zh(traditional_to_simplified(segments), audio_path)