Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Dec 23, 2024

Commit

9c238e8

verified ·

1 Parent(s): fc6da2b

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -73

app.py CHANGED Viewed

@@ -26,51 +26,50 @@ import spaces
 import nltk
 nltk.download('punkt')
-```
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
 TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
 MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
 os.makedirs(MODELS_PATH, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-if not os.path.exists(os.path.join(MODELS_PATH, "wmencodec.th")):
-    # download wmencodec
-    url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th"
-    filename = os.path.join(MODELS_PATH, "wmencodec.th")
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    with open(filename, "wb") as file:
-        for chunk in response.iter_content(chunk_size=8192):
-            file.write(chunk)
-    print(f"File downloaded to: {filename}")
-else:
-    print("wmencodec model found")
-if not os.path.exists(os.path.join(MODELS_PATH, "English.pth")):
-    # download english model
-    url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/English.pth"
-    filename = os.path.join(MODELS_PATH, "English.pth")
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    with open(filename, "wb") as file:
-        for chunk in response.iter_content(chunk_size=8192):
-            file.write(chunk)
-    print(f"File downloaded to: {filename}")
-else:
-    print("english model found")
-if not os.path.exists(os.path.join(MODELS_PATH, "Mandarin.pth")):
-    # download mandarin model
-    url = "https://huggingface.co/westbrook/SSR-Speech-Mandarin/resolve/main/Mandarin.pth"
-    filename = os.path.join(MODELS_PATH, "Mandarin.pth")
-    response = requests.get(url, stream=True)
-    response.raise_for_status()
-    with open(filename, "wb") as file:
-        for chunk in response.iter_content(chunk_size=8192):
-            file.write(chunk)
-    print(f"File downloaded to: {filename}")
-else:
-    print("mandarin model found")
 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
@@ -132,40 +131,39 @@ from whisperx import align as align_func
 text_tokenizer_en = TextTokenizer(backend="espeak")
 text_tokenizer_zh = TextTokenizer(backend="espeak", language='cmn')
-ssrspeech_fn_en = f"{MODELS_PATH}/English.pth"
-ckpt_en = torch.load(ssrspeech_fn_en)
-model_en = ssr.SSR_Speech(ckpt_en["config"])
-model_en.load_state_dict(ckpt_en["model"])
-config_en = model_en.args
-phn2num_en = ckpt_en["phn2num"]
-model_en.to(device)
-ssrspeech_fn_zh = f"{MODELS_PATH}/Mandarin.pth"
-ckpt_zh = torch.load(ssrspeech_fn_zh)
-model_zh = ssr.SSR_Speech(ckpt_zh["config"])
-model_zh.load_state_dict(ckpt_zh["model"])
-config_zh = model_zh.args
-phn2num_zh = ckpt_zh["phn2num"]
-model_zh.to(device)
-encodec_fn = f"{MODELS_PATH}/wmencodec.th"
-ssrspeech_model_en = {
-    "config": config_en,
-    "phn2num": phn2num_en,
-    "model": model_en,
-    "text_tokenizer": text_tokenizer_en,
-    "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
-}
-ssrspeech_model_zh = {
-    "config": config_zh,
-    "phn2num": phn2num_zh,
-    "model": model_zh,
-    "text_tokenizer": text_tokenizer_zh,
-    "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
-}
-```
 def get_transcribe_state(segments):

 import nltk
 nltk.download('punkt')
 DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
 TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
 MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
 os.makedirs(MODELS_PATH, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# if not os.path.exists(os.path.join(MODELS_PATH, "wmencodec.th")):
+#     # download wmencodec
+#     url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/wmencodec.th"
+#     filename = os.path.join(MODELS_PATH, "wmencodec.th")
+#     response = requests.get(url, stream=True)
+#     response.raise_for_status()
+#     with open(filename, "wb") as file:
+#         for chunk in response.iter_content(chunk_size=8192):
+#             file.write(chunk)
+#     print(f"File downloaded to: {filename}")
+# else:
+#     print("wmencodec model found")
+# if not os.path.exists(os.path.join(MODELS_PATH, "English.pth")):
+#     # download english model
+#     url = "https://huggingface.co/westbrook/SSR-Speech-English/resolve/main/English.pth"
+#     filename = os.path.join(MODELS_PATH, "English.pth")
+#     response = requests.get(url, stream=True)
+#     response.raise_for_status()
+#     with open(filename, "wb") as file:
+#         for chunk in response.iter_content(chunk_size=8192):
+#             file.write(chunk)
+#     print(f"File downloaded to: {filename}")
+# else:
+#     print("english model found")
+# if not os.path.exists(os.path.join(MODELS_PATH, "Mandarin.pth")):
+#     # download mandarin model
+#     url = "https://huggingface.co/westbrook/SSR-Speech-Mandarin/resolve/main/Mandarin.pth"
+#     filename = os.path.join(MODELS_PATH, "Mandarin.pth")
+#     response = requests.get(url, stream=True)
+#     response.raise_for_status()
+#     with open(filename, "wb") as file:
+#         for chunk in response.iter_content(chunk_size=8192):
+#             file.write(chunk)
+#     print(f"File downloaded to: {filename}")
+# else:
+#     print("mandarin model found")
 def get_random_string():
     return "".join(str(uuid.uuid4()).split("-"))
 text_tokenizer_en = TextTokenizer(backend="espeak")
 text_tokenizer_zh = TextTokenizer(backend="espeak", language='cmn')
+# ssrspeech_fn_en = f"{MODELS_PATH}/English.pth"
+# ckpt_en = torch.load(ssrspeech_fn_en)
+# model_en = ssr.SSR_Speech(ckpt_en["config"])
+# model_en.load_state_dict(ckpt_en["model"])
+# config_en = model_en.args
+# phn2num_en = ckpt_en["phn2num"]
+# model_en.to(device)
+# ssrspeech_fn_zh = f"{MODELS_PATH}/Mandarin.pth"
+# ckpt_zh = torch.load(ssrspeech_fn_zh)
+# model_zh = ssr.SSR_Speech(ckpt_zh["config"])
+# model_zh.load_state_dict(ckpt_zh["model"])
+# config_zh = model_zh.args
+# phn2num_zh = ckpt_zh["phn2num"]
+# model_zh.to(device)
+# encodec_fn = f"{MODELS_PATH}/wmencodec.th"
+# ssrspeech_model_en = {
+#     "config": config_en,
+#     "phn2num": phn2num_en,
+#     "model": model_en,
+#     "text_tokenizer": text_tokenizer_en,
+#     "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
+# }
+# ssrspeech_model_zh = {
+#     "config": config_zh,
+#     "phn2num": phn2num_zh,
+#     "model": model_zh,
+#     "text_tokenizer": text_tokenizer_zh,
+#     "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
+# }
 def get_transcribe_state(segments):