Spaces:
Running
Running
File size: 3,872 Bytes
802e11f 3e667ed 802e11f 5293275 802e11f 3e667ed 802e11f 9c7fd5c 3ebe5da 802e11f b5433b4 802e11f b5433b4 802e11f 3e667ed 1124846 9e5e755 1124846 802e11f 5293275 802e11f b5433b4 802e11f 1124846 3e667ed 9e5e755 3e667ed 802e11f 3e667ed 802e11f 0b2bb7e 802e11f 3e667ed b5433b4 802e11f 3e667ed 802e11f 3e667ed 3ebe5da 3e667ed 9079195 802e11f d192051 802e11f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
"""
Copyright 2023 Balacoon
contains implementation
for Revoice request
"""
import os
import asyncio
import base64
import hashlib
import json
import ssl
import time
from typing import Tuple, Union
import numpy as np
import resampy
import websockets
def prepare_audio(audio: Tuple[int, np.ndarray]) -> np.ndarray:
"""
ensures that audio is in int16 format, 16khz mono
"""
sr, wav = audio
# ensure proper type
if wav.dtype == np.int32:
max_val = np.max(np.abs(wav))
mult = (32767.0 / 2**31) if max_val > 32768 else 1.0
wav = (wav.astype(np.float32) * mult).astype(np.int16)
elif wav.dtype == np.float32 or wav.dtype == np.float64:
mult = 32767.0 if np.max(np.abs(wav)) <= 1.0 else 1.0
wav = (wav * mult).astype(np.int16)
if wav.ndim == 2:
# average channels
if wav.shape[0] == 2:
wav = np.mean(wav, axis=0, keepdims=False)
if wav.shape[1] == 2:
wav = np.mean(wav, axis=1, keepdims=False)
if wav.ndim != 1:
return None
# ensure proper sampling rate
if sr != 16000:
wav = (wav / 32768.0).astype(np.float)
wav = resampy.resample(wav, sr, 16000)
wav = (wav * 32768.0).astype(np.int16)
return wav
def create_signature(api_secret: str) -> str:
"""
helper function that creates signature,
required to authentificate the request
"""
int_time = int(time.time() / 1000)
signature_input = (api_secret + str(int_time)).encode()
signature = hashlib.sha256(signature_input).hexdigest()
return signature
async def async_service_request(source_str: str, source: np.ndarray, target: np.ndarray, api_key: str, api_secret: str) -> np.ndarray:
if target is None or len(target) == 0:
return None
ssl_context = ssl.create_default_context()
async with websockets.connect(
os.environ["endpoint"], close_timeout=1024, ssl=ssl_context
) as websocket:
request_dict = {
"target": base64.b64encode(target.tobytes()).decode("utf-8"),
"api_key": api_key,
"signature": create_signature(api_secret),
}
if source_str and len(source_str) > 0:
request_dict["source_str"] = source_str
elif source is not None and len(source) > 0:
request_dict["source"] = base64.b64encode(source.tobytes()).decode("utf-8")
else:
return None
request = json.dumps(request_dict)
await websocket.send(request)
# read reply
result_lst = []
while True:
try:
data = await asyncio.wait_for(websocket.recv(), timeout=30)
result_lst.append(np.frombuffer(data, dtype="int16"))
except websockets.exceptions.ConnectionClosed:
break
except asyncio.TimeoutError:
break
if data is None:
break
result = np.concatenate(result_lst) if result_lst else None
return result
def service_request(
source_str: str, source_audio: Tuple[int, np.ndarray], target_audio: Tuple[int, np.ndarray],
api_key: str, api_secret: str,
) -> Tuple[int, np.ndarray]:
"""
prepares audio (has to be 16khz mono)
and runs request to a voice conversion service
"""
src = None
if source_audio is not None:
src = prepare_audio(source_audio)
tgt = prepare_audio(target_audio)
if tgt is None:
return
if source_str is None and src is None:
return
if len(tgt) >= 30 * 16000:
# too long
return
if src is not None and len(src) >= 60 * 16000:
return
if source_str is not None and len(source_str) > 256:
return
res = asyncio.run(async_service_request(source_str, src, tgt, api_key, api_secret))
return 16000, res
|