File size: 3,872 Bytes
802e11f
 
 
 
3e667ed
802e11f
 
5293275
802e11f
 
 
 
 
 
3e667ed
802e11f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c7fd5c
 
3ebe5da
 
 
 
 
 
 
 
802e11f
 
 
 
 
 
 
 
b5433b4
802e11f
 
 
 
 
b5433b4
802e11f
 
 
 
3e667ed
1124846
9e5e755
1124846
 
802e11f
 
 
5293275
802e11f
 
 
b5433b4
 
802e11f
1124846
3e667ed
9e5e755
3e667ed
 
 
802e11f
 
 
 
 
 
 
3e667ed
802e11f
 
 
0b2bb7e
 
802e11f
 
 
 
 
 
3e667ed
 
b5433b4
802e11f
 
 
 
 
3e667ed
 
 
802e11f
3e667ed
3ebe5da
3e667ed
 
 
 
 
 
 
 
9079195
802e11f
d192051
802e11f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Copyright 2023 Balacoon

contains implementation
for Revoice request
"""

import os
import asyncio
import base64
import hashlib
import json
import ssl
import time
from typing import Tuple, Union

import numpy as np
import resampy
import websockets


def prepare_audio(audio: Tuple[int, np.ndarray]) -> np.ndarray:
    """
    ensures that audio is in int16 format, 16khz mono
    """
    sr, wav = audio
    # ensure proper type
    if wav.dtype == np.int32:
        max_val = np.max(np.abs(wav))
        mult = (32767.0 / 2**31) if max_val > 32768 else 1.0
        wav = (wav.astype(np.float32) * mult).astype(np.int16)
    elif wav.dtype == np.float32 or wav.dtype == np.float64:
        mult = 32767.0 if np.max(np.abs(wav)) <= 1.0 else 1.0
        wav = (wav * mult).astype(np.int16)

    if wav.ndim == 2:
        # average channels
        if wav.shape[0] == 2:
            wav = np.mean(wav, axis=0, keepdims=False)
        if wav.shape[1] == 2:
            wav = np.mean(wav, axis=1, keepdims=False)

    if wav.ndim != 1:
        return None
 
    # ensure proper sampling rate
    if sr != 16000:
        wav = (wav / 32768.0).astype(np.float)
        wav = resampy.resample(wav, sr, 16000)
        wav = (wav * 32768.0).astype(np.int16)
    return wav


def create_signature(api_secret: str) -> str:
    """
    helper function that creates signature,
    required to authentificate the request
    """
    int_time = int(time.time() / 1000)
    signature_input = (api_secret + str(int_time)).encode()
    signature = hashlib.sha256(signature_input).hexdigest()
    return signature


async def async_service_request(source_str: str, source: np.ndarray, target: np.ndarray, api_key: str, api_secret: str) -> np.ndarray:

    if target is None or len(target) == 0:
        return None
    
    ssl_context = ssl.create_default_context()

    async with websockets.connect(
        os.environ["endpoint"], close_timeout=1024, ssl=ssl_context
    ) as websocket:
        request_dict = {
            "target": base64.b64encode(target.tobytes()).decode("utf-8"),
            "api_key": api_key,
            "signature": create_signature(api_secret),
        }
        if source_str and len(source_str) > 0:
            request_dict["source_str"] = source_str
        elif source is not None and len(source) > 0:
            request_dict["source"] = base64.b64encode(source.tobytes()).decode("utf-8")
        else:
            return None
        request = json.dumps(request_dict)
        await websocket.send(request)

        # read reply
        result_lst = []
        while True:
            try:
                data = await asyncio.wait_for(websocket.recv(), timeout=30)
                result_lst.append(np.frombuffer(data, dtype="int16"))
            except websockets.exceptions.ConnectionClosed:
                break
            except asyncio.TimeoutError:
                break
            if data is None:
                break
        result = np.concatenate(result_lst) if result_lst else None
        return result


def service_request(
    source_str: str, source_audio: Tuple[int, np.ndarray], target_audio: Tuple[int, np.ndarray],
    api_key: str, api_secret: str,
) -> Tuple[int, np.ndarray]:
    """
    prepares audio (has to be 16khz mono)
    and runs request to a voice conversion service
    """
    src = None
    if source_audio is not None:
        src = prepare_audio(source_audio)
    tgt = prepare_audio(target_audio)
    if tgt is None:
        return
    if source_str is None and src is None:
        return
    if len(tgt) >= 30 * 16000:
        # too long
        return
    if src is not None and len(src) >= 60 * 16000:
        return
    if source_str is not None and len(source_str) > 256:
        return

    res = asyncio.run(async_service_request(source_str, src, tgt, api_key, api_secret))
    return 16000, res