File size: 2,909 Bytes
802e11f
 
 
 
 
 
 
5293275
802e11f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5293275
802e11f
 
 
 
 
 
 
 
5293275
802e11f
 
 
 
5293275
802e11f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9079195
 
 
802e11f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Copyright 2023 Balacoon

contains implementation
for voice conversion request
"""

import os
import asyncio
import base64
import hashlib
import json
import ssl
import time
from typing import Tuple

import numpy as np
import resampy
import websockets


def prepare_audio(audio: Tuple[int, np.ndarray]) -> np.ndarray:
    """
    ensures that audio is in int16 format, 16khz mono
    """
    sr, wav = audio
    # ensure proper type
    if wav.dtype == np.int32:
        max_val = np.max(np.abs(wav))
        mult = (32767.0 / 2**31) if max_val > 32768 else 1.0
        wav = (wav.astype(np.float32) * mult).astype(np.int16)
    elif wav.dtype == np.float32 or wav.dtype == np.float64:
        mult = 32767.0 if np.max(np.abs(wav)) <= 1.0 else 1.0
        wav = (wav * mult).astype(np.int16)

    # ensure proper sampling rate
    if sr != 16000:
        wav = (wav / 32768.0).astype(np.float)
        wav = resampy.resample(wav, sr, 16000)
        wav = (wav * 32768.0).astype(np.int16)
    return wav


def create_signature(api_secret: str) -> str:
    """
    helper function that creates signature,
    required to authentificate the request
    """
    int_time = int(time.time() / 1000)
    signature_input = (os.environ["api_secret"] + str(int_time)).encode()
    signature = hashlib.sha256(signature_input).hexdigest()
    return signature


async def async_service_request(source: np.ndarray, target: np.ndarray) -> np.ndarray:
    ssl_context = ssl.create_default_context()

    async with websockets.connect(
        os.environ["endpoint"], close_timeout=1024, ssl=ssl_context
    ) as websocket:
        request_dict = {
            "source": base64.b64encode(source.tobytes()).decode("utf-8"),
            "target": base64.b64encode(target.tobytes()).decode("utf-8"),
            "api_key": os.environ["api_key"],
            "signature": create_signature(),
        }
        request = json.dumps(request_dict)
        await websocket.send(request)

        # read reply
        result_lst = []
        while True:
            try:
                data = await websocket.recv()
                result_lst.append(np.frombuffer(data, dtype="int16"))
            except websockets.exceptions.ConnectionClosed:
                break
            if data is None:
                break
        result = np.concatenate(result_lst) if result_lst else None
        return result


def vc_service_request(
    source_audio: Tuple[int, np.ndarray], target_audio: Tuple[int, np.ndarray]
) -> Tuple[int, np.ndarray]:
    """
    prepares audio (has to be 16khz mono)
    and runs request to a voice conversion service
    """
    src = prepare_audio(source_audio)
    tgt = prepare_audio(target_audio)
    if len(src) >= 60 * 16000 or len(tgt) >= 30 * 16000:
        # input is way too long, dont return anything
        return

    res = asyncio.run(async_service_request(src, tgt))
    return 16000, res