clementruhm commited on
Commit
802e11f
1 Parent(s): 8f9b0ff

Initial commit of voice conversion demo

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
  title: Voice Conversion Service
3
- emoji: 🐠
4
  colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Voice Conversion Service
3
+ emoji: 💬
4
  colorFrom: blue
5
+ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ Interactive demo for Voice Conversion service by Balacoon.
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2023 Balacoon
3
+
4
+ Voice Conversion service interactive demo
5
+ """
6
+
7
+ import glob
8
+ import logging
9
+ import os
10
+
11
+ import gradio as gr
12
+
13
+ from vc_service_request import vc_service_request
14
+
15
+ script_dir = os.path.dirname(os.path.abspath(__file__))
16
+
17
+
18
+ def main():
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+ with gr.Blocks() as demo:
22
+ gr.Markdown(
23
+ """
24
+ <h1 align="center">Balacoon🦝 Voice Conversion</h1>
25
+
26
+ Welcome to the live demo of Balacoon's Voice Conversion service.
27
+ Check out our [website](https://balacoon.com/demo/#voice-conversion)
28
+ to learn more.
29
+ Voice Conversion allows you to transform your own voice
30
+ into the voice of another person using just a single sample.
31
+ For optimal results, we recommend using clean audio files in English.
32
+
33
+ Here's how it works:
34
+
35
+ 1. Begin by recording your voice.
36
+ 2. Select an audio sample that represents the target voice you want to convert to.
37
+ 3. Click the "Convert" button and listen to the result!
38
+
39
+ If you are interested to plug in Voice Conversion
40
+ service into your own application, don't hesitate to get in touch with us at
41
42
+ """
43
+ )
44
+
45
+ with gr.Row():
46
+ with gr.Column(variant="panel"):
47
+ src_audio_mic = gr.Audio(source="microphone", label="Record your voice")
48
+ src_audio_file = gr.Audio(
49
+ source="upload", label="Or upload audio to convert"
50
+ )
51
+
52
+ with gr.Column(variant="panel"):
53
+ tgt_audio_file = gr.Audio(
54
+ source="upload", label="Select audio with target voice"
55
+ )
56
+ tgt_examples_paths = glob.glob(
57
+ os.path.join(script_dir, "references", "*.wav")
58
+ )
59
+ gr.Examples(
60
+ tgt_examples_paths,
61
+ inputs=[tgt_audio_file],
62
+ )
63
+
64
+ with gr.Row():
65
+ convert_btn = gr.Button("Convert")
66
+ with gr.Row():
67
+ result_audio = gr.Audio()
68
+
69
+ def voice_conversion(src_from_mic_, src_from_file_, tgt_from_file_):
70
+ """
71
+ helper function which checks where source come from
72
+ """
73
+ src_ = None
74
+ if src_from_mic_:
75
+ src_ = src_from_mic_
76
+ elif src_from_file_:
77
+ src_ = src_from_file_
78
+ tgt_ = tgt_from_file_
79
+ if not src_ or not tgt_:
80
+ logging.warning("source or target are not provided")
81
+ return
82
+ return vc_service_request(src_, tgt_)
83
+
84
+ convert_btn.click(
85
+ voice_conversion,
86
+ inputs=[src_audio_mic, src_audio_file, tgt_audio_file],
87
+ outputs=result_audio,
88
+ )
89
+
90
+ demo.queue(concurrency_count=1).launch()
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
references/cate_blanchett.wav ADDED
Binary file (401 kB). View file
 
references/george_clooney.wav ADDED
Binary file (343 kB). View file
 
references/james_earl_jones.wav ADDED
Binary file (402 kB). View file
 
references/kratos.wav ADDED
Binary file (411 kB). View file
 
references/meryl_streep.wav ADDED
Binary file (398 kB). View file
 
references/mike_rowe.wav ADDED
Binary file (453 kB). View file
 
references/nikole_kidman.wav ADDED
Binary file (439 kB). View file
 
references/sam_elliott.wav ADDED
Binary file (461 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy==1.23.2
2
+ resampy==0.4.2
3
+ streamlit==1.22.0
4
+ websockets==10.3
setup.cfg ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max_complexity=10
3
+ per-file-ignores=__init__.py:F401,F403
4
+ ignore = E203,W503
5
+ max-line-length=119
6
+
7
+ [isort]
8
+ profile=black
9
+ line_length=119
10
+
11
+ [mypy]
12
+ ignore_missing_imports = True
vc_service_request.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2023 Balacoon
3
+
4
+ contains implementation
5
+ for voice conversion request
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import hashlib
11
+ import json
12
+ import ssl
13
+ import time
14
+ from typing import Tuple
15
+
16
+ import numpy as np
17
+ import resampy
18
+ import streamlit as st
19
+ import websockets
20
+
21
+
22
+ def prepare_audio(audio: Tuple[int, np.ndarray]) -> np.ndarray:
23
+ """
24
+ ensures that audio is in int16 format, 16khz mono
25
+ """
26
+ sr, wav = audio
27
+ # ensure proper type
28
+ if wav.dtype == np.int32:
29
+ max_val = np.max(np.abs(wav))
30
+ mult = (32767.0 / 2**31) if max_val > 32768 else 1.0
31
+ wav = (wav.astype(np.float32) * mult).astype(np.int16)
32
+ elif wav.dtype == np.float32 or wav.dtype == np.float64:
33
+ mult = 32767.0 if np.max(np.abs(wav)) <= 1.0 else 1.0
34
+ wav = (wav * mult).astype(np.int16)
35
+
36
+ # ensure proper sampling rate
37
+ if sr != 16000:
38
+ wav = (wav / 32768.0).astype(np.float)
39
+ wav = resampy.resample(wav, sr, 16000)
40
+ wav = (wav * 32768.0).astype(np.int16)
41
+ return wav
42
+
43
+
44
+ def create_signature(api_secret: str) -> str:
45
+ """
46
+ helper function that creates signature,
47
+ required to authentificate the request
48
+ """
49
+ int_time = int(time.time() / 1000)
50
+ signature_input = (st["api_secret"] + str(int_time)).encode()
51
+ signature = hashlib.sha256(signature_input).hexdigest()
52
+ return signature
53
+
54
+
55
+ async def async_service_request(source: np.ndarray, target: np.ndarray) -> np.ndarray:
56
+ ssl_context = ssl.create_default_context()
57
+
58
+ async with websockets.connect(
59
+ st["endpoint"], close_timeout=1024, ssl=ssl_context
60
+ ) as websocket:
61
+ request_dict = {
62
+ "source": base64.b64encode(source.tobytes()).decode("utf-8"),
63
+ "target": base64.b64encode(target.tobytes()).decode("utf-8"),
64
+ "api_key": st["api_key"],
65
+ "signature": create_signature(),
66
+ }
67
+ request = json.dumps(request_dict)
68
+ await websocket.send(request)
69
+
70
+ # read reply
71
+ result_lst = []
72
+ while True:
73
+ try:
74
+ data = await websocket.recv()
75
+ result_lst.append(np.frombuffer(data, dtype="int16"))
76
+ except websockets.exceptions.ConnectionClosed:
77
+ break
78
+ if data is None:
79
+ break
80
+ result = np.concatenate(result_lst) if result_lst else None
81
+ return result
82
+
83
+
84
+ def vc_service_request(
85
+ source_audio: Tuple[int, np.ndarray], target_audio: Tuple[int, np.ndarray]
86
+ ) -> Tuple[int, np.ndarray]:
87
+ """
88
+ prepares audio (has to be 16khz mono)
89
+ and runs request to a voice conversion service
90
+ """
91
+ src = prepare_audio(source_audio)
92
+ tgt = prepare_audio(target_audio)
93
+
94
+ res = asyncio.run(async_service_request(src, tgt))
95
+ return 16000, res