File size: 2,646 Bytes
99cda37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e56594
99cda37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6f585d
4e56594
 
99cda37
 
 
44f0dfc
4e56594
44f0dfc
99cda37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from pathlib import Path

import gradio as gr
import pickle
import torchaudio
import torch
from speechbrain.inference.speaker import EncoderClassifier
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps, collect_chunks

classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

with open("gender_classifier.pickle", "rb") as file:
    gender_clf = pickle.load(file)

with open("height_estimator_1.pickle", "rb") as file:
    male_clf = pickle.load(file)

with open("height_estimator_0.pickle", "rb") as file:
    female_clf = pickle.load(file)

article_md = Path("Description.md")
error_message = "No speech detected or signal too short!"


def read_markdown_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        markdown_string = file.read()
    return markdown_string


def metric_to_imperial(height):
    inches = round(height / 2.54)
    return f"{int(inches / 12)}'{inches % 12}\""


def get_speech(wav):
    model = load_silero_vad()
    speech_timestamps = get_speech_timestamps(wav, model)
    return collect_chunks(speech_timestamps, wav)


def estimate_height(gender, vad, filepath, imperial):
    if filepath is None:
        return error_message
    signal = read_audio(filepath)
    if vad:
        signal = get_speech(signal)
        if len(signal) < 1:
            return error_message


    embedding = torch.squeeze(classifier.encode_batch(signal), 0)
    if gender == "Detect" or gender is None:
        gender = gender_clf.predict(embedding)
    else:
        gender = 1 if gender == "Male" else 0

    height_estimator = male_clf if gender else female_clf
    height = height_estimator.predict(embedding)[0]

    if imperial:
        height = metric_to_imperial(height)
    else:
        height = str(round(height)) + " cm"

    return f"{'Male' if gender else 'Female'} {height}"


theme = gr.themes.Glass()

with gr.Blocks(theme=theme) as demo:
    gr.Interface(
        fn=estimate_height, inputs=[
            gr.Radio(["Detect", "Male", "Female"], label="Gender of a speaker", value="Detect"),
            gr.Checkbox(label="VAD", info="If there is a lot of silence in your audio, maybe try using VAD"),
            gr.Audio(label="Audio", type="filepath"),
            gr.Checkbox(label="Imperial units")
        ],
        outputs=[gr.Label(label="Prediction")],
        title="Speaker height estimator",
        description="Demo of estimator trained using [HeightCeleb](https://github.com/stachu86/HeightCeleb) dataset",
        allow_flagging="never",
        article=read_markdown_file(article_md)
    )
demo.launch(False, debug=True)