|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import pickle |
|
import torchaudio |
|
import torch |
|
from speechbrain.inference.speaker import EncoderClassifier |
|
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps, collect_chunks |
|
|
|
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb") |
|
|
|
with open("gender_classifier.pickle", "rb") as file: |
|
gender_clf = pickle.load(file) |
|
|
|
with open("height_estimator_1.pickle", "rb") as file: |
|
male_clf = pickle.load(file) |
|
|
|
with open("height_estimator_0.pickle", "rb") as file: |
|
female_clf = pickle.load(file) |
|
|
|
article_md = Path("Description.md") |
|
error_message = "No speech detected or signal too short!" |
|
|
|
|
|
def read_markdown_file(file_path): |
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
markdown_string = file.read() |
|
return markdown_string |
|
|
|
|
|
def metric_to_imperial(height): |
|
inches = round(height / 2.54) |
|
return f"{int(inches / 12)}'{inches % 12}\"" |
|
|
|
|
|
def get_speech(wav): |
|
model = load_silero_vad() |
|
speech_timestamps = get_speech_timestamps(wav, model) |
|
return collect_chunks(speech_timestamps, wav) |
|
|
|
|
|
def estimate_height(gender, vad, filepath, imperial): |
|
if filepath is None: |
|
return error_message |
|
signal = read_audio(filepath) |
|
if vad: |
|
signal = get_speech(signal) |
|
if len(signal) < 1: |
|
return error_message |
|
|
|
|
|
embedding = torch.squeeze(classifier.encode_batch(signal), 0) |
|
if gender == "Detect" or gender is None: |
|
gender = gender_clf.predict(embedding) |
|
else: |
|
gender = 1 if gender == "Male" else 0 |
|
|
|
height_estimator = male_clf if gender else female_clf |
|
height = height_estimator.predict(embedding)[0] |
|
|
|
if imperial: |
|
height = metric_to_imperial(height) |
|
else: |
|
height = str(round(height)) + " cm" |
|
|
|
return f"{'Male' if gender else 'Female'} {height}" |
|
|
|
|
|
theme = gr.themes.Glass() |
|
|
|
with gr.Blocks(theme=theme) as demo: |
|
gr.Interface( |
|
fn=estimate_height, inputs=[ |
|
gr.Radio(["Detect", "Male", "Female"], label="Gender of a speaker", value="Detect"), |
|
gr.Checkbox(label="VAD", info="If there is a lot of silence in your audio, maybe try using VAD"), |
|
gr.Audio(label="Audio", type="filepath"), |
|
gr.Checkbox(label="Imperial units") |
|
], |
|
outputs=[gr.Label(label="Prediction")], |
|
title="Speaker height estimator", |
|
description="Demo of estimator trained using [HeightCeleb](https://github.com/stachu86/HeightCeleb) dataset", |
|
allow_flagging="never", |
|
article=read_markdown_file(article_md) |
|
) |
|
demo.launch(False, debug=True) |