File size: 4,528 Bytes
8ed98a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9876bf6
 
 
8ed98a1
 
 
 
 
 
 
 
 
9876bf6
 
8ed98a1
 
 
 
c2ea8b8
8ed98a1
 
 
 
 
 
 
 
 
9876bf6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ed98a1
 
 
 
 
 
 
c2ea8b8
 
 
 
8ed98a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2ea8b8
8ed98a1
 
c2ea8b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ed98a1
 
 
 
 
 
 
 
 
c2ea8b8
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import torch
from nemo.collections.asr.models import EncDecSpeakerLabelModel


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

STYLE = """
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha256-YvdLHPgkqJ8DVUxjjnGVlMMJtNimJ6dYkowFFvp4kKs=" crossorigin="anonymous">
"""
OUTPUT_OK = (
    STYLE
    + """
    <div class="container">
        <div class="row"><h1 style="text-align: center">The provided samples are</h1></div>
        <div class="row"><h1 class="text-success" style="text-align: center">Same Speakers!!!</h1></div>
        <div class="row"><h1 class="display-1 text-success" style="text-align: center">similarity score: {:.1f}%</h1></div>
        <div class="row"><tiny style="text-align: center">(Similarity score must be atleast 80% to be considered as same speaker)</small><div class="row">

    </div>
"""
)
OUTPUT_FAIL = (
    STYLE
    + """
    <div class="container">
        <div class="row"><h1 style="text-align: center">The provided samples are from </h1></div>
        <div class="row"><h1 class="text-danger" style="text-align: center">Different Speakers!!!</h1></div>       
        <div class="row"><h1 class="display-1 text-danger" style="text-align: center">similarity score: {:.1f}%</h1></div>
        <div class="row"><tiny style="text-align: center">(Similarity score must be atleast 80% to be considered as same speaker)</small><div class="row">
    </div>
"""
)

THRESHOLD = 0.80

model_name = "nvidia/speakerverification_en_titanet_large"
model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device)


def compare_samples(path1, path2):
    if not (path1 and path2):
        return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'

    embs1 = model.get_embedding(path1).squeeze()
    embs2 = model.get_embedding(path2).squeeze()
    
    #Length Normalize
    X = embs1 / torch.linalg.norm(embs1)
    Y = embs2 / torch.linalg.norm(embs2)
    
    # Score
    similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
    similarity_score = (similarity_score + 1) / 2
    
    # Decision
    if similarity_score >= THRESHOLD:
        return OUTPUT_OK.format(similarity_score * 100)
    else:
        return OUTPUT_FAIL.format(similarity_score * 100)


inputs = [
    gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
    gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
]

upload_inputs = [
    gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"),
    gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"),
]

description = (
    "This demonstration will analyze two recordings of speech and ascertain whether they have been spoken by the same individual.\n"
    "You can attempt this exercise using your own voice."
)
article = (
    "<p style='text-align: center'>"
    "<a href='https://huggingface.co/nvidia/speakerverification_en_titanet_large' target='_blank'>πŸŽ™οΈ Learn more about TitaNet model</a> | "
    "<a href='https://arxiv.org/pdf/2110.04410.pdf' target='_blank'>πŸ“š TitaNet paper</a> | "
    "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>πŸ§‘β€πŸ’» Repository</a>"
    "</p>"
)
examples = [
    ["data/id10270_5r0dWxy17C8-00001.wav", "data/id10270_5r0dWxy17C8-00002.wav"],
    ["data/id10271_1gtz-CUIygI-00001.wav", "data/id10271_1gtz-CUIygI-00002.wav"],
    ["data/id10270_5r0dWxy17C8-00001.wav", "data/id10271_1gtz-CUIygI-00001.wav"],
    ["data/id10270_5r0dWxy17C8-00002.wav", "data/id10271_1gtz-CUIygI-00002.wav"],
]

microphone_interface = gr.Interface(
    fn=compare_samples,
    inputs=inputs,
    outputs=gr.outputs.HTML(label=""),
    title="Speaker Verification with TitaNet Embeddings",
    description=description,
    article=article,
    layout="horizontal",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
)

upload_interface = gr.Interface(
    fn=compare_samples,
    inputs=upload_inputs,
    outputs=gr.outputs.HTML(label=""),
    title="Speaker Verification with TitaNet Embeddings",
    description=description,
    article=article,
    layout="horizontal",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
)

demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])

demo.launch(enable_queue=True)