Resund commited on
Commit
34b94e6
Β·
1 Parent(s): dcbba11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -9
app.py CHANGED
@@ -1,10 +1,20 @@
 
1
  import gradio as gr
2
  import torch
3
- from torchaudio.sox_effects import apply_effects_file
 
 
 
4
  from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
5
 
6
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
 
 
 
 
 
 
 
8
  STYLE = """
9
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha256-YvdLHPgkqJ8DVUxjjnGVlMMJtNimJ6dYkowFFvp4kKs=" crossorigin="anonymous">
10
  """
@@ -44,7 +54,7 @@ EFFECTS = [
44
 
45
  THRESHOLD = 0.85
46
 
47
- model_name = "microsoft/unispeech-sat-base-plus-sv"
48
  feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
49
  model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
50
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
@@ -53,9 +63,12 @@ cosine_sim = torch.nn.CosineSimilarity(dim=-1)
53
  def similarity_fn(path1, path2):
54
  if not (path1 and path2):
55
  return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
56
-
57
- wav1, _ = apply_effects_file(path1, EFFECTS)
58
- wav2, _ = apply_effects_file(path2, EFFECTS)
 
 
 
59
  print(wav1.shape, wav2.shape)
60
 
61
  input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
@@ -89,8 +102,8 @@ description = (
89
  )
90
  article = (
91
  "<p style='text-align: center'>"
92
- "<a href='https://huggingface.co/microsoft/unispeech-sat-large-sv' target='_blank'>πŸŽ™οΈ Learn more about UniSpeech-SAT</a> | "
93
- "<a href='https://arxiv.org/abs/2110.05752' target='_blank'>πŸ“š UniSpeech-SAT paper</a> | "
94
  "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>πŸ“š X-Vector paper</a>"
95
  "</p>"
96
  )
@@ -103,7 +116,7 @@ interface = gr.Interface(
103
  fn=similarity_fn,
104
  inputs=inputs,
105
  outputs=output,
106
- title="Voice Authentication with UniSpeech-SAT + X-Vectors",
107
  description=description,
108
  article=article,
109
  layout="horizontal",
@@ -112,4 +125,4 @@ interface = gr.Interface(
112
  live=False,
113
  examples=examples,
114
  )
115
- interface.launch(enable_queue=True)
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
+ import pydub
5
+ import torchaudio
6
+ from torchaudio.sox_effects import apply_effects_tensor
7
+ import numpy as np
8
  from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
9
 
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
 
12
+ def load_audio(file_name):
13
+ audio = pydub.AudioSegment.from_file(file_name)
14
+ arr = np.array(audio.get_array_of_samples(), dtype=np.float32)
15
+ arr = arr / (1 << (8 * audio.sample_width - 1))
16
+ return arr.astype(np.float32), audio.frame_rate
17
+
18
  STYLE = """
19
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha256-YvdLHPgkqJ8DVUxjjnGVlMMJtNimJ6dYkowFFvp4kKs=" crossorigin="anonymous">
20
  """
 
54
 
55
  THRESHOLD = 0.85
56
 
57
+ model_name = "microsoft/wavlm-base-plus-sv"
58
  feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
59
  model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
60
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
 
63
  def similarity_fn(path1, path2):
64
  if not (path1 and path2):
65
  return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
66
+
67
+ wav1, sr1 = load_audio(path1)
68
+ print(wav1, wav1.shape, wav1.dtype)
69
+ wav1, _ = apply_effects_tensor(torch.tensor(wav1).unsqueeze(0), sr1, EFFECTS)
70
+ wav2, sr2 = load_audio(path2)
71
+ wav2, _ = apply_effects_tensor(torch.tensor(wav2).unsqueeze(0), sr2, EFFECTS)
72
  print(wav1.shape, wav2.shape)
73
 
74
  input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
 
102
  )
103
  article = (
104
  "<p style='text-align: center'>"
105
+ "<a href='https://huggingface.co/microsoft/wavlm-base-plus-sv' target='_blank'>πŸŽ™οΈ Learn more about WavLM</a> | "
106
+ "<a href='https://arxiv.org/abs/2110.13900' target='_blank'>πŸ“š WavLM paper</a> | "
107
  "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>πŸ“š X-Vector paper</a>"
108
  "</p>"
109
  )
 
116
  fn=similarity_fn,
117
  inputs=inputs,
118
  outputs=output,
119
+ title="Voice Authentication with WavLM + X-Vectors",
120
  description=description,
121
  article=article,
122
  layout="horizontal",
 
125
  live=False,
126
  examples=examples,
127
  )
128
+ interface.launch(enable_queue=True)