Thanish commited on
Commit
075b664
1 Parent(s): 1706c64

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -18
README.md CHANGED
@@ -28,7 +28,7 @@ model-index:
28
 
29
  # Wav2Vec2-Large-XLSR-53-Tamil
30
 
31
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Tamil using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset.
32
  When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
  ## Usage
@@ -47,13 +47,13 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
47
  # Preprocessing the datasets.
48
  # We need to read the aduio files as arrays
49
  def speech_file_to_array_fn(batch):
50
- \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
51
- \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
52
- \treturn batch
53
  test_dataset = test_dataset.map(speech_file_to_array_fn)
54
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
55
  with torch.no_grad():
56
- \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
57
  predicted_ids = torch.argmax(logits, dim=-1)
58
  print("Prediction:", processor.batch_decode(predicted_ids))
59
  print("Reference:", test_dataset["sentence"][:2])
@@ -76,25 +76,25 @@ wer = load_metric("wer")
76
  processor = Wav2Vec2Processor.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
77
  model = Wav2Vec2ForCTC.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
78
  model.to("cuda")
79
- chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]' # TODO: adapt this list to include all special characters you removed from the data
80
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
81
  # Preprocessing the datasets.
82
  # We need to read the audio files as arrays
83
  def speech_file_to_array_fn(batch):
84
- \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
85
- \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
86
- \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
87
- \treturn batch
88
  test_dataset = test_dataset.map(speech_file_to_array_fn)
89
  # Preprocessing the datasets.
90
  # We need to read the aduio files as arrays
91
  def evaluate(batch):
92
- \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
93
- \twith torch.no_grad():
94
- \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
95
- \tpred_ids = torch.argmax(logits, dim=-1)
96
- \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
97
- \treturn batch
98
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
99
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
100
  ```
@@ -104,6 +104,6 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
104
 
105
  ## Training
106
 
107
- The Common Voice `train`, `validation`, and ... datasets were used for training as well as ... and ... # TODO: adapt to state all the datasets that were used for training.
108
 
109
- The script used for training can be found [here](...) # TODO: fill in a link to your training script here. If you trained your model in a colab, simply fill in the link here. If you trained the model locally, it would be great if you could upload the training script on github and paste the link here.
 
28
 
29
  # Wav2Vec2-Large-XLSR-53-Tamil
30
 
31
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Tamil using the [Common Voice](https://huggingface.co/datasets/common_voice) dataset.
32
  When using this model, make sure that your speech input is sampled at 16kHz.
33
 
34
  ## Usage
 
47
  # Preprocessing the datasets.
48
  # We need to read the aduio files as arrays
49
  def speech_file_to_array_fn(batch):
50
+ \\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
51
+ \\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
52
+ \\treturn batch
53
  test_dataset = test_dataset.map(speech_file_to_array_fn)
54
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
55
  with torch.no_grad():
56
+ \\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
57
  predicted_ids = torch.argmax(logits, dim=-1)
58
  print("Prediction:", processor.batch_decode(predicted_ids))
59
  print("Reference:", test_dataset["sentence"][:2])
 
76
  processor = Wav2Vec2Processor.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
77
  model = Wav2Vec2ForCTC.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
78
  model.to("cuda")
79
+ chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“]' # TODO: adapt this list to include all special characters you removed from the data
80
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
81
  # Preprocessing the datasets.
82
  # We need to read the audio files as arrays
83
  def speech_file_to_array_fn(batch):
84
+ \\tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
85
+ \\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
86
+ \\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
87
+ \\treturn batch
88
  test_dataset = test_dataset.map(speech_file_to_array_fn)
89
  # Preprocessing the datasets.
90
  # We need to read the aduio files as arrays
91
  def evaluate(batch):
92
+ \\tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
93
+ \\twith torch.no_grad():
94
+ \\t\\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
95
+ \\tpred_ids = torch.argmax(logits, dim=-1)
96
+ \\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
97
+ \\treturn batch
98
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
99
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
100
  ```
 
104
 
105
  ## Training
106
 
107
+ The Common Voice `train`, `validation` were used for training
108
 
109
+ The script used for training can be found [https://colab.research.google.com/drive/1PC2SjxpcWMQ2qmRw21NbP38wtQQUa5os#scrollTo=YKBZdqqJG9Tv](...)