Muhammad Nafish Zaldinanda commited on
Commit
f1d3793
1 Parent(s): 6480238

Training in progress, step 1000

Browse files
XLSR.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 768,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.0,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "gradient_checkpointing": false,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.1,
59
+ "hidden_size": 1024,
60
+ "initializer_range": 0.02,
61
+ "intermediate_size": 4096,
62
+ "layer_norm_eps": 1e-05,
63
+ "layerdrop": 0.1,
64
+ "mask_channel_length": 10,
65
+ "mask_channel_min_space": 1,
66
+ "mask_channel_other": 0.0,
67
+ "mask_channel_prob": 0.0,
68
+ "mask_channel_selection": "static",
69
+ "mask_feature_length": 10,
70
+ "mask_feature_min_masks": 0,
71
+ "mask_feature_prob": 0.0,
72
+ "mask_time_length": 10,
73
+ "mask_time_min_masks": 2,
74
+ "mask_time_min_space": 1,
75
+ "mask_time_other": 0.0,
76
+ "mask_time_prob": 0.05,
77
+ "mask_time_selection": "static",
78
+ "model_type": "wav2vec2",
79
+ "num_adapter_layers": 3,
80
+ "num_attention_heads": 16,
81
+ "num_codevector_groups": 2,
82
+ "num_codevectors_per_group": 320,
83
+ "num_conv_pos_embedding_groups": 16,
84
+ "num_conv_pos_embeddings": 128,
85
+ "num_feat_extract_layers": 7,
86
+ "num_hidden_layers": 24,
87
+ "num_negatives": 100,
88
+ "output_hidden_size": 1024,
89
+ "pad_token_id": 28,
90
+ "proj_codevector_dim": 768,
91
+ "tdnn_dilation": [
92
+ 1,
93
+ 2,
94
+ 3,
95
+ 1,
96
+ 1
97
+ ],
98
+ "tdnn_dim": [
99
+ 512,
100
+ 512,
101
+ 512,
102
+ 512,
103
+ 1500
104
+ ],
105
+ "tdnn_kernel": [
106
+ 5,
107
+ 3,
108
+ 3,
109
+ 1,
110
+ 1
111
+ ],
112
+ "torch_dtype": "float32",
113
+ "transformers_version": "4.35.2",
114
+ "use_weighted_layer_sum": false,
115
+ "vocab_size": 31,
116
+ "xvector_output_dim": 512
117
+ }
inference.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"Rfsio4paNmF-","executionInfo":{"status":"ok","timestamp":1704990774869,"user_tz":-420,"elapsed":61531,"user":{"displayName":"Atmatech Dev","userId":"08685432457477166628"}}},"outputs":[],"source":["%%capture\n","!pip install datasets==1.4.1\n","!pip install transformers\n","!pip install torchaudio\n","!pip install librosa\n","!pip install jiwer\n","!pip install pydub==0.25.1"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"UW_rtVdGdoEt","executionInfo":{"status":"ok","timestamp":1704990805764,"user_tz":-420,"elapsed":5203,"user":{"displayName":"Atmatech Dev","userId":"08685432457477166628"}}},"outputs":[],"source":["import os, sys\n","import torch\n","import torchaudio\n","import librosa\n","import numpy as np\n","from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC\n","from pydub import AudioSegment"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive/')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WXr3fpZqXdw4","executionInfo":{"status":"ok","timestamp":1704990835065,"user_tz":-420,"elapsed":27776,"user":{"displayName":"Atmatech Dev","userId":"08685432457477166628"}},"outputId":"9b19a117-1f73-4c7d-970d-45f5ae657575"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive/\n"]}]},{"cell_type":"code","source":["use_device = torch.device(\"cpu\")\n","model = Wav2Vec2ForCTC.from_pretrained(\"/content/drive/MyDrive/machine-learning/speech-to-text/huggingface/pytorch/finetuning/dataset-comparison/wav2vec2-xlsr/checkpoint-141200\").to(use_device)\n","processor = Wav2Vec2Processor.from_pretrained(\"/content/drive/MyDrive/machine-learning/speech-to-text/huggingface/pytorch/finetuning/dataset-comparison/wav2vec2-xlsr/processor\")"],"metadata":{"id":"iNIfh73xWXAm"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["audio_path = \"/content/drive/MyDrive/machine-learning/speech-to-text/data/predict/id/testing_1.wav\"\n","filename = os.path.basename(audio_path)\n","audio_filename = filename.split(\".\")[0]\n","audio_format = filename.split(\".\")[1]\n","audio = AudioSegment.from_file_using_temporary_files(audio_path, format=audio_format)\n","audio_filename = str(\"/content/\") + str(audio_filename) + \".mp3\"\n","audio.export(audio_filename, format=\"mp3\", parameters=[\"-ac\", \"1\", \"-ar\", \"16000\"])"],"metadata":{"id":"TDqul7nYWUvb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def resample(speech_array, sampling_rate):\n"," if sampling_rate == 48000:\n"," speech_array = librosa.resample(np.asarray(speech_array), 48_000, 16_000)\n","\n"," elif sampling_rate == 44100:\n"," speech_array = librosa.resample(np.asarray(speech_array), 44100, 16_000)\n","\n"," elif sampling_rate == 32000:\n"," speech_array = librosa.resample(np.asarray(speech_array), 32000, 16_000)\n"," return speech_array"],"metadata":{"id":"LuPSy0CVYBXU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["speech_array, sampling_rate = torchaudio.load(audio_filename)\n","speech_array = resample(speech_array[0].numpy(), sampling_rate)\n","input_values = processor(speech_array, sampling_rate=16_000).input_values[0]\n","input_dict = processor(input_values, return_tensors=\"pt\", padding=True)"],"metadata":{"id":"7jzbDTW0YGun"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["logits = model(input_dict.input_values.to(use_device)).logits\n","pred_ids = torch.argmax(logits, dim=-1)[0]\n","text = processor.decode(pred_ids)\n","print(text)"],"metadata":{"id":"EM3SNKTQYJtW"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"p_zvrwOfN-9Y"},"outputs":[],"source":["import torch\n","from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n","from torchaudio.transforms import Resample\n","from IPython.display import Audio\n","from pydub import AudioSegment\n","\n","# Ganti dengan jalur file model hasil fine-tuning\n","fine_tuned_model_path = \"path_to_your_fine_tuned_model\"\n","\n","# Load the processor and fine-tuned model\n","processor = Wav2Vec2Processor.from_pretrained(fine_tuned_model_path)\n","model = Wav2Vec2ForCTC.from_pretrained(fine_tuned_model_path)\n","\n","# Load the audio file in mp3 format\n","mp3_audio_file_path = \"path_to_your_audio_file.mp3\"\n","audio = AudioSegment.from_file(mp3_audio_file_path, format=\"mp3\")\n","\n","# Resample the audio if needed\n","waveform, sample_rate = torch.tensor(audio.raw_data).to(torch.float32), audio.frame_rate\n","if sample_rate != processor.feature_extractor.sampling_rate:\n"," resampler = Resample(orig_freq=sample_rate, new_freq=processor.feature_extractor.sampling_rate)\n"," waveform = resampler(waveform)\n","\n","# Tokenize and transcribe\n","input_values = processor(waveform.numpy(), return_tensors=\"pt\", padding=\"longest\").input_values\n","with torch.no_grad():\n"," logits = model(input_values).logits\n"," predicted_ids = torch.argmax(logits, dim=-1)\n","\n","# Decode the predicted transcription\n","transcription = processor.batch_decode(predicted_ids)[0]\n","print(\"Predicted Transcription:\", transcription)\n","\n","# Play the original mp3 audio\n","Audio(mp3_audio_file_path)\n"]}],"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyMj/sdpm6gj8SN2GzFFGg8D"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9505669872b1d8ba1d49509324521dbec46592d3c89852177343dec7db1f6d71
3
+ size 1261934580
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
processor/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 30,
3
+ "<s>": 29
4
+ }
processor/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
processor/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "[UNK]"
6
+ }
processor/tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "27": {
4
+ "content": "[UNK]",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "28": {
12
+ "content": "[PAD]",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "29": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "30": {
28
+ "content": "</s>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[PAD]",
42
+ "processor_class": "Wav2Vec2Processor",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": null,
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "[UNK]",
47
+ "word_delimiter_token": "|"
48
+ }
processor/vocab.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 28,
3
+ "[UNK]": 27,
4
+ "a": 2,
5
+ "b": 16,
6
+ "c": 23,
7
+ "d": 21,
8
+ "e": 15,
9
+ "f": 12,
10
+ "g": 4,
11
+ "h": 7,
12
+ "i": 18,
13
+ "j": 8,
14
+ "k": 6,
15
+ "l": 19,
16
+ "m": 10,
17
+ "n": 5,
18
+ "o": 25,
19
+ "p": 9,
20
+ "q": 17,
21
+ "r": 13,
22
+ "s": 26,
23
+ "t": 1,
24
+ "u": 22,
25
+ "v": 11,
26
+ "w": 20,
27
+ "x": 3,
28
+ "y": 0,
29
+ "z": 14,
30
+ "|": 24
31
+ }
runs/Jan11_16-21-53_5b0d62e2a2dd/events.out.tfevents.1704990137.5b0d62e2a2dd.533.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b71fdaa5d469dad8c279c7be05e790a0376d295a8bf817b3729eb4fbb9a606c
3
+ size 12784
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b12eda2eeacdd6eded1b650123f4b19c02d6860b56a6a3ad9341f20b7178e090
3
+ size 4792
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"y": 0, "t": 1, "a": 2, "x": 3, "g": 4, "n": 5, "k": 6, "h": 7, "j": 8, "p": 9, "m": 10, "v": 11, "f": 12, "r": 13, "z": 14, "e": 15, "b": 16, "q": 17, "i": 18, "l": 19, "w": 20, "d": 21, "u": 22, "c": 23, "o": 25, "s": 26, "|": 24, "[UNK]": 27, "[PAD]": 28}
wav2vec2-large-xlsr-indonesian.ipynb ADDED
The diff for this file is too large to render. See raw diff