Training in progress, step 1000

Browse files

Files changed (14) hide show

XLSR.ipynb +0 -0
config.json +117 -0
inference.ipynb +1 -0
model.safetensors +3 -0
preprocessor_config.json +10 -0
processor/added_tokens.json +4 -0
processor/preprocessor_config.json +10 -0
processor/special_tokens_map.json +6 -0
processor/tokenizer_config.json +48 -0
processor/vocab.json +31 -0
runs/Jan11_16-21-53_5b0d62e2a2dd/events.out.tfevents.1704990137.5b0d62e2a2dd.533.0 +3 -0
training_args.bin +3 -0
vocab.json +1 -0
wav2vec2-large-xlsr-indonesian.ipynb +0 -0

XLSR.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
+ "activation_dropout": 0.0,
+ "adapter_attn_dim": null,
+ "adapter_kernel_size": 3,
+ "adapter_stride": 2,
+ "add_adapter": false,
+ "apply_spec_augment": true,
+ "architectures": [
+ "Wav2Vec2ForCTC"
+ ],
+ "attention_dropout": 0.1,
+ "bos_token_id": 1,
+ "classifier_proj_size": 256,
+ "codevector_dim": 768,
+ "contrastive_logits_temperature": 0.1,
+ "conv_bias": true,
+ "conv_dim": [
+ 512,
+ 512,
+ 512,
+ 512,
+ 512,
+ 512,
+ 512
+ ],
+ "conv_kernel": [
+ 10,
+ 3,
+ 3,
+ 3,
+ 3,
+ 2,
+ 2
+ ],
+ "conv_stride": [
+ 5,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2,
+ 2
+ ],
+ "ctc_loss_reduction": "mean",
+ "ctc_zero_infinity": false,
+ "diversity_loss_weight": 0.1,
+ "do_stable_layer_norm": true,
+ "eos_token_id": 2,
+ "feat_extract_activation": "gelu",
+ "feat_extract_dropout": 0.0,
+ "feat_extract_norm": "layer",
+ "feat_proj_dropout": 0.0,
+ "feat_quantizer_dropout": 0.0,
+ "final_dropout": 0.0,
+ "gradient_checkpointing": false,
+ "hidden_act": "gelu",
+ "hidden_dropout": 0.1,
+ "hidden_size": 1024,
+ "initializer_range": 0.02,
+ "intermediate_size": 4096,
+ "layer_norm_eps": 1e-05,
+ "layerdrop": 0.1,
+ "mask_channel_length": 10,
+ "mask_channel_min_space": 1,
+ "mask_channel_other": 0.0,
+ "mask_channel_prob": 0.0,
+ "mask_channel_selection": "static",
+ "mask_feature_length": 10,
+ "mask_feature_min_masks": 0,
+ "mask_feature_prob": 0.0,
+ "mask_time_length": 10,
+ "mask_time_min_masks": 2,
+ "mask_time_min_space": 1,
+ "mask_time_other": 0.0,
+ "mask_time_prob": 0.05,
+ "mask_time_selection": "static",
+ "model_type": "wav2vec2",
+ "num_adapter_layers": 3,
+ "num_attention_heads": 16,
+ "num_codevector_groups": 2,
+ "num_codevectors_per_group": 320,
+ "num_conv_pos_embedding_groups": 16,
+ "num_conv_pos_embeddings": 128,
+ "num_feat_extract_layers": 7,
+ "num_hidden_layers": 24,
+ "num_negatives": 100,
+ "output_hidden_size": 1024,
+ "pad_token_id": 28,
+ "proj_codevector_dim": 768,
+ "tdnn_dilation": [
+ 1,
+ 2,
+ 3,
+ 1,
+ 1
+ ],
+ "tdnn_dim": [
+ 512,
+ 512,
+ 512,
+ 512,
+ 1500
+ ],
+ "tdnn_kernel": [
+ 5,
+ 3,
+ 3,
+ 1,
+ 1
+ ],
+ "torch_dtype": "float32",
+ "transformers_version": "4.35.2",
+ "use_weighted_layer_sum": false,
+ "vocab_size": 31,
+ "xvector_output_dim": 512
+}

inference.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"Rfsio4paNmF-","executionInfo":{"status":"ok","timestamp":1704990774869,"user_tz":-420,"elapsed":61531,"user":{"displayName":"Atmatech Dev","userId":"08685432457477166628"}}},"outputs":[],"source":["%%capture\n","!pip install datasets==1.4.1\n","!pip install transformers\n","!pip install torchaudio\n","!pip install librosa\n","!pip install jiwer\n","!pip install pydub==0.25.1"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"UW_rtVdGdoEt","executionInfo":{"status":"ok","timestamp":1704990805764,"user_tz":-420,"elapsed":5203,"user":{"displayName":"Atmatech Dev","userId":"08685432457477166628"}}},"outputs":[],"source":["import os, sys\n","import torch\n","import torchaudio\n","import librosa\n","import numpy as np\n","from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC\n","from pydub import AudioSegment"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive/')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WXr3fpZqXdw4","executionInfo":{"status":"ok","timestamp":1704990835065,"user_tz":-420,"elapsed":27776,"user":{"displayName":"Atmatech Dev","userId":"08685432457477166628"}},"outputId":"9b19a117-1f73-4c7d-970d-45f5ae657575"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive/\n"]}]},{"cell_type":"code","source":["use_device = torch.device(\"cpu\")\n","model = Wav2Vec2ForCTC.from_pretrained(\"/content/drive/MyDrive/machine-learning/speech-to-text/huggingface/pytorch/finetuning/dataset-comparison/wav2vec2-xlsr/checkpoint-141200\").to(use_device)\n","processor = Wav2Vec2Processor.from_pretrained(\"/content/drive/MyDrive/machine-learning/speech-to-text/huggingface/pytorch/finetuning/dataset-comparison/wav2vec2-xlsr/processor\")"],"metadata":{"id":"iNIfh73xWXAm"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["audio_path = \"/content/drive/MyDrive/machine-learning/speech-to-text/data/predict/id/testing_1.wav\"\n","filename = os.path.basename(audio_path)\n","audio_filename = filename.split(\".\")[0]\n","audio_format = filename.split(\".\")[1]\n","audio = AudioSegment.from_file_using_temporary_files(audio_path, format=audio_format)\n","audio_filename = str(\"/content/\") + str(audio_filename) + \".mp3\"\n","audio.export(audio_filename, format=\"mp3\", parameters=[\"-ac\", \"1\", \"-ar\", \"16000\"])"],"metadata":{"id":"TDqul7nYWUvb"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def resample(speech_array, sampling_rate):\n"," if sampling_rate == 48000:\n"," speech_array = librosa.resample(np.asarray(speech_array), 48_000, 16_000)\n","\n"," elif sampling_rate == 44100:\n"," speech_array = librosa.resample(np.asarray(speech_array), 44100, 16_000)\n","\n"," elif sampling_rate == 32000:\n"," speech_array = librosa.resample(np.asarray(speech_array), 32000, 16_000)\n"," return speech_array"],"metadata":{"id":"LuPSy0CVYBXU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["speech_array, sampling_rate = torchaudio.load(audio_filename)\n","speech_array = resample(speech_array[0].numpy(), sampling_rate)\n","input_values = processor(speech_array, sampling_rate=16_000).input_values[0]\n","input_dict = processor(input_values, return_tensors=\"pt\", padding=True)"],"metadata":{"id":"7jzbDTW0YGun"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["logits = model(input_dict.input_values.to(use_device)).logits\n","pred_ids = torch.argmax(logits, dim=-1)[0]\n","text = processor.decode(pred_ids)\n","print(text)"],"metadata":{"id":"EM3SNKTQYJtW"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"p_zvrwOfN-9Y"},"outputs":[],"source":["import torch\n","from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n","from torchaudio.transforms import Resample\n","from IPython.display import Audio\n","from pydub import AudioSegment\n","\n","# Ganti dengan jalur file model hasil fine-tuning\n","fine_tuned_model_path = \"path_to_your_fine_tuned_model\"\n","\n","# Load the processor and fine-tuned model\n","processor = Wav2Vec2Processor.from_pretrained(fine_tuned_model_path)\n","model = Wav2Vec2ForCTC.from_pretrained(fine_tuned_model_path)\n","\n","# Load the audio file in mp3 format\n","mp3_audio_file_path = \"path_to_your_audio_file.mp3\"\n","audio = AudioSegment.from_file(mp3_audio_file_path, format=\"mp3\")\n","\n","# Resample the audio if needed\n","waveform, sample_rate = torch.tensor(audio.raw_data).to(torch.float32), audio.frame_rate\n","if sample_rate != processor.feature_extractor.sampling_rate:\n"," resampler = Resample(orig_freq=sample_rate, new_freq=processor.feature_extractor.sampling_rate)\n"," waveform = resampler(waveform)\n","\n","# Tokenize and transcribe\n","input_values = processor(waveform.numpy(), return_tensors=\"pt\", padding=\"longest\").input_values\n","with torch.no_grad():\n"," logits = model(input_values).logits\n"," predicted_ids = torch.argmax(logits, dim=-1)\n","\n","# Decode the predicted transcription\n","transcription = processor.batch_decode(predicted_ids)[0]\n","print(\"Predicted Transcription:\", transcription)\n","\n","# Play the original mp3 audio\n","Audio(mp3_audio_file_path)\n"]}],"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyMj/sdpm6gj8SN2GzFFGg8D"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9505669872b1d8ba1d49509324521dbec46592d3c89852177343dec7db1f6d71
+size 1261934580

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+ "do_normalize": true,
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+ "feature_size": 1,
+ "padding_side": "right",
+ "padding_value": 0.0,
+ "processor_class": "Wav2Vec2Processor",
+ "return_attention_mask": true,
+ "sampling_rate": 16000
+}

processor/added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+ "</s>": 30,
+ "<s>": 29
+}

processor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+ "do_normalize": true,
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+ "feature_size": 1,
+ "padding_side": "right",
+ "padding_value": 0.0,
+ "processor_class": "Wav2Vec2Processor",
+ "return_attention_mask": true,
+ "sampling_rate": 16000
+}

processor/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "bos_token": "<s>",
+ "eos_token": "</s>",
+ "pad_token": "[PAD]",
+ "unk_token": "[UNK]"
+}

processor/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+ "added_tokens_decoder": {
+ "27": {
+ "content": "[UNK]",
+ "lstrip": true,
+ "normalized": false,
+ "rstrip": true,
+ "single_word": false,
+ "special": false
+ },
+ "28": {
+ "content": "[PAD]",
+ "lstrip": true,
+ "normalized": false,
+ "rstrip": true,
+ "single_word": false,
+ "special": false
+ },
+ "29": {
+ "content": "<s>",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "30": {
+ "content": "</s>",
+ "lstrip": false,
+ "normalized": true,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<s>",
+ "clean_up_tokenization_spaces": true,
+ "do_lower_case": false,
+ "eos_token": "</s>",
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "[PAD]",
+ "processor_class": "Wav2Vec2Processor",
+ "replace_word_delimiter_char": " ",
+ "target_lang": null,
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
+ "unk_token": "[UNK]",
+ "word_delimiter_token": "|"
+}

processor/vocab.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+ "[PAD]": 28,
+ "[UNK]": 27,
+ "a": 2,
+ "b": 16,
+ "c": 23,
+ "d": 21,
+ "e": 15,
+ "f": 12,
+ "g": 4,
+ "h": 7,
+ "i": 18,
+ "j": 8,
+ "k": 6,
+ "l": 19,
+ "m": 10,
+ "n": 5,
+ "o": 25,
+ "p": 9,
+ "q": 17,
+ "r": 13,
+ "s": 26,
+ "t": 1,
+ "u": 22,
+ "v": 11,
+ "w": 20,
+ "x": 3,
+ "y": 0,
+ "z": 14,
+ "|": 24
+}

runs/Jan11_16-21-53_5b0d62e2a2dd/events.out.tfevents.1704990137.5b0d62e2a2dd.533.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b71fdaa5d469dad8c279c7be05e790a0376d295a8bf817b3729eb4fbb9a606c
+size 12784

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b12eda2eeacdd6eded1b650123f4b19c02d6860b56a6a3ad9341f20b7178e090
+size 4792

vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"y": 0, "t": 1, "a": 2, "x": 3, "g": 4, "n": 5, "k": 6, "h": 7, "j": 8, "p": 9, "m": 10, "v": 11, "f": 12, "r": 13, "z": 14, "e": 15, "b": 16, "q": 17, "i": 18, "l": 19, "w": 20, "d": 21, "u": 22, "c": 23, "o": 25, "s": 26, "\|": 24, "[UNK]": 27, "[PAD]": 28}

wav2vec2-large-xlsr-indonesian.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff