alifalhasan commited on
Commit
8e41ab0
1 Parent(s): 8a9887f

[Task] Model Training

Browse files

[Description] Added model training codes
[Author]

@alifalhasan

README.md CHANGED
@@ -8,4 +8,16 @@ sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ ---
12
+
13
+ # Setup and Requirements
14
+
15
+ **1. Clone the Translate repo:**
16
+ ```
17
+ git clone https://huggingface.co/spaces/alifalhasan/arabic2english
18
+ ```
19
+ **2. Install requirements:**
20
+ ```
21
+ pip install -r requirements.txt
22
+ python -m spacy download en_core_web_sm
23
+ ```
data/arabic2english.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1 +1,7 @@
1
- gradio
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ torchtext
4
+ spacy
5
+ transformers
6
+ nltk
7
+ pandas
src/data_processing/__init__.py ADDED
File without changes
src/data_processing/data_processing.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import spacy
4
+ import random
5
+ import pandas as pd
6
+
7
+ from spacy.lang.ar import Arabic
8
+ from torchtext.legacy import data
9
+ from spacy.tokenizer import Tokenizer
10
+
11
+ # Load data
12
+ script_directory = os.path.dirname(os.path.abspath(__file__))
13
+ df = pd.read_csv(
14
+ os.path.join(script_directory, "../../data/arabic2english.txt"),
15
+ delimiter="\t",
16
+ names=["eng", "ar"],
17
+ )
18
+
19
+ # Load English and Arabic language models from spaCy
20
+ spacy_eng = spacy.load("en_core_web_sm")
21
+ ar = Arabic()
22
+ ar_tokenizer = Tokenizer(ar.vocab)
23
+
24
+
25
+ # Tokenizer functions
26
+ def engTokenizer(text):
27
+ return [word.text for word in spacy_eng.tokenizer(text)]
28
+
29
+
30
+ def arTokenizer(text):
31
+ return [
32
+ word.text
33
+ for word in ar_tokenizer(
34
+ re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", text)).strip()
35
+ )
36
+ ]
37
+
38
+
39
+ # Fields for source (English) and target (Arabic) data
40
+ SOURCE = data.Field(
41
+ tokenize=engTokenizer, # Custom tokenizer for English
42
+ init_token="<sos>", # Start-of-sentence token
43
+ eos_token="<eos>", # End-of-sentence token
44
+ batch_first=False,
45
+ )
46
+ TARGET = data.Field(
47
+ tokenize=arTokenizer, # Custom tokenizer for Arabic
48
+ init_token="ببدأ", # Arabic start-of-sentence token
49
+ eos_token="نهها", # Arabic end-of-sentence token
50
+ tokenizer_language="ar", # Specify language for tokenization
51
+ batch_first=False,
52
+ )
53
+
54
+
55
+ class TextDataset(data.Dataset):
56
+ def __init__(self, df, src_field, target_field, is_test=False):
57
+ """
58
+ Initializes a TextDataset.
59
+
60
+ Args:
61
+ df: A Pandas DataFrame containing text data.
62
+ src_field: The Field object for the source language.
63
+ target_field: The Field object for the target language.
64
+ is_test: A boolean indicating whether this is a test dataset.
65
+ """
66
+ fields = [("eng", src_field), ("ar", target_field)]
67
+ samples = []
68
+ for i, row in df.iterrows():
69
+ eng = row.eng
70
+ ar = row.ar
71
+ samples.append(data.Example.fromlist([eng, ar], fields))
72
+
73
+ super().__init__(samples, fields)
74
+
75
+ def __len__(self):
76
+ """Returns the number of samples in the dataset."""
77
+ return len(self.samples)
78
+
79
+ def __getitem__(self, idx):
80
+ """Returns the sample at the given index."""
81
+ return self.samples[idx]
82
+
83
+
84
+ # TextDataset instance
85
+ torchdataset = TextDataset(df, SOURCE, TARGET)
86
+
87
+ # Split the dataset into training and validation sets
88
+ train_data, valid_data = torchdataset.split(
89
+ split_ratio=0.8, random_state=random.seed(42)
90
+ )
91
+
92
+ # Build vocabulary for source and target fields
93
+ SOURCE.build_vocab(train_data, min_freq=2)
94
+ TARGET.build_vocab(train_data, min_freq=2)
src/train/__init__.py ADDED
File without changes
src/train/train.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+
5
+ from torch import nn, optim
6
+ from torchtext.legacy import data
7
+ from transformer import Transformer
8
+ from data_processing import SOURCE, TARGET, train_data, valid_data
9
+
10
+ """Hyperparameters"""
11
+ # Training
12
+ BATCH_SIZE = 16
13
+ learning_rate = 0.001
14
+ num_epochs = 20
15
+
16
+ # Model
17
+ num_heads = 8
18
+ num_encoder_layers = 3
19
+ num_decoder_layers = 3
20
+ pad_idx = SOURCE.vocab.stoi["<pad>"]
21
+ max_len = 230
22
+ dropout = 0.4
23
+ embedding_size = 256
24
+
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ # Create iterators for Transformer
28
+ train_iter, valid_iter = data.BucketIterator.splits(
29
+ (train_data, valid_data),
30
+ batch_size=BATCH_SIZE,
31
+ sort_key=lambda x: len(x.eng), # Sort by sentence length
32
+ device=device,
33
+ shuffle=True,
34
+ )
35
+
36
+ # Get vocabulary sizes
37
+ src_vocab_size = len(SOURCE.vocab)
38
+ trg_vocab_size = len(TARGET.vocab)
39
+
40
+ # Initialize Transformer model
41
+ model = Transformer(
42
+ embedding_size,
43
+ src_vocab_size,
44
+ trg_vocab_size,
45
+ pad_idx,
46
+ num_heads=num_heads,
47
+ num_encoder_layers=num_encoder_layers,
48
+ num_decoder_layers=num_decoder_layers,
49
+ forward_expansion=2 * embedding_size,
50
+ dropout=dropout,
51
+ max_len=max_len,
52
+ device=device,
53
+ ).to(device)
54
+
55
+ train_loss = []
56
+ validation_loss = []
57
+
58
+ optimizer = optim.Adam(model.parameters(), lr=learning_rate)
59
+
60
+ criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
61
+
62
+ for epoch in range(num_epochs):
63
+ step_loss = []
64
+ model.train()
65
+
66
+ for batch in train_iter:
67
+ input_data = batch.eng.to(device)
68
+ target = batch.ar.to(device)
69
+
70
+ output = model(input_data, target[:-1])
71
+
72
+ optimizer.zero_grad()
73
+ output = output.reshape(-1, trg_vocab_size) # Reshape for loss calculation
74
+ target = target[1:].reshape(-1)
75
+ loss = criterion(output, target)
76
+ loss.backward()
77
+ optimizer.step()
78
+
79
+ step_loss.append(loss.item())
80
+
81
+ train_loss.append(np.mean(step_loss))
82
+ print(f"Epoch {epoch} | Train Cross Entropy Loss: {np.mean(step_loss)}")
83
+
84
+ with torch.inference_mode():
85
+ step_valid_loss = []
86
+ model.eval()
87
+
88
+ for batch in valid_iter:
89
+ input_sentence = batch.eng.to(device)
90
+ target = batch.ar.to(device)
91
+
92
+ output = model(input_sentence, target[:-1])
93
+ output = output.reshape(-1, trg_vocab_size)
94
+ target = target[1:].reshape(-1)
95
+
96
+ loss = criterion(output, target)
97
+ step_valid_loss.append(loss.item())
98
+
99
+ validation_loss.append(np.mean(step_valid_loss))
100
+ print(
101
+ f"Epoch {epoch} | Validation Cross Entropy Loss: {np.mean(step_valid_loss)}"
102
+ )
103
+
104
+ script_directory = os.path.dirname(os.path.abspath(__file__))
105
+ torch.save(model, os.path.join(script_directory, "../../models/arabic2english.pt"))
src/train/transformer.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
5
+
6
+
7
+ class Transformer(nn.Module):
8
+ """
9
+ Transformer model for sequence-to-sequence tasks.
10
+ """
11
+
12
+ def __init__(
13
+ self,
14
+ embedding_size,
15
+ source_vocab_size,
16
+ target_vocab_size,
17
+ source_pad_idx,
18
+ num_heads,
19
+ num_encoder_layers,
20
+ num_decoder_layers,
21
+ forward_expansion,
22
+ dropout,
23
+ max_len,
24
+ ):
25
+ """
26
+ Initializes the Transformer model.
27
+
28
+ Args:
29
+ embedding_size: Size of the embeddings.
30
+ source_vocab_size: Size of the source vocabulary.
31
+ target_vocab_size: Size of the target vocabulary.
32
+ source_pad_idx: Index of the padding token in the source vocabulary.
33
+ num_heads: Number of attention heads.
34
+ num_encoder_layers: Number of encoder layers.
35
+ num_decoder_layers: Number of decoder layers.
36
+ forward_expansion: Factor for expanding the model dimensionality.
37
+ dropout: Dropout probability.
38
+ max_len: Maximum sequence length.
39
+ """
40
+
41
+ super().__init__()
42
+
43
+ # Embedding layers for source and target tokens
44
+ self.src_embeddings = nn.Embedding(source_vocab_size, embedding_size)
45
+ self.trg_embeddings = nn.Embedding(target_vocab_size, embedding_size)
46
+
47
+ # Positional encodings for source and target sequences
48
+ self.positional_encodings = nn.Parameter(
49
+ torch.zeros(1, max_len, embedding_size)
50
+ )
51
+
52
+ # Transformer encoder-decoder
53
+ self.transformer = nn.Transformer(
54
+ d_model=embedding_size,
55
+ nhead=num_heads,
56
+ num_encoder_layers=num_encoder_layers,
57
+ num_decoder_layers=num_decoder_layers,
58
+ dim_feedforward=forward_expansion * embedding_size,
59
+ dropout=dropout,
60
+ )
61
+
62
+ # Output layer for target vocabulary
63
+ self.fc_out = nn.Linear(embedding_size, target_vocab_size)
64
+
65
+ # Dropout for regularization
66
+ self.dropout = nn.Dropout(dropout)
67
+
68
+ # Source padding index
69
+ self.src_pad_idx = source_pad_idx
70
+
71
+ def make_src_mask(self, src):
72
+ """
73
+ Creates a mask for padding tokens in the source sequence.
74
+ """
75
+
76
+ src_mask = src.transpose(0, 1) == self.src_pad_idx
77
+ return src_mask
78
+
79
+ def forward(self, src, trg):
80
+ """
81
+ Forward pass of the Transformer model.
82
+ """
83
+
84
+ src_seq_length, N = src.shape
85
+ trg_seq_length, N = trg.shape
86
+
87
+ # Add positional encodings to embeddings
88
+ embed_src = self.dropout(
89
+ (
90
+ self.src_embeddings(src)
91
+ + self.positional_encodings[:, :src_seq_length, :]
92
+ )
93
+ )
94
+ embed_trg = self.dropout(
95
+ (
96
+ self.trg_embeddings(trg)
97
+ + self.positional_encodings[:, :trg_seq_length, :]
98
+ )
99
+ )
100
+
101
+ # Create masks for source padding and target sequence
102
+ src_padding_mask = self.make_src_mask(src)
103
+ trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
104
+ device
105
+ )
106
+
107
+ # Pass input through transformer encoder-decoder
108
+ out = self.transformer(
109
+ embed_src,
110
+ embed_trg,
111
+ src_key_padding_mask=src_padding_mask,
112
+ tgt_mask=trg_mask,
113
+ )
114
+
115
+ # Apply output layer
116
+ out = self.fc_out(out)
117
+ return out
src/translation/__init__.py ADDED
File without changes
src/translation/translate.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ device = "cuda" if torch.cuda.is_available() else "cpu"
4
+
5
+ def translate_sentence(model,sentence,srcField,targetField,srcTokenizer):
6
+ model.eval()
7
+ processed_sentence = srcField.process([srcTokenizer(sentence)]).to(device)
8
+ trg = ["بداية"]
9
+
10
+ for _ in range(60):
11
+ trg_indecies = [targetField.vocab.stoi[word] for word in trg]
12
+ trg_tensor = torch.LongTensor(trg_indecies).unsqueeze(1).to(device)
13
+ outputs = model(processed_sentence,trg_tensor)
14
+
15
+ if targetField.vocab.itos[outputs.argmax(2)[-1:].item()] == "<unk>":
16
+ continue
17
+ trg.append(targetField.vocab.itos[outputs.argmax(2)[-1:].item()])
18
+ if targetField.vocab.itos[outputs.argmax(2)[-1:].item()] == "نهاية":
19
+ break
20
+ return " ".join([word for word in trg if word != "<unk>"][1:-1])
21
+
22
+
23
+ if __name__ == '__main__':
24
+ print("I'm home -> {}",translate_sentence(model,"I'm at home" ,SRC,TRG,engTokenizer))
25
+ print("I'm alone -> {}",translate_sentence(model,"I'm alone" ,SRC,TRG,engTokenizer))