Spaces:
Sleeping
Sleeping
alifalhasan
commited on
Commit
•
8e41ab0
1
Parent(s):
8a9887f
[Task] Model Training
Browse files[Description] Added model training codes
[Author]
@alifalhasan
- README.md +13 -1
- data/arabic2english.txt +0 -0
- requirements.txt +7 -1
- src/data_processing/__init__.py +0 -0
- src/data_processing/data_processing.py +94 -0
- src/train/__init__.py +0 -0
- src/train/train.py +105 -0
- src/train/transformer.py +117 -0
- src/translation/__init__.py +0 -0
- src/translation/translate.py +25 -0
README.md
CHANGED
@@ -8,4 +8,16 @@ sdk_version: 4.21.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
# Setup and Requirements
|
14 |
+
|
15 |
+
**1. Clone the Translate repo:**
|
16 |
+
```
|
17 |
+
git clone https://huggingface.co/spaces/alifalhasan/arabic2english
|
18 |
+
```
|
19 |
+
**2. Install requirements:**
|
20 |
+
```
|
21 |
+
pip install -r requirements.txt
|
22 |
+
python -m spacy download en_core_web_sm
|
23 |
+
```
|
data/arabic2english.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -1 +1,7 @@
|
|
1 |
-
gradio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
torch
|
3 |
+
torchtext
|
4 |
+
spacy
|
5 |
+
transformers
|
6 |
+
nltk
|
7 |
+
pandas
|
src/data_processing/__init__.py
ADDED
File without changes
|
src/data_processing/data_processing.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import spacy
|
4 |
+
import random
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from spacy.lang.ar import Arabic
|
8 |
+
from torchtext.legacy import data
|
9 |
+
from spacy.tokenizer import Tokenizer
|
10 |
+
|
11 |
+
# Load data
|
12 |
+
script_directory = os.path.dirname(os.path.abspath(__file__))
|
13 |
+
df = pd.read_csv(
|
14 |
+
os.path.join(script_directory, "../../data/arabic2english.txt"),
|
15 |
+
delimiter="\t",
|
16 |
+
names=["eng", "ar"],
|
17 |
+
)
|
18 |
+
|
19 |
+
# Load English and Arabic language models from spaCy
|
20 |
+
spacy_eng = spacy.load("en_core_web_sm")
|
21 |
+
ar = Arabic()
|
22 |
+
ar_tokenizer = Tokenizer(ar.vocab)
|
23 |
+
|
24 |
+
|
25 |
+
# Tokenizer functions
|
26 |
+
def engTokenizer(text):
|
27 |
+
return [word.text for word in spacy_eng.tokenizer(text)]
|
28 |
+
|
29 |
+
|
30 |
+
def arTokenizer(text):
|
31 |
+
return [
|
32 |
+
word.text
|
33 |
+
for word in ar_tokenizer(
|
34 |
+
re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", text)).strip()
|
35 |
+
)
|
36 |
+
]
|
37 |
+
|
38 |
+
|
39 |
+
# Fields for source (English) and target (Arabic) data
|
40 |
+
SOURCE = data.Field(
|
41 |
+
tokenize=engTokenizer, # Custom tokenizer for English
|
42 |
+
init_token="<sos>", # Start-of-sentence token
|
43 |
+
eos_token="<eos>", # End-of-sentence token
|
44 |
+
batch_first=False,
|
45 |
+
)
|
46 |
+
TARGET = data.Field(
|
47 |
+
tokenize=arTokenizer, # Custom tokenizer for Arabic
|
48 |
+
init_token="ببدأ", # Arabic start-of-sentence token
|
49 |
+
eos_token="نهها", # Arabic end-of-sentence token
|
50 |
+
tokenizer_language="ar", # Specify language for tokenization
|
51 |
+
batch_first=False,
|
52 |
+
)
|
53 |
+
|
54 |
+
|
55 |
+
class TextDataset(data.Dataset):
|
56 |
+
def __init__(self, df, src_field, target_field, is_test=False):
|
57 |
+
"""
|
58 |
+
Initializes a TextDataset.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
df: A Pandas DataFrame containing text data.
|
62 |
+
src_field: The Field object for the source language.
|
63 |
+
target_field: The Field object for the target language.
|
64 |
+
is_test: A boolean indicating whether this is a test dataset.
|
65 |
+
"""
|
66 |
+
fields = [("eng", src_field), ("ar", target_field)]
|
67 |
+
samples = []
|
68 |
+
for i, row in df.iterrows():
|
69 |
+
eng = row.eng
|
70 |
+
ar = row.ar
|
71 |
+
samples.append(data.Example.fromlist([eng, ar], fields))
|
72 |
+
|
73 |
+
super().__init__(samples, fields)
|
74 |
+
|
75 |
+
def __len__(self):
|
76 |
+
"""Returns the number of samples in the dataset."""
|
77 |
+
return len(self.samples)
|
78 |
+
|
79 |
+
def __getitem__(self, idx):
|
80 |
+
"""Returns the sample at the given index."""
|
81 |
+
return self.samples[idx]
|
82 |
+
|
83 |
+
|
84 |
+
# TextDataset instance
|
85 |
+
torchdataset = TextDataset(df, SOURCE, TARGET)
|
86 |
+
|
87 |
+
# Split the dataset into training and validation sets
|
88 |
+
train_data, valid_data = torchdataset.split(
|
89 |
+
split_ratio=0.8, random_state=random.seed(42)
|
90 |
+
)
|
91 |
+
|
92 |
+
# Build vocabulary for source and target fields
|
93 |
+
SOURCE.build_vocab(train_data, min_freq=2)
|
94 |
+
TARGET.build_vocab(train_data, min_freq=2)
|
src/train/__init__.py
ADDED
File without changes
|
src/train/train.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from torch import nn, optim
|
6 |
+
from torchtext.legacy import data
|
7 |
+
from transformer import Transformer
|
8 |
+
from data_processing import SOURCE, TARGET, train_data, valid_data
|
9 |
+
|
10 |
+
"""Hyperparameters"""
|
11 |
+
# Training
|
12 |
+
BATCH_SIZE = 16
|
13 |
+
learning_rate = 0.001
|
14 |
+
num_epochs = 20
|
15 |
+
|
16 |
+
# Model
|
17 |
+
num_heads = 8
|
18 |
+
num_encoder_layers = 3
|
19 |
+
num_decoder_layers = 3
|
20 |
+
pad_idx = SOURCE.vocab.stoi["<pad>"]
|
21 |
+
max_len = 230
|
22 |
+
dropout = 0.4
|
23 |
+
embedding_size = 256
|
24 |
+
|
25 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
+
|
27 |
+
# Create iterators for Transformer
|
28 |
+
train_iter, valid_iter = data.BucketIterator.splits(
|
29 |
+
(train_data, valid_data),
|
30 |
+
batch_size=BATCH_SIZE,
|
31 |
+
sort_key=lambda x: len(x.eng), # Sort by sentence length
|
32 |
+
device=device,
|
33 |
+
shuffle=True,
|
34 |
+
)
|
35 |
+
|
36 |
+
# Get vocabulary sizes
|
37 |
+
src_vocab_size = len(SOURCE.vocab)
|
38 |
+
trg_vocab_size = len(TARGET.vocab)
|
39 |
+
|
40 |
+
# Initialize Transformer model
|
41 |
+
model = Transformer(
|
42 |
+
embedding_size,
|
43 |
+
src_vocab_size,
|
44 |
+
trg_vocab_size,
|
45 |
+
pad_idx,
|
46 |
+
num_heads=num_heads,
|
47 |
+
num_encoder_layers=num_encoder_layers,
|
48 |
+
num_decoder_layers=num_decoder_layers,
|
49 |
+
forward_expansion=2 * embedding_size,
|
50 |
+
dropout=dropout,
|
51 |
+
max_len=max_len,
|
52 |
+
device=device,
|
53 |
+
).to(device)
|
54 |
+
|
55 |
+
train_loss = []
|
56 |
+
validation_loss = []
|
57 |
+
|
58 |
+
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
59 |
+
|
60 |
+
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
|
61 |
+
|
62 |
+
for epoch in range(num_epochs):
|
63 |
+
step_loss = []
|
64 |
+
model.train()
|
65 |
+
|
66 |
+
for batch in train_iter:
|
67 |
+
input_data = batch.eng.to(device)
|
68 |
+
target = batch.ar.to(device)
|
69 |
+
|
70 |
+
output = model(input_data, target[:-1])
|
71 |
+
|
72 |
+
optimizer.zero_grad()
|
73 |
+
output = output.reshape(-1, trg_vocab_size) # Reshape for loss calculation
|
74 |
+
target = target[1:].reshape(-1)
|
75 |
+
loss = criterion(output, target)
|
76 |
+
loss.backward()
|
77 |
+
optimizer.step()
|
78 |
+
|
79 |
+
step_loss.append(loss.item())
|
80 |
+
|
81 |
+
train_loss.append(np.mean(step_loss))
|
82 |
+
print(f"Epoch {epoch} | Train Cross Entropy Loss: {np.mean(step_loss)}")
|
83 |
+
|
84 |
+
with torch.inference_mode():
|
85 |
+
step_valid_loss = []
|
86 |
+
model.eval()
|
87 |
+
|
88 |
+
for batch in valid_iter:
|
89 |
+
input_sentence = batch.eng.to(device)
|
90 |
+
target = batch.ar.to(device)
|
91 |
+
|
92 |
+
output = model(input_sentence, target[:-1])
|
93 |
+
output = output.reshape(-1, trg_vocab_size)
|
94 |
+
target = target[1:].reshape(-1)
|
95 |
+
|
96 |
+
loss = criterion(output, target)
|
97 |
+
step_valid_loss.append(loss.item())
|
98 |
+
|
99 |
+
validation_loss.append(np.mean(step_valid_loss))
|
100 |
+
print(
|
101 |
+
f"Epoch {epoch} | Validation Cross Entropy Loss: {np.mean(step_valid_loss)}"
|
102 |
+
)
|
103 |
+
|
104 |
+
script_directory = os.path.dirname(os.path.abspath(__file__))
|
105 |
+
torch.save(model, os.path.join(script_directory, "../../models/arabic2english.pt"))
|
src/train/transformer.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
5 |
+
|
6 |
+
|
7 |
+
class Transformer(nn.Module):
|
8 |
+
"""
|
9 |
+
Transformer model for sequence-to-sequence tasks.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(
|
13 |
+
self,
|
14 |
+
embedding_size,
|
15 |
+
source_vocab_size,
|
16 |
+
target_vocab_size,
|
17 |
+
source_pad_idx,
|
18 |
+
num_heads,
|
19 |
+
num_encoder_layers,
|
20 |
+
num_decoder_layers,
|
21 |
+
forward_expansion,
|
22 |
+
dropout,
|
23 |
+
max_len,
|
24 |
+
):
|
25 |
+
"""
|
26 |
+
Initializes the Transformer model.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
embedding_size: Size of the embeddings.
|
30 |
+
source_vocab_size: Size of the source vocabulary.
|
31 |
+
target_vocab_size: Size of the target vocabulary.
|
32 |
+
source_pad_idx: Index of the padding token in the source vocabulary.
|
33 |
+
num_heads: Number of attention heads.
|
34 |
+
num_encoder_layers: Number of encoder layers.
|
35 |
+
num_decoder_layers: Number of decoder layers.
|
36 |
+
forward_expansion: Factor for expanding the model dimensionality.
|
37 |
+
dropout: Dropout probability.
|
38 |
+
max_len: Maximum sequence length.
|
39 |
+
"""
|
40 |
+
|
41 |
+
super().__init__()
|
42 |
+
|
43 |
+
# Embedding layers for source and target tokens
|
44 |
+
self.src_embeddings = nn.Embedding(source_vocab_size, embedding_size)
|
45 |
+
self.trg_embeddings = nn.Embedding(target_vocab_size, embedding_size)
|
46 |
+
|
47 |
+
# Positional encodings for source and target sequences
|
48 |
+
self.positional_encodings = nn.Parameter(
|
49 |
+
torch.zeros(1, max_len, embedding_size)
|
50 |
+
)
|
51 |
+
|
52 |
+
# Transformer encoder-decoder
|
53 |
+
self.transformer = nn.Transformer(
|
54 |
+
d_model=embedding_size,
|
55 |
+
nhead=num_heads,
|
56 |
+
num_encoder_layers=num_encoder_layers,
|
57 |
+
num_decoder_layers=num_decoder_layers,
|
58 |
+
dim_feedforward=forward_expansion * embedding_size,
|
59 |
+
dropout=dropout,
|
60 |
+
)
|
61 |
+
|
62 |
+
# Output layer for target vocabulary
|
63 |
+
self.fc_out = nn.Linear(embedding_size, target_vocab_size)
|
64 |
+
|
65 |
+
# Dropout for regularization
|
66 |
+
self.dropout = nn.Dropout(dropout)
|
67 |
+
|
68 |
+
# Source padding index
|
69 |
+
self.src_pad_idx = source_pad_idx
|
70 |
+
|
71 |
+
def make_src_mask(self, src):
|
72 |
+
"""
|
73 |
+
Creates a mask for padding tokens in the source sequence.
|
74 |
+
"""
|
75 |
+
|
76 |
+
src_mask = src.transpose(0, 1) == self.src_pad_idx
|
77 |
+
return src_mask
|
78 |
+
|
79 |
+
def forward(self, src, trg):
|
80 |
+
"""
|
81 |
+
Forward pass of the Transformer model.
|
82 |
+
"""
|
83 |
+
|
84 |
+
src_seq_length, N = src.shape
|
85 |
+
trg_seq_length, N = trg.shape
|
86 |
+
|
87 |
+
# Add positional encodings to embeddings
|
88 |
+
embed_src = self.dropout(
|
89 |
+
(
|
90 |
+
self.src_embeddings(src)
|
91 |
+
+ self.positional_encodings[:, :src_seq_length, :]
|
92 |
+
)
|
93 |
+
)
|
94 |
+
embed_trg = self.dropout(
|
95 |
+
(
|
96 |
+
self.trg_embeddings(trg)
|
97 |
+
+ self.positional_encodings[:, :trg_seq_length, :]
|
98 |
+
)
|
99 |
+
)
|
100 |
+
|
101 |
+
# Create masks for source padding and target sequence
|
102 |
+
src_padding_mask = self.make_src_mask(src)
|
103 |
+
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
|
104 |
+
device
|
105 |
+
)
|
106 |
+
|
107 |
+
# Pass input through transformer encoder-decoder
|
108 |
+
out = self.transformer(
|
109 |
+
embed_src,
|
110 |
+
embed_trg,
|
111 |
+
src_key_padding_mask=src_padding_mask,
|
112 |
+
tgt_mask=trg_mask,
|
113 |
+
)
|
114 |
+
|
115 |
+
# Apply output layer
|
116 |
+
out = self.fc_out(out)
|
117 |
+
return out
|
src/translation/__init__.py
ADDED
File without changes
|
src/translation/translate.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
4 |
+
|
5 |
+
def translate_sentence(model,sentence,srcField,targetField,srcTokenizer):
|
6 |
+
model.eval()
|
7 |
+
processed_sentence = srcField.process([srcTokenizer(sentence)]).to(device)
|
8 |
+
trg = ["بداية"]
|
9 |
+
|
10 |
+
for _ in range(60):
|
11 |
+
trg_indecies = [targetField.vocab.stoi[word] for word in trg]
|
12 |
+
trg_tensor = torch.LongTensor(trg_indecies).unsqueeze(1).to(device)
|
13 |
+
outputs = model(processed_sentence,trg_tensor)
|
14 |
+
|
15 |
+
if targetField.vocab.itos[outputs.argmax(2)[-1:].item()] == "<unk>":
|
16 |
+
continue
|
17 |
+
trg.append(targetField.vocab.itos[outputs.argmax(2)[-1:].item()])
|
18 |
+
if targetField.vocab.itos[outputs.argmax(2)[-1:].item()] == "نهاية":
|
19 |
+
break
|
20 |
+
return " ".join([word for word in trg if word != "<unk>"][1:-1])
|
21 |
+
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
print("I'm home -> {}",translate_sentence(model,"I'm at home" ,SRC,TRG,engTokenizer))
|
25 |
+
print("I'm alone -> {}",translate_sentence(model,"I'm alone" ,SRC,TRG,engTokenizer))
|