Spaces:
Sleeping
Sleeping
alifalhasan
commited on
Commit
•
b1c38c2
1
Parent(s):
9a4dd2c
[Task] Minor Update
Browse files[Description] Added comments and fixed the bug from requirements.txt
[Author]
@alifalhasan
- app.py +11 -11
- requirements.txt +3 -2
- src/data_processing/data_processing.py +56 -0
- src/train/train.py +26 -20
- src/train/transformer.py +28 -9
- src/translation/translate.py +32 -14
app.py
CHANGED
@@ -2,23 +2,23 @@ import gradio as gr
|
|
2 |
|
3 |
from src.translation.translate import translate
|
4 |
|
5 |
-
LANGS = ["arabic", "english"]
|
6 |
|
7 |
if __name__ == "__main__":
|
8 |
# Create the Gradio interface
|
9 |
iface = gr.Interface(
|
10 |
-
fn=translate,
|
11 |
inputs=[
|
12 |
-
gr.components.Textbox(label="Text"),
|
13 |
-
gr.components.Dropdown(label="Source Language", choices=LANGS),
|
14 |
-
gr.components.Dropdown(label="Target Language", choices=LANGS),
|
15 |
],
|
16 |
-
outputs=["text"],
|
17 |
-
examples=[["I'm ready", "english", "arabic"]],
|
18 |
-
cache_examples=False,
|
19 |
-
title="arabic2english",
|
20 |
-
description="This is a translator app for arabic and english. Currently supports only english to arabic."
|
21 |
)
|
22 |
|
23 |
# Launch the interface
|
24 |
-
iface.launch(share=True)
|
|
|
2 |
|
3 |
from src.translation.translate import translate
|
4 |
|
5 |
+
LANGS = ["arabic", "english"] # Define a list of supported languages
|
6 |
|
7 |
if __name__ == "__main__":
|
8 |
# Create the Gradio interface
|
9 |
iface = gr.Interface(
|
10 |
+
fn=translate, # Specify the translation function as the main function
|
11 |
inputs=[
|
12 |
+
gr.components.Textbox(label="Text"), # Add a textbox input for entering text
|
13 |
+
gr.components.Dropdown(label="Source Language", choices=LANGS), # Add a dropdown for selecting source language
|
14 |
+
gr.components.Dropdown(label="Target Language", choices=LANGS), # Add a dropdown for selecting target language
|
15 |
],
|
16 |
+
outputs=["text"], # Define the output type as text
|
17 |
+
examples=[["I'm ready", "english", "arabic"]], # Provide an example input for demonstration
|
18 |
+
cache_examples=False, # Disable caching of examples
|
19 |
+
title="arabic2english", # Set the title of the interface
|
20 |
+
description="This is a translator app for arabic and english. Currently supports only english to arabic." # Add a description of the interface
|
21 |
)
|
22 |
|
23 |
# Launch the interface
|
24 |
+
iface.launch(share=True) # Launch the interface and enable sharing
|
requirements.txt
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
gradio
|
2 |
torch>=1.6
|
3 |
torchtext==0.6
|
4 |
-
spacy
|
5 |
transformers
|
6 |
nltk
|
7 |
-
pandas
|
|
|
|
|
|
1 |
gradio
|
2 |
torch>=1.6
|
3 |
torchtext==0.6
|
|
|
4 |
transformers
|
5 |
nltk
|
6 |
+
pandas
|
7 |
+
spacy
|
8 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
|
src/data_processing/data_processing.py
CHANGED
@@ -7,23 +7,46 @@ from torchtext import data
|
|
7 |
from spacy.lang.ar import Arabic
|
8 |
from spacy.tokenizer import Tokenizer
|
9 |
|
|
|
10 |
df = pd.read_csv(
|
11 |
"data/arabic2english.txt",
|
12 |
delimiter="\t",
|
13 |
names=["eng", "ar"],
|
14 |
)
|
15 |
|
|
|
16 |
spacy_eng = spacy.load("en_core_web_sm")
|
17 |
|
|
|
18 |
arab = Arabic()
|
|
|
|
|
19 |
ar_Tokenizer = Tokenizer(arab.vocab)
|
20 |
|
21 |
|
22 |
def engTokenizer(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
return [word.text for word in spacy_eng.tokenizer(text)]
|
24 |
|
25 |
|
26 |
def arTokenizer(sentence):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
return [
|
28 |
word.text
|
29 |
for word in ar_Tokenizer(
|
@@ -32,6 +55,7 @@ def arTokenizer(sentence):
|
|
32 |
]
|
33 |
|
34 |
|
|
|
35 |
SRC = data.Field(
|
36 |
tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
|
37 |
)
|
@@ -45,6 +69,20 @@ TRG = data.Field(
|
|
45 |
|
46 |
|
47 |
class TextDataset(data.Dataset):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
|
50 |
fields = [("eng", src_field), ("ar", target_field)]
|
@@ -57,17 +95,35 @@ class TextDataset(data.Dataset):
|
|
57 |
super().__init__(samples, fields, **kwargs)
|
58 |
|
59 |
def __len__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
return len(self.samples)
|
61 |
|
62 |
def __getitem__(self, idx):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
return self.samples[idx]
|
64 |
|
65 |
|
|
|
66 |
torchdataset = TextDataset(df, SRC, TRG)
|
67 |
|
|
|
68 |
train_data, valid_data = torchdataset.split(
|
69 |
split_ratio=0.8, random_state=random.seed(32)
|
70 |
)
|
71 |
|
|
|
72 |
SRC.build_vocab(train_data, min_freq=2)
|
73 |
TRG.build_vocab(train_data, min_freq=2)
|
|
|
7 |
from spacy.lang.ar import Arabic
|
8 |
from spacy.tokenizer import Tokenizer
|
9 |
|
10 |
+
# Reading data into a pandas DataFrame
|
11 |
df = pd.read_csv(
|
12 |
"data/arabic2english.txt",
|
13 |
delimiter="\t",
|
14 |
names=["eng", "ar"],
|
15 |
)
|
16 |
|
17 |
+
# Loading English language model from spaCy
|
18 |
spacy_eng = spacy.load("en_core_web_sm")
|
19 |
|
20 |
+
# Creating an instance of Arabic language model from spaCy
|
21 |
arab = Arabic()
|
22 |
+
|
23 |
+
# Creating a tokenizer for Arabic text using the Arabic language model
|
24 |
ar_Tokenizer = Tokenizer(arab.vocab)
|
25 |
|
26 |
|
27 |
def engTokenizer(text):
|
28 |
+
"""
|
29 |
+
Tokenizes English text using spaCy tokenizer.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
text (str): The input English text.
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
list: List of tokens.
|
36 |
+
"""
|
37 |
return [word.text for word in spacy_eng.tokenizer(text)]
|
38 |
|
39 |
|
40 |
def arTokenizer(sentence):
|
41 |
+
"""
|
42 |
+
Tokenizes Arabic sentence using spaCy tokenizer.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
sentence (str): The input Arabic sentence.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
list: List of tokens.
|
49 |
+
"""
|
50 |
return [
|
51 |
word.text
|
52 |
for word in ar_Tokenizer(
|
|
|
55 |
]
|
56 |
|
57 |
|
58 |
+
# Defining fields for source and target languages using torchtext
|
59 |
SRC = data.Field(
|
60 |
tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
|
61 |
)
|
|
|
69 |
|
70 |
|
71 |
class TextDataset(data.Dataset):
|
72 |
+
"""
|
73 |
+
Custom dataset class for text data.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
df (pandas.DataFrame): DataFrame containing source and target language data.
|
77 |
+
src_field (torchtext.data.Field): Field for source language.
|
78 |
+
target_field (torchtext.data.Field): Field for target language.
|
79 |
+
is_test (bool): Flag indicating if the dataset is for testing.
|
80 |
+
|
81 |
+
Attributes:
|
82 |
+
fields (list): List of tuples containing field names and corresponding Field objects.
|
83 |
+
samples (list): List of data examples.
|
84 |
+
|
85 |
+
"""
|
86 |
|
87 |
def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
|
88 |
fields = [("eng", src_field), ("ar", target_field)]
|
|
|
95 |
super().__init__(samples, fields, **kwargs)
|
96 |
|
97 |
def __len__(self):
|
98 |
+
"""
|
99 |
+
Get the number of samples in the dataset.
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
int: Number of samples.
|
103 |
+
"""
|
104 |
return len(self.samples)
|
105 |
|
106 |
def __getitem__(self, idx):
|
107 |
+
"""
|
108 |
+
Get a sample from the dataset.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
idx (int): Index of the sample.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
torchtext.data.Example: Sample at the specified index.
|
115 |
+
"""
|
116 |
return self.samples[idx]
|
117 |
|
118 |
|
119 |
+
# Creating a TextDataset instance
|
120 |
torchdataset = TextDataset(df, SRC, TRG)
|
121 |
|
122 |
+
# Splitting the dataset into training and validation sets
|
123 |
train_data, valid_data = torchdataset.split(
|
124 |
split_ratio=0.8, random_state=random.seed(32)
|
125 |
)
|
126 |
|
127 |
+
# Building vocabularies for source and target languages
|
128 |
SRC.build_vocab(train_data, min_freq=2)
|
129 |
TRG.build_vocab(train_data, min_freq=2)
|
src/train/train.py
CHANGED
@@ -7,7 +7,6 @@ from torchtext import data
|
|
7 |
from transformer import Transformer
|
8 |
|
9 |
import sys
|
10 |
-
|
11 |
sys.path.append(os.path.abspath("src/data_processing/"))
|
12 |
from data_processing import (
|
13 |
SRC,
|
@@ -16,11 +15,13 @@ from data_processing import (
|
|
16 |
valid_data,
|
17 |
)
|
18 |
|
|
|
19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
|
21 |
"""Hyperparameters"""
|
22 |
BATCH_SIZE = 16
|
23 |
|
|
|
24 |
train_iter, valid_iter = data.BucketIterator.splits(
|
25 |
(train_data, valid_data),
|
26 |
batch_size=BATCH_SIZE,
|
@@ -30,12 +31,12 @@ train_iter, valid_iter = data.BucketIterator.splits(
|
|
30 |
device=device,
|
31 |
shuffle=True,
|
32 |
)
|
33 |
-
load_model = False
|
34 |
-
save_model = True
|
35 |
|
|
|
36 |
num_epochs = 30
|
37 |
learning_rate = 0.0001
|
38 |
|
|
|
39 |
num_heads = 8
|
40 |
num_encoder_layers = 3
|
41 |
num_decoder_layers = 3
|
@@ -45,14 +46,14 @@ dropout = 0.4
|
|
45 |
embedding_size = 256
|
46 |
src_pad_idx = SRC.vocab.stoi["<pad>"]
|
47 |
|
48 |
-
|
49 |
src_vocab_size = len(SRC.vocab)
|
50 |
-
print("Size of
|
51 |
|
52 |
trg_vocab_size = len(TRG.vocab)
|
53 |
-
print("Size of
|
54 |
-
|
55 |
|
|
|
56 |
model = Transformer(
|
57 |
embedding_size,
|
58 |
src_vocab_size,
|
@@ -66,38 +67,43 @@ model = Transformer(
|
|
66 |
device=device,
|
67 |
).to(device)
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
|
|
|
73 |
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
74 |
|
|
|
75 |
pad_idx = SRC.vocab.stoi["<pad>"]
|
76 |
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
|
|
|
|
|
77 |
for epoch in range(num_epochs):
|
78 |
stepLoss = []
|
79 |
-
model.train()
|
80 |
for batch in train_iter:
|
81 |
input_data = batch.eng.to(device)
|
82 |
target = batch.ar.to(device)
|
83 |
|
84 |
-
output = model(input_data, target[:-1])
|
85 |
-
optimizer.zero_grad()
|
86 |
-
|
87 |
output = output.reshape(-1, trg_vocab_size)
|
88 |
target = target[1:].reshape(-1)
|
89 |
|
90 |
-
loss = criterion(output, target)
|
91 |
-
loss.backward()
|
|
|
92 |
|
93 |
-
optimizer.step()
|
94 |
stepLoss.append(loss.item())
|
95 |
|
96 |
-
|
97 |
print(" Epoch {} | Train Cross Entropy Loss: ".format(epoch), np.mean(stepLoss))
|
|
|
|
|
98 |
with torch.inference_mode():
|
99 |
stepValidLoss = []
|
100 |
-
model.eval()
|
101 |
for i, batch in enumerate(valid_iter):
|
102 |
input_sentence = batch.eng.to(device)
|
103 |
target = batch.ar.to(device)
|
@@ -109,7 +115,7 @@ for epoch in range(num_epochs):
|
|
109 |
|
110 |
stepValidLoss.append(loss.item())
|
111 |
|
112 |
-
|
113 |
print(
|
114 |
" Epoch {} | Validation Cross Entropy Loss: ".format(epoch),
|
115 |
np.mean(stepValidLoss),
|
|
|
7 |
from transformer import Transformer
|
8 |
|
9 |
import sys
|
|
|
10 |
sys.path.append(os.path.abspath("src/data_processing/"))
|
11 |
from data_processing import (
|
12 |
SRC,
|
|
|
15 |
valid_data,
|
16 |
)
|
17 |
|
18 |
+
# Setting the device
|
19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
|
21 |
"""Hyperparameters"""
|
22 |
BATCH_SIZE = 16
|
23 |
|
24 |
+
# Creating data iterators for training and validation sets
|
25 |
train_iter, valid_iter = data.BucketIterator.splits(
|
26 |
(train_data, valid_data),
|
27 |
batch_size=BATCH_SIZE,
|
|
|
31 |
device=device,
|
32 |
shuffle=True,
|
33 |
)
|
|
|
|
|
34 |
|
35 |
+
# Training parameters
|
36 |
num_epochs = 30
|
37 |
learning_rate = 0.0001
|
38 |
|
39 |
+
# Transformer model hyperparameters
|
40 |
num_heads = 8
|
41 |
num_encoder_layers = 3
|
42 |
num_decoder_layers = 3
|
|
|
46 |
embedding_size = 256
|
47 |
src_pad_idx = SRC.vocab.stoi["<pad>"]
|
48 |
|
49 |
+
# Vocabulary sizes
|
50 |
src_vocab_size = len(SRC.vocab)
|
51 |
+
print("Size of English vocabulary:", src_vocab_size)
|
52 |
|
53 |
trg_vocab_size = len(TRG.vocab)
|
54 |
+
print("Size of Arabic vocabulary:", trg_vocab_size)
|
|
|
55 |
|
56 |
+
# Creating the Transformer model
|
57 |
model = Transformer(
|
58 |
embedding_size,
|
59 |
src_vocab_size,
|
|
|
67 |
device=device,
|
68 |
).to(device)
|
69 |
|
70 |
+
# Lists to track training and validation losses
|
71 |
+
train_loss = []
|
72 |
+
validation_loss = []
|
73 |
|
74 |
+
# Optimizer definition
|
75 |
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
76 |
|
77 |
+
# Criterion for loss calculation
|
78 |
pad_idx = SRC.vocab.stoi["<pad>"]
|
79 |
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
|
80 |
+
|
81 |
+
# Main training loop
|
82 |
for epoch in range(num_epochs):
|
83 |
stepLoss = []
|
84 |
+
model.train() # Set the model to training mode
|
85 |
for batch in train_iter:
|
86 |
input_data = batch.eng.to(device)
|
87 |
target = batch.ar.to(device)
|
88 |
|
89 |
+
output = model(input_data, target[:-1]) # Forward pass
|
90 |
+
optimizer.zero_grad() # Zero the gradients
|
|
|
91 |
output = output.reshape(-1, trg_vocab_size)
|
92 |
target = target[1:].reshape(-1)
|
93 |
|
94 |
+
loss = criterion(output, target) # Calculate the loss
|
95 |
+
loss.backward() # Backpropagation
|
96 |
+
optimizer.step() # Update the parameters
|
97 |
|
|
|
98 |
stepLoss.append(loss.item())
|
99 |
|
100 |
+
train_loss.append(np.mean(stepLoss))
|
101 |
print(" Epoch {} | Train Cross Entropy Loss: ".format(epoch), np.mean(stepLoss))
|
102 |
+
|
103 |
+
# Validation loop
|
104 |
with torch.inference_mode():
|
105 |
stepValidLoss = []
|
106 |
+
model.eval() # Set the model to evaluation mode
|
107 |
for i, batch in enumerate(valid_iter):
|
108 |
input_sentence = batch.eng.to(device)
|
109 |
target = batch.ar.to(device)
|
|
|
115 |
|
116 |
stepValidLoss.append(loss.item())
|
117 |
|
118 |
+
validation_loss.append(np.mean(stepValidLoss))
|
119 |
print(
|
120 |
" Epoch {} | Validation Cross Entropy Loss: ".format(epoch),
|
121 |
np.mean(stepValidLoss),
|
src/train/transformer.py
CHANGED
@@ -33,67 +33,86 @@ class Transformer(nn.Module):
|
|
33 |
num_decoder_layers: Number of decoder layers.
|
34 |
dropout: Dropout probability.
|
35 |
max_len: Maximum sequence length.
|
|
|
36 |
"""
|
37 |
|
38 |
super(Transformer, self).__init__()
|
|
|
39 |
self.src_embeddings = nn.Embedding(src_vocab_size, embedding_size)
|
40 |
self.src_positional_embeddings = nn.Embedding(max_len, embedding_size)
|
41 |
self.trg_embeddings = nn.Embedding(trg_vocab_size, embedding_size)
|
42 |
self.trg_positional_embeddings = nn.Embedding(max_len, embedding_size)
|
43 |
self.device = device
|
|
|
44 |
self.transformer = nn.Transformer(
|
45 |
embedding_size,
|
46 |
num_heads,
|
47 |
num_encoder_layers,
|
48 |
num_decoder_layers,
|
49 |
)
|
50 |
-
|
51 |
self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
|
52 |
self.dropout = nn.Dropout(dropout)
|
53 |
self.src_pad_idx = src_pad_idx
|
54 |
|
55 |
def make_src_mask(self, src):
|
56 |
-
|
|
|
|
|
|
|
|
|
57 |
|
|
|
|
|
|
|
|
|
58 |
return src_mask.to(self.device)
|
59 |
|
60 |
def forward(self, src, trg):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
src_seq_length, S = src.shape
|
62 |
trg_seq_length, S = trg.shape
|
63 |
-
#
|
64 |
src_positions = (
|
65 |
torch.arange(0, src_seq_length)
|
66 |
.unsqueeze(1)
|
67 |
.expand(src_seq_length, S)
|
68 |
.to(self.device)
|
69 |
)
|
70 |
-
|
71 |
trg_positions = (
|
72 |
torch.arange(0, trg_seq_length)
|
73 |
.unsqueeze(1)
|
74 |
.expand(trg_seq_length, S)
|
75 |
.to(self.device)
|
76 |
)
|
77 |
-
|
78 |
embed_src = self.dropout(
|
79 |
(self.src_embeddings(src) + self.src_positional_embeddings(src_positions))
|
80 |
)
|
81 |
-
|
82 |
embed_trg = self.dropout(
|
83 |
(self.trg_embeddings(trg) + self.trg_positional_embeddings(trg_positions))
|
84 |
)
|
85 |
-
|
86 |
src_padding_mask = self.make_src_mask(src)
|
87 |
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
|
88 |
self.device
|
89 |
)
|
90 |
-
|
91 |
out = self.transformer(
|
92 |
embed_src,
|
93 |
embed_trg,
|
94 |
src_key_padding_mask=src_padding_mask,
|
95 |
tgt_mask=trg_mask,
|
96 |
)
|
|
|
97 |
out = self.fc_out(out)
|
98 |
-
|
99 |
return out
|
|
|
33 |
num_decoder_layers: Number of decoder layers.
|
34 |
dropout: Dropout probability.
|
35 |
max_len: Maximum sequence length.
|
36 |
+
device: Device to place tensors on.
|
37 |
"""
|
38 |
|
39 |
super(Transformer, self).__init__()
|
40 |
+
# Embeddings for source and target sequences
|
41 |
self.src_embeddings = nn.Embedding(src_vocab_size, embedding_size)
|
42 |
self.src_positional_embeddings = nn.Embedding(max_len, embedding_size)
|
43 |
self.trg_embeddings = nn.Embedding(trg_vocab_size, embedding_size)
|
44 |
self.trg_positional_embeddings = nn.Embedding(max_len, embedding_size)
|
45 |
self.device = device
|
46 |
+
# Transformer layer
|
47 |
self.transformer = nn.Transformer(
|
48 |
embedding_size,
|
49 |
num_heads,
|
50 |
num_encoder_layers,
|
51 |
num_decoder_layers,
|
52 |
)
|
53 |
+
# Final fully connected layer
|
54 |
self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
|
55 |
self.dropout = nn.Dropout(dropout)
|
56 |
self.src_pad_idx = src_pad_idx
|
57 |
|
58 |
def make_src_mask(self, src):
|
59 |
+
"""
|
60 |
+
Creates a mask to ignore padding tokens in the source sequence.
|
61 |
+
|
62 |
+
Args:
|
63 |
+
src: Source sequence tensor.
|
64 |
|
65 |
+
Returns:
|
66 |
+
src_mask: Mask tensor.
|
67 |
+
"""
|
68 |
+
src_mask = src.transpose(0, 1) == self.src_pad_idx
|
69 |
return src_mask.to(self.device)
|
70 |
|
71 |
def forward(self, src, trg):
|
72 |
+
"""
|
73 |
+
Forward pass of the Transformer model.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
src: Source sequence tensor.
|
77 |
+
trg: Target sequence tensor.
|
78 |
+
|
79 |
+
Returns:
|
80 |
+
out: Output tensor.
|
81 |
+
"""
|
82 |
src_seq_length, S = src.shape
|
83 |
trg_seq_length, S = trg.shape
|
84 |
+
# Generate position indices for source and target sequences
|
85 |
src_positions = (
|
86 |
torch.arange(0, src_seq_length)
|
87 |
.unsqueeze(1)
|
88 |
.expand(src_seq_length, S)
|
89 |
.to(self.device)
|
90 |
)
|
|
|
91 |
trg_positions = (
|
92 |
torch.arange(0, trg_seq_length)
|
93 |
.unsqueeze(1)
|
94 |
.expand(trg_seq_length, S)
|
95 |
.to(self.device)
|
96 |
)
|
97 |
+
# Apply embeddings and dropout for source and target sequences
|
98 |
embed_src = self.dropout(
|
99 |
(self.src_embeddings(src) + self.src_positional_embeddings(src_positions))
|
100 |
)
|
|
|
101 |
embed_trg = self.dropout(
|
102 |
(self.trg_embeddings(trg) + self.trg_positional_embeddings(trg_positions))
|
103 |
)
|
104 |
+
# Generate masks for source padding and target sequences
|
105 |
src_padding_mask = self.make_src_mask(src)
|
106 |
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
|
107 |
self.device
|
108 |
)
|
109 |
+
# Forward pass through Transformer
|
110 |
out = self.transformer(
|
111 |
embed_src,
|
112 |
embed_trg,
|
113 |
src_key_padding_mask=src_padding_mask,
|
114 |
tgt_mask=trg_mask,
|
115 |
)
|
116 |
+
# Apply final fully connected layer
|
117 |
out = self.fc_out(out)
|
|
|
118 |
return out
|
src/translation/translate.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
import torch
|
2 |
import os
|
3 |
import sys
|
|
|
4 |
|
5 |
sys.path.append(os.path.abspath("src/train/"))
|
6 |
sys.path.append(os.path.abspath("src/data_processing/"))
|
@@ -10,6 +10,7 @@ from data_processing import SRC, TRG, arTokenizer, engTokenizer
|
|
10 |
|
11 |
device = "cpu"
|
12 |
|
|
|
13 |
num_heads = 8
|
14 |
num_encoder_layers = 3
|
15 |
num_decoder_layers = 3
|
@@ -17,11 +18,12 @@ max_len = 230
|
|
17 |
dropout = 0.4
|
18 |
embedding_size = 256
|
19 |
|
|
|
20 |
src_pad_idx = SRC.vocab.stoi["<pad>"]
|
21 |
src_vocab_size = len(SRC.vocab)
|
22 |
trg_vocab_size = len(TRG.vocab)
|
23 |
|
24 |
-
# Initialize model with hyperparameters
|
25 |
model = Transformer(
|
26 |
embedding_size,
|
27 |
src_vocab_size,
|
@@ -35,31 +37,47 @@ model = Transformer(
|
|
35 |
device=device,
|
36 |
).to(device)
|
37 |
|
38 |
-
# Load the saved model
|
39 |
model.load_state_dict(torch.load("models/arabic2english.pt", map_location=device))
|
40 |
|
41 |
|
42 |
def translate(sentence, srcField, targetField):
|
43 |
-
"""
|
44 |
-
model.
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
|
|
51 |
for _ in range(max_len):
|
52 |
trg_tensor = (
|
53 |
torch.tensor([targetField.vocab.stoi[word] for word in trg])
|
54 |
.unsqueeze(1)
|
55 |
.to(device)
|
56 |
)
|
57 |
-
outputs = model(processed_sentence, trg_tensor)
|
58 |
|
|
|
59 |
pred_token = targetField.vocab.itos[outputs.argmax(2)[-1:].item()]
|
60 |
-
if pred_token != "<unk>":
|
61 |
trg.append(pred_token)
|
62 |
-
if pred_token == "نهاية":
|
63 |
break
|
64 |
|
65 |
-
return " ".join(
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
+
import torch
|
4 |
|
5 |
sys.path.append(os.path.abspath("src/train/"))
|
6 |
sys.path.append(os.path.abspath("src/data_processing/"))
|
|
|
10 |
|
11 |
device = "cpu"
|
12 |
|
13 |
+
# Define model hyperparameters
|
14 |
num_heads = 8
|
15 |
num_encoder_layers = 3
|
16 |
num_decoder_layers = 3
|
|
|
18 |
dropout = 0.4
|
19 |
embedding_size = 256
|
20 |
|
21 |
+
# Define vocabulary sizes and padding index
|
22 |
src_pad_idx = SRC.vocab.stoi["<pad>"]
|
23 |
src_vocab_size = len(SRC.vocab)
|
24 |
trg_vocab_size = len(TRG.vocab)
|
25 |
|
26 |
+
# Initialize model with specified hyperparameters
|
27 |
model = Transformer(
|
28 |
embedding_size,
|
29 |
src_vocab_size,
|
|
|
37 |
device=device,
|
38 |
).to(device)
|
39 |
|
40 |
+
# Load the saved model parameters
|
41 |
model.load_state_dict(torch.load("models/arabic2english.pt", map_location=device))
|
42 |
|
43 |
|
44 |
def translate(sentence, srcField, targetField):
|
45 |
+
"""
|
46 |
+
Translates an English sentence to Arabic using the Transformer model.
|
47 |
+
|
48 |
+
Args:
|
49 |
+
sentence (str): Input Arabic sentence to be translated.
|
50 |
+
srcField: Source language field.
|
51 |
+
targetField: Target language field.
|
52 |
+
|
53 |
+
Returns:
|
54 |
+
str: Translated English sentence.
|
55 |
+
"""
|
56 |
+
model.eval() # Set model to evaluation mode
|
57 |
+
srcTokenizer = engTokenizer # Initialize source tokenizer
|
58 |
+
srcField = SRC # Set source language field to English
|
59 |
+
targetField = TRG # Set target language field to Arabic
|
60 |
+
processed_sentence = srcField.process([srcTokenizer(sentence)]).to(
|
61 |
+
device
|
62 |
+
) # Process input sentence
|
63 |
+
trg = ["بداية"] # Initialize target sentence with start token
|
64 |
|
65 |
+
# Generate translation
|
66 |
for _ in range(max_len):
|
67 |
trg_tensor = (
|
68 |
torch.tensor([targetField.vocab.stoi[word] for word in trg])
|
69 |
.unsqueeze(1)
|
70 |
.to(device)
|
71 |
)
|
72 |
+
outputs = model(processed_sentence, trg_tensor) # Generate output predictions
|
73 |
|
74 |
+
# Determine predicted token
|
75 |
pred_token = targetField.vocab.itos[outputs.argmax(2)[-1:].item()]
|
76 |
+
if pred_token != "<unk>": # Exclude unknown tokens
|
77 |
trg.append(pred_token)
|
78 |
+
if pred_token == "نهاية": # Stop translation at end token
|
79 |
break
|
80 |
|
81 |
+
return " ".join(
|
82 |
+
[word for word in trg if word != "<unk>"][1:-1]
|
83 |
+
) # Return translated sentence
|