Spaces:
Sleeping
Sleeping
alifalhasan
commited on
Commit
•
9a4dd2c
1
Parent(s):
8e41ab0
[Task] Model Deployment
Browse files[Description] Completed model training and deployment.
[Author]
@alifalhasan
- app.py +24 -0
- models/arabic2english.pt +3 -0
- requirements.txt +2 -2
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/data_processing/__pycache__/data_processing.cpython-311.pyc +0 -0
- src/data_processing/data_processing.py +22 -43
- src/train/__pycache__/transformer.cpython-311.pyc +0 -0
- src/train/train.py +66 -50
- src/train/transformer.py +41 -59
- src/translation/__pycache__/__init__.cpython-311.pyc +0 -0
- src/translation/__pycache__/translate.cpython-311.pyc +0 -0
- src/translation/translate.py +56 -16
app.py
CHANGED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from src.translation.translate import translate
|
4 |
+
|
5 |
+
LANGS = ["arabic", "english"]
|
6 |
+
|
7 |
+
if __name__ == "__main__":
|
8 |
+
# Create the Gradio interface
|
9 |
+
iface = gr.Interface(
|
10 |
+
fn=translate,
|
11 |
+
inputs=[
|
12 |
+
gr.components.Textbox(label="Text"),
|
13 |
+
gr.components.Dropdown(label="Source Language", choices=LANGS),
|
14 |
+
gr.components.Dropdown(label="Target Language", choices=LANGS),
|
15 |
+
],
|
16 |
+
outputs=["text"],
|
17 |
+
examples=[["I'm ready", "english", "arabic"]],
|
18 |
+
cache_examples=False,
|
19 |
+
title="arabic2english",
|
20 |
+
description="This is a translator app for arabic and english. Currently supports only english to arabic."
|
21 |
+
)
|
22 |
+
|
23 |
+
# Launch the interface
|
24 |
+
iface.launch(share=True)
|
models/arabic2english.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8c3e17038c7e05e86f3e7abad08ed8d9b0270b8cb70b5ee259e4df74bd321a2
|
3 |
+
size 93639810
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
gradio
|
2 |
-
torch
|
3 |
-
torchtext
|
4 |
spacy
|
5 |
transformers
|
6 |
nltk
|
|
|
1 |
gradio
|
2 |
+
torch>=1.6
|
3 |
+
torchtext==0.6
|
4 |
spacy
|
5 |
transformers
|
6 |
nltk
|
src/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (188 Bytes). View file
|
|
src/data_processing/__pycache__/data_processing.cpython-311.pyc
ADDED
Binary file (3.98 kB). View file
|
|
src/data_processing/data_processing.py
CHANGED
@@ -1,68 +1,52 @@
|
|
1 |
-
import os
|
2 |
import re
|
3 |
import spacy
|
4 |
import random
|
5 |
import pandas as pd
|
6 |
|
|
|
7 |
from spacy.lang.ar import Arabic
|
8 |
-
from torchtext.legacy import data
|
9 |
from spacy.tokenizer import Tokenizer
|
10 |
|
11 |
-
# Load data
|
12 |
-
script_directory = os.path.dirname(os.path.abspath(__file__))
|
13 |
df = pd.read_csv(
|
14 |
-
|
15 |
delimiter="\t",
|
16 |
names=["eng", "ar"],
|
17 |
)
|
18 |
|
19 |
-
# Load English and Arabic language models from spaCy
|
20 |
spacy_eng = spacy.load("en_core_web_sm")
|
21 |
-
|
22 |
-
|
|
|
23 |
|
24 |
|
25 |
-
# Tokenizer functions
|
26 |
def engTokenizer(text):
|
27 |
return [word.text for word in spacy_eng.tokenizer(text)]
|
28 |
|
29 |
|
30 |
-
def arTokenizer(
|
31 |
return [
|
32 |
word.text
|
33 |
-
for word in
|
34 |
-
re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ",
|
35 |
)
|
36 |
]
|
37 |
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
tokenize=engTokenizer, # Custom tokenizer for English
|
42 |
-
init_token="<sos>", # Start-of-sentence token
|
43 |
-
eos_token="<eos>", # End-of-sentence token
|
44 |
-
batch_first=False,
|
45 |
)
|
46 |
-
|
47 |
-
tokenize=arTokenizer,
|
48 |
-
init_token="ببدأ", # Arabic start-of-sentence token
|
49 |
-
eos_token="نهها", # Arabic end-of-sentence token
|
50 |
-
tokenizer_language="ar", # Specify language for tokenization
|
51 |
batch_first=False,
|
|
|
|
|
|
|
52 |
)
|
53 |
|
54 |
|
55 |
class TextDataset(data.Dataset):
|
56 |
-
|
57 |
-
|
58 |
-
Initializes a TextDataset.
|
59 |
-
|
60 |
-
Args:
|
61 |
-
df: A Pandas DataFrame containing text data.
|
62 |
-
src_field: The Field object for the source language.
|
63 |
-
target_field: The Field object for the target language.
|
64 |
-
is_test: A boolean indicating whether this is a test dataset.
|
65 |
-
"""
|
66 |
fields = [("eng", src_field), ("ar", target_field)]
|
67 |
samples = []
|
68 |
for i, row in df.iterrows():
|
@@ -70,25 +54,20 @@ class TextDataset(data.Dataset):
|
|
70 |
ar = row.ar
|
71 |
samples.append(data.Example.fromlist([eng, ar], fields))
|
72 |
|
73 |
-
super().__init__(samples, fields)
|
74 |
|
75 |
def __len__(self):
|
76 |
-
"""Returns the number of samples in the dataset."""
|
77 |
return len(self.samples)
|
78 |
|
79 |
def __getitem__(self, idx):
|
80 |
-
"""Returns the sample at the given index."""
|
81 |
return self.samples[idx]
|
82 |
|
83 |
|
84 |
-
|
85 |
-
torchdataset = TextDataset(df, SOURCE, TARGET)
|
86 |
|
87 |
-
# Split the dataset into training and validation sets
|
88 |
train_data, valid_data = torchdataset.split(
|
89 |
-
split_ratio=0.8, random_state=random.seed(
|
90 |
)
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
TARGET.build_vocab(train_data, min_freq=2)
|
|
|
|
|
1 |
import re
|
2 |
import spacy
|
3 |
import random
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from torchtext import data
|
7 |
from spacy.lang.ar import Arabic
|
|
|
8 |
from spacy.tokenizer import Tokenizer
|
9 |
|
|
|
|
|
10 |
df = pd.read_csv(
|
11 |
+
"data/arabic2english.txt",
|
12 |
delimiter="\t",
|
13 |
names=["eng", "ar"],
|
14 |
)
|
15 |
|
|
|
16 |
spacy_eng = spacy.load("en_core_web_sm")
|
17 |
+
|
18 |
+
arab = Arabic()
|
19 |
+
ar_Tokenizer = Tokenizer(arab.vocab)
|
20 |
|
21 |
|
|
|
22 |
def engTokenizer(text):
|
23 |
return [word.text for word in spacy_eng.tokenizer(text)]
|
24 |
|
25 |
|
26 |
+
def arTokenizer(sentence):
|
27 |
return [
|
28 |
word.text
|
29 |
+
for word in ar_Tokenizer(
|
30 |
+
re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip()
|
31 |
)
|
32 |
]
|
33 |
|
34 |
|
35 |
+
SRC = data.Field(
|
36 |
+
tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
|
|
|
|
|
|
|
|
|
37 |
)
|
38 |
+
TRG = data.Field(
|
39 |
+
tokenize=arTokenizer,
|
|
|
|
|
|
|
40 |
batch_first=False,
|
41 |
+
tokenizer_language="ar",
|
42 |
+
init_token="بداية",
|
43 |
+
eos_token="نهاية",
|
44 |
)
|
45 |
|
46 |
|
47 |
class TextDataset(data.Dataset):
|
48 |
+
|
49 |
+
def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
fields = [("eng", src_field), ("ar", target_field)]
|
51 |
samples = []
|
52 |
for i, row in df.iterrows():
|
|
|
54 |
ar = row.ar
|
55 |
samples.append(data.Example.fromlist([eng, ar], fields))
|
56 |
|
57 |
+
super().__init__(samples, fields, **kwargs)
|
58 |
|
59 |
def __len__(self):
|
|
|
60 |
return len(self.samples)
|
61 |
|
62 |
def __getitem__(self, idx):
|
|
|
63 |
return self.samples[idx]
|
64 |
|
65 |
|
66 |
+
torchdataset = TextDataset(df, SRC, TRG)
|
|
|
67 |
|
|
|
68 |
train_data, valid_data = torchdataset.split(
|
69 |
+
split_ratio=0.8, random_state=random.seed(32)
|
70 |
)
|
71 |
|
72 |
+
SRC.build_vocab(train_data, min_freq=2)
|
73 |
+
TRG.build_vocab(train_data, min_freq=2)
|
|
src/train/__pycache__/transformer.cpython-311.pyc
ADDED
Binary file (4.4 kB). View file
|
|
src/train/train.py
CHANGED
@@ -3,103 +3,119 @@ import torch
|
|
3 |
import numpy as np
|
4 |
|
5 |
from torch import nn, optim
|
6 |
-
from torchtext
|
7 |
from transformer import Transformer
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
"""Hyperparameters"""
|
11 |
-
# Training
|
12 |
BATCH_SIZE = 16
|
13 |
-
learning_rate = 0.001
|
14 |
-
num_epochs = 20
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
num_heads = 8
|
18 |
num_encoder_layers = 3
|
19 |
num_decoder_layers = 3
|
20 |
-
|
21 |
max_len = 230
|
22 |
dropout = 0.4
|
23 |
embedding_size = 256
|
|
|
24 |
|
25 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
device=device,
|
33 |
-
shuffle=True,
|
34 |
-
)
|
35 |
|
36 |
-
# Get vocabulary sizes
|
37 |
-
src_vocab_size = len(SOURCE.vocab)
|
38 |
-
trg_vocab_size = len(TARGET.vocab)
|
39 |
|
40 |
-
# Initialize Transformer model
|
41 |
model = Transformer(
|
42 |
embedding_size,
|
43 |
src_vocab_size,
|
44 |
trg_vocab_size,
|
45 |
-
|
46 |
-
num_heads
|
47 |
-
num_encoder_layers
|
48 |
-
num_decoder_layers
|
49 |
-
|
50 |
-
|
51 |
-
max_len=max_len,
|
52 |
device=device,
|
53 |
).to(device)
|
54 |
|
55 |
-
|
56 |
-
|
|
|
57 |
|
58 |
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
59 |
|
|
|
60 |
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
|
61 |
-
|
62 |
for epoch in range(num_epochs):
|
63 |
-
|
64 |
model.train()
|
65 |
-
|
66 |
for batch in train_iter:
|
67 |
input_data = batch.eng.to(device)
|
68 |
target = batch.ar.to(device)
|
69 |
|
70 |
output = model(input_data, target[:-1])
|
71 |
-
|
72 |
optimizer.zero_grad()
|
73 |
-
|
|
|
74 |
target = target[1:].reshape(-1)
|
|
|
75 |
loss = criterion(output, target)
|
76 |
loss.backward()
|
77 |
-
optimizer.step()
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
train_loss.append(np.mean(step_loss))
|
82 |
-
print(f"Epoch {epoch} | Train Cross Entropy Loss: {np.mean(step_loss)}")
|
83 |
|
|
|
|
|
84 |
with torch.inference_mode():
|
85 |
-
|
86 |
model.eval()
|
87 |
-
|
88 |
-
for batch in valid_iter:
|
89 |
input_sentence = batch.eng.to(device)
|
90 |
target = batch.ar.to(device)
|
91 |
-
|
92 |
output = model(input_sentence, target[:-1])
|
93 |
output = output.reshape(-1, trg_vocab_size)
|
94 |
target = target[1:].reshape(-1)
|
95 |
-
|
96 |
loss = criterion(output, target)
|
97 |
-
step_valid_loss.append(loss.item())
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
103 |
|
|
|
104 |
script_directory = os.path.dirname(os.path.abspath(__file__))
|
105 |
-
|
|
|
|
3 |
import numpy as np
|
4 |
|
5 |
from torch import nn, optim
|
6 |
+
from torchtext import data
|
7 |
from transformer import Transformer
|
8 |
+
|
9 |
+
import sys
|
10 |
+
|
11 |
+
sys.path.append(os.path.abspath("src/data_processing/"))
|
12 |
+
from data_processing import (
|
13 |
+
SRC,
|
14 |
+
TRG,
|
15 |
+
train_data,
|
16 |
+
valid_data,
|
17 |
+
)
|
18 |
+
|
19 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
|
21 |
"""Hyperparameters"""
|
|
|
22 |
BATCH_SIZE = 16
|
|
|
|
|
23 |
|
24 |
+
train_iter, valid_iter = data.BucketIterator.splits(
|
25 |
+
(train_data, valid_data),
|
26 |
+
batch_size=BATCH_SIZE,
|
27 |
+
sort=None,
|
28 |
+
sort_within_batch=False,
|
29 |
+
sort_key=lambda x: len(x.eng),
|
30 |
+
device=device,
|
31 |
+
shuffle=True,
|
32 |
+
)
|
33 |
+
load_model = False
|
34 |
+
save_model = True
|
35 |
+
|
36 |
+
num_epochs = 30
|
37 |
+
learning_rate = 0.0001
|
38 |
+
|
39 |
num_heads = 8
|
40 |
num_encoder_layers = 3
|
41 |
num_decoder_layers = 3
|
42 |
+
|
43 |
max_len = 230
|
44 |
dropout = 0.4
|
45 |
embedding_size = 256
|
46 |
+
src_pad_idx = SRC.vocab.stoi["<pad>"]
|
47 |
|
|
|
48 |
|
49 |
+
src_vocab_size = len(SRC.vocab)
|
50 |
+
print("Size of english vocabulary:", src_vocab_size)
|
51 |
+
|
52 |
+
trg_vocab_size = len(TRG.vocab)
|
53 |
+
print("Size of arabic vocabulary:", trg_vocab_size)
|
|
|
|
|
|
|
54 |
|
|
|
|
|
|
|
55 |
|
|
|
56 |
model = Transformer(
|
57 |
embedding_size,
|
58 |
src_vocab_size,
|
59 |
trg_vocab_size,
|
60 |
+
src_pad_idx,
|
61 |
+
num_heads,
|
62 |
+
num_encoder_layers,
|
63 |
+
num_decoder_layers,
|
64 |
+
dropout,
|
65 |
+
max_len,
|
|
|
66 |
device=device,
|
67 |
).to(device)
|
68 |
|
69 |
+
loss_track = []
|
70 |
+
loss_validation_track = []
|
71 |
+
|
72 |
|
73 |
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
74 |
|
75 |
+
pad_idx = SRC.vocab.stoi["<pad>"]
|
76 |
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
|
|
|
77 |
for epoch in range(num_epochs):
|
78 |
+
stepLoss = []
|
79 |
model.train()
|
|
|
80 |
for batch in train_iter:
|
81 |
input_data = batch.eng.to(device)
|
82 |
target = batch.ar.to(device)
|
83 |
|
84 |
output = model(input_data, target[:-1])
|
|
|
85 |
optimizer.zero_grad()
|
86 |
+
|
87 |
+
output = output.reshape(-1, trg_vocab_size)
|
88 |
target = target[1:].reshape(-1)
|
89 |
+
|
90 |
loss = criterion(output, target)
|
91 |
loss.backward()
|
|
|
92 |
|
93 |
+
optimizer.step()
|
94 |
+
stepLoss.append(loss.item())
|
|
|
|
|
95 |
|
96 |
+
loss_track.append(np.mean(stepLoss))
|
97 |
+
print(" Epoch {} | Train Cross Entropy Loss: ".format(epoch), np.mean(stepLoss))
|
98 |
with torch.inference_mode():
|
99 |
+
stepValidLoss = []
|
100 |
model.eval()
|
101 |
+
for i, batch in enumerate(valid_iter):
|
|
|
102 |
input_sentence = batch.eng.to(device)
|
103 |
target = batch.ar.to(device)
|
104 |
+
optimizer.zero_grad()
|
105 |
output = model(input_sentence, target[:-1])
|
106 |
output = output.reshape(-1, trg_vocab_size)
|
107 |
target = target[1:].reshape(-1)
|
|
|
108 |
loss = criterion(output, target)
|
|
|
109 |
|
110 |
+
stepValidLoss.append(loss.item())
|
111 |
+
|
112 |
+
loss_validation_track.append(np.mean(stepValidLoss))
|
113 |
+
print(
|
114 |
+
" Epoch {} | Validation Cross Entropy Loss: ".format(epoch),
|
115 |
+
np.mean(stepValidLoss),
|
116 |
+
)
|
117 |
|
118 |
+
# Save the model
|
119 |
script_directory = os.path.dirname(os.path.abspath(__file__))
|
120 |
+
model = model.to('cpu')
|
121 |
+
torch.save(model.state_dict(), os.path.join(script_directory, "../../models/arabic2english.pt"))
|
src/train/transformer.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import torch
|
2 |
from torch import nn
|
3 |
|
4 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
5 |
-
|
6 |
|
7 |
class Transformer(nn.Module):
|
8 |
"""
|
@@ -12,106 +10,90 @@ class Transformer(nn.Module):
|
|
12 |
def __init__(
|
13 |
self,
|
14 |
embedding_size,
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
num_heads,
|
19 |
num_encoder_layers,
|
20 |
num_decoder_layers,
|
21 |
-
forward_expansion,
|
22 |
dropout,
|
23 |
max_len,
|
|
|
24 |
):
|
25 |
"""
|
26 |
Initializes the Transformer model.
|
27 |
|
28 |
Args:
|
29 |
embedding_size: Size of the embeddings.
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
num_heads: Number of attention heads.
|
34 |
num_encoder_layers: Number of encoder layers.
|
35 |
num_decoder_layers: Number of decoder layers.
|
36 |
-
forward_expansion: Factor for expanding the model dimensionality.
|
37 |
dropout: Dropout probability.
|
38 |
max_len: Maximum sequence length.
|
39 |
"""
|
40 |
|
41 |
-
super().__init__()
|
42 |
-
|
43 |
-
|
44 |
-
self.
|
45 |
-
self.
|
46 |
-
|
47 |
-
# Positional encodings for source and target sequences
|
48 |
-
self.positional_encodings = nn.Parameter(
|
49 |
-
torch.zeros(1, max_len, embedding_size)
|
50 |
-
)
|
51 |
-
|
52 |
-
# Transformer encoder-decoder
|
53 |
self.transformer = nn.Transformer(
|
54 |
-
|
55 |
-
|
56 |
-
num_encoder_layers
|
57 |
-
num_decoder_layers
|
58 |
-
dim_feedforward=forward_expansion * embedding_size,
|
59 |
-
dropout=dropout,
|
60 |
)
|
61 |
|
62 |
-
|
63 |
-
self.fc_out = nn.Linear(embedding_size, target_vocab_size)
|
64 |
-
|
65 |
-
# Dropout for regularization
|
66 |
self.dropout = nn.Dropout(dropout)
|
67 |
-
|
68 |
-
# Source padding index
|
69 |
-
self.src_pad_idx = source_pad_idx
|
70 |
|
71 |
def make_src_mask(self, src):
|
72 |
-
"""
|
73 |
-
Creates a mask for padding tokens in the source sequence.
|
74 |
-
"""
|
75 |
-
|
76 |
src_mask = src.transpose(0, 1) == self.src_pad_idx
|
77 |
-
|
|
|
78 |
|
79 |
def forward(self, src, trg):
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
# Add positional encodings to embeddings
|
88 |
embed_src = self.dropout(
|
89 |
-
(
|
90 |
-
self.src_embeddings(src)
|
91 |
-
+ self.positional_encodings[:, :src_seq_length, :]
|
92 |
-
)
|
93 |
)
|
|
|
94 |
embed_trg = self.dropout(
|
95 |
-
(
|
96 |
-
self.trg_embeddings(trg)
|
97 |
-
+ self.positional_encodings[:, :trg_seq_length, :]
|
98 |
-
)
|
99 |
)
|
100 |
|
101 |
-
# Create masks for source padding and target sequence
|
102 |
src_padding_mask = self.make_src_mask(src)
|
103 |
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
|
104 |
-
device
|
105 |
)
|
106 |
|
107 |
-
# Pass input through transformer encoder-decoder
|
108 |
out = self.transformer(
|
109 |
embed_src,
|
110 |
embed_trg,
|
111 |
src_key_padding_mask=src_padding_mask,
|
112 |
tgt_mask=trg_mask,
|
113 |
)
|
114 |
-
|
115 |
-
# Apply output layer
|
116 |
out = self.fc_out(out)
|
|
|
117 |
return out
|
|
|
1 |
import torch
|
2 |
from torch import nn
|
3 |
|
|
|
|
|
4 |
|
5 |
class Transformer(nn.Module):
|
6 |
"""
|
|
|
10 |
def __init__(
|
11 |
self,
|
12 |
embedding_size,
|
13 |
+
src_vocab_size,
|
14 |
+
trg_vocab_size,
|
15 |
+
src_pad_idx,
|
16 |
num_heads,
|
17 |
num_encoder_layers,
|
18 |
num_decoder_layers,
|
|
|
19 |
dropout,
|
20 |
max_len,
|
21 |
+
device,
|
22 |
):
|
23 |
"""
|
24 |
Initializes the Transformer model.
|
25 |
|
26 |
Args:
|
27 |
embedding_size: Size of the embeddings.
|
28 |
+
src_vocab_size: Size of the source vocabulary.
|
29 |
+
trg_vocab_size: Size of the target vocabulary.
|
30 |
+
src_pad_idx: Index of the padding token in the source vocabulary.
|
31 |
num_heads: Number of attention heads.
|
32 |
num_encoder_layers: Number of encoder layers.
|
33 |
num_decoder_layers: Number of decoder layers.
|
|
|
34 |
dropout: Dropout probability.
|
35 |
max_len: Maximum sequence length.
|
36 |
"""
|
37 |
|
38 |
+
super(Transformer, self).__init__()
|
39 |
+
self.src_embeddings = nn.Embedding(src_vocab_size, embedding_size)
|
40 |
+
self.src_positional_embeddings = nn.Embedding(max_len, embedding_size)
|
41 |
+
self.trg_embeddings = nn.Embedding(trg_vocab_size, embedding_size)
|
42 |
+
self.trg_positional_embeddings = nn.Embedding(max_len, embedding_size)
|
43 |
+
self.device = device
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
self.transformer = nn.Transformer(
|
45 |
+
embedding_size,
|
46 |
+
num_heads,
|
47 |
+
num_encoder_layers,
|
48 |
+
num_decoder_layers,
|
|
|
|
|
49 |
)
|
50 |
|
51 |
+
self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
|
|
|
|
|
|
|
52 |
self.dropout = nn.Dropout(dropout)
|
53 |
+
self.src_pad_idx = src_pad_idx
|
|
|
|
|
54 |
|
55 |
def make_src_mask(self, src):
|
|
|
|
|
|
|
|
|
56 |
src_mask = src.transpose(0, 1) == self.src_pad_idx
|
57 |
+
|
58 |
+
return src_mask.to(self.device)
|
59 |
|
60 |
def forward(self, src, trg):
|
61 |
+
src_seq_length, S = src.shape
|
62 |
+
trg_seq_length, S = trg.shape
|
63 |
+
# adding zeros is an easy way
|
64 |
+
src_positions = (
|
65 |
+
torch.arange(0, src_seq_length)
|
66 |
+
.unsqueeze(1)
|
67 |
+
.expand(src_seq_length, S)
|
68 |
+
.to(self.device)
|
69 |
+
)
|
70 |
|
71 |
+
trg_positions = (
|
72 |
+
torch.arange(0, trg_seq_length)
|
73 |
+
.unsqueeze(1)
|
74 |
+
.expand(trg_seq_length, S)
|
75 |
+
.to(self.device)
|
76 |
+
)
|
77 |
|
|
|
78 |
embed_src = self.dropout(
|
79 |
+
(self.src_embeddings(src) + self.src_positional_embeddings(src_positions))
|
|
|
|
|
|
|
80 |
)
|
81 |
+
|
82 |
embed_trg = self.dropout(
|
83 |
+
(self.trg_embeddings(trg) + self.trg_positional_embeddings(trg_positions))
|
|
|
|
|
|
|
84 |
)
|
85 |
|
|
|
86 |
src_padding_mask = self.make_src_mask(src)
|
87 |
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
|
88 |
+
self.device
|
89 |
)
|
90 |
|
|
|
91 |
out = self.transformer(
|
92 |
embed_src,
|
93 |
embed_trg,
|
94 |
src_key_padding_mask=src_padding_mask,
|
95 |
tgt_mask=trg_mask,
|
96 |
)
|
|
|
|
|
97 |
out = self.fc_out(out)
|
98 |
+
|
99 |
return out
|
src/translation/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (200 Bytes). View file
|
|
src/translation/__pycache__/translate.cpython-311.pyc
ADDED
Binary file (3.43 kB). View file
|
|
src/translation/translate.py
CHANGED
@@ -1,25 +1,65 @@
|
|
1 |
import torch
|
|
|
|
|
2 |
|
3 |
-
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
model.eval()
|
|
|
|
|
|
|
7 |
processed_sentence = srcField.process([srcTokenizer(sentence)]).to(device)
|
8 |
trg = ["بداية"]
|
9 |
|
10 |
-
for _ in range(
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
trg.append(targetField.vocab.itos[outputs.argmax(2)[-1:].item()])
|
18 |
-
if targetField.vocab.itos[outputs.argmax(2)[-1:].item()] == "نهاية":
|
19 |
-
break
|
20 |
-
return " ".join([word for word in trg if word != "<unk>"][1:-1])
|
21 |
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
if
|
24 |
-
print("I'm home -> {}",translate_sentence(model,"I'm at home" ,SRC,TRG,engTokenizer))
|
25 |
-
print("I'm alone -> {}",translate_sentence(model,"I'm alone" ,SRC,TRG,engTokenizer))
|
|
|
1 |
import torch
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
|
5 |
+
sys.path.append(os.path.abspath("src/train/"))
|
6 |
+
sys.path.append(os.path.abspath("src/data_processing/"))
|
7 |
|
8 |
+
from transformer import Transformer
|
9 |
+
from data_processing import SRC, TRG, arTokenizer, engTokenizer
|
10 |
+
|
11 |
+
device = "cpu"
|
12 |
+
|
13 |
+
num_heads = 8
|
14 |
+
num_encoder_layers = 3
|
15 |
+
num_decoder_layers = 3
|
16 |
+
max_len = 230
|
17 |
+
dropout = 0.4
|
18 |
+
embedding_size = 256
|
19 |
+
|
20 |
+
src_pad_idx = SRC.vocab.stoi["<pad>"]
|
21 |
+
src_vocab_size = len(SRC.vocab)
|
22 |
+
trg_vocab_size = len(TRG.vocab)
|
23 |
+
|
24 |
+
# Initialize model with hyperparameters
|
25 |
+
model = Transformer(
|
26 |
+
embedding_size,
|
27 |
+
src_vocab_size,
|
28 |
+
trg_vocab_size,
|
29 |
+
src_pad_idx,
|
30 |
+
num_heads,
|
31 |
+
num_encoder_layers,
|
32 |
+
num_decoder_layers,
|
33 |
+
dropout,
|
34 |
+
max_len,
|
35 |
+
device=device,
|
36 |
+
).to(device)
|
37 |
+
|
38 |
+
# Load the saved model
|
39 |
+
model.load_state_dict(torch.load("models/arabic2english.pt", map_location=device))
|
40 |
+
|
41 |
+
|
42 |
+
def translate(sentence, srcField, targetField):
|
43 |
+
"""Translates an Arabic sentence to English using the model."""
|
44 |
model.eval()
|
45 |
+
srcTokenizer = engTokenizer
|
46 |
+
srcField = SRC
|
47 |
+
targetField = TRG
|
48 |
processed_sentence = srcField.process([srcTokenizer(sentence)]).to(device)
|
49 |
trg = ["بداية"]
|
50 |
|
51 |
+
for _ in range(max_len):
|
52 |
+
trg_tensor = (
|
53 |
+
torch.tensor([targetField.vocab.stoi[word] for word in trg])
|
54 |
+
.unsqueeze(1)
|
55 |
+
.to(device)
|
56 |
+
)
|
57 |
+
outputs = model(processed_sentence, trg_tensor)
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
pred_token = targetField.vocab.itos[outputs.argmax(2)[-1:].item()]
|
60 |
+
if pred_token != "<unk>":
|
61 |
+
trg.append(pred_token)
|
62 |
+
if pred_token == "نهاية":
|
63 |
+
break
|
64 |
|
65 |
+
return " ".join([word for word in trg if word != "<unk>"][1:-1])
|
|
|
|