alifalhasan commited on
Commit
9a4dd2c
1 Parent(s): 8e41ab0

[Task] Model Deployment

Browse files

[Description] Completed model training and deployment.
[Author]

@alifalhasan

app.py CHANGED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.translation.translate import translate
4
+
5
+ LANGS = ["arabic", "english"]
6
+
7
+ if __name__ == "__main__":
8
+ # Create the Gradio interface
9
+ iface = gr.Interface(
10
+ fn=translate,
11
+ inputs=[
12
+ gr.components.Textbox(label="Text"),
13
+ gr.components.Dropdown(label="Source Language", choices=LANGS),
14
+ gr.components.Dropdown(label="Target Language", choices=LANGS),
15
+ ],
16
+ outputs=["text"],
17
+ examples=[["I'm ready", "english", "arabic"]],
18
+ cache_examples=False,
19
+ title="arabic2english",
20
+ description="This is a translator app for arabic and english. Currently supports only english to arabic."
21
+ )
22
+
23
+ # Launch the interface
24
+ iface.launch(share=True)
models/arabic2english.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8c3e17038c7e05e86f3e7abad08ed8d9b0270b8cb70b5ee259e4df74bd321a2
3
+ size 93639810
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  gradio
2
- torch
3
- torchtext
4
  spacy
5
  transformers
6
  nltk
 
1
  gradio
2
+ torch>=1.6
3
+ torchtext==0.6
4
  spacy
5
  transformers
6
  nltk
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (188 Bytes). View file
 
src/data_processing/__pycache__/data_processing.cpython-311.pyc ADDED
Binary file (3.98 kB). View file
 
src/data_processing/data_processing.py CHANGED
@@ -1,68 +1,52 @@
1
- import os
2
  import re
3
  import spacy
4
  import random
5
  import pandas as pd
6
 
 
7
  from spacy.lang.ar import Arabic
8
- from torchtext.legacy import data
9
  from spacy.tokenizer import Tokenizer
10
 
11
- # Load data
12
- script_directory = os.path.dirname(os.path.abspath(__file__))
13
  df = pd.read_csv(
14
- os.path.join(script_directory, "../../data/arabic2english.txt"),
15
  delimiter="\t",
16
  names=["eng", "ar"],
17
  )
18
 
19
- # Load English and Arabic language models from spaCy
20
  spacy_eng = spacy.load("en_core_web_sm")
21
- ar = Arabic()
22
- ar_tokenizer = Tokenizer(ar.vocab)
 
23
 
24
 
25
- # Tokenizer functions
26
  def engTokenizer(text):
27
  return [word.text for word in spacy_eng.tokenizer(text)]
28
 
29
 
30
- def arTokenizer(text):
31
  return [
32
  word.text
33
- for word in ar_tokenizer(
34
- re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", text)).strip()
35
  )
36
  ]
37
 
38
 
39
- # Fields for source (English) and target (Arabic) data
40
- SOURCE = data.Field(
41
- tokenize=engTokenizer, # Custom tokenizer for English
42
- init_token="<sos>", # Start-of-sentence token
43
- eos_token="<eos>", # End-of-sentence token
44
- batch_first=False,
45
  )
46
- TARGET = data.Field(
47
- tokenize=arTokenizer, # Custom tokenizer for Arabic
48
- init_token="ببدأ", # Arabic start-of-sentence token
49
- eos_token="نهها", # Arabic end-of-sentence token
50
- tokenizer_language="ar", # Specify language for tokenization
51
  batch_first=False,
 
 
 
52
  )
53
 
54
 
55
  class TextDataset(data.Dataset):
56
- def __init__(self, df, src_field, target_field, is_test=False):
57
- """
58
- Initializes a TextDataset.
59
-
60
- Args:
61
- df: A Pandas DataFrame containing text data.
62
- src_field: The Field object for the source language.
63
- target_field: The Field object for the target language.
64
- is_test: A boolean indicating whether this is a test dataset.
65
- """
66
  fields = [("eng", src_field), ("ar", target_field)]
67
  samples = []
68
  for i, row in df.iterrows():
@@ -70,25 +54,20 @@ class TextDataset(data.Dataset):
70
  ar = row.ar
71
  samples.append(data.Example.fromlist([eng, ar], fields))
72
 
73
- super().__init__(samples, fields)
74
 
75
  def __len__(self):
76
- """Returns the number of samples in the dataset."""
77
  return len(self.samples)
78
 
79
  def __getitem__(self, idx):
80
- """Returns the sample at the given index."""
81
  return self.samples[idx]
82
 
83
 
84
- # TextDataset instance
85
- torchdataset = TextDataset(df, SOURCE, TARGET)
86
 
87
- # Split the dataset into training and validation sets
88
  train_data, valid_data = torchdataset.split(
89
- split_ratio=0.8, random_state=random.seed(42)
90
  )
91
 
92
- # Build vocabulary for source and target fields
93
- SOURCE.build_vocab(train_data, min_freq=2)
94
- TARGET.build_vocab(train_data, min_freq=2)
 
 
1
  import re
2
  import spacy
3
  import random
4
  import pandas as pd
5
 
6
+ from torchtext import data
7
  from spacy.lang.ar import Arabic
 
8
  from spacy.tokenizer import Tokenizer
9
 
 
 
10
  df = pd.read_csv(
11
+ "data/arabic2english.txt",
12
  delimiter="\t",
13
  names=["eng", "ar"],
14
  )
15
 
 
16
  spacy_eng = spacy.load("en_core_web_sm")
17
+
18
+ arab = Arabic()
19
+ ar_Tokenizer = Tokenizer(arab.vocab)
20
 
21
 
 
22
  def engTokenizer(text):
23
  return [word.text for word in spacy_eng.tokenizer(text)]
24
 
25
 
26
+ def arTokenizer(sentence):
27
  return [
28
  word.text
29
+ for word in ar_Tokenizer(
30
+ re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip()
31
  )
32
  ]
33
 
34
 
35
+ SRC = data.Field(
36
+ tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
 
 
 
 
37
  )
38
+ TRG = data.Field(
39
+ tokenize=arTokenizer,
 
 
 
40
  batch_first=False,
41
+ tokenizer_language="ar",
42
+ init_token="بداية",
43
+ eos_token="نهاية",
44
  )
45
 
46
 
47
  class TextDataset(data.Dataset):
48
+
49
+ def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
 
 
 
 
 
 
 
 
50
  fields = [("eng", src_field), ("ar", target_field)]
51
  samples = []
52
  for i, row in df.iterrows():
 
54
  ar = row.ar
55
  samples.append(data.Example.fromlist([eng, ar], fields))
56
 
57
+ super().__init__(samples, fields, **kwargs)
58
 
59
  def __len__(self):
 
60
  return len(self.samples)
61
 
62
  def __getitem__(self, idx):
 
63
  return self.samples[idx]
64
 
65
 
66
+ torchdataset = TextDataset(df, SRC, TRG)
 
67
 
 
68
  train_data, valid_data = torchdataset.split(
69
+ split_ratio=0.8, random_state=random.seed(32)
70
  )
71
 
72
+ SRC.build_vocab(train_data, min_freq=2)
73
+ TRG.build_vocab(train_data, min_freq=2)
 
src/train/__pycache__/transformer.cpython-311.pyc ADDED
Binary file (4.4 kB). View file
 
src/train/train.py CHANGED
@@ -3,103 +3,119 @@ import torch
3
  import numpy as np
4
 
5
  from torch import nn, optim
6
- from torchtext.legacy import data
7
  from transformer import Transformer
8
- from data_processing import SOURCE, TARGET, train_data, valid_data
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  """Hyperparameters"""
11
- # Training
12
  BATCH_SIZE = 16
13
- learning_rate = 0.001
14
- num_epochs = 20
15
 
16
- # Model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  num_heads = 8
18
  num_encoder_layers = 3
19
  num_decoder_layers = 3
20
- pad_idx = SOURCE.vocab.stoi["<pad>"]
21
  max_len = 230
22
  dropout = 0.4
23
  embedding_size = 256
 
24
 
25
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
 
27
- # Create iterators for Transformer
28
- train_iter, valid_iter = data.BucketIterator.splits(
29
- (train_data, valid_data),
30
- batch_size=BATCH_SIZE,
31
- sort_key=lambda x: len(x.eng), # Sort by sentence length
32
- device=device,
33
- shuffle=True,
34
- )
35
 
36
- # Get vocabulary sizes
37
- src_vocab_size = len(SOURCE.vocab)
38
- trg_vocab_size = len(TARGET.vocab)
39
 
40
- # Initialize Transformer model
41
  model = Transformer(
42
  embedding_size,
43
  src_vocab_size,
44
  trg_vocab_size,
45
- pad_idx,
46
- num_heads=num_heads,
47
- num_encoder_layers=num_encoder_layers,
48
- num_decoder_layers=num_decoder_layers,
49
- forward_expansion=2 * embedding_size,
50
- dropout=dropout,
51
- max_len=max_len,
52
  device=device,
53
  ).to(device)
54
 
55
- train_loss = []
56
- validation_loss = []
 
57
 
58
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
59
 
 
60
  criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
61
-
62
  for epoch in range(num_epochs):
63
- step_loss = []
64
  model.train()
65
-
66
  for batch in train_iter:
67
  input_data = batch.eng.to(device)
68
  target = batch.ar.to(device)
69
 
70
  output = model(input_data, target[:-1])
71
-
72
  optimizer.zero_grad()
73
- output = output.reshape(-1, trg_vocab_size) # Reshape for loss calculation
 
74
  target = target[1:].reshape(-1)
 
75
  loss = criterion(output, target)
76
  loss.backward()
77
- optimizer.step()
78
 
79
- step_loss.append(loss.item())
80
-
81
- train_loss.append(np.mean(step_loss))
82
- print(f"Epoch {epoch} | Train Cross Entropy Loss: {np.mean(step_loss)}")
83
 
 
 
84
  with torch.inference_mode():
85
- step_valid_loss = []
86
  model.eval()
87
-
88
- for batch in valid_iter:
89
  input_sentence = batch.eng.to(device)
90
  target = batch.ar.to(device)
91
-
92
  output = model(input_sentence, target[:-1])
93
  output = output.reshape(-1, trg_vocab_size)
94
  target = target[1:].reshape(-1)
95
-
96
  loss = criterion(output, target)
97
- step_valid_loss.append(loss.item())
98
 
99
- validation_loss.append(np.mean(step_valid_loss))
100
- print(
101
- f"Epoch {epoch} | Validation Cross Entropy Loss: {np.mean(step_valid_loss)}"
102
- )
 
 
 
103
 
 
104
  script_directory = os.path.dirname(os.path.abspath(__file__))
105
- torch.save(model, os.path.join(script_directory, "../../models/arabic2english.pt"))
 
 
3
  import numpy as np
4
 
5
  from torch import nn, optim
6
+ from torchtext import data
7
  from transformer import Transformer
8
+
9
+ import sys
10
+
11
+ sys.path.append(os.path.abspath("src/data_processing/"))
12
+ from data_processing import (
13
+ SRC,
14
+ TRG,
15
+ train_data,
16
+ valid_data,
17
+ )
18
+
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
 
21
  """Hyperparameters"""
 
22
  BATCH_SIZE = 16
 
 
23
 
24
+ train_iter, valid_iter = data.BucketIterator.splits(
25
+ (train_data, valid_data),
26
+ batch_size=BATCH_SIZE,
27
+ sort=None,
28
+ sort_within_batch=False,
29
+ sort_key=lambda x: len(x.eng),
30
+ device=device,
31
+ shuffle=True,
32
+ )
33
+ load_model = False
34
+ save_model = True
35
+
36
+ num_epochs = 30
37
+ learning_rate = 0.0001
38
+
39
  num_heads = 8
40
  num_encoder_layers = 3
41
  num_decoder_layers = 3
42
+
43
  max_len = 230
44
  dropout = 0.4
45
  embedding_size = 256
46
+ src_pad_idx = SRC.vocab.stoi["<pad>"]
47
 
 
48
 
49
+ src_vocab_size = len(SRC.vocab)
50
+ print("Size of english vocabulary:", src_vocab_size)
51
+
52
+ trg_vocab_size = len(TRG.vocab)
53
+ print("Size of arabic vocabulary:", trg_vocab_size)
 
 
 
54
 
 
 
 
55
 
 
56
  model = Transformer(
57
  embedding_size,
58
  src_vocab_size,
59
  trg_vocab_size,
60
+ src_pad_idx,
61
+ num_heads,
62
+ num_encoder_layers,
63
+ num_decoder_layers,
64
+ dropout,
65
+ max_len,
 
66
  device=device,
67
  ).to(device)
68
 
69
+ loss_track = []
70
+ loss_validation_track = []
71
+
72
 
73
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
74
 
75
+ pad_idx = SRC.vocab.stoi["<pad>"]
76
  criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
 
77
  for epoch in range(num_epochs):
78
+ stepLoss = []
79
  model.train()
 
80
  for batch in train_iter:
81
  input_data = batch.eng.to(device)
82
  target = batch.ar.to(device)
83
 
84
  output = model(input_data, target[:-1])
 
85
  optimizer.zero_grad()
86
+
87
+ output = output.reshape(-1, trg_vocab_size)
88
  target = target[1:].reshape(-1)
89
+
90
  loss = criterion(output, target)
91
  loss.backward()
 
92
 
93
+ optimizer.step()
94
+ stepLoss.append(loss.item())
 
 
95
 
96
+ loss_track.append(np.mean(stepLoss))
97
+ print(" Epoch {} | Train Cross Entropy Loss: ".format(epoch), np.mean(stepLoss))
98
  with torch.inference_mode():
99
+ stepValidLoss = []
100
  model.eval()
101
+ for i, batch in enumerate(valid_iter):
 
102
  input_sentence = batch.eng.to(device)
103
  target = batch.ar.to(device)
104
+ optimizer.zero_grad()
105
  output = model(input_sentence, target[:-1])
106
  output = output.reshape(-1, trg_vocab_size)
107
  target = target[1:].reshape(-1)
 
108
  loss = criterion(output, target)
 
109
 
110
+ stepValidLoss.append(loss.item())
111
+
112
+ loss_validation_track.append(np.mean(stepValidLoss))
113
+ print(
114
+ " Epoch {} | Validation Cross Entropy Loss: ".format(epoch),
115
+ np.mean(stepValidLoss),
116
+ )
117
 
118
+ # Save the model
119
  script_directory = os.path.dirname(os.path.abspath(__file__))
120
+ model = model.to('cpu')
121
+ torch.save(model.state_dict(), os.path.join(script_directory, "../../models/arabic2english.pt"))
src/train/transformer.py CHANGED
@@ -1,8 +1,6 @@
1
  import torch
2
  from torch import nn
3
 
4
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
5
-
6
 
7
  class Transformer(nn.Module):
8
  """
@@ -12,106 +10,90 @@ class Transformer(nn.Module):
12
  def __init__(
13
  self,
14
  embedding_size,
15
- source_vocab_size,
16
- target_vocab_size,
17
- source_pad_idx,
18
  num_heads,
19
  num_encoder_layers,
20
  num_decoder_layers,
21
- forward_expansion,
22
  dropout,
23
  max_len,
 
24
  ):
25
  """
26
  Initializes the Transformer model.
27
 
28
  Args:
29
  embedding_size: Size of the embeddings.
30
- source_vocab_size: Size of the source vocabulary.
31
- target_vocab_size: Size of the target vocabulary.
32
- source_pad_idx: Index of the padding token in the source vocabulary.
33
  num_heads: Number of attention heads.
34
  num_encoder_layers: Number of encoder layers.
35
  num_decoder_layers: Number of decoder layers.
36
- forward_expansion: Factor for expanding the model dimensionality.
37
  dropout: Dropout probability.
38
  max_len: Maximum sequence length.
39
  """
40
 
41
- super().__init__()
42
-
43
- # Embedding layers for source and target tokens
44
- self.src_embeddings = nn.Embedding(source_vocab_size, embedding_size)
45
- self.trg_embeddings = nn.Embedding(target_vocab_size, embedding_size)
46
-
47
- # Positional encodings for source and target sequences
48
- self.positional_encodings = nn.Parameter(
49
- torch.zeros(1, max_len, embedding_size)
50
- )
51
-
52
- # Transformer encoder-decoder
53
  self.transformer = nn.Transformer(
54
- d_model=embedding_size,
55
- nhead=num_heads,
56
- num_encoder_layers=num_encoder_layers,
57
- num_decoder_layers=num_decoder_layers,
58
- dim_feedforward=forward_expansion * embedding_size,
59
- dropout=dropout,
60
  )
61
 
62
- # Output layer for target vocabulary
63
- self.fc_out = nn.Linear(embedding_size, target_vocab_size)
64
-
65
- # Dropout for regularization
66
  self.dropout = nn.Dropout(dropout)
67
-
68
- # Source padding index
69
- self.src_pad_idx = source_pad_idx
70
 
71
  def make_src_mask(self, src):
72
- """
73
- Creates a mask for padding tokens in the source sequence.
74
- """
75
-
76
  src_mask = src.transpose(0, 1) == self.src_pad_idx
77
- return src_mask
 
78
 
79
  def forward(self, src, trg):
80
- """
81
- Forward pass of the Transformer model.
82
- """
 
 
 
 
 
 
83
 
84
- src_seq_length, N = src.shape
85
- trg_seq_length, N = trg.shape
 
 
 
 
86
 
87
- # Add positional encodings to embeddings
88
  embed_src = self.dropout(
89
- (
90
- self.src_embeddings(src)
91
- + self.positional_encodings[:, :src_seq_length, :]
92
- )
93
  )
 
94
  embed_trg = self.dropout(
95
- (
96
- self.trg_embeddings(trg)
97
- + self.positional_encodings[:, :trg_seq_length, :]
98
- )
99
  )
100
 
101
- # Create masks for source padding and target sequence
102
  src_padding_mask = self.make_src_mask(src)
103
  trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
104
- device
105
  )
106
 
107
- # Pass input through transformer encoder-decoder
108
  out = self.transformer(
109
  embed_src,
110
  embed_trg,
111
  src_key_padding_mask=src_padding_mask,
112
  tgt_mask=trg_mask,
113
  )
114
-
115
- # Apply output layer
116
  out = self.fc_out(out)
 
117
  return out
 
1
  import torch
2
  from torch import nn
3
 
 
 
4
 
5
  class Transformer(nn.Module):
6
  """
 
10
  def __init__(
11
  self,
12
  embedding_size,
13
+ src_vocab_size,
14
+ trg_vocab_size,
15
+ src_pad_idx,
16
  num_heads,
17
  num_encoder_layers,
18
  num_decoder_layers,
 
19
  dropout,
20
  max_len,
21
+ device,
22
  ):
23
  """
24
  Initializes the Transformer model.
25
 
26
  Args:
27
  embedding_size: Size of the embeddings.
28
+ src_vocab_size: Size of the source vocabulary.
29
+ trg_vocab_size: Size of the target vocabulary.
30
+ src_pad_idx: Index of the padding token in the source vocabulary.
31
  num_heads: Number of attention heads.
32
  num_encoder_layers: Number of encoder layers.
33
  num_decoder_layers: Number of decoder layers.
 
34
  dropout: Dropout probability.
35
  max_len: Maximum sequence length.
36
  """
37
 
38
+ super(Transformer, self).__init__()
39
+ self.src_embeddings = nn.Embedding(src_vocab_size, embedding_size)
40
+ self.src_positional_embeddings = nn.Embedding(max_len, embedding_size)
41
+ self.trg_embeddings = nn.Embedding(trg_vocab_size, embedding_size)
42
+ self.trg_positional_embeddings = nn.Embedding(max_len, embedding_size)
43
+ self.device = device
 
 
 
 
 
 
44
  self.transformer = nn.Transformer(
45
+ embedding_size,
46
+ num_heads,
47
+ num_encoder_layers,
48
+ num_decoder_layers,
 
 
49
  )
50
 
51
+ self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
 
 
 
52
  self.dropout = nn.Dropout(dropout)
53
+ self.src_pad_idx = src_pad_idx
 
 
54
 
55
  def make_src_mask(self, src):
 
 
 
 
56
  src_mask = src.transpose(0, 1) == self.src_pad_idx
57
+
58
+ return src_mask.to(self.device)
59
 
60
  def forward(self, src, trg):
61
+ src_seq_length, S = src.shape
62
+ trg_seq_length, S = trg.shape
63
+ # adding zeros is an easy way
64
+ src_positions = (
65
+ torch.arange(0, src_seq_length)
66
+ .unsqueeze(1)
67
+ .expand(src_seq_length, S)
68
+ .to(self.device)
69
+ )
70
 
71
+ trg_positions = (
72
+ torch.arange(0, trg_seq_length)
73
+ .unsqueeze(1)
74
+ .expand(trg_seq_length, S)
75
+ .to(self.device)
76
+ )
77
 
 
78
  embed_src = self.dropout(
79
+ (self.src_embeddings(src) + self.src_positional_embeddings(src_positions))
 
 
 
80
  )
81
+
82
  embed_trg = self.dropout(
83
+ (self.trg_embeddings(trg) + self.trg_positional_embeddings(trg_positions))
 
 
 
84
  )
85
 
 
86
  src_padding_mask = self.make_src_mask(src)
87
  trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
88
+ self.device
89
  )
90
 
 
91
  out = self.transformer(
92
  embed_src,
93
  embed_trg,
94
  src_key_padding_mask=src_padding_mask,
95
  tgt_mask=trg_mask,
96
  )
 
 
97
  out = self.fc_out(out)
98
+
99
  return out
src/translation/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (200 Bytes). View file
 
src/translation/__pycache__/translate.cpython-311.pyc ADDED
Binary file (3.43 kB). View file
 
src/translation/translate.py CHANGED
@@ -1,25 +1,65 @@
1
  import torch
 
 
2
 
3
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
4
 
5
- def translate_sentence(model,sentence,srcField,targetField,srcTokenizer):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  model.eval()
 
 
 
7
  processed_sentence = srcField.process([srcTokenizer(sentence)]).to(device)
8
  trg = ["بداية"]
9
 
10
- for _ in range(60):
11
- trg_indecies = [targetField.vocab.stoi[word] for word in trg]
12
- trg_tensor = torch.LongTensor(trg_indecies).unsqueeze(1).to(device)
13
- outputs = model(processed_sentence,trg_tensor)
14
-
15
- if targetField.vocab.itos[outputs.argmax(2)[-1:].item()] == "<unk>":
16
- continue
17
- trg.append(targetField.vocab.itos[outputs.argmax(2)[-1:].item()])
18
- if targetField.vocab.itos[outputs.argmax(2)[-1:].item()] == "نهاية":
19
- break
20
- return " ".join([word for word in trg if word != "<unk>"][1:-1])
21
 
 
 
 
 
 
22
 
23
- if __name__ == '__main__':
24
- print("I'm home -> {}",translate_sentence(model,"I'm at home" ,SRC,TRG,engTokenizer))
25
- print("I'm alone -> {}",translate_sentence(model,"I'm alone" ,SRC,TRG,engTokenizer))
 
1
  import torch
2
+ import os
3
+ import sys
4
 
5
+ sys.path.append(os.path.abspath("src/train/"))
6
+ sys.path.append(os.path.abspath("src/data_processing/"))
7
 
8
+ from transformer import Transformer
9
+ from data_processing import SRC, TRG, arTokenizer, engTokenizer
10
+
11
+ device = "cpu"
12
+
13
+ num_heads = 8
14
+ num_encoder_layers = 3
15
+ num_decoder_layers = 3
16
+ max_len = 230
17
+ dropout = 0.4
18
+ embedding_size = 256
19
+
20
+ src_pad_idx = SRC.vocab.stoi["<pad>"]
21
+ src_vocab_size = len(SRC.vocab)
22
+ trg_vocab_size = len(TRG.vocab)
23
+
24
+ # Initialize model with hyperparameters
25
+ model = Transformer(
26
+ embedding_size,
27
+ src_vocab_size,
28
+ trg_vocab_size,
29
+ src_pad_idx,
30
+ num_heads,
31
+ num_encoder_layers,
32
+ num_decoder_layers,
33
+ dropout,
34
+ max_len,
35
+ device=device,
36
+ ).to(device)
37
+
38
+ # Load the saved model
39
+ model.load_state_dict(torch.load("models/arabic2english.pt", map_location=device))
40
+
41
+
42
+ def translate(sentence, srcField, targetField):
43
+ """Translates an Arabic sentence to English using the model."""
44
  model.eval()
45
+ srcTokenizer = engTokenizer
46
+ srcField = SRC
47
+ targetField = TRG
48
  processed_sentence = srcField.process([srcTokenizer(sentence)]).to(device)
49
  trg = ["بداية"]
50
 
51
+ for _ in range(max_len):
52
+ trg_tensor = (
53
+ torch.tensor([targetField.vocab.stoi[word] for word in trg])
54
+ .unsqueeze(1)
55
+ .to(device)
56
+ )
57
+ outputs = model(processed_sentence, trg_tensor)
 
 
 
 
58
 
59
+ pred_token = targetField.vocab.itos[outputs.argmax(2)[-1:].item()]
60
+ if pred_token != "<unk>":
61
+ trg.append(pred_token)
62
+ if pred_token == "نهاية":
63
+ break
64
 
65
+ return " ".join([word for word in trg if word != "<unk>"][1:-1])