import re import spacy import random import pandas as pd from torchtext import data from spacy.lang.ar import Arabic from spacy.tokenizer import Tokenizer # Reading data into a pandas DataFrame df = pd.read_csv( "data/arabic2english.txt", delimiter="\t", names=["eng", "ar"], ) # Loading English language model from spaCy spacy_eng = spacy.load("en_core_web_sm") # Creating an instance of Arabic language model from spaCy arab = Arabic() # Creating a tokenizer for Arabic text using the Arabic language model ar_Tokenizer = Tokenizer(arab.vocab) def engTokenizer(text): """ Tokenizes English text using spaCy tokenizer. Args: text (str): The input English text. Returns: list: List of tokens. """ return [word.text for word in spacy_eng.tokenizer(text)] def arTokenizer(sentence): """ Tokenizes Arabic sentence using spaCy tokenizer. Args: sentence (str): The input Arabic sentence. Returns: list: List of tokens. """ return [ word.text for word in ar_Tokenizer( re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip() ) ] # Defining fields for source and target languages using torchtext SRC = data.Field( tokenize=engTokenizer, batch_first=False, init_token="", eos_token="" ) TRG = data.Field( tokenize=arTokenizer, batch_first=False, tokenizer_language="ar", init_token="بداية", eos_token="نهاية", ) class TextDataset(data.Dataset): """ Custom dataset class for text data. Args: df (pandas.DataFrame): DataFrame containing source and target language data. src_field (torchtext.data.Field): Field for source language. target_field (torchtext.data.Field): Field for target language. is_test (bool): Flag indicating if the dataset is for testing. Attributes: fields (list): List of tuples containing field names and corresponding Field objects. samples (list): List of data examples. """ def __init__(self, df, src_field, target_field, is_test=False, **kwargs): fields = [("eng", src_field), ("ar", target_field)] samples = [] for i, row in df.iterrows(): eng = row.eng ar = row.ar samples.append(data.Example.fromlist([eng, ar], fields)) super().__init__(samples, fields, **kwargs) def __len__(self): """ Get the number of samples in the dataset. Returns: int: Number of samples. """ return len(self.samples) def __getitem__(self, idx): """ Get a sample from the dataset. Args: idx (int): Index of the sample. Returns: torchtext.data.Example: Sample at the specified index. """ return self.samples[idx] # Creating a TextDataset instance torchdataset = TextDataset(df, SRC, TRG) # Splitting the dataset into training and validation sets train_data, valid_data = torchdataset.split( split_ratio=0.8, random_state=random.seed(32) ) # Building vocabularies for source and target languages SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2)