Spaces:
Sleeping
Sleeping
import re | |
import spacy | |
import random | |
import pandas as pd | |
from torchtext import data | |
from spacy.lang.ar import Arabic | |
from spacy.tokenizer import Tokenizer | |
# Reading data into a pandas DataFrame | |
df = pd.read_csv( | |
"data/arabic2english.txt", | |
delimiter="\t", | |
names=["eng", "ar"], | |
) | |
# Loading English language model from spaCy | |
spacy_eng = spacy.load("en_core_web_sm") | |
# Creating an instance of Arabic language model from spaCy | |
arab = Arabic() | |
# Creating a tokenizer for Arabic text using the Arabic language model | |
ar_Tokenizer = Tokenizer(arab.vocab) | |
def engTokenizer(text): | |
""" | |
Tokenizes English text using spaCy tokenizer. | |
Args: | |
text (str): The input English text. | |
Returns: | |
list: List of tokens. | |
""" | |
return [word.text for word in spacy_eng.tokenizer(text)] | |
def arTokenizer(sentence): | |
""" | |
Tokenizes Arabic sentence using spaCy tokenizer. | |
Args: | |
sentence (str): The input Arabic sentence. | |
Returns: | |
list: List of tokens. | |
""" | |
return [ | |
word.text | |
for word in ar_Tokenizer( | |
re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip() | |
) | |
] | |
# Defining fields for source and target languages using torchtext | |
SRC = data.Field( | |
tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>" | |
) | |
TRG = data.Field( | |
tokenize=arTokenizer, | |
batch_first=False, | |
tokenizer_language="ar", | |
init_token="بداية", | |
eos_token="نهاية", | |
) | |
class TextDataset(data.Dataset): | |
""" | |
Custom dataset class for text data. | |
Args: | |
df (pandas.DataFrame): DataFrame containing source and target language data. | |
src_field (torchtext.data.Field): Field for source language. | |
target_field (torchtext.data.Field): Field for target language. | |
is_test (bool): Flag indicating if the dataset is for testing. | |
Attributes: | |
fields (list): List of tuples containing field names and corresponding Field objects. | |
samples (list): List of data examples. | |
""" | |
def __init__(self, df, src_field, target_field, is_test=False, **kwargs): | |
fields = [("eng", src_field), ("ar", target_field)] | |
samples = [] | |
for i, row in df.iterrows(): | |
eng = row.eng | |
ar = row.ar | |
samples.append(data.Example.fromlist([eng, ar], fields)) | |
super().__init__(samples, fields, **kwargs) | |
def __len__(self): | |
""" | |
Get the number of samples in the dataset. | |
Returns: | |
int: Number of samples. | |
""" | |
return len(self.samples) | |
def __getitem__(self, idx): | |
""" | |
Get a sample from the dataset. | |
Args: | |
idx (int): Index of the sample. | |
Returns: | |
torchtext.data.Example: Sample at the specified index. | |
""" | |
return self.samples[idx] | |
# Creating a TextDataset instance | |
torchdataset = TextDataset(df, SRC, TRG) | |
# Splitting the dataset into training and validation sets | |
train_data, valid_data = torchdataset.split( | |
split_ratio=0.8, random_state=random.seed(32) | |
) | |
# Building vocabularies for source and target languages | |
SRC.build_vocab(train_data, min_freq=2) | |
TRG.build_vocab(train_data, min_freq=2) | |