arabic2english / src /data_processing /data_processing.py
alifalhasan's picture
[Task] Minor Update
b1c38c2 verified
import re
import spacy
import random
import pandas as pd
from torchtext import data
from spacy.lang.ar import Arabic
from spacy.tokenizer import Tokenizer
# Reading data into a pandas DataFrame
df = pd.read_csv(
"data/arabic2english.txt",
delimiter="\t",
names=["eng", "ar"],
)
# Loading English language model from spaCy
spacy_eng = spacy.load("en_core_web_sm")
# Creating an instance of Arabic language model from spaCy
arab = Arabic()
# Creating a tokenizer for Arabic text using the Arabic language model
ar_Tokenizer = Tokenizer(arab.vocab)
def engTokenizer(text):
"""
Tokenizes English text using spaCy tokenizer.
Args:
text (str): The input English text.
Returns:
list: List of tokens.
"""
return [word.text for word in spacy_eng.tokenizer(text)]
def arTokenizer(sentence):
"""
Tokenizes Arabic sentence using spaCy tokenizer.
Args:
sentence (str): The input Arabic sentence.
Returns:
list: List of tokens.
"""
return [
word.text
for word in ar_Tokenizer(
re.sub(r"\s+", " ", re.sub(r"[\.\'\"\n+]", " ", sentence)).strip()
)
]
# Defining fields for source and target languages using torchtext
SRC = data.Field(
tokenize=engTokenizer, batch_first=False, init_token="<sos>", eos_token="<eos>"
)
TRG = data.Field(
tokenize=arTokenizer,
batch_first=False,
tokenizer_language="ar",
init_token="بداية",
eos_token="نهاية",
)
class TextDataset(data.Dataset):
"""
Custom dataset class for text data.
Args:
df (pandas.DataFrame): DataFrame containing source and target language data.
src_field (torchtext.data.Field): Field for source language.
target_field (torchtext.data.Field): Field for target language.
is_test (bool): Flag indicating if the dataset is for testing.
Attributes:
fields (list): List of tuples containing field names and corresponding Field objects.
samples (list): List of data examples.
"""
def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
fields = [("eng", src_field), ("ar", target_field)]
samples = []
for i, row in df.iterrows():
eng = row.eng
ar = row.ar
samples.append(data.Example.fromlist([eng, ar], fields))
super().__init__(samples, fields, **kwargs)
def __len__(self):
"""
Get the number of samples in the dataset.
Returns:
int: Number of samples.
"""
return len(self.samples)
def __getitem__(self, idx):
"""
Get a sample from the dataset.
Args:
idx (int): Index of the sample.
Returns:
torchtext.data.Example: Sample at the specified index.
"""
return self.samples[idx]
# Creating a TextDataset instance
torchdataset = TextDataset(df, SRC, TRG)
# Splitting the dataset into training and validation sets
train_data, valid_data = torchdataset.split(
split_ratio=0.8, random_state=random.seed(32)
)
# Building vocabularies for source and target languages
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)