QARAC / qarac /corpora /CorpusLoader.py
PeteBleackley
Forgot a len
be7beac
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 07:48:54 2023
@author: peter
"""
import numpy
import pandas
import tokenizers
class CorpusLoader(object):
def __init__(self,path,
tokenizer,
text_inputs,
text_outputs,
label=None):
"""
Creates the Corpus Loader
Parameters
----------
path : str
Path to load dataset from
start_doc : tokenizers.Encoding
Token id for document start character
end_doc : tokenizers.Encoding
Token id for the document end character
text_inputs : list[str]
Columns of the dataset to add to the inputs
text_outputs : dict[str,tuple[str]]
The columns of the dataset to add to the outputs. The key is the name
of the column in the original dataset, the first element of the tuple
is the name that the column prefixed with '<s>' will have in the
inputs, and the second element of the tuple is the name that the column
suffixed with '</s>' will have in the outputs
label : str, optional
A column of numerical labels to add to the outputs. The default is None.
Returns
-------
None.
"""
data = pandas.read_csv(path)
self.n_rows = data.shape[0]
self.text_inputs = text_inputs
self.text_outputs = text_outputs
self.label = label
self.rng = numpy.random.default_rng()
columns = list(set(self.text_inputs)|set(self.text_outputs.keys()))
tokenized = {column:tokenizer.encode_batch(data[column].apply(lambda x:tokenizers.TextInputSequence(x)),
add_special_tokens=False)
for column in columns}
if self.label is not None:
tokenized[self.label] = data[self.label]
columns.append(self.label)
self.dataset = [{column:tokenized[column][i]
for column in columns}
for i in range(self.n_rows)]
self.start_doc = tokenizer.encode('<s>')
self.end_doc = tokenizer.encode('</s>')
def __len__(self):
"""
The length of the corpus
Returns
-------
int
The number of samples
"""
return self.n_rows
def __iter__(self):
"""
Generates samples in a random order
Yields
------
X : dict
Inputs for model
Y : dict
outputs for model
"""
self.rng.shuffle(self.dataset)
for row in self.dataset:
X={}
Y={}
for column in self.text_inputs:
X[column] = row[column]
for (column,(x_name,y_name)) in self.text_outputs.items():
X[x_name] = tokenizers.Encoding.merge([self.start_doc,row[column]])
Y[y_name] = tokenizers.Encoding.merge([row[column],self.end_doc])
if self.label is not None:
Y[self.label]=row[self.label]
yield (X,Y)
def max_lengths(self):
result = {column:max((len(row[column])
for row in self.dataset))
for column in self.text_inputs}
for (column,(inside,outside)) in self.text_outputs.items():
n = result[column] if column in result else max((len(row[column])
for row in self.dataset))
result[inside] = n+1
result[outside] = n+1
return result