Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Sep 20 07:48:54 2023 | |
@author: peter | |
""" | |
import numpy | |
import pandas | |
import tokenizers | |
class CorpusLoader(object): | |
def __init__(self,path, | |
tokenizer, | |
text_inputs, | |
text_outputs, | |
label=None): | |
""" | |
Creates the Corpus Loader | |
Parameters | |
---------- | |
path : str | |
Path to load dataset from | |
start_doc : tokenizers.Encoding | |
Token id for document start character | |
end_doc : tokenizers.Encoding | |
Token id for the document end character | |
text_inputs : list[str] | |
Columns of the dataset to add to the inputs | |
text_outputs : dict[str,tuple[str]] | |
The columns of the dataset to add to the outputs. The key is the name | |
of the column in the original dataset, the first element of the tuple | |
is the name that the column prefixed with '<s>' will have in the | |
inputs, and the second element of the tuple is the name that the column | |
suffixed with '</s>' will have in the outputs | |
label : str, optional | |
A column of numerical labels to add to the outputs. The default is None. | |
Returns | |
------- | |
None. | |
""" | |
data = pandas.read_csv(path) | |
self.n_rows = data.shape[0] | |
self.text_inputs = text_inputs | |
self.text_outputs = text_outputs | |
self.label = label | |
self.rng = numpy.random.default_rng() | |
columns = list(set(self.text_inputs)|set(self.text_outputs.keys())) | |
tokenized = {column:tokenizer.encode_batch(data[column].apply(lambda x:tokenizers.TextInputSequence(x)), | |
add_special_tokens=False) | |
for column in columns} | |
if self.label is not None: | |
tokenized[self.label] = data[self.label] | |
columns.append(self.label) | |
self.dataset = [{column:tokenized[column][i] | |
for column in columns} | |
for i in range(self.n_rows)] | |
self.start_doc = tokenizer.encode('<s>') | |
self.end_doc = tokenizer.encode('</s>') | |
def __len__(self): | |
""" | |
The length of the corpus | |
Returns | |
------- | |
int | |
The number of samples | |
""" | |
return self.n_rows | |
def __iter__(self): | |
""" | |
Generates samples in a random order | |
Yields | |
------ | |
X : dict | |
Inputs for model | |
Y : dict | |
outputs for model | |
""" | |
self.rng.shuffle(self.dataset) | |
for row in self.dataset: | |
X={} | |
Y={} | |
for column in self.text_inputs: | |
X[column] = row[column] | |
for (column,(x_name,y_name)) in self.text_outputs.items(): | |
X[x_name] = tokenizers.Encoding.merge([self.start_doc,row[column]]) | |
Y[y_name] = tokenizers.Encoding.merge([row[column],self.end_doc]) | |
if self.label is not None: | |
Y[self.label]=row[self.label] | |
yield (X,Y) | |
def max_lengths(self): | |
result = {column:max((len(row[column]) | |
for row in self.dataset)) | |
for column in self.text_inputs} | |
for (column,(inside,outside)) in self.text_outputs.items(): | |
n = result[column] if column in result else max((len(row[column]) | |
for row in self.dataset)) | |
result[inside] = n+1 | |
result[outside] = n+1 | |
return result | |