File size: 4,223 Bytes
b3c2eb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import json
import os
import numpy as np
from tqdm import tqdm
from tensorflow.keras.utils import Sequence
from lstm_chem.utils.smiles_tokenizer2 import SmilesTokenizer
class DataLoader(Sequence):
def __init__(self, config, data_type='train'):
self.config = config
self.data_type = data_type
assert self.data_type in ['train', 'valid', 'finetune']
self.max_len = 0
if self.data_type == 'train':
self.smiles = self._load(self.config.data_filename)
elif self.data_type == 'finetune':
self.smiles = self._load(self.config.finetune_data_filename)
else:
pass
self.st = SmilesTokenizer()
self.one_hot_dict = self.st.one_hot_dict
self.tokenized_smiles = self._tokenize(self.smiles)
if self.data_type in ['train', 'valid']:
self.idx = np.arange(len(self.tokenized_smiles))
self.valid_size = int(
np.ceil(
len(self.tokenized_smiles) * self.config.validation_split))
np.random.seed(self.config.seed)
np.random.shuffle(self.idx)
def _set_data(self):
if self.data_type == 'train':
ret = [
self.tokenized_smiles[self.idx[i]]
for i in self.idx[self.valid_size:]
]
elif self.data_type == 'valid':
ret = [
self.tokenized_smiles[self.idx[i]]
for i in self.idx[:self.valid_size]
]
else:
ret = self.tokenized_smiles
return ret
def _load(self, data_filename):
length = self.config.data_length
print('loading SMILES...')
with open(data_filename) as f:
smiles = [s.rstrip() for s in f]
if length != 0:
smiles = smiles[:length]
print('done.')
return smiles
def _tokenize(self, smiles):
assert isinstance(smiles, list)
print('tokenizing SMILES...')
tokenized_smiles = [self.st.tokenize(smi) for smi in tqdm(smiles)]
if self.data_type == 'train':
for tokenized_smi in tokenized_smiles:
length = len(tokenized_smi)
if self.max_len < length:
self.max_len = length
self.config.train_smi_max_len = self.max_len
print('done.')
return tokenized_smiles
def __len__(self):
target_tokenized_smiles = self._set_data()
if self.data_type in ['train', 'valid']:
ret = int(
np.ceil(
len(target_tokenized_smiles) /
float(self.config.batch_size)))
else:
ret = int(
np.ceil(
len(target_tokenized_smiles) /
float(self.config.finetune_batch_size)))
return ret
def __getitem__(self, idx):
target_tokenized_smiles = self._set_data()
if self.data_type in ['train', 'valid']:
data = target_tokenized_smiles[idx *
self.config.batch_size:(idx + 1) *
self.config.batch_size]
else:
data = target_tokenized_smiles[idx *
self.config.finetune_batch_size:
(idx + 1) *
self.config.finetune_batch_size]
data = self._padding(data)
self.X, self.y = [], []
for tp_smi in data:
X = [self.one_hot_dict[symbol] for symbol in tp_smi[:-1]]
self.X.append(X)
y = [self.one_hot_dict[symbol] for symbol in tp_smi[1:]]
self.y.append(y)
self.X = np.array(self.X, dtype=np.float32)
self.y = np.array(self.y, dtype=np.float32)
# return self.X, self.y, [None]
return self.X, self.y
def _pad(self, tokenized_smi):
return ['G'] + tokenized_smi + ['E'] + [
'A' for _ in range(self.max_len - len(tokenized_smi))
]
def _padding(self, data):
padded_smiles = [self._pad(t_smi) for t_smi in data]
return padded_smiles
|