Spaces:
Runtime error
Runtime error
File size: 639 Bytes
1fc0c38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
import codecs
from SmilesPE.learner import *
import pandas as pd
import argparse
parser = argparse.ArgumentParser(description='Train SmilesPE Tokenizer.')
parser.add_argument('dataset_file_path', type=str, help='Path to the dataset file')
parser.add_argument('output_file_path', type=str, help='Path to file containing trained tokenizer weights')
# Parse the arguments
args = parser.parse_args()
df = pd.read_csv(args.dataset_file_path)
# df = df[0:30000]
output = codecs.open(args.output_file_path, 'w')
learn_SPE(df['canonical_smiles'].tolist(), output, 30000, min_frequency=2000, augmentation=1, verbose=True, total_symbols=True) |