pubmedul2_small / tokenizer.py
Siddharth63's picture
new additions
98c533d
# %pip install sentencepiece
# %pip install datasets
# %pip install seqio
import unicodedata
import os
import nltk
from tqdm import tqdm
import glob
from random import sample
def sample_and_make_tempfile(sentences_dir, num_files):
""" Use the set of files containing a sentence per line,
sample num_files out of those and save as a temp file """
sentence_files = glob.glob(sentences_dir + "/*.txt")
# sample num_files
sampled_files=sample(sentence_files, num_files)
print("sampled files:")
print(sampled_files)
#read all the lines from sampled files and save to a list
all_lines = []
for filename in sampled_files:
with open(filename) as f:
lines = f.read().splitlines()
all_lines.extend(lines)
print("number of lines sampled:", len(all_lines))
#combine into a single file and save
tempfile_path = os.path.join("text", "temp.txt")
with open(tempfile_path, "w") as f:
for sentence in tqdm(all_lines):
# remove newlines
line = sentence.strip()
# do not save empty items such as
if sentence != []:
f.writelines(sentence + '\n')
print("Wrote to ", tempfile_path)
return tempfile_path
def chunks(sentences, n, tot_len):
"""Yield successive n-sized chunks from sentences."""
for i in range(0, tot_len, n):
end_i = min(len(sentences),i + n)
yield sentences[i:end_i]["text"]
def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'):
"""
Make a sentence per line files, chuncsize sentences per file"""
# make sure data dir exists
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# use simple regex for sentence tokenizing
sent_detector = nltk.RegexpTokenizer(u'[^ !?。]*[!?。.\n]')
# loop over the chunks
for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))):
# new file for each chunk
filename = "sent_{}.txt".format(chunk_ind)
filepath = os.path.join(data_dir, filename)
print("writing to ", filepath)
with open(filepath, "w") as f:
for sentence in tqdm(sentence_chunk):
# remove newlines
line = sentence.strip()
# unicode normalize japanese spaces etc
unicodedata.normalize('NFKC', line)
# tokenize into sentences
sentences = sent_detector.tokenize(line)
# do not save empty items such as
if sentences != []:
f.writelines(s + '\n' for s in sentences)
def combine_files(output_file, *files):
"""
Combines the contents of multiple text files into a single file.
:param output_file: Path to the output file.
:param files: Paths to the files to be combined.
:return: Total number of lines in the combined file.
"""
total_lines = 0
with open(output_file, 'w') as outfile:
for file in files:
with open(file, 'r') as infile:
lines = infile.readlines()
total_lines += len(lines)
outfile.writelines(lines)
# Add a newline for separation (optional)
outfile.write('\n')
return total_lines
# make sentence files from hugingface dataset
dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset")
make_sentence_files(dataset_bio["train"])
# combine files to get 45 million sentences
files_to_combine = glob.glob("text/sentences/*.txt")
files_to_combine = files_to_combine[:2]
total_lines = combine_files(output_file_path, *files_to_combine)
# Train the sentencepiece transformers on 45 million sentences
import sentencepiece as spm
spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'],
train_extremely_large_corpus=True,
num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)
# Add 100 extra tokens to the model
from seqio import SentencePieceVocabulary
import os
import tensorflow as tf
from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2
def add_100extra(vocab: SentencePieceVocabulary, out_dir: str):
tf.io.gfile.makedirs(out_dir)
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model)
model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model)
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write(
'\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces)
)
# vocab = t5.data.get_default_vocabulary()
# out_dir = "../vocabulary/cc_all.32000.100extra"
#
# add_100extra(vocab, out_dir)
#
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100)
# out_dir = "../vocabulary/nedd.32000.100extra"
# add_100extra(vocab, out_dir)
#
# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128)
# out_dir = "../vocabulary/nedd.32000.128extra"
# add_100extra(vocab, out_dir)
#
vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100)
out_dir = "conv"
add_100extra(vocab, out_dir)