pubmedul2_small / tokenizer.py

new additions

98c533d over 1 year ago

5.74 kB

	# %pip install sentencepiece
	# %pip install datasets
	# %pip install seqio

	import unicodedata
	import os
	import nltk
	from tqdm import tqdm
	import glob
	from random import sample

	def sample_and_make_tempfile(sentences_dir, num_files):
	""" Use the set of files containing a sentence per line,
	sample num_files out of those and save as a temp file """

	sentence_files = glob.glob(sentences_dir + "/*.txt")

	# sample num_files
	sampled_files=sample(sentence_files, num_files)

	print("sampled files:")
	print(sampled_files)

	#read all the lines from sampled files and save to a list
	all_lines = []
	for filename in sampled_files:
	with open(filename) as f:
	lines = f.read().splitlines()

	all_lines.extend(lines)

	print("number of lines sampled:", len(all_lines))

	#combine into a single file and save
	tempfile_path = os.path.join("text", "temp.txt")
	with open(tempfile_path, "w") as f:

	for sentence in tqdm(all_lines):

	# remove newlines
	line = sentence.strip()

	# do not save empty items such as
	if sentence != []:

	f.writelines(sentence + '\n')

	print("Wrote to ", tempfile_path)
	return tempfile_path


	def chunks(sentences, n, tot_len):
	"""Yield successive n-sized chunks from sentences."""
	for i in range(0, tot_len, n):
	end_i = min(len(sentences),i + n)
	yield sentences[i:end_i]["text"]



	def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'):
	"""
	Make a sentence per line files, chuncsize sentences per file"""

	# make sure data dir exists
	if not os.path.exists(data_dir):
	os.makedirs(data_dir)

	# use simple regex for sentence tokenizing
	sent_detector = nltk.RegexpTokenizer(u'[^　！？。]*[！？。.\n]')

	# loop over the chunks
	for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))):

	# new file for each chunk
	filename = "sent_{}.txt".format(chunk_ind)
	filepath = os.path.join(data_dir, filename)

	print("writing to ", filepath)

	with open(filepath, "w") as f:

	for sentence in tqdm(sentence_chunk):

	# remove newlines
	line = sentence.strip()

	# unicode normalize japanese spaces etc
	unicodedata.normalize('NFKC', line)

	# tokenize into sentences
	sentences = sent_detector.tokenize(line)

	# do not save empty items such as
	if sentences != []:

	f.writelines(s + '\n' for s in sentences)


	def combine_files(output_file, *files):
	"""
	Combines the contents of multiple text files into a single file.

	:param output_file: Path to the output file.
	:param files: Paths to the files to be combined.
	:return: Total number of lines in the combined file.
	"""
	total_lines = 0

	with open(output_file, 'w') as outfile:
	for file in files:
	with open(file, 'r') as infile:
	lines = infile.readlines()
	total_lines += len(lines)
	outfile.writelines(lines)
	# Add a newline for separation (optional)
	outfile.write('\n')


	return total_lines

	# make sentence files from hugingface dataset
	dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset")
	make_sentence_files(dataset_bio["train"])

	# combine files to get 45 million sentences
	files_to_combine = glob.glob("text/sentences/*.txt")
	files_to_combine = files_to_combine[:2]
	total_lines = combine_files(output_file_path, *files_to_combine)

	# Train the sentencepiece transformers on 45 million sentences
	import sentencepiece as spm

	spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
	pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
	user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'],
	train_extremely_large_corpus=True,
	num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True)


	# Add 100 extra tokens to the model
	from seqio import SentencePieceVocabulary
	import os
	import tensorflow as tf
	from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2


	def add_100extra(vocab: SentencePieceVocabulary, out_dir: str):
	tf.io.gfile.makedirs(out_dir)
	tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model)

	model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model)
	tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write(
	'\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces)
	)


	# vocab = t5.data.get_default_vocabulary()
	# out_dir = "../vocabulary/cc_all.32000.100extra"
	#
	# add_100extra(vocab, out_dir)
	#
	# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=100)
	# out_dir = "../vocabulary/nedd.32000.100extra"
	# add_100extra(vocab, out_dir)
	#
	# vocab = seqio.SentencePieceVocabulary("../vocabulary/nedd.32000/spiece.model", extra_ids=128)
	# out_dir = "../vocabulary/nedd.32000.128extra"
	# add_100extra(vocab, out_dir)
	#


	vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100)
	out_dir = "conv"
	add_100extra(vocab, out_dir)