|
|
|
|
|
|
|
|
|
import unicodedata |
|
import os |
|
import nltk |
|
from tqdm import tqdm |
|
import glob |
|
from random import sample |
|
|
|
def sample_and_make_tempfile(sentences_dir, num_files): |
|
""" Use the set of files containing a sentence per line, |
|
sample num_files out of those and save as a temp file """ |
|
|
|
sentence_files = glob.glob(sentences_dir + "/*.txt") |
|
|
|
|
|
sampled_files=sample(sentence_files, num_files) |
|
|
|
print("sampled files:") |
|
print(sampled_files) |
|
|
|
|
|
all_lines = [] |
|
for filename in sampled_files: |
|
with open(filename) as f: |
|
lines = f.read().splitlines() |
|
|
|
all_lines.extend(lines) |
|
|
|
print("number of lines sampled:", len(all_lines)) |
|
|
|
|
|
tempfile_path = os.path.join("text", "temp.txt") |
|
with open(tempfile_path, "w") as f: |
|
|
|
for sentence in tqdm(all_lines): |
|
|
|
|
|
line = sentence.strip() |
|
|
|
|
|
if sentence != []: |
|
|
|
f.writelines(sentence + '\n') |
|
|
|
print("Wrote to ", tempfile_path) |
|
return tempfile_path |
|
|
|
|
|
def chunks(sentences, n, tot_len): |
|
"""Yield successive n-sized chunks from sentences.""" |
|
for i in range(0, tot_len, n): |
|
end_i = min(len(sentences),i + n) |
|
yield sentences[i:end_i]["text"] |
|
|
|
|
|
|
|
def make_sentence_files(dataset, chunksize = 5600000, data_dir = 'text/sentences'): |
|
""" |
|
Make a sentence per line files, chuncsize sentences per file""" |
|
|
|
|
|
if not os.path.exists(data_dir): |
|
os.makedirs(data_dir) |
|
|
|
|
|
sent_detector = nltk.RegexpTokenizer(u'[^ !?。]*[!?。.\n]') |
|
|
|
|
|
for chunk_ind, sentence_chunk in enumerate(chunks(dataset, chunksize, len(dataset))): |
|
|
|
|
|
filename = "sent_{}.txt".format(chunk_ind) |
|
filepath = os.path.join(data_dir, filename) |
|
|
|
print("writing to ", filepath) |
|
|
|
with open(filepath, "w") as f: |
|
|
|
for sentence in tqdm(sentence_chunk): |
|
|
|
|
|
line = sentence.strip() |
|
|
|
|
|
unicodedata.normalize('NFKC', line) |
|
|
|
|
|
sentences = sent_detector.tokenize(line) |
|
|
|
|
|
if sentences != []: |
|
|
|
f.writelines(s + '\n' for s in sentences) |
|
|
|
|
|
def combine_files(output_file, *files): |
|
""" |
|
Combines the contents of multiple text files into a single file. |
|
|
|
:param output_file: Path to the output file. |
|
:param files: Paths to the files to be combined. |
|
:return: Total number of lines in the combined file. |
|
""" |
|
total_lines = 0 |
|
|
|
with open(output_file, 'w') as outfile: |
|
for file in files: |
|
with open(file, 'r') as infile: |
|
lines = infile.readlines() |
|
total_lines += len(lines) |
|
outfile.writelines(lines) |
|
|
|
outfile.write('\n') |
|
|
|
|
|
return total_lines |
|
|
|
|
|
dataset_bio = datasets.load_dataset("Siddharth63/biological_dataset") |
|
make_sentence_files(dataset_bio["train"]) |
|
|
|
|
|
files_to_combine = glob.glob("text/sentences/*.txt") |
|
files_to_combine = files_to_combine[:2] |
|
total_lines = combine_files(output_file_path, *files_to_combine) |
|
|
|
|
|
import sentencepiece as spm |
|
|
|
spm.SentencePieceTrainer.train(input="text/final_file.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0, |
|
pad_id=0, unk_id=2, eos_id=1, bos_id=-1, |
|
user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'], |
|
train_extremely_large_corpus=True, |
|
num_threads=90, input_sentence_size=45000000, shuffle_input_sentence=True) |
|
|
|
|
|
|
|
from seqio import SentencePieceVocabulary |
|
import os |
|
import tensorflow as tf |
|
from sentencepiece import SentencePieceProcessor, sentencepiece_model_pb2 |
|
|
|
|
|
def add_100extra(vocab: SentencePieceVocabulary, out_dir: str): |
|
tf.io.gfile.makedirs(out_dir) |
|
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.model'), 'w').write(vocab.sp_model) |
|
|
|
model = sentencepiece_model_pb2.ModelProto.FromString(vocab.sp_model) |
|
tf.io.gfile.GFile(os.path.join(out_dir, 'spiece.vocab'), 'w').write( |
|
'\n'.join(f'{p.piece}\t{p.score}' for p in model.pieces) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab = SentencePieceVocabulary("/Users/sdeshpande/Desktop/Challenges/patents/spiece_45.model", extra_ids=100) |
|
out_dir = "conv" |
|
add_100extra(vocab, out_dir) |