Spaces:
Running
Running
File size: 3,899 Bytes
aefc9ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import random
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import nltk
from data_builder import load_data, save_data
from model import from_pretrained
class T5Paraphraser:
def __init__(self, args):
self.device = args.device
self.tokenizer = from_pretrained(AutoTokenizer, args.t5_model_name, {}, args.cache_dir)
self.model = from_pretrained(AutoModelForSeq2SeqLM, args.t5_model_name, {}, args.cache_dir)
self.model = self.model.to(args.device)
self.model.eval()
def paraphrase(self, sents):
parabatch = ["paraphrase: " + sent + " </s>" for sent in sents]
encoding = self.tokenizer(parabatch, padding=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
outputs = self.model.generate(
input_ids=input_ids, attention_mask=attention_masks,
max_length=256,
do_sample=True,
top_k=200,
top_p=0.95,
early_stopping=True,
num_return_sequences=1
)
assert len(sents) == len(outputs)
results = []
for output, sent in zip(outputs, sents):
line = self.tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
line = line.strip()
line = line if len(line) > 0 else sent
results.append(line)
return results
class RandomParaphraser:
def __init__(self, args):
self.device = args.device
def paraphrase(self, sents):
results = []
for sent in sents:
words = sent.split()
if len(words) > 20:
idx = random.randint(0, len(words) - 2)
words[idx], words[idx+1] = words[idx+1], words[idx]
results.append(' '.join(words))
return results
def generate_data(args):
data = load_data(args.dataset_file)
originals = data['original']
samples = data['sampled']
print(f"Total number of samples: {len(samples)}")
print(f"Average number of words: {np.mean([len(x.split()) for x in samples])}")
if args.do_random_para:
print(f'Using random paraphraser.')
paraphraser = RandomParaphraser(args)
else:
print(f'Loading model {args.t5_model_name}...')
paraphraser = T5Paraphraser(args)
new_samples = []
for sample in tqdm(samples):
lines = sample.split('\n')
new_lines = []
for line in lines:
line = line.strip()
if len(line) == 0:
new_lines.append(line)
else:
sents = nltk.sent_tokenize(line)
new_sents = paraphraser.paraphrase(sents)
new_lines.append(' '.join(new_sents))
new_samples.append('\n'.join(new_lines))
new_data = {'original': originals, 'sampled': new_samples}
save_data(args.output_file, args, new_data)
if __name__ == '__main__':
import argparse
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument('--output_file', type=str, default="./exp_test/results/xsum_gpt2")
parser.add_argument('--dataset', type=str, default="xsum")
parser.add_argument('--dataset_file', type=str, default="./exp_test/data/xsum_gpt2")
parser.add_argument('--t5_model_name', type=str, default="Vamsi/T5_Paraphrase_Paws")
parser.add_argument('--paraphraser', type=str, default="t5", choices=["t5", "random"])
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--device', type=str, default="cuda")
parser.add_argument('--cache_dir', type=str, default="../cache")
args = parser.parse_args()
torch.manual_seed(args.seed)
np.random.seed(args.seed)
import nltk
nltk.download('punkt')
generate_data(args)
|