File size: 3,899 Bytes
aefc9ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import random

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import nltk
from data_builder import load_data, save_data
from model import from_pretrained

class T5Paraphraser:
    def __init__(self, args):
        self.device = args.device
        self.tokenizer = from_pretrained(AutoTokenizer, args.t5_model_name, {}, args.cache_dir)
        self.model = from_pretrained(AutoModelForSeq2SeqLM, args.t5_model_name, {}, args.cache_dir)
        self.model = self.model.to(args.device)
        self.model.eval()

    def paraphrase(self, sents):
        parabatch = ["paraphrase: " + sent + " </s>" for sent in sents]
        encoding = self.tokenizer(parabatch, padding=True, return_tensors="pt")
        input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
        outputs = self.model.generate(
            input_ids=input_ids, attention_mask=attention_masks,
            max_length=256,
            do_sample=True,
            top_k=200,
            top_p=0.95,
            early_stopping=True,
            num_return_sequences=1
        )
        assert len(sents) == len(outputs)
        results = []
        for output, sent in zip(outputs, sents):
            line = self.tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            line = line.strip()
            line = line if len(line) > 0 else sent
            results.append(line)
        return results

class RandomParaphraser:
    def __init__(self, args):
        self.device = args.device

    def paraphrase(self, sents):
        results = []
        for sent in sents:
            words = sent.split()
            if len(words) > 20:
                idx = random.randint(0, len(words) - 2)
                words[idx], words[idx+1] = words[idx+1], words[idx]
            results.append(' '.join(words))
        return results

def generate_data(args):
    data = load_data(args.dataset_file)
    originals = data['original']
    samples = data['sampled']
    print(f"Total number of samples: {len(samples)}")
    print(f"Average number of words: {np.mean([len(x.split()) for x in samples])}")

    if args.do_random_para:
        print(f'Using random paraphraser.')
        paraphraser = RandomParaphraser(args)
    else:
        print(f'Loading model {args.t5_model_name}...')
        paraphraser = T5Paraphraser(args)

    new_samples = []
    for sample in tqdm(samples):
        lines = sample.split('\n')
        new_lines = []
        for line in lines:
            line = line.strip()
            if len(line) == 0:
                new_lines.append(line)
            else:
                sents = nltk.sent_tokenize(line)
                new_sents = paraphraser.paraphrase(sents)
                new_lines.append(' '.join(new_sents))
        new_samples.append('\n'.join(new_lines))

    new_data = {'original': originals, 'sampled': new_samples}
    save_data(args.output_file, args, new_data)


if __name__ == '__main__':
    import argparse
    from tqdm import tqdm
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_file', type=str, default="./exp_test/results/xsum_gpt2")
    parser.add_argument('--dataset', type=str, default="xsum")
    parser.add_argument('--dataset_file', type=str, default="./exp_test/data/xsum_gpt2")
    parser.add_argument('--t5_model_name', type=str, default="Vamsi/T5_Paraphrase_Paws")
    parser.add_argument('--paraphraser', type=str, default="t5", choices=["t5", "random"])
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--device', type=str, default="cuda")
    parser.add_argument('--cache_dir', type=str, default="../cache")
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    import nltk
    nltk.download('punkt')

    generate_data(args)