In [1]:
#!wget https://raw.githubusercontent.com/youyuge34/Poems_generator_Keras/master/dataset/poetry.txt
!git clone https://github.com/Werneror/Poetry.git

Cloning into 'Poetry'...
remote: Enumerating objects: 135, done.[K
remote: Total 135 (delta 0), reused 0 (delta 0), pack-reused 135[K
Receiving objects: 100% (135/135), 123.55 MiB | 12.33 MiB/s, done.
Resolving deltas: 100% (77/77), done.
Updating files: 100% (39/39), done.


In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

In [3]:
data = None
for (dirpath, dirnames, filenames) in os.walk("Poetry"):
    for filename in filenames:
        if filename.endswith("csv"):
            cur_data = pd.read_csv(f"Poetry/{filename}")
            if data is None:
                data = cur_data
            else:
                data = pd.concat([data, cur_data])
data.head()

Unnamed: 0,题目,朝代,作者,内容
0,彭生行,明,何景明,岷峨山根江水坼，万里波涛混吴越。倾湖倒海不可量，仰看一线青天上。郁蓝秀色盘三巴，间产锦石兼丹...
1,黄河篇,明,何景明,黄河昆崙源，九曲与天通。银汉贯箕尾，左盘日月宫。奔流下龙门，喷薄沙海风。三山万里倚穷发，鳖极...
2,三清山人歌,明,何景明,山人佩剑冠远游，腰间鞶囊垂虎头，七星照耀金银钩。东行策杖指卢霍，逝将沧海寻丹丘。三清西南龙虎...
3,昔游篇,明,何景明,三星烂夜河汉流，觞行瑟作中堂幽。李君勿叹息，薛君且停讴。英英孟夫子，听我当筵歌昔游。昔游少年...
4,赠商三,明,何景明,去冬雪雨留蓟门，开筵谑浪倒金樽。今春灯火到长安，过门不肯回银鞍。燕山花隔平山柳，马上东风几回首。


In [4]:
import re

def verse_length(verses):
    return len(verses[0])

def verse_heads(verses):
    verse_heads = [verse[0] for verse in verses]
    return "".join(verse_heads)

def split_poem(poem):
    return [verse for verse in re.split("，|。", poem) if len(verse)]
    
def is_correct_length(poem, max_length, min_length):
    return len(poem) < max_length and len(poem) > min_length
    
def is_equal_length(verses):
    verse_lengths = [len(verse) for verse in verses]
    for length in verse_lengths:
        if length != verse_lengths[0]:
            return False
    return True    

In [5]:
data = data[~data["内容"].isna()]
data['verses'] = [split_poem(poem) for poem in data['内容']]
data['equal_verse_lengths'] = [is_equal_length(verses) for verses in data['verses']]
data['meet_length_requirements'] = [is_correct_length(poem, 100, 20) for poem in data['内容']]
valid_poems = data[data['equal_verse_lengths'] & data['meet_length_requirements']]
valid_poems['verse_lengths'] = [verse_length(verses) for verses in valid_poems['verses']]
valid_poems['verse_heads'] = [verse_heads(verses) for verses in valid_poems['verses']]
valid_poems = valid_poems[valid_poems['verse_lengths'] < 10]
print(f"Number of valid poems: {len(valid_poems)}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Number of valid poems: 617674


In [6]:
valid_poems.head()

Unnamed: 0,题目,朝代,作者,内容,verses,equal_verse_lengths,meet_length_requirements,verse_lengths,verse_heads
4,赠商三,明,何景明,去冬雪雨留蓟门，开筵谑浪倒金樽。今春灯火到长安，过门不肯回银鞍。燕山花隔平山柳，马上东风几回首。,"[去冬雪雨留蓟门, 开筵谑浪倒金樽, 今春灯火到长安, 过门不肯回银鞍, 燕山花隔平山柳, ...",True,True,7,去开今过燕马
14,送叶生还闽中兼怀郑继之,明,何景明,叶生行吟燕中市，葛巾麻鞋岁将晚。两都为客今始归，五岳寻仙不辞远。江南画舸春柳低，海上茅堂白云...,"[叶生行吟燕中市, 葛巾麻鞋岁将晚, 两都为客今始归, 五岳寻仙不辞远, 江南画舸春柳低, ...",True,True,7,叶葛两五江海谷为
15,送林利正同知之潮阳,明,何景明,忆在成均共携手，泉山门下相知久。万里恩情若父兄，十年道义惭师友。君才岂孤一第名，佩刀今作岭南...,"[忆在成均共携手, 泉山门下相知久, 万里恩情若父兄, 十年道义惭师友, 君才岂孤一第名, ...",True,True,7,忆泉万十君佩挂伐燕相过道
16,金陵歌送李先生,明,何景明,李公为舅有吕甥，甥舅四海皆知名。吕君关西昨日去，公自金陵来复行。金陵江水无断绝，金陵之山高巀...,"[李公为舅有吕甥, 甥舅四海皆知名, 吕君关西昨日去, 公自金陵来复行, 金陵江水无断绝, ...",True,True,7,李甥吕公金金龙星白清燕
21,延津歌送韩令,明,何景明,延津寇过馀少男，延津县令莫停骖。双凫直向黄河北，一雁先飞清卫南。黄河岸边不种麦，浊浪滔天多贾...,"[延津寇过馀少男, 延津县令莫停骖, 双凫直向黄河北, 一雁先飞清卫南, 黄河岸边不种麦, ...",True,True,7,延延双一黄浊城县


In [7]:
import torch, json

class CharTokenizer:
    def __init__(self, corpus=None, vocab=None):
        if vocab is not None:
            self.vocab = vocab
        elif corpus is not None:
            self.vocab = self._build_vocab(corpus)
        else:
            raise Exception("Either corpus or vocab has to be supplied")
        self.id2vocab = [char for char, index in sorted(self.vocab.items(), key=lambda item: item[1])]
        
    def _tokenize(self, text):
        return list(text)
        
    def __call__(self, prompt, text=None, add_eos_token=False):
        token_ids = [self.vocab.get(token, 0) for token in self._tokenize(prompt)]
        if text is not None:
            text_token_ids = [self.vocab.get(token, 0) for token in self._tokenize(text)]
            token_ids = token_ids + [self.vocab["<bos>"]] + text_token_ids
        if add_eos_token:
            token_ids = token_ids + [self.vocab["<eos>"]]
        input_ids_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0)
        attention_masks = torch.ones_like(input_ids_tensor)
        return {"input_ids": input_ids_tensor, "attention_mask": attention_masks}
        
    def _build_vocab(self, corpus):
        vocab = {"<pad>": 0}
        for verse_lengths in range(3, 10):
            vocab[str(verse_lengths)] = len(vocab)
        for doc in corpus:
            chars = self._tokenize(doc)
            for char in chars:
                if char not in vocab:
                    vocab[char] = len(vocab)
        vocab["<bos>"] = len(vocab)
        vocab["<eos>"] = len(vocab)
        return vocab
    
    def decode(self, token_ids):
        chars = [self.id2vocab[token_id] for token_id in token_ids.flatten().tolist()]
        filtered_chars = [char for char in chars if char not in ["<eos>", "<bos>", "<pad>"]]
        return "".join(filtered_chars)
    
    def save(self, filepath):
        with open(filepath, "w") as f:
            json.dump(self.vocab, f)
    
    @classmethod
    def load(cls, filepath):
        with open(filepath) as f:
            vocab = json.load(f)
        return cls(vocab=vocab)

In [8]:
tokenizer = CharTokenizer(valid_poems['内容'])
tokenizer.save("/kaggle/working/tokenizer.json")

In [9]:
tokenized_dataset = [tokenizer(prompt = str(length) + heads, text=poem, add_eos_token=True) for poem, length, heads in zip(valid_poems['内容'],
                                                                                                             valid_poems['verse_lengths'],
                                                                                                             valid_poems['verse_heads'])]
train_dataset, val_dataset = train_test_split(tokenized_dataset, test_size=0.02, random_state=1234)
max_lengths = max([tokenized["input_ids"].size(1) for tokenized in tokenized_dataset])
print(max_lengths)

123


In [10]:
PAD_TOKEN_ID = 0

def collate_fn(batch_inputs):
    seq_lengths = [i["input_ids"].size(1) for i in batch_inputs]
    max_length = max(seq_lengths)
    input_ids = torch.full((len(batch_inputs), max_length), PAD_TOKEN_ID, dtype=torch.long)
    attention_mask = torch.full((len(batch_inputs), max_length), 0, dtype=torch.long)
    for idx, inputs in enumerate(batch_inputs):
        input_ids[idx, :seq_lengths[idx]] = inputs["input_ids"]
        attention_mask[idx, :seq_lengths[idx]] = 1
    labels = input_ids.clone()
    labels[labels == PAD_TOKEN_ID] = -100
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [11]:
config = GPT2Config(vocab_size = len(tokenizer.vocab),
                    n_positions = max_lengths,
                    n_embd = 768,
                    n_layer = 6,
                    n_head = 12,
                    eos_token_id=tokenizer.vocab["<eos>"],
                    bos_token_id=tokenizer.vocab["<bos>"])
model = GPT2LMHeadModel(config)
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_parameters}")

Number of trainable parameters: 50873088


In [12]:
from transformers import EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="results",
    eval_steps=2000,
    save_steps=2000,
    evaluation_strategy="steps",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    save_total_limit=2,
    num_train_epochs=8,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2,
    group_by_length=True,
    metric_for_best_model = 'loss',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
)

Using amp half precision backend


In [13]:
# n_embd = 768, n_layer = 12, n_head = 12, 58k steps, 93.4 M parameters, train loss 3.150600, val loss 3.163932
# n_embd = 768, n_layer = 6, n_head = 12,  steps, 50.9 M parameters, train loss , val loss 
# n_embd = 256, n_layer = 4, n_head = 8, steps, 5.94M parameters, train loss 3.374200, val loss 3.339147
# n_embd = 128, n_layer = 2, n_head = 4, 54k steps, 1.78M parameters, train loss 3.819500, val loss 3.694196
trainer.train()

***** Running training *****
  Num examples = 605320
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 151336


Step,Training Loss,Validation Loss
2000,4.3677,4.235631
4000,3.9533,3.883913
6000,3.7907,3.730361
8000,3.6995,3.639758
10000,3.6265,3.58157
12000,3.5758,3.529477
14000,3.5395,3.490788
16000,3.5061,3.457211
18000,3.4711,3.42791
20000,3.4117,3.404946


***** Running Evaluation *****
  Num examples = 12354
  Batch size = 64
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12354
  Batch size = 64
Saving model checkpoint to results/checkpoint-4000
Configuration saved in results/checkpoint-4000/config.json
Model weights saved in results/checkpoint-4000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 12354
  Batch size = 64
Saving model checkpoint to results/checkpoint-6000
Configuration saved in results/checkpoint-6000/config.json
Model weights saved in results/checkpoint-6000/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-2000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12354
  Batch size = 64
Saving model checkpoint to results/checkpoint-8000
Configuration saved in results/checkpoint-8000/config.j

TrainOutput(global_step=58000, training_loss=3.448922660038389, metrics={'train_runtime': 13970.1599, 'train_samples_per_second': 346.636, 'train_steps_per_second': 10.833, 'total_flos': 5.124009885990912e+16, 'train_loss': 3.448922660038389, 'epoch': 3.07})

In [14]:
def generation(prompt, length):
    tokens = tokenizer(prompt=str(length) + prompt)
    output_ids = model.generate(tokens['input_ids'].to("cuda"),
                                   do_sample=True, 
                                   top_k=50,
                                   top_p=0.95,
                                   max_length=100)
    decoded_verse = tokenizer.decode(output_ids)[5:]
    return decoded_verse

In [15]:
generation("花好月圆", length=5)

Setting `pad_token_id` to `eos_token_id`:10741 for open-end generation.


'花明水在溪，好在波上得。月光忽在溪，圆明了不蚀。'

In [16]:
generation("下楼吃饭", length=7)

Setting `pad_token_id` to `eos_token_id`:10741 for open-end generation.


'下山来访小园中，楼阁清幽景物同。吃吃僧斋分数宿，饭松茶灶有馀功。'

In [17]:
generation("今晚加班", length=7)

Setting `pad_token_id` to `eos_token_id`:10741 for open-end generation.


'大深无坐今夕分明是别年，晚陪花下醉清眠。加餐我自能高咏，班列君应似谪仙。大地星河连太皞，深宵星斗下华躔。无言独向閒庭静，坐对西南又一天。'

In [18]:
generation("加班内卷", length=7)

Setting `pad_token_id` to `eos_token_id`:10741 for open-end generation.


'加餐未暇望天颜，班列群仙戏綵幡。内史赐花频赐宴，卷帘先为看朝元。'

In [19]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session