seonglae commited on Nov 22, 2023

Commit

a9082f6

1 Parent(s): edd7f9f

Training in progress, step 500

Browse files

Files changed (20) hide show

config.json +60 -0
embedding.py +29 -0
eval.py +84 -0
model.safetensors +3 -0
reader.py +36 -0
runs/Nov22_13-17-25_workspace-yt5idya013hw-0/events.out.tfevents.1700659046.workspace-yt5idya013hw-0.37370.0 +3 -0
runs/Nov22_13-18-19_workspace-yt5idya013hw-0/events.out.tfevents.1700659101.workspace-yt5idya013hw-0.38133.0 +3 -0
runs/Nov22_13-19-06_workspace-yt5idya013hw-0/events.out.tfevents.1700659147.workspace-yt5idya013hw-0.39864.0 +3 -0
runs/Nov22_13-20-04_workspace-yt5idya013hw-0/events.out.tfevents.1700659206.workspace-yt5idya013hw-0.42035.0 +3 -0
runs/Nov22_13-20-53_workspace-yt5idya013hw-0/events.out.tfevents.1700659254.workspace-yt5idya013hw-0.43999.0 +3 -0
runs/Nov22_13-21-31_workspace-yt5idya013hw-0/events.out.tfevents.1700659293.workspace-yt5idya013hw-0.45721.0 +3 -0
runs/Nov22_13-22-38_workspace-yt5idya013hw-0/events.out.tfevents.1700659361.workspace-yt5idya013hw-0.48336.0 +3 -0
runs/Nov22_13-34-19_workspace-yt5idya013hw-0/events.out.tfevents.1700660060.workspace-yt5idya013hw-0.67705.0 +3 -0
special_tokens_map.json +110 -0
summarizer.py +24 -0
tokenizer.json +0 -0
tokenizer_config.json +967 -0
train.py +65 -0
training_args.bin +3 -0
utils.py +57 -0

config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "_name_or_path": "google/pegasus-x-base",
+  "activation_dropout": 0.1,
+  "activation_function": "relu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": true,
+  "architectures": [
+    "PegasusXForConditionalGeneration"
+  ],
+  "attention_dropout": 0.1,
+  "block_size": 512,
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
+  "classifier_dropout": 0.0,
+  "d_model": 768,
+  "decoder_attention_heads": 12,
+  "decoder_ffn_dim": 3072,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 0,
+  "dropout": 0.1,
+  "encoder_attention_heads": 12,
+  "encoder_ffn_dim": 3072,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 1,
+  "extra_pos_embeddings": 1,
+  "force_bos_token_to_be_generated": false,
+  "forced_eos_token_id": 1,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "length_penalty": 0.8,
+  "max_length": 16384,
+  "max_position_embeddings": 16384,
+  "model_type": "pegasus_x",
+  "normalize_before": true,
+  "normalize_embedding": false,
+  "num_beams": 8,
+  "num_global_tokens": 128,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "scale_embedding": true,
+  "stagger_local_blocks": true,
+  "static_position_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "use_cache": true,
+  "vocab_size": 96103
+}

embedding.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import List, Dict
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+def average_pool(last_hidden_states: Tensor,
+                 attention_mask: Tensor) -> Tensor:
+  last_hidden = last_hidden_states.masked_fill(
+      ~attention_mask[..., None].bool(), 0.0)
+  return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+def encode_hf(input_texts: List[str], model_id: str = 'thenlper/gte-small',
+              prefix: str = ''):
+  tokenizer = AutoTokenizer.from_pretrained(model_id)
+  model = AutoModel.from_pretrained(model_id).to('cuda')
+  input_texts = [prefix + input_text for input_text in input_texts]
+  # Tokenize the input texts
+  batch_dict = tokenizer(input_texts, padding=True,
+                         truncation=True, return_tensors='pt').to('cuda')
+  outputs = model(**batch_dict)
+  embeddings = average_pool(outputs.last_hidden_state,
+                            batch_dict['attention_mask'])
+  # normalize embeddings
+  embeddings = F.normalize(embeddings)
+  return embeddings

eval.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import re
+import string
+import unicodedata
+from evaluate import evaluator, QuestionAnsweringEvaluator
+from datasets import load_dataset
+def evaluate_dataset(id: str, subset: str, metric: str = 'squad_v2',
+                     question_col: str = 'question', context_col: str = 'retrieved', predict_col: str = 'predicted',
+                     id_col: str = 'question', label_col: str = 'answer', labeling: bool = True):
+  referee: QuestionAnsweringEvaluator = evaluator("question-answering")
+  referee.PIPELINE_KWARGS["handle_impossible_answer"] = True
+  # Dataset
+  dataset = load_dataset(id, subset)
+  dataset_list = list(dataset['train'])
+  metric_input, qa = referee.prepare_data(
+      dataset['train'], question_col, context_col, id_col, label_col)
+  # References
+  if labeling:
+    for i, reference in enumerate(metric_input['references']):
+      starts = [qa['context'][i].find(answer)
+                for answer in reference['answers']]
+      reference['answers'] = {
+          'answer_start': starts, 'text': reference['answers']}
+  # Prediction
+  metric_input['predictions'] = []
+  for row in dataset_list:
+    result = {
+        'prediction_text': row[predict_col], 'id': row[id_col]}
+    if metric == 'squad_v2':
+      result['no_answer_probability'] = 0.
+    metric_input['predictions'].append(result)
+  metric_module = referee.prepare_metric(metric)
+  results = referee.compute_metric(metric_module, metric_inputs=metric_input)
+  return results
+def evaluate_dataset_manual(id: str, subset: str):
+  dataset = load_dataset(id, subset)
+  dataset_list = list(dataset['train'])
+  for row in dataset_list:
+    row['score'] = max([regex_match_score(row['predicted'], answer)
+                       for answer in row['answer']])
+  score = sum([row['score'] for row in dataset_list]) / len(dataset_list)
+  return score
+def normalize_answer(s):
+  """Normalize answer."""
+  s = unicodedata.normalize("NFD", s)
+  def remove_articles(text):
+    return re.sub(r"\b(a|an|the)\b", " ", text)
+  def white_space_fix(text):
+    return " ".join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return "".join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+def exact_match_score(prediction, ground_truth):
+  return normalize_answer(prediction) == normalize_answer(ground_truth)
+def regex_match_score(prediction, ground_truth):
+  try:
+    regex = re.compile(ground_truth,
+                       flags=re.IGNORECASE + re.UNICODE + re.MULTILINE)
+    return regex.match(prediction) is not None
+  except re.error:
+    return False

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3362987f673771c822ed03171d5e8ad806008c8f7057b4ed4ad893506208bd8b
+size 1089213696

reader.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import TypedDict, List, Dict
+from re import sub
+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DPRReaderTokenizer, DPRReader, logging
+from transformers import QuestionAnsweringPipeline
+max_answer_len = 8
+logging.set_verbosity_error()
+class AnswerInfo(TypedDict):
+  score: float
+  start: int
+  end: int
+  answer: str
+@torch.inference_mode()
+def ask_reader(tokenizer: AutoTokenizer, model: AutoModelForQuestionAnswering,
+               questions: List[str], ctxs: List[str]) -> List[AnswerInfo]:
+  with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    pipeline = QuestionAnsweringPipeline(
+        model=model, tokenizer=tokenizer, device='cuda', max_answer_len=max_answer_len)
+    answer_infos: List[AnswerInfo] = pipeline(
+        question=questions, context=ctxs)
+  for answer_info in answer_infos:
+    answer_info['answer'] = sub(r'[.\(\)"\',]', '', answer_info['answer'])
+  return answer_infos
+def get_reader(model_id="mrm8488/longformer-base-4096-finetuned-squadv2"):
+  tokenizer = DPRReaderTokenizer.from_pretrained(model_id)
+  model = DPRReader.from_pretrained(model_id).to(0)
+  return tokenizer, model

runs/Nov22_13-17-25_workspace-yt5idya013hw-0/events.out.tfevents.1700659046.workspace-yt5idya013hw-0.37370.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15b5883324cca5bf9205d794de47d7dc248390a83a14caed3ef39369e01e11d5
+size 5134

runs/Nov22_13-18-19_workspace-yt5idya013hw-0/events.out.tfevents.1700659101.workspace-yt5idya013hw-0.38133.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e9925734784a947ec34d7a401f25b29de941f4ec0f0a8d818519b55e0e7cb43
+size 5134

runs/Nov22_13-19-06_workspace-yt5idya013hw-0/events.out.tfevents.1700659147.workspace-yt5idya013hw-0.39864.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e00909093fc8ce8d5aebb7c0cc56c60bb9e102f191c85fe000b68ad68fff7b33
+size 5134

runs/Nov22_13-20-04_workspace-yt5idya013hw-0/events.out.tfevents.1700659206.workspace-yt5idya013hw-0.42035.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7654a3da47dda91d122a6906099f39841bb090fb755d488709dbe4af6e42594
+size 5133

runs/Nov22_13-20-53_workspace-yt5idya013hw-0/events.out.tfevents.1700659254.workspace-yt5idya013hw-0.43999.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a282925fc0143388c39c4033a201f80618014ff219aaa55937728537b44048c4
+size 5132

runs/Nov22_13-21-31_workspace-yt5idya013hw-0/events.out.tfevents.1700659293.workspace-yt5idya013hw-0.45721.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efee85fdb76a6a0dd25ae9f7f4aa292a3fd203f6b403ee80f1cb58e432acba15
+size 5132

runs/Nov22_13-22-38_workspace-yt5idya013hw-0/events.out.tfevents.1700659361.workspace-yt5idya013hw-0.48336.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75c449e08bda4a6ae2fce4fbb586fb91d45ec3905e87451017be5b5c5b53a3c0
+size 5132

runs/Nov22_13-34-19_workspace-yt5idya013hw-0/events.out.tfevents.1700660060.workspace-yt5idya013hw-0.67705.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6adab62f08cee5a5217d202f30df47170a004d6364d8b8bb7e7c2b58a3ab9ee7
+size 5289

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,110 @@

+{
+  "additional_special_tokens": [
+    "<mask_1>",
+    "<unk_2>",
+    "<unk_3>",
+    "<unk_4>",
+    "<unk_5>",
+    "<unk_6>",
+    "<unk_7>",
+    "<unk_8>",
+    "<unk_9>",
+    "<unk_10>",
+    "<unk_11>",
+    "<unk_12>",
+    "<unk_13>",
+    "<unk_14>",
+    "<unk_15>",
+    "<unk_16>",
+    "<unk_17>",
+    "<unk_18>",
+    "<unk_19>",
+    "<unk_20>",
+    "<unk_21>",
+    "<unk_22>",
+    "<unk_23>",
+    "<unk_24>",
+    "<unk_25>",
+    "<unk_26>",
+    "<unk_27>",
+    "<unk_28>",
+    "<unk_29>",
+    "<unk_30>",
+    "<unk_31>",
+    "<unk_32>",
+    "<unk_33>",
+    "<unk_34>",
+    "<unk_35>",
+    "<unk_36>",
+    "<unk_37>",
+    "<unk_38>",
+    "<unk_39>",
+    "<unk_40>",
+    "<unk_41>",
+    "<unk_42>",
+    "<unk_43>",
+    "<unk_44>",
+    "<unk_45>",
+    "<unk_46>",
+    "<unk_47>",
+    "<unk_48>",
+    "<unk_49>",
+    "<unk_50>",
+    "<unk_51>",
+    "<unk_52>",
+    "<unk_53>",
+    "<unk_54>",
+    "<unk_55>",
+    "<unk_56>",
+    "<unk_57>",
+    "<unk_58>",
+    "<unk_59>",
+    "<unk_60>",
+    "<unk_61>",
+    "<unk_62>",
+    "<unk_63>",
+    "<unk_64>",
+    "<unk_65>",
+    "<unk_66>",
+    "<unk_67>",
+    "<unk_68>",
+    "<unk_69>",
+    "<unk_70>",
+    "<unk_71>",
+    "<unk_72>",
+    "<unk_73>",
+    "<unk_74>",
+    "<unk_75>",
+    "<unk_76>",
+    "<unk_77>",
+    "<unk_78>",
+    "<unk_79>",
+    "<unk_80>",
+    "<unk_81>",
+    "<unk_82>",
+    "<unk_83>",
+    "<unk_84>",
+    "<unk_85>",
+    "<unk_86>",
+    "<unk_87>",
+    "<unk_88>",
+    "<unk_89>",
+    "<unk_90>",
+    "<unk_91>",
+    "<unk_92>",
+    "<unk_93>",
+    "<unk_94>",
+    "<unk_95>",
+    "<unk_96>",
+    "<unk_97>",
+    "<unk_98>",
+    "<unk_99>",
+    "<unk_100>",
+    "<unk_101>",
+    "<unk_102>"
+  ],
+  "eos_token": "</s>",
+  "mask_token": "<mask_2>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

summarizer.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from typing import List, Tuple
+from transformers import AutoTokenizer, BartForConditionalGeneration, BartTokenizerFast
+import torch
+def summarize_text(tokenizer: BartTokenizerFast, model: BartForConditionalGeneration,
+                   input_texts: List[str]):
+  inputs = tokenizer(input_texts, padding=True,
+                     return_tensors='pt', truncation=True).to(1)
+  with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    summary_ids = model.generate(inputs["input_ids"])
+  summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
+                                     clean_up_tokenization_spaces=False, batch_size=len(input_texts))
+  return summaries
+def get_summarizer(model_id="ccdv/lsg-bart-base-4096-multinews") -> Tuple[BartTokenizerFast, BartForConditionalGeneration]:
+  tokenizer = BartTokenizerFast.from_pretrained(model_id)
+  model = BartForConditionalGeneration.from_pretrained(model_id).to(1)
+  model = torch.compile(model)
+  return tokenizer, model
+# OpenAI reader

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,967 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<mask_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<mask_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<unk_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<unk_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<unk_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<unk_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<unk_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<unk_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<unk_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<unk_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<unk_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<unk_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<unk_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<unk_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<unk_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<unk_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<unk_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<unk_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "<unk_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<unk_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "<unk_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<unk_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<unk_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<unk_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "<unk_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<unk_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "<unk_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "<unk_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "<unk_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "<unk_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<unk_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<unk_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "<unk_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<unk_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "<unk_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "37": {
+      "content": "<unk_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "38": {
+      "content": "<unk_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "39": {
+      "content": "<unk_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "40": {
+      "content": "<unk_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41": {
+      "content": "<unk_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42": {
+      "content": "<unk_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "43": {
+      "content": "<unk_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "44": {
+      "content": "<unk_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "45": {
+      "content": "<unk_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46": {
+      "content": "<unk_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "47": {
+      "content": "<unk_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "48": {
+      "content": "<unk_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49": {
+      "content": "<unk_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50": {
+      "content": "<unk_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "51": {
+      "content": "<unk_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "52": {
+      "content": "<unk_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "53": {
+      "content": "<unk_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "54": {
+      "content": "<unk_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "55": {
+      "content": "<unk_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "56": {
+      "content": "<unk_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57": {
+      "content": "<unk_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "58": {
+      "content": "<unk_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59": {
+      "content": "<unk_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "60": {
+      "content": "<unk_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "61": {
+      "content": "<unk_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "62": {
+      "content": "<unk_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "63": {
+      "content": "<unk_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64": {
+      "content": "<unk_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65": {
+      "content": "<unk_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "66": {
+      "content": "<unk_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "67": {
+      "content": "<unk_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "68": {
+      "content": "<unk_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "69": {
+      "content": "<unk_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70": {
+      "content": "<unk_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "71": {
+      "content": "<unk_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "72": {
+      "content": "<unk_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73": {
+      "content": "<unk_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "74": {
+      "content": "<unk_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "75": {
+      "content": "<unk_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "76": {
+      "content": "<unk_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "77": {
+      "content": "<unk_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "78": {
+      "content": "<unk_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "79": {
+      "content": "<unk_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "80": {
+      "content": "<unk_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "81": {
+      "content": "<unk_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "82": {
+      "content": "<unk_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "83": {
+      "content": "<unk_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "84": {
+      "content": "<unk_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "85": {
+      "content": "<unk_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "86": {
+      "content": "<unk_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "87": {
+      "content": "<unk_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "88": {
+      "content": "<unk_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "89": {
+      "content": "<unk_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "90": {
+      "content": "<unk_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "91": {
+      "content": "<unk_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92": {
+      "content": "<unk_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "93": {
+      "content": "<unk_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "94": {
+      "content": "<unk_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "95": {
+      "content": "<unk_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96": {
+      "content": "<unk_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "97": {
+      "content": "<unk_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "98": {
+      "content": "<unk_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "99": {
+      "content": "<unk_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "<unk_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "<unk_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "<unk_100>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "<unk_101>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "<unk_102>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "105": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<mask_1>",
+    "<unk_2>",
+    "<unk_3>",
+    "<unk_4>",
+    "<unk_5>",
+    "<unk_6>",
+    "<unk_7>",
+    "<unk_8>",
+    "<unk_9>",
+    "<unk_10>",
+    "<unk_11>",
+    "<unk_12>",
+    "<unk_13>",
+    "<unk_14>",
+    "<unk_15>",
+    "<unk_16>",
+    "<unk_17>",
+    "<unk_18>",
+    "<unk_19>",
+    "<unk_20>",
+    "<unk_21>",
+    "<unk_22>",
+    "<unk_23>",
+    "<unk_24>",
+    "<unk_25>",
+    "<unk_26>",
+    "<unk_27>",
+    "<unk_28>",
+    "<unk_29>",
+    "<unk_30>",
+    "<unk_31>",
+    "<unk_32>",
+    "<unk_33>",
+    "<unk_34>",
+    "<unk_35>",
+    "<unk_36>",
+    "<unk_37>",
+    "<unk_38>",
+    "<unk_39>",
+    "<unk_40>",
+    "<unk_41>",
+    "<unk_42>",
+    "<unk_43>",
+    "<unk_44>",
+    "<unk_45>",
+    "<unk_46>",
+    "<unk_47>",
+    "<unk_48>",
+    "<unk_49>",
+    "<unk_50>",
+    "<unk_51>",
+    "<unk_52>",
+    "<unk_53>",
+    "<unk_54>",
+    "<unk_55>",
+    "<unk_56>",
+    "<unk_57>",
+    "<unk_58>",
+    "<unk_59>",
+    "<unk_60>",
+    "<unk_61>",
+    "<unk_62>",
+    "<unk_63>",
+    "<unk_64>",
+    "<unk_65>",
+    "<unk_66>",
+    "<unk_67>",
+    "<unk_68>",
+    "<unk_69>",
+    "<unk_70>",
+    "<unk_71>",
+    "<unk_72>",
+    "<unk_73>",
+    "<unk_74>",
+    "<unk_75>",
+    "<unk_76>",
+    "<unk_77>",
+    "<unk_78>",
+    "<unk_79>",
+    "<unk_80>",
+    "<unk_81>",
+    "<unk_82>",
+    "<unk_83>",
+    "<unk_84>",
+    "<unk_85>",
+    "<unk_86>",
+    "<unk_87>",
+    "<unk_88>",
+    "<unk_89>",
+    "<unk_90>",
+    "<unk_91>",
+    "<unk_92>",
+    "<unk_93>",
+    "<unk_94>",
+    "<unk_95>",
+    "<unk_96>",
+    "<unk_97>",
+    "<unk_98>",
+    "<unk_99>",
+    "<unk_100>",
+    "<unk_101>",
+    "<unk_102>"
+  ],
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "full_tokenizer_file": null,
+  "mask_token": "<mask_2>",
+  "mask_token_sent": "<mask_1>",
+  "model_max_length": 1024,
+  "offset": 103,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "PegasusTokenizer",
+  "unk_token": "<unk>"
+}

train.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
+import numpy as np
+import torch
+from huggingface_hub import login
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+def preprocesser(tokenizer):
+  def preprocess_function(examples):
+    inputs = [f"{examples['question_text'][i]}\n{doc}" for i,
+              doc in enumerate(examples["document_text"])]
+    model_inputs = tokenizer(inputs, truncation=True)
+    labels = tokenizer(
+        text_target=examples["summarization_text"], truncation=True)
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+  return preprocess_function
+def training(output='resrer', dataset_id='seonglae/resrer-nq', checkpoint='google/pegasus-x-base',
+             owner='seonglae', token=None):
+  if token is not None:
+    login(token=token)
+  # Load model
+  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+  # Load dataset
+  dataset = load_dataset(dataset_id, split='train')
+  splited_dataset = dataset.train_test_split(test_size=0.2)
+  tokenized_dataset = splited_dataset.map(
+      preprocesser(tokenizer), batched=True)
+  print(tokenized_dataset["train"][0])
+  # Train
+  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+  data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+  training_args = Seq2SeqTrainingArguments(
+      output_dir=output,
+      evaluation_strategy="epoch",
+      learning_rate=2e-5,
+      per_device_train_batch_size=2,
+      optim='adamw_hf',
+      weight_decay=0.01,
+      save_total_limit=3,
+      num_train_epochs=4,
+      push_to_hub=True,
+  )
+  trainer = Seq2SeqTrainer(
+      model=model,
+      args=training_args,
+      train_dataset=tokenized_dataset["train"],
+      eval_dataset=tokenized_dataset["test"],
+      tokenizer=tokenizer,
+      data_collator=data_collator,
+  )
+  trainer.train()
+  # Push
+  if token is not None:
+    tokenizer.push_to_hub(f"{owner}/{output}", token=token)
+    model.push_to_hub(f"{owner}/{output}", token=token)

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:685886fd200260152b29db695575ac8dd0381c487c29a959991bdb5afe90a216
+size 4728

utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import TypedDict, List
+from tiktoken import Encoding
+class Row(TypedDict):
+  id: str
+  title: str
+  url: str
+  text: str
+def split_token(encoder: Encoding, rows: List[Row], input_texts: List[str], split: int = 512) -> List[Row]:
+  dict_list: List[Row] = []
+  # Batch documents
+  for i, text_tokenes in enumerate(encoder.encode_batch(input_texts)):
+    row = rows[i]
+    passages_count = int((len(text_tokenes) - 1) / split)
+    # Passages from start
+    for i in range(passages_count):
+      tokens = text_tokenes[i * split:(i + 1) * split]
+      for i in range(passages_count):
+        tokens = text_tokenes[i * split:(i + 1) * split]
+        # Append tokens until meet whitespace
+        for token in text_tokenes[(i + 1) * split:]:
+          if not encoder.decode_single_token_bytes(token).startswith(b' '):
+            tokens.append(token)
+          else:
+            break
+        # Unshift tokens until meet whitespace
+        if not encoder.decode_single_token_bytes(text_tokenes[i * split]).startswith(b' '):
+          for token in reversed(text_tokenes[:i * split]):
+            if not encoder.decode_single_token_bytes(token).startswith(b' '):
+              tokens.insert(0, token)
+            else:
+              tokens.insert(0, token)
+              break
+        dict_list.append({'id': f"{row['id']}_{i}", 'title': row['title'], 'url': row['url'],
+                          'text': encoder.decode(tokens)})
+    # Passages from end
+    tokens = text_tokenes[-split:]
+    if not encoder.decode_single_token_bytes(text_tokenes[0]).startswith(b' '):
+      # Unshift tokens until meet whitespace
+      for token in reversed(text_tokenes[:-split]):
+        if not encoder.decode_single_token_bytes(token).startswith(b' '):
+          tokens.insert(0, token)
+        else:
+          tokens.insert(0, token)
+          break
+    dict_list.append({'id': f"{row['id']}_{passages_count}", 'title': row['title'], 'url': row['url'],
+                      'text': encoder.decode(tokens)})
+  return dict_list