Ivan Tan commited on
Commit
414714e
Β·
1 Parent(s): 1e70b5c

Init repo for TIABotV2

Browse files
.gitattributes CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ model-sagemaker-5epochs/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
36
+ train.csv filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: TIABotV2
3
- emoji: 🐒
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 3.9
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: TIABot
3
+ emoji: πŸš€
4
+ colorFrom: green
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.1.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[10]:
5
+
6
+
7
+ import pandas as pd
8
+ import os
9
+ import torch
10
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
11
+ from transformers.optimization import Adafactor
12
+ import time
13
+ import warnings
14
+ import random
15
+ warnings.filterwarnings('ignore')
16
+
17
+ import re
18
+
19
+ def strip_html(text):
20
+ return re.sub('<[^<]+?>', '', text)
21
+
22
+
23
+ # In[5]:
24
+
25
+
26
+ train_columns = ['round_amount', 'round_date', 'stage', 'investee',
27
+ 'investee_description', 'investee_country', 'investee_region',
28
+ 'investee_subregion', 'investee_vertical', 'investee_industry',
29
+ 'investor_list', 'previous_investors', 'prior_funding']
30
+ train = pd.read_csv("train.csv")
31
+
32
+
33
+ # In[6]:
34
+
35
+
36
+ train.publication_timestamp = pd.to_datetime(train.publication_timestamp)
37
+
38
+
39
+ # In[7]:
40
+
41
+
42
+ input_text = train[train_columns].to_dict(orient='records')
43
+ train_df = train[['title']].rename(columns={'title':'target_text'})
44
+ train_df['input_text'] = input_text
45
+ train_df['prefix'] = 'tia'
46
+ train_df.input_text = train_df.input_text.astype(str)
47
+
48
+
49
+ # In[8]:
50
+
51
+
52
+ if torch.cuda.is_available():
53
+ dev = torch.device("cuda:0")
54
+ print("Running on the GPU")
55
+ else:
56
+ dev = torch.device("cpu")
57
+ print("Running on the CPU")
58
+
59
+
60
+ # In[ ]:
61
+
62
+
63
+ tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-base')
64
+ model = T5ForConditionalGeneration.from_pretrained('model-sagemaker-5epochs/', local_files_only=True)
65
+ #moving the model to device(GPU/CPU)
66
+ model.to(dev)
67
+
68
+
69
+ # In[12]:
70
+
71
+
72
+ vi_table = train[['investee_industry', 'investee_vertical']].drop_duplicates()
73
+
74
+
75
+ # In[13]:
76
+
77
+
78
+ def update_industry(value):
79
+ verticals = list(vi_table[vi_table['investee_industry'] == value]['investee_vertical'].values)
80
+ return verticals[0]
81
+
82
+ def update_vertical(value):
83
+ industries = list(vi_table[vi_table['investee_vertical'] == value]['investee_industry'].values)
84
+ return industries[0]
85
+
86
+
87
+ # In[ ]:
88
+
89
+
90
+ update_industry('Green')
91
+
92
+
93
+ # In[ ]:
94
+
95
+
96
+ update_vertical('Clean tech')
97
+
98
+
99
+ # In[ ]:
100
+
101
+
102
+ import gradio as gr
103
+
104
+
105
+ # In[ ]:
106
+
107
+
108
+ num_return_sequences = 5
109
+
110
+
111
+ # In[ ]:
112
+
113
+
114
+ def generate_headline(stage, investee_country, investee_subregion, investee_region,
115
+ investee_vertical, investee_industry,
116
+ round_amount, investee, investee_description, investor_list, previous_investors,
117
+ other_values):
118
+
119
+ full_df = other_values.set_index("key").T
120
+
121
+ full_df['stage'] = stage
122
+ full_df['investee_country'] = investee_country
123
+ full_df['investee_subregion'] = investee_subregion
124
+ full_df['investee_region'] = investee_region
125
+ full_df['investee_vertical'] = investee_vertical
126
+ full_df['investee_industry'] = investee_industry
127
+ full_df['round_amount'] = str(float(round_amount))
128
+ full_df['investee'] = investee
129
+ full_df['investee_description'] = investee_description
130
+ full_df['investor_list'] = investor_list
131
+ full_df['previous_investors'] = previous_investors
132
+
133
+ random_set =full_df[['round_amount', 'round_date', 'stage', 'investee',
134
+ 'investee_description', 'investee_country', 'investee_region',
135
+ 'investee_subregion', 'investee_vertical', 'investee_industry',
136
+ 'investor_list', 'previous_investors', 'prior_funding']].to_json(orient="records")
137
+ # print(random_set)
138
+
139
+ input_ids = tokenizer.encode(f"tia: {{{random_set}}}", return_tensors="pt") # Batch size 1
140
+ input_ids=input_ids.to(dev)
141
+ outputs = model.generate(input_ids)
142
+ # text_output = tokenizer.decode(outputs[0]) # Single output
143
+ text_outputs = model.generate(inputs=input_ids, do_sample=True,
144
+ num_beams=2,
145
+ num_return_sequences=num_return_sequences,
146
+ repetition_penalty=5.0)
147
+ outputs = [strip_html(tokenizer.decode(o)) for o in text_outputs]
148
+ return "\n".join(outputs)
149
+
150
+
151
+ # In[ ]:
152
+
153
+
154
+ other_columns = ['round_date', 'prior_funding']
155
+
156
+
157
+ # In[ ]:
158
+
159
+
160
+ train.sample(1)[other_columns].T.reset_index().values
161
+
162
+
163
+ # In[ ]:
164
+
165
+
166
+ print(train.query("investee == 'NOSH'")['title'].head(1).T)
167
+ train.query("investee == 'NOSH'")[train_columns].head(1).T
168
+
169
+
170
+ # In[ ]:
171
+
172
+
173
+ fake_data = {
174
+ "round_amount":1000000.0,
175
+ "round_date":"2018-09-26",
176
+ "stage":"Pre-series A",
177
+ "investee":"NOSH",
178
+ "investee_description":"NOSH makes and delivers ready-to-eat meals in Hong Kong.",
179
+ "investee_country":"Hong Kong",
180
+ "investee_region":"Asia",
181
+ "investee_subregion":"Eastern Asia",
182
+ "investee_vertical":"Food tech",
183
+ "investee_industry":"Restaurants & Food",
184
+ "investor_list":["Alibaba Entrepreneurs Fund (ι˜Ώι‡Œε·΄ε·΄εˆ›δΈšθ€…εŸΊι‡‘)"],
185
+ "previous_investors":"",
186
+ "prior_funding":1000000.0
187
+ }
188
+
189
+
190
+ # In[ ]:
191
+
192
+
193
+ pd.DataFrame([fake_data]).T
194
+
195
+
196
+ # In[ ]:
197
+
198
+
199
+ demo = gr.Blocks()
200
+
201
+ random_sample = train[train_columns].sample(1)
202
+ random_sample = pd.DataFrame([fake_data])
203
+
204
+ stage = gr.Dropdown(label="stage", choices=list(train[train_columns].stage.unique()))
205
+ investee_country = gr.Dropdown(label="investee_country", choices=list(train[train_columns].investee_country.unique()),
206
+ value=random_sample.investee_country.values[0])
207
+ investee_subregion = gr.Dropdown(label="investee_subregion", choices=list(train[train_columns].investee_subregion.unique()),
208
+ value=random_sample.investee_subregion.values[0])
209
+ investee_region = gr.Dropdown(label="investee_region", choices=list(train[train_columns].investee_region.unique()),
210
+ value=random_sample.investee_region.values[0])
211
+ investee_vertical = gr.Dropdown(label="investee_vertical", choices=list(train[train_columns].investee_vertical.unique()),
212
+ value=random_sample.investee_vertical.values[0])
213
+ investee_industry = gr.Dropdown(label="investee_industry", choices=list(train[train_columns].investee_industry.unique()),
214
+ value=random_sample.investee_industry.values[0])
215
+
216
+ if pd.isnull(random_sample.round_amount.values[0]):
217
+ rand_amount = 0
218
+ else:
219
+ rand_amount = random_sample.round_amount.values[0]
220
+
221
+ round_amount = gr.Slider(label="round_amount", minimum=100000, maximum=200000000,
222
+ value=rand_amount,
223
+ step=100000)
224
+
225
+ investee = gr.Textbox(label="investee", value=random_sample.investee.values[0])
226
+ investee_description = gr.Textbox(label="investee_description",
227
+ value=random_sample.investee_description.values[0])
228
+ investor_list = gr.Textbox(label="investor_list",
229
+ value=random_sample.investor_list.values[0])
230
+ previous_investors = gr.Textbox(label="previous_investors",
231
+ value=random_sample.previous_investors.values[0])
232
+ other_values = gr.Dataframe(
233
+ headers=['key', 'value'],
234
+ value=[['round_date', random_sample.round_date.values[0]],
235
+ ['prior_funding', random_sample.prior_funding.values[0]]]
236
+ )
237
+ out = gr.Textbox(max_lines=num_return_sequences)
238
+
239
+ with demo:
240
+ gr.Markdown("Enter funding data to generate news headline.")
241
+
242
+ inputs=[stage, investee_country, investee_subregion, investee_region,
243
+ investee_vertical, investee_industry,
244
+ round_amount, investee, investee_description, investor_list, previous_investors,
245
+ other_values]
246
+
247
+ investee_industry.change(fn=update_industry, inputs=investee_industry, outputs=investee_vertical)
248
+ investee_vertical.change(fn=update_vertical, inputs=investee_vertical, outputs=investee_industry)
249
+ gr.Interface(fn=generate_headline, inputs=inputs, outputs=out, live=True)
250
+ description="Enter funding data to generate news headline.",
251
+ live=True
252
+
253
+ demo.launch()
model-sagemaker-5epochs/code/.ipynb_checkpoints/inference-checkpoint.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the script that will be used in the inference container
2
+ import json
3
+ import torch
4
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
+
6
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+
8
+
9
+ def model_fn(model_dir):
10
+ """
11
+ Load the model and tokenizer for inference
12
+ """
13
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
14
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device).eval()
15
+
16
+ return {"model": model, "tokenizer": tokenizer}
17
+
18
+
19
+ def predict_fn(input_data, model_dict):
20
+ """
21
+ Make a prediction with the model
22
+ """
23
+ text = input_data.pop("inputs")
24
+ parameters_list = input_data.pop("parameters_list", None)
25
+
26
+ tokenizer = model_dict["tokenizer"]
27
+ model = model_dict["model"]
28
+
29
+ # Parameters may or may not be passed
30
+ input_ids = tokenizer(
31
+ text, truncation=True, padding="longest", return_tensors="pt"
32
+ ).input_ids.to(device)
33
+
34
+ if parameters_list:
35
+ predictions = []
36
+ for parameters in parameters_list:
37
+ output = model.generate(input_ids, **parameters)
38
+ predictions.append(tokenizer.batch_decode(output, skip_special_tokens=True))
39
+ else:
40
+ output = model.generate(input_ids)
41
+ predictions = tokenizer.batch_decode(output, skip_special_tokens=True)
42
+
43
+ return predictions
44
+
45
+
46
+ def input_fn(request_body, request_content_type):
47
+ """
48
+ Transform the input request to a dictionary
49
+ """
50
+ return json.loads(request_body)
model-sagemaker-5epochs/code/.ipynb_checkpoints/requirements-checkpoint.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ nltk
2
+ rouge_score
model-sagemaker-5epochs/code/.ipynb_checkpoints/train-checkpoint.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the script that will be used in the training container
2
+ import argparse
3
+ import logging
4
+ import os
5
+ import sys
6
+
7
+ import numpy as np
8
+ import nltk
9
+
10
+ try:
11
+ nltk.data.find("tokenizers/punkt")
12
+ except LookupError as e:
13
+ print(e)
14
+ try:
15
+ nltk.download("punkt")
16
+ except FileExistsError as e:
17
+ print(e)
18
+ pass
19
+
20
+ from nltk import sent_tokenize
21
+
22
+ from datasets import load_metric, load_from_disk
23
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
24
+ from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
25
+
26
+ logger = logging.getLogger(__name__)
27
+ logger.setLevel(logging.INFO)
28
+ logger.addHandler(logging.StreamHandler(sys.stdout))
29
+
30
+
31
+ def tokenize(batch, text_column, target_column, max_source, max_target):
32
+ tokenized_input = tokenizer(
33
+ batch[text_column], padding="max_length", truncation=True, max_length=max_source
34
+ )
35
+ tokenized_target = tokenizer(
36
+ batch[target_column],
37
+ padding="max_length",
38
+ truncation=True,
39
+ max_length=max_target,
40
+ )
41
+
42
+ tokenized_input["labels"] = tokenized_target["input_ids"]
43
+
44
+ return tokenized_input
45
+
46
+
47
+ def load_and_tokenize_dataset(
48
+ data_dir, split, text_column, target_column, max_source, max_target
49
+ ):
50
+
51
+ dataset = load_from_disk(os.path.join(data_dir, split))
52
+ tokenized_dataset = dataset.map(
53
+ lambda x: tokenize(x, text_column, target_column, max_source, max_target),
54
+ batched=True,
55
+ batch_size=512,
56
+ )
57
+ tokenized_dataset.set_format(
58
+ "numpy", columns=["input_ids", "attention_mask", "labels"]
59
+ )
60
+
61
+ return tokenized_dataset
62
+
63
+ def compute_metrics(eval_pred):
64
+ metric = load_metric('glue', 'mrpc')
65
+ predictions, references = eval_pred
66
+ return metric.compute(predictions=predictions, references=references)
67
+
68
+ # metric = load_metric("rouge")
69
+ # predictions, labels = eval_pred
70
+ # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
71
+
72
+ # # Replace -100 in the labels as we can't decode them.
73
+ # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
74
+ # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
75
+
76
+ # # Rouge expects a newline after each sentence
77
+ # decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
78
+ # for pred in decoded_preds]
79
+ # decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
80
+ # for label in decoded_labels]
81
+
82
+ # # Compute ROUGE scores
83
+ # logger.info("Decoded preds: %s" % decoded_preds)
84
+ # logger.info("Decoded labels: %s" % decoded_labels)
85
+ # result = metric.compute(predictions=decoded_preds, references=decoded_labels,
86
+ # use_stemmer=True)
87
+
88
+ # # Extract ROUGE f1 scores
89
+ # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
90
+
91
+ # # Add mean generated length to metrics
92
+ # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
93
+ # for pred in predictions]
94
+ # result["gen_len"] = np.mean(prediction_lens)
95
+
96
+ # return {k: round(v, 4) for k, v in result.items()}
97
+
98
+ def train(args):
99
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
100
+
101
+ logger.info("Loading tokenizer...\n")
102
+ global tokenizer
103
+ global model_name
104
+ model_name = args.model_name
105
+
106
+ logger.info("Loading pretrained model\n")
107
+ if "google" in model_name:
108
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
109
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
110
+ else:
111
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
112
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
113
+ logger.info("Pretrained model loaded\n")
114
+
115
+ logger.info("Fetching and tokenizing data for training")
116
+ train_dataset = load_and_tokenize_dataset(
117
+ args.train_data_dir,
118
+ "train",
119
+ args.text_column,
120
+ args.target_column,
121
+ args.max_source,
122
+ args.max_target,
123
+ )
124
+
125
+ logger.info("Tokenizing data for training loaded")
126
+
127
+ eval_dataset = load_and_tokenize_dataset(
128
+ args.train_data_dir,
129
+ "validation",
130
+ args.text_column,
131
+ args.target_column,
132
+ args.max_source,
133
+ args.max_target,
134
+ )
135
+ test_dataset = load_and_tokenize_dataset(
136
+ args.train_data_dir,
137
+ "test",
138
+ args.text_column,
139
+ args.target_column,
140
+ args.max_source,
141
+ args.max_target,
142
+ )
143
+
144
+ logger.info("Defining training arguments\n")
145
+ training_args = Seq2SeqTrainingArguments(
146
+ output_dir=args.model_dir,
147
+ num_train_epochs=args.epoch,
148
+ per_device_train_batch_size=args.train_batch_size,
149
+ per_device_eval_batch_size=args.eval_batch_size,
150
+ learning_rate=args.lr,
151
+ warmup_steps=args.warmup_steps,
152
+ weight_decay=args.weight_decay,
153
+ logging_dir=args.log_dir,
154
+ logging_strategy=args.logging_strategy,
155
+ load_best_model_at_end=True,
156
+ adafactor=True,
157
+ do_train=True,
158
+ do_eval=True,
159
+ do_predict=True,
160
+ save_total_limit=3,
161
+ evaluation_strategy="epoch",
162
+ save_strategy="epoch",
163
+ predict_with_generate=True,
164
+ metric_for_best_model="eval_loss",
165
+ seed=7,
166
+ )
167
+
168
+ logger.info("Defining seq2seq Trainer")
169
+ trainer = Seq2SeqTrainer(
170
+ model=model,
171
+ args=training_args,
172
+ train_dataset=train_dataset,
173
+ eval_dataset=eval_dataset,
174
+ tokenizer=tokenizer#,compute_metrics=compute_metrics,
175
+ )
176
+
177
+ logger.info("Starting Training")
178
+ trainer.train()
179
+ logger.info("Model trained successfully")
180
+ trainer.save_model()
181
+ logger.info("Model saved successfully")
182
+
183
+ # Evaluation
184
+ logger.info("*** Evaluate on test set***")
185
+
186
+ logger.info(trainer.predict(test_dataset))
187
+
188
+ logger.info("Removing unused checkpoints to save space in container")
189
+ os.system(f"rm -rf {args.model_dir}/checkpoint-*/")
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser()
194
+ parser.add_argument("--model-name", type=str, default="google/pegasus-xsum")
195
+ parser.add_argument(
196
+ "--train-data-dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]
197
+ )
198
+ # parser.add_argument("--val-data-dir", type=str,
199
+ # default=os.environ["SM_CHANNEL_VALIDATION"])
200
+ # parser.add_argument("--test-data-dir", type=str,
201
+ # default=os.environ["SM_CHANNEL_TEST"])
202
+ parser.add_argument("--text-column", type=str, default="dialogue")
203
+ parser.add_argument("--target-column", type=str, default="summary")
204
+ parser.add_argument("--max-source", type=int, default=512)
205
+ parser.add_argument("--max-target", type=int, default=80)
206
+ parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
207
+ parser.add_argument("--epoch", type=int, default=5)
208
+ parser.add_argument("--train-batch-size", type=int, default=2)
209
+ parser.add_argument("--eval-batch-size", type=int, default=2)
210
+ parser.add_argument("--warmup-steps", type=float, default=500)
211
+ parser.add_argument("--lr", type=float, default=2e-5)
212
+ parser.add_argument("--weight-decay", type=float, default=0.0)
213
+ parser.add_argument("--log-dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
214
+ parser.add_argument("--logging-strategy", type=str, default="epoch")
215
+ train(parser.parse_args())
model-sagemaker-5epochs/code/inference.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the script that will be used in the inference container
2
+ import json
3
+ import torch
4
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
+
6
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+
8
+
9
+ def model_fn(model_dir):
10
+ """
11
+ Load the model and tokenizer for inference
12
+ """
13
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
14
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device).eval()
15
+
16
+ return {"model": model, "tokenizer": tokenizer}
17
+
18
+
19
+ def predict_fn(input_data, model_dict):
20
+ """
21
+ Make a prediction with the model
22
+ """
23
+ text = input_data.pop("inputs")
24
+ parameters_list = input_data.pop("parameters_list", None)
25
+
26
+ tokenizer = model_dict["tokenizer"]
27
+ model = model_dict["model"]
28
+
29
+ # Parameters may or may not be passed
30
+ input_ids = tokenizer(
31
+ text, truncation=True, padding="longest", return_tensors="pt"
32
+ ).input_ids.to(device)
33
+
34
+ if parameters_list:
35
+ predictions = []
36
+ for parameters in parameters_list:
37
+ output = model.generate(input_ids, **parameters)
38
+ predictions.append(tokenizer.batch_decode(output, skip_special_tokens=True))
39
+ else:
40
+ output = model.generate(input_ids)
41
+ predictions = tokenizer.batch_decode(output, skip_special_tokens=True)
42
+
43
+ return predictions
44
+
45
+
46
+ def input_fn(request_body, request_content_type):
47
+ """
48
+ Transform the input request to a dictionary
49
+ """
50
+ return json.loads(request_body)
model-sagemaker-5epochs/code/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ nltk
2
+ rouge_score
model-sagemaker-5epochs/code/train.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is the script that will be used in the training container
2
+ import argparse
3
+ import logging
4
+ import os
5
+ import sys
6
+
7
+ import numpy as np
8
+ import nltk
9
+
10
+ try:
11
+ nltk.data.find("tokenizers/punkt")
12
+ except LookupError as e:
13
+ print(e)
14
+ try:
15
+ nltk.download("punkt")
16
+ except FileExistsError as e:
17
+ print(e)
18
+ pass
19
+
20
+ from nltk import sent_tokenize
21
+
22
+ from datasets import load_metric, load_from_disk
23
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
24
+ from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
25
+
26
+ logger = logging.getLogger(__name__)
27
+ logger.setLevel(logging.INFO)
28
+ logger.addHandler(logging.StreamHandler(sys.stdout))
29
+
30
+
31
+ def tokenize(batch, text_column, target_column, max_source, max_target):
32
+ tokenized_input = tokenizer(
33
+ batch[text_column], padding="max_length", truncation=True, max_length=max_source
34
+ )
35
+ tokenized_target = tokenizer(
36
+ batch[target_column],
37
+ padding="max_length",
38
+ truncation=True,
39
+ max_length=max_target,
40
+ )
41
+
42
+ tokenized_input["labels"] = tokenized_target["input_ids"]
43
+
44
+ return tokenized_input
45
+
46
+
47
+ def load_and_tokenize_dataset(
48
+ data_dir, split, text_column, target_column, max_source, max_target
49
+ ):
50
+
51
+ dataset = load_from_disk(os.path.join(data_dir, split))
52
+ tokenized_dataset = dataset.map(
53
+ lambda x: tokenize(x, text_column, target_column, max_source, max_target),
54
+ batched=True,
55
+ batch_size=512,
56
+ )
57
+ tokenized_dataset.set_format(
58
+ "numpy", columns=["input_ids", "attention_mask", "labels"]
59
+ )
60
+
61
+ return tokenized_dataset
62
+
63
+ def compute_metrics(eval_pred):
64
+ metric = load_metric('glue', 'mrpc')
65
+ predictions, references = eval_pred
66
+ return metric.compute(predictions=predictions, references=references)
67
+
68
+ # metric = load_metric("rouge")
69
+ # predictions, labels = eval_pred
70
+ # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
71
+
72
+ # # Replace -100 in the labels as we can't decode them.
73
+ # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
74
+ # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
75
+
76
+ # # Rouge expects a newline after each sentence
77
+ # decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
78
+ # for pred in decoded_preds]
79
+ # decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
80
+ # for label in decoded_labels]
81
+
82
+ # # Compute ROUGE scores
83
+ # logger.info("Decoded preds: %s" % decoded_preds)
84
+ # logger.info("Decoded labels: %s" % decoded_labels)
85
+ # result = metric.compute(predictions=decoded_preds, references=decoded_labels,
86
+ # use_stemmer=True)
87
+
88
+ # # Extract ROUGE f1 scores
89
+ # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
90
+
91
+ # # Add mean generated length to metrics
92
+ # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
93
+ # for pred in predictions]
94
+ # result["gen_len"] = np.mean(prediction_lens)
95
+
96
+ # return {k: round(v, 4) for k, v in result.items()}
97
+
98
+ def train(args):
99
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
100
+
101
+ logger.info("Loading tokenizer...\n")
102
+ global tokenizer
103
+ global model_name
104
+ model_name = args.model_name
105
+
106
+ logger.info("Loading pretrained model\n")
107
+ if "google" in model_name:
108
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
109
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
110
+ else:
111
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
112
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
113
+ logger.info("Pretrained model loaded\n")
114
+
115
+ logger.info("Fetching and tokenizing data for training")
116
+ train_dataset = load_and_tokenize_dataset(
117
+ args.train_data_dir,
118
+ "train",
119
+ args.text_column,
120
+ args.target_column,
121
+ args.max_source,
122
+ args.max_target,
123
+ )
124
+
125
+ logger.info("Tokenizing data for training loaded")
126
+
127
+ eval_dataset = load_and_tokenize_dataset(
128
+ args.train_data_dir,
129
+ "validation",
130
+ args.text_column,
131
+ args.target_column,
132
+ args.max_source,
133
+ args.max_target,
134
+ )
135
+ test_dataset = load_and_tokenize_dataset(
136
+ args.train_data_dir,
137
+ "test",
138
+ args.text_column,
139
+ args.target_column,
140
+ args.max_source,
141
+ args.max_target,
142
+ )
143
+
144
+ logger.info("Defining training arguments\n")
145
+ training_args = Seq2SeqTrainingArguments(
146
+ output_dir=args.model_dir,
147
+ num_train_epochs=args.epoch,
148
+ per_device_train_batch_size=args.train_batch_size,
149
+ per_device_eval_batch_size=args.eval_batch_size,
150
+ learning_rate=args.lr,
151
+ warmup_steps=args.warmup_steps,
152
+ weight_decay=args.weight_decay,
153
+ logging_dir=args.log_dir,
154
+ logging_strategy=args.logging_strategy,
155
+ load_best_model_at_end=True,
156
+ adafactor=True,
157
+ do_train=True,
158
+ do_eval=True,
159
+ do_predict=True,
160
+ save_total_limit=3,
161
+ evaluation_strategy="epoch",
162
+ save_strategy="epoch",
163
+ predict_with_generate=True,
164
+ metric_for_best_model="eval_loss",
165
+ seed=7,
166
+ )
167
+
168
+ logger.info("Defining seq2seq Trainer")
169
+ trainer = Seq2SeqTrainer(
170
+ model=model,
171
+ args=training_args,
172
+ train_dataset=train_dataset,
173
+ eval_dataset=eval_dataset,
174
+ tokenizer=tokenizer#,compute_metrics=compute_metrics,
175
+ )
176
+
177
+ logger.info("Starting Training")
178
+ trainer.train()
179
+ logger.info("Model trained successfully")
180
+ trainer.save_model()
181
+ logger.info("Model saved successfully")
182
+
183
+ # Evaluation
184
+ logger.info("*** Evaluate on test set***")
185
+
186
+ logger.info(trainer.predict(test_dataset))
187
+
188
+ logger.info("Removing unused checkpoints to save space in container")
189
+ os.system(f"rm -rf {args.model_dir}/checkpoint-*/")
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser()
194
+ parser.add_argument("--model-name", type=str, default="google/pegasus-xsum")
195
+ parser.add_argument(
196
+ "--train-data-dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"]
197
+ )
198
+ # parser.add_argument("--val-data-dir", type=str,
199
+ # default=os.environ["SM_CHANNEL_VALIDATION"])
200
+ # parser.add_argument("--test-data-dir", type=str,
201
+ # default=os.environ["SM_CHANNEL_TEST"])
202
+ parser.add_argument("--text-column", type=str, default="dialogue")
203
+ parser.add_argument("--target-column", type=str, default="summary")
204
+ parser.add_argument("--max-source", type=int, default=512)
205
+ parser.add_argument("--max-target", type=int, default=80)
206
+ parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
207
+ parser.add_argument("--epoch", type=int, default=5)
208
+ parser.add_argument("--train-batch-size", type=int, default=2)
209
+ parser.add_argument("--eval-batch-size", type=int, default=2)
210
+ parser.add_argument("--warmup-steps", type=float, default=500)
211
+ parser.add_argument("--lr", type=float, default=2e-5)
212
+ parser.add_argument("--weight-decay", type=float, default=0.0)
213
+ parser.add_argument("--log-dir", type=str, default=os.environ["SM_OUTPUT_DIR"])
214
+ parser.add_argument("--logging-strategy", type=str, default="epoch")
215
+ train(parser.parse_args())
model-sagemaker-5epochs/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/t5-v1_1-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "gated-gelu",
13
+ "gradient_checkpointing": false,
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "layer_norm_epsilon": 1e-06,
17
+ "model_type": "t5",
18
+ "num_decoder_layers": 12,
19
+ "num_heads": 12,
20
+ "num_layers": 12,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "relative_attention_num_buckets": 32,
24
+ "tie_word_embeddings": false,
25
+ "transformers_version": "4.6.1",
26
+ "use_cache": true,
27
+ "vocab_size": 32128
28
+ }
model-sagemaker-5epochs/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d13160a831c6a1f94d5205c958f8b79a9468ed83497ed3098aeb7645d0d109a1
3
+ size 990445401
model-sagemaker-5epochs/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}
model-sagemaker-5epochs/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
model-sagemaker-5epochs/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "name_or_path": "google/t5-v1_1-base", "special_tokens_map_file": "/root/.cache/huggingface/transformers/76bf19bfedb85afbe644966ca9ab7b0404d753a41bf601115bced39f825ffa9c.c94798918c92ded6aeef2d2f0e666d2cc4145eca1aa6e1336fde07f2e13e2f46", "tokenizer_file": null}
model-sagemaker-5epochs/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92e99d1bdd152d96b8c6dd0796be148e20b4a351b909b0744df5c127c8709351
3
+ size 2479
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==3.0.22
2
+ pandas==1.4.2
3
+ torch==1.10.2
4
+ transformers==4.19.2
5
+ sentencepiece==0.1.95
6
+ sentence-transformers==2.2.2
train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:999febaa4d0e013cb0c89ba43c657bfdf13d9d7d8e52f4050b64341ff833489d
3
+ size 34332589