thak123
commited on
Commit
•
f08fa03
0
Parent(s):
Duplicate from FFZG-cleopatra/Croatian-News-Sentiment-Classifier-V1
Browse files- .gitattributes +34 -0
- README.md +13 -0
- app.py +78 -0
- config.py +19 -0
- data_predict.py +48 -0
- model.py +19 -0
- mtm.py +214 -0
- requirements.txt +8 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Croatian Sentiment News Classifier
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.29.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: FFZG-cleopatra/Croatian-News-Sentiment-Classifier-V1
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datasets
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import transformers
|
5 |
+
from config import epochs, batch_size, learning_rate, id2label
|
6 |
+
from model import tokenizer, multitask_model
|
7 |
+
from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname
|
8 |
+
import pandas as pd
|
9 |
+
from datasets import Dataset, DatasetDict
|
10 |
+
from data_predict import convert_to_stsb_features,convert_to_features
|
11 |
+
import gradio as gr
|
12 |
+
from huggingface_hub import hf_hub_download,snapshot_download
|
13 |
+
|
14 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
15 |
+
|
16 |
+
# Version 1 - Croatian Document + Slovenian Document.
|
17 |
+
model_link = hf_hub_download(repo_id="FFZG-cleopatra/Croatian-Document-News-Sentiment-Classifier",filename = "pytorch_model.bin")
|
18 |
+
|
19 |
+
multitask_model.load_state_dict(torch.load(model_link, map_location=device))
|
20 |
+
multitask_model.to(device)
|
21 |
+
|
22 |
+
def predict_sentiment(sentence = "Volim ti"):
|
23 |
+
# gather everyone if you want to have a single DatasetDict
|
24 |
+
document = DatasetDict({
|
25 |
+
# "train": Dataset.from_pandas(df_document_sl_hr_train),
|
26 |
+
# "valid": Dataset.from_pandas(df_document_sl_hr_valid),
|
27 |
+
"test": Dataset.from_dict({"content":[sentence]})
|
28 |
+
})
|
29 |
+
|
30 |
+
dataset_dict = {
|
31 |
+
"document": document,
|
32 |
+
}
|
33 |
+
|
34 |
+
for task_name, dataset in dataset_dict.items():
|
35 |
+
print(task_name)
|
36 |
+
print(dataset_dict[task_name]["test"][0])
|
37 |
+
print()
|
38 |
+
|
39 |
+
|
40 |
+
convert_func_dict = {
|
41 |
+
"document": convert_to_stsb_features,
|
42 |
+
# "paragraph": convert_to_stsb_features,
|
43 |
+
# "sentence": convert_to_stsb_features,
|
44 |
+
}
|
45 |
+
|
46 |
+
features_dict = convert_to_features(dataset_dict, convert_func_dict)
|
47 |
+
|
48 |
+
predictions = []
|
49 |
+
|
50 |
+
for _, batch in enumerate(features_dict["document"]['test']):
|
51 |
+
for key, value in batch.items():
|
52 |
+
batch[key] = batch[key].to(device)
|
53 |
+
|
54 |
+
task_model = multitask_model.get_model("document")
|
55 |
+
classifier_output = task_model.forward(
|
56 |
+
torch.unsqueeze(batch["input_ids"], 0),
|
57 |
+
torch.unsqueeze(batch["attention_mask"], 0),)
|
58 |
+
|
59 |
+
print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True))
|
60 |
+
print("logits:",classifier_output.logits)
|
61 |
+
prediction =torch.max(classifier_output.logits, axis=1)
|
62 |
+
predictions.append(prediction.indices.item())
|
63 |
+
|
64 |
+
print("predictions:", predictions[0] , id2label[predictions[0]] )
|
65 |
+
return id2label[predictions[0]]
|
66 |
+
|
67 |
+
|
68 |
+
interface = gr.Interface(
|
69 |
+
fn=predict_sentiment,
|
70 |
+
inputs='text',
|
71 |
+
outputs=['label'],
|
72 |
+
title='Croatian News Sentiment Analysis 1.0',
|
73 |
+
description='Get the positive/neutral/negative sentiment for the given input.'
|
74 |
+
)
|
75 |
+
|
76 |
+
|
77 |
+
interface.launch(inline = False)
|
78 |
+
|
config.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
print(os.listdir())
|
4 |
+
model_name = "EMBEDDIA/crosloengual-bert" #"FFZG-cleopatra/dummy-model" #"FFZG-cleopatra/Croatian-News-Classifier"
|
5 |
+
|
6 |
+
print("model-name:",model_name)
|
7 |
+
|
8 |
+
output_path = ""
|
9 |
+
drop_out = 0.3
|
10 |
+
max_length = 512
|
11 |
+
epochs = 5
|
12 |
+
label2id = {'neutral': 0, 'negative': 1, 'positive': 2}
|
13 |
+
id2label = {0: 'neutral', 1: 'negative', 2: 'positive'}
|
14 |
+
output_dir=""
|
15 |
+
batch_size=16 #32
|
16 |
+
learning_rate=2e-5
|
17 |
+
|
18 |
+
from pip import _internal
|
19 |
+
print(_internal.main(['list']))
|
data_predict.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import Dataset, DatasetDict
|
2 |
+
import pandas as pd
|
3 |
+
from config import max_length, label2id
|
4 |
+
from model import tokenizer
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
+
|
8 |
+
|
9 |
+
def convert_to_stsb_features(example_batch):
|
10 |
+
inputs = example_batch['content']
|
11 |
+
features = tokenizer.batch_encode_plus(
|
12 |
+
inputs, truncation=True, max_length=max_length, padding='max_length')
|
13 |
+
|
14 |
+
# features["labels"] = [label2id[i] for i in example_batch["sentiment"]]
|
15 |
+
features["labels"] = [0]*len(example_batch["content"]) #[i for i in range(len(example_batch["content"]))]
|
16 |
+
# features["nid"] = [int(i) for i in example_batch["nid"]]
|
17 |
+
return features
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
def convert_to_features(dataset_dict, convert_func_dict):
|
23 |
+
columns_dict = {
|
24 |
+
"document": ['input_ids', 'attention_mask', 'labels'],
|
25 |
+
# "paragraph": ['input_ids', 'attention_mask', 'labels'],
|
26 |
+
# "sentence": ['input_ids', 'attention_mask', 'labels'],
|
27 |
+
}
|
28 |
+
features_dict = {}
|
29 |
+
|
30 |
+
for task_name, dataset in dataset_dict.items():
|
31 |
+
features_dict[task_name] = {}
|
32 |
+
print(task_name)
|
33 |
+
for phase, phase_dataset in dataset.items():
|
34 |
+
features_dict[task_name][phase] = phase_dataset.map(
|
35 |
+
convert_func_dict[task_name],
|
36 |
+
batched=True,
|
37 |
+
load_from_cache_file=False,
|
38 |
+
)
|
39 |
+
print(task_name, phase, len(phase_dataset),
|
40 |
+
len(features_dict[task_name][phase]))
|
41 |
+
features_dict[task_name][phase].set_format(
|
42 |
+
type="torch",
|
43 |
+
columns=columns_dict[task_name],
|
44 |
+
)
|
45 |
+
print("=>",task_name, phase, len(phase_dataset),
|
46 |
+
len(features_dict[task_name][phase]))
|
47 |
+
return features_dict
|
48 |
+
|
model.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import transformers
|
2 |
+
from mtm import MultitaskModel
|
3 |
+
from config import model_name, drop_out
|
4 |
+
|
5 |
+
multitask_model = MultitaskModel.create(
|
6 |
+
model_name=model_name,
|
7 |
+
model_type_dict={
|
8 |
+
"document": transformers.AutoModelForSequenceClassification,
|
9 |
+
"paragraph": transformers.AutoModelForSequenceClassification,
|
10 |
+
"sentence": transformers.AutoModelForSequenceClassification,
|
11 |
+
},
|
12 |
+
model_config_dict={
|
13 |
+
"document": transformers.AutoConfig.from_pretrained(model_name, num_labels=3, hidden_dropout_prob=drop_out, attention_probs_dropout_prob=drop_out),
|
14 |
+
"paragraph": transformers.AutoConfig.from_pretrained(model_name, num_labels=3, hidden_dropout_prob=drop_out, attention_probs_dropout_prob=drop_out),
|
15 |
+
"sentence": transformers.AutoConfig.from_pretrained(model_name, num_labels=3, hidden_dropout_prob=drop_out, attention_probs_dropout_prob=drop_out),
|
16 |
+
},
|
17 |
+
)
|
18 |
+
|
19 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
|
mtm.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import transformers
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.utils.data.sampler import RandomSampler
|
5 |
+
from torch.utils.data.distributed import DistributedSampler
|
6 |
+
from torch.utils.data.dataloader import DataLoader
|
7 |
+
from transformers.data.data_collator import DataCollator
|
8 |
+
from transformers.data.data_collator import DataCollatorWithPadding, InputDataClass
|
9 |
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
10 |
+
from transformers import is_torch_tpu_available
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
class MultitaskModel(transformers.PreTrainedModel):
|
14 |
+
def __init__(self, encoder, taskmodels_dict):
|
15 |
+
"""
|
16 |
+
Setting MultitaskModel up as a PretrainedModel allows us
|
17 |
+
to take better advantage of Trainer features
|
18 |
+
"""
|
19 |
+
super().__init__(transformers.PretrainedConfig())
|
20 |
+
|
21 |
+
self.encoder = encoder
|
22 |
+
self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)
|
23 |
+
|
24 |
+
@classmethod
|
25 |
+
def create(cls, model_name, model_type_dict, model_config_dict):
|
26 |
+
"""
|
27 |
+
This creates a MultitaskModel using the model class and config objects
|
28 |
+
from single-task models.
|
29 |
+
|
30 |
+
We do this by creating each single-task model, and having them share
|
31 |
+
the same encoder transformer.
|
32 |
+
"""
|
33 |
+
shared_encoder = None
|
34 |
+
taskmodels_dict = {}
|
35 |
+
do = nn.Dropout(p=0.2)
|
36 |
+
for task_name, model_type in model_type_dict.items():
|
37 |
+
model = model_type.from_pretrained(
|
38 |
+
model_name,
|
39 |
+
config=model_config_dict[task_name],
|
40 |
+
)
|
41 |
+
if shared_encoder is None:
|
42 |
+
shared_encoder = getattr(
|
43 |
+
model, cls.get_encoder_attr_name(model))
|
44 |
+
else:
|
45 |
+
setattr(model, cls.get_encoder_attr_name(
|
46 |
+
model), shared_encoder)
|
47 |
+
taskmodels_dict[task_name] = model
|
48 |
+
return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)
|
49 |
+
|
50 |
+
@classmethod
|
51 |
+
def get_encoder_attr_name(cls, model):
|
52 |
+
"""
|
53 |
+
The encoder transformer is named differently in each model "architecture".
|
54 |
+
This method lets us get the name of the encoder attribute
|
55 |
+
"""
|
56 |
+
model_class_name = model.__class__.__name__
|
57 |
+
if model_class_name.startswith("Bert"):
|
58 |
+
return "bert"
|
59 |
+
elif model_class_name.startswith("Roberta"):
|
60 |
+
return "roberta"
|
61 |
+
elif model_class_name.startswith("Albert"):
|
62 |
+
return "albert"
|
63 |
+
else:
|
64 |
+
raise KeyError(f"Add support for new model {model_class_name}")
|
65 |
+
|
66 |
+
def forward(self, task_name, **kwargs):
|
67 |
+
return self.taskmodels_dict[task_name](**kwargs)
|
68 |
+
|
69 |
+
def get_model(self, task_name):
|
70 |
+
return self.taskmodels_dict[task_name]
|
71 |
+
|
72 |
+
class NLPDataCollator(DataCollatorWithPadding): # DataCollatorWithPadding
|
73 |
+
"""
|
74 |
+
Extending the existing DataCollator to work with NLP dataset batches
|
75 |
+
"""
|
76 |
+
|
77 |
+
def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:
|
78 |
+
first = features[0]
|
79 |
+
batch = None
|
80 |
+
if isinstance(first, dict):
|
81 |
+
# NLP data sets current works presents features as lists of dictionary
|
82 |
+
# (one per example), so we will adapt the collate_batch logic for that
|
83 |
+
if "labels" in first and first["labels"] is not None:
|
84 |
+
if first["labels"].dtype == torch.int64:
|
85 |
+
labels = torch.tensor([f["labels"]
|
86 |
+
for f in features], dtype=torch.long)
|
87 |
+
else:
|
88 |
+
labels = torch.tensor([f["labels"]
|
89 |
+
for f in features], dtype=torch.float)
|
90 |
+
batch = {"labels": labels}
|
91 |
+
for k, v in first.items():
|
92 |
+
if k != "labels" and v is not None and not isinstance(v, str):
|
93 |
+
batch[k] = torch.stack([f[k] for f in features])
|
94 |
+
return batch
|
95 |
+
else:
|
96 |
+
# otherwise, revert to using the default collate_batch
|
97 |
+
return DataCollatorWithPadding().collate_batch(features)
|
98 |
+
|
99 |
+
|
100 |
+
class StrIgnoreDevice(str):
|
101 |
+
"""
|
102 |
+
This is a hack. The Trainer is going call .to(device) on every input
|
103 |
+
value, but we need to pass in an additional `task_name` string.
|
104 |
+
This prevents it from throwing an error
|
105 |
+
"""
|
106 |
+
|
107 |
+
def to(self, device):
|
108 |
+
return self
|
109 |
+
|
110 |
+
|
111 |
+
class DataLoaderWithTaskname:
|
112 |
+
"""
|
113 |
+
Wrapper around a DataLoader to also yield a task name
|
114 |
+
"""
|
115 |
+
|
116 |
+
def __init__(self, task_name, data_loader):
|
117 |
+
self.task_name = task_name
|
118 |
+
self.data_loader = data_loader
|
119 |
+
|
120 |
+
self.batch_size = data_loader.batch_size
|
121 |
+
self.dataset = data_loader.dataset
|
122 |
+
|
123 |
+
def __len__(self):
|
124 |
+
return len(self.data_loader)
|
125 |
+
|
126 |
+
def __iter__(self):
|
127 |
+
for batch in self.data_loader:
|
128 |
+
batch["task_name"] = StrIgnoreDevice(self.task_name)
|
129 |
+
yield batch
|
130 |
+
|
131 |
+
|
132 |
+
class MultitaskDataloader:
|
133 |
+
"""
|
134 |
+
Data loader that combines and samples from multiple single-task
|
135 |
+
data loaders.
|
136 |
+
"""
|
137 |
+
|
138 |
+
def __init__(self, dataloader_dict):
|
139 |
+
self.dataloader_dict = dataloader_dict
|
140 |
+
self.num_batches_dict = {
|
141 |
+
task_name: len(dataloader)
|
142 |
+
for task_name, dataloader in self.dataloader_dict.items()
|
143 |
+
}
|
144 |
+
self.task_name_list = list(self.dataloader_dict)
|
145 |
+
self.dataset = [None] * sum(
|
146 |
+
len(dataloader.dataset)
|
147 |
+
for dataloader in self.dataloader_dict.values()
|
148 |
+
)
|
149 |
+
|
150 |
+
def __len__(self):
|
151 |
+
return sum(self.num_batches_dict.values())
|
152 |
+
|
153 |
+
def __iter__(self):
|
154 |
+
"""
|
155 |
+
For each batch, sample a task, and yield a batch from the respective
|
156 |
+
task Dataloader.
|
157 |
+
|
158 |
+
We use size-proportional sampling, but you could easily modify this
|
159 |
+
to sample from some-other distribution.
|
160 |
+
"""
|
161 |
+
task_choice_list = []
|
162 |
+
for i, task_name in enumerate(self.task_name_list):
|
163 |
+
task_choice_list += [i] * self.num_batches_dict[task_name]
|
164 |
+
task_choice_list = np.array(task_choice_list)
|
165 |
+
np.random.shuffle(task_choice_list)
|
166 |
+
dataloader_iter_dict = {
|
167 |
+
task_name: iter(dataloader)
|
168 |
+
for task_name, dataloader in self.dataloader_dict.items()
|
169 |
+
}
|
170 |
+
for task_choice in task_choice_list:
|
171 |
+
task_name = self.task_name_list[task_choice]
|
172 |
+
yield next(dataloader_iter_dict[task_name])
|
173 |
+
|
174 |
+
|
175 |
+
class MultitaskTrainer(transformers.Trainer):
|
176 |
+
|
177 |
+
def get_single_train_dataloader(self, task_name, train_dataset):
|
178 |
+
"""
|
179 |
+
Create a single-task data loader that also yields task names
|
180 |
+
"""
|
181 |
+
if self.train_dataset is None:
|
182 |
+
raise ValueError("Trainer: training requires a train_dataset.")
|
183 |
+
if False and is_torch_tpu_available():
|
184 |
+
train_sampler = get_tpu_sampler(train_dataset)
|
185 |
+
else:
|
186 |
+
train_sampler = (
|
187 |
+
RandomSampler(train_dataset)
|
188 |
+
if self.args.local_rank == -1
|
189 |
+
else DistributedSampler(train_dataset)
|
190 |
+
)
|
191 |
+
|
192 |
+
data_loader = DataLoaderWithTaskname(
|
193 |
+
task_name=task_name,
|
194 |
+
data_loader=DataLoader(
|
195 |
+
train_dataset,
|
196 |
+
batch_size=self.args.train_batch_size,
|
197 |
+
sampler=train_sampler,
|
198 |
+
collate_fn=self.data_collator.collate_batch,
|
199 |
+
),
|
200 |
+
)
|
201 |
+
return data_loader
|
202 |
+
|
203 |
+
def get_train_dataloader(self):
|
204 |
+
"""
|
205 |
+
Returns a MultitaskDataloader, which is not actually a Dataloader
|
206 |
+
but an iterable that returns a generator that samples from each
|
207 |
+
task Dataloader
|
208 |
+
"""
|
209 |
+
return MultitaskDataloader({
|
210 |
+
task_name: self.get_single_train_dataloader(
|
211 |
+
task_name, task_dataset)
|
212 |
+
for task_name, task_dataset in self.train_dataset.items()
|
213 |
+
})
|
214 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nltk
|
2 |
+
datasets==1.6.2
|
3 |
+
torch==1.8.1
|
4 |
+
transformers==4.8.2
|
5 |
+
pytorch-lightning==1.4.9
|
6 |
+
tokenizers==0.10.3
|
7 |
+
numpy==1.21.2
|
8 |
+
scikit-learn==0.24.1
|