File size: 4,407 Bytes
5285b7f
 
 
 
 
 
 
 
 
 
 
86f28e8
 
 
5285b7f
 
 
 
86f28e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5285b7f
 
 
86f28e8
 
5285b7f
 
 
86f28e8
5285b7f
b24d99b
 
5285b7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b24d99b
5285b7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import datasets
import numpy as np
import torch
import transformers
from config import epochs, batch_size, learning_rate
from model import tokenizer, multitask_model
from mtm import MultitaskTrainer, NLPDataCollator, DataLoaderWithTaskname
import pandas as pd
# from data_5_LT23 import features_dict,extra_feature_dict
from data_predict import convert_to_stsb_features,convert_to_features

from huggingface_hub import hf_hub_download,snapshot_download


features_dict = {}
extra_feature_dict = {}
sentinews_location = ""

# df_document_croatian_test = pd.read_csv(sentinews_location+"textlabel.tsv", sep="\t")
# df_document_croatian_test = df_document_croatian_test[["content"]]
def predict():
    # gather everyone if you want to have a single DatasetDict
    document = DatasetDict({
        # "train": Dataset.from_pandas(df_document_sl_hr_train),
        # "valid": Dataset.from_pandas(df_document_sl_hr_valid),
        "test": Dataset.from_dict({"content":["Volim ti"]})
    })
    
    dataset_dict = {
        "document": document,
    }
    
    for task_name, dataset in dataset_dict.items():
        print(task_name)
        print(dataset_dict[task_name]["test"][0])
        print()
    
    
    convert_func_dict = {
        "document": convert_to_stsb_features,
        # "paragraph": convert_to_stsb_features,
        # "sentence": convert_to_stsb_features,
    }
    
    features_dict = convert_to_features(dataset_dict, convert_func_dict)



#model_link = snapshot_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier")
model_link = hf_hub_download(repo_id="FFZG-cleopatra/Croatian-News-Classifier",filename = "pytorch_model.bin")

# multitask_model.from_pretrained(, config="/media/gaurish/angela/projects/CroatianSlovenEnglishBert/i-got-u-brother-cleopatra-workshop/src/models/multitask_model_3ep/config.json")
multitask_model.load_state_dict(torch.load(
   model_link #"multitask_model_3ep/pytorch_model.bin"
    ))
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# multitask_model.to(device)
predictions = []
for _, batch in enumerate(features_dict["document"]['test']):
    for key, value in batch.items():
        batch[key] = batch[key].to(device)
    
    task_model = multitask_model.get_model("document")
    classifier_output = task_model.forward(
            torch.unsqueeze(batch["input_ids"], 0),
            torch.unsqueeze(batch["attention_mask"], 0),)
    
    print(tokenizer.decode(batch["input_ids"],skip_special_tokens=True))
    prediction =torch.max(classifier_output.logits, axis=1)
    predictions.append(prediction.indices.item())

pd.DataFrame({"original_predictions":predictions}).to_csv("eacl_slavic.tsv")


trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        learning_rate=learning_rate,
        output_dir="/tmp",
        do_train=False,
        do_eval=True,
        # evaluation_strategy ="steps",
        # num_train_epochs=epochs,
        # fp16=True,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_steps=3000,
        # eval_steps=50,
        load_best_model_at_end=True,
        
    ),
    data_collator=NLPDataCollator(tokenizer=tokenizer),
    callbacks=[],
    
)
print(features_dict["document"]["test"])
tests_dict = {}
for task_name in ["document"]: # "paragraph", "sentence"
    test_dataloader = DataLoaderWithTaskname(
        task_name,
        trainer.get_eval_dataloader(features_dict[task_name]["test"])
    )
    print(len(trainer.get_eval_dataloader(features_dict[task_name]["test"])))
    print(test_dataloader.data_loader.collate_fn)
    print(len(test_dataloader.data_loader))
    tests_dict[task_name] = trainer.prediction_loop(
        test_dataloader,
        description=f"Testing: {task_name}"
    )
print(tests_dict)
for task_name in ["document",  ]: #"paragraph","sentence"
    for metric in ["precision", "recall", "f1"]:
        print("test {} {}:".format(metric, task_name),
              datasets.load_metric(metric,
                                   name="dev {} {}".format(metric, task_name)).compute(
                  predictions=np.argmax(
                      tests_dict[task_name].predictions, axis=1),
                  references=tests_dict[task_name].label_ids, average="macro"
              ))
print()