File size: 3,450 Bytes
a54f158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910da7b
 
a54f158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1084a38
a54f158
 
 
1084a38
a54f158
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from fastai.text.all import *
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset

torch.serialization.add_safe_globals(['L'])

class QuestionDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.text = X
        self.targets = y
        self.tok = tokenizer
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        
        text = self.text[idx]
        targ = self.targets[idx]
        
        return self.tok(text, padding='max_length', 
                        truncation=True,
                        max_length=30,
                        return_tensors="pt")["input_ids"][0], tensor(targ)
    
    def new_empty(self):
        return QuestionDataset([], [], self.tok)

class ModelLoader:
    def __init__(self):
        self.path = "DeBERTaV3/input/"
        self.train_df = pd.read_csv(self.path + "train.csv")
        self.test_df = pd.read_csv(self.path + "test.csv")

        self.tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
        self.df = self.train_df
        
        # Train/validation split
        self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(
            self.df["question_text"].tolist(),
            self.df["target"].tolist(),
            stratify=self.df["target"],
            test_size=0.01
        )

        self.train_ds = QuestionDataset(self.X_train, self.y_train, self.tokenizer)
        self.valid_ds = QuestionDataset(self.X_valid, self.y_valid, self.tokenizer)

        self.train_dl = DataLoader(self.train_ds, batch_size=256)
        self.valid_dl = DataLoader(self.valid_ds, batch_size=512)
        self.dls = DataLoaders(self.train_dl, self.valid_dl)

        self.bert = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').train()

        self.classifier = nn.Sequential(
            nn.Linear(768, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 2)
        )

        self.bert.classifier = self.classifier

        class BertClassifier(nn.Module):
            def __init__(self, bert):
                super(BertClassifier, self).__init__()
                self.bert = bert
            
            def forward(self, x):
                return self.bert(x).logits

        self.model = BertClassifier(self.bert)

        # Calculate class weights
        n_0 = (self.train_df["target"] == 0).sum()
        n_1 = (self.train_df["target"] == 1).sum()
        n = n_0 + n_1

        self.class_weights = tensor([n / (n + n_0), n / (n + n_1)])
        self.learn = Learner(self.dls, self.model,
                             loss_func=nn.CrossEntropyLoss(weight=self.class_weights),
                             metrics=[accuracy, F1Score()]).to_fp16()
        try:
            # First attempt: Try loading with weights_only=True
            self.learn.load('fastai_QIQC-deberta-v3', strict=False, weights_only=True)
        except Exception as e:
            print(f"Warning: Could not load with weights_only=True. Falling back to default loading. Error: {e}")
            # Second attempt: Fall back to regular loading if the first attempt fails
            self.learn.load('fastai_QIQC-deberta-v3', strict=False)

    def get_learner(self):
        return self.learn