Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from fastai.text.all import * | |
from sklearn.model_selection import train_test_split | |
from torch.utils.data import Dataset | |
torch.serialization.add_safe_globals(['L']) | |
class QuestionDataset(Dataset): | |
def __init__(self, X, y, tokenizer): | |
self.text = X | |
self.targets = y | |
self.tok = tokenizer | |
def __len__(self): | |
return len(self.text) | |
def __getitem__(self, idx): | |
text = self.text[idx] | |
targ = self.targets[idx] | |
return self.tok(text, padding='max_length', | |
truncation=True, | |
max_length=30, | |
return_tensors="pt")["input_ids"][0], tensor(targ) | |
def new_empty(self): | |
return QuestionDataset([], [], self.tok) | |
class ModelLoader: | |
def __init__(self): | |
self.path = "DeBERTaV3/input/" | |
self.train_df = pd.read_csv(self.path + "train.csv") | |
self.test_df = pd.read_csv(self.path + "test.csv") | |
self.tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base') | |
self.df = self.train_df | |
# Train/validation split | |
self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split( | |
self.df["question_text"].tolist(), | |
self.df["target"].tolist(), | |
stratify=self.df["target"], | |
test_size=0.01 | |
) | |
self.train_ds = QuestionDataset(self.X_train, self.y_train, self.tokenizer) | |
self.valid_ds = QuestionDataset(self.X_valid, self.y_valid, self.tokenizer) | |
self.train_dl = DataLoader(self.train_ds, batch_size=256) | |
self.valid_dl = DataLoader(self.valid_ds, batch_size=512) | |
self.dls = DataLoaders(self.train_dl, self.valid_dl) | |
self.bert = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').train() | |
self.classifier = nn.Sequential( | |
nn.Linear(768, 1024), | |
nn.ReLU(), | |
nn.Dropout(0.5), | |
nn.Linear(1024, 2) | |
) | |
self.bert.classifier = self.classifier | |
class BertClassifier(nn.Module): | |
def __init__(self, bert): | |
super(BertClassifier, self).__init__() | |
self.bert = bert | |
def forward(self, x): | |
return self.bert(x).logits | |
self.model = BertClassifier(self.bert) | |
# Calculate class weights | |
n_0 = (self.train_df["target"] == 0).sum() | |
n_1 = (self.train_df["target"] == 1).sum() | |
n = n_0 + n_1 | |
self.class_weights = tensor([n / (n + n_0), n / (n + n_1)]) | |
self.learn = Learner(self.dls, self.model, | |
loss_func=nn.CrossEntropyLoss(weight=self.class_weights), | |
metrics=[accuracy, F1Score()]).to_fp16() | |
try: | |
# First attempt: Try loading with weights_only=True | |
self.learn.load('fastai_QIQC-deberta-v3', strict=False, weights_only=True) | |
except Exception as e: | |
print(f"Warning: Could not load with weights_only=True. Falling back to default loading. Error: {e}") | |
# Second attempt: Fall back to regular loading if the first attempt fails | |
self.learn.load('fastai_QIQC-deberta-v3', strict=False) | |
def get_learner(self): | |
return self.learn |