File size: 4,967 Bytes
0cb9929
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Import Required Libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel, AdamW
from sklearn.metrics import accuracy_score
import numpy as np

# Import functions from the preprocessing module
from transactify.data_preprocessing import preprocessing_data, split_data, read_data

# Define a BERT-based classification model
class BertClassifier(nn.Module):
    def __init__(self, num_labels, dropout_rate=0.3):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Pooler output (CLS token)
        output = self.dropout(pooled_output)
        logits = self.classifier(output)
        return logits

# Training the model
# Training the model
def train_model(model, train_dataloader, val_dataloader, device, epochs=3, lr=2e-5):
    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = batch

            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device).long()  # Ensure labels are LongTensor

            model.zero_grad()
            outputs = model(b_input_ids, b_input_mask)

            loss = loss_fn(outputs, b_labels)
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")

        model.eval()
        total_val_accuracy = 0
        total_val_loss = 0

        with torch.no_grad():
            for batch in val_dataloader:
                b_input_ids, b_input_mask, b_labels = batch
                b_input_ids = b_input_ids.to(device)
                b_input_mask = b_input_mask.to(device)
                b_labels = b_labels.to(device)

                outputs = model(b_input_ids, b_input_mask)
                loss = loss_fn(outputs, b_labels)
                total_val_loss += loss.item()

                preds = torch.argmax(outputs, dim=1)
                total_val_accuracy += (preds == b_labels).sum().item()

        avg_val_accuracy = total_val_accuracy / len(val_dataloader.dataset)
        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}")

# Testing the model
def test_model(model, test_dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            outputs = model(b_input_ids, b_input_mask)
            preds = torch.argmax(outputs, dim=1)

            all_preds.append(preds.cpu().numpy())
            all_labels.append(b_labels.cpu().numpy())

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy}")

# Main function to train, validate, and test the model
def main(data_path, epochs=3, batch_size=16):
    # Read and preprocess data
    data = read_data(data_path)
    if data is None:
        return
    
    input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
    X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)

    # Determine the number of labels
    num_labels = len(labelencoder.classes_)
    
    # Create the model
    model = BertClassifier(num_labels)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Create dataloaders
    train_dataset = TensorDataset(X_train_ids, X_train_masks, y_train)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

    val_dataset = TensorDataset(X_test_ids, X_test_masks, y_test)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    # Train the model
    train_model(model, train_dataloader, val_dataloader, device, epochs=epochs)

    # Test the model
    test_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    test_model(model, test_dataloader, device)

if __name__ == "__main__":
    data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
    main(data_path)