File size: 4,282 Bytes
e1a89b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Import Required Libraries:
import numpy as np
import pandas as pd

import torch
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re

# Read the data
def read_data(path):
    try:
        df = pd.read_csv(path)
        if df.empty:
            print("The file is empty.")
            return None
        return df
    except FileNotFoundError:
        print(f"File not found at: {path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Path to your data file
data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"

# Read the data and check if it was loaded successfully
data = read_data(data_path)
if data is not None:
    print("Data loaded successfully:")
    print(data.head(15))
else:
    print("Data loading failed. Exiting...")
    exit()

# Cleaning the text
def clean_text(text):
    text = text.lower()                    # Converting uppercase to lowercase
    text = re.sub(r"\d+", " ", text)       # Removing digits in the text
    text = re.sub(r"[^\w\s]", " ", text)   # Removing punctuations
    text = text.strip()                    # Remove extra spaces
    return text

# Preprocessing the data
def preprocessing_data(df, max_length=20):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    input_ids = []
    attention_masks = []
    
    # Ensure the dataframe has the required columns
    if "Transaction Description" not in df.columns or "Category" not in df.columns:
        raise ValueError("The required columns 'Transaction Description' and 'Category' are missing from the dataset.")
    
    for description in df["Transaction Description"]:
        cleaned_text = clean_text(description)
        
        # Debugging print statements
        # print(f"Original Description: {description}")
        # print(f"Cleaned Text: {cleaned_text}")
        
        # Only tokenize if the cleaned text is not empty
        if cleaned_text:  
            encoded_dict = tokenizer.encode_plus(
                cleaned_text,
                add_special_tokens=True,  # Add special tokens for BERT
                max_length=max_length,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors="pt",
                truncation=True
            )
            
            input_ids.append(encoded_dict['input_ids'])  # Append input IDs
            attention_masks.append(encoded_dict['attention_mask'])  # Append attention masks
        else:
            print("Cleaned text is empty, skipping...")

    # Debugging output to check sizes
    print(f"Total input_ids collected: {len(input_ids)}")
    print(f"Total attention_masks collected: {len(attention_masks)}")
    
    if not input_ids:
        raise ValueError("No input_ids were collected. Check the cleaning process.")

    # Concatenating the list of tensors to form a single tensor
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    # Encoding the labels
    labelencoder = LabelEncoder()
    labels = labelencoder.fit_transform(df["Category"])
    labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to LongTensor
    
    return input_ids, attention_masks, labels, labelencoder

# Split the data into train and test sets
def split_data(input_ids, attention_masks, labels, test_size=0.2, random_state=42):
    X_train_ids, X_test_ids, y_train, y_test = train_test_split(
        input_ids, labels, test_size=test_size, random_state=random_state
    )
    
    X_train_masks, X_test_masks = train_test_split(
        attention_masks, test_size=test_size, random_state=random_state
    )
    
    return X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test

# Preprocess the data and split into train and test sets
input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)

# Output the sizes of the splits for confirmation
print(f"Training set size: {X_train_ids.shape[0]}")
print(f"Test set size: {X_test_ids.shape[0]}")