ananthakrishnan
commited on
Commit
·
0cb9929
1
Parent(s):
f03f427
tech: model creation
Browse files- __pycache__/datapreprocessing.cpython-312.pyc +0 -0
- bert_model.py +134 -0
- datapreprocessing.py → data_preprocessing.py +63 -26
- requirenments.txt → requirements.txt +0 -0
- setup.md +1 -1
__pycache__/datapreprocessing.cpython-312.pyc
ADDED
Binary file (4.3 kB). View file
|
|
bert_model.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import Required Libraries
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.utils.data import DataLoader, TensorDataset
|
5 |
+
from transformers import BertModel, AdamW
|
6 |
+
from sklearn.metrics import accuracy_score
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# Import functions from the preprocessing module
|
10 |
+
from transactify.data_preprocessing import preprocessing_data, split_data, read_data
|
11 |
+
|
12 |
+
# Define a BERT-based classification model
|
13 |
+
class BertClassifier(nn.Module):
|
14 |
+
def __init__(self, num_labels, dropout_rate=0.3):
|
15 |
+
super(BertClassifier, self).__init__()
|
16 |
+
self.bert = BertModel.from_pretrained("bert-base-uncased")
|
17 |
+
self.dropout = nn.Dropout(dropout_rate)
|
18 |
+
self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
|
19 |
+
|
20 |
+
def forward(self, input_ids, attention_mask):
|
21 |
+
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
|
22 |
+
pooled_output = outputs[1] # Pooler output (CLS token)
|
23 |
+
output = self.dropout(pooled_output)
|
24 |
+
logits = self.classifier(output)
|
25 |
+
return logits
|
26 |
+
|
27 |
+
# Training the model
|
28 |
+
# Training the model
|
29 |
+
def train_model(model, train_dataloader, val_dataloader, device, epochs=3, lr=2e-5):
|
30 |
+
optimizer = AdamW(model.parameters(), lr=lr)
|
31 |
+
loss_fn = nn.CrossEntropyLoss()
|
32 |
+
|
33 |
+
for epoch in range(epochs):
|
34 |
+
model.train()
|
35 |
+
total_train_loss = 0
|
36 |
+
for step, batch in enumerate(train_dataloader):
|
37 |
+
b_input_ids, b_input_mask, b_labels = batch
|
38 |
+
|
39 |
+
b_input_ids = b_input_ids.to(device)
|
40 |
+
b_input_mask = b_input_mask.to(device)
|
41 |
+
b_labels = b_labels.to(device).long() # Ensure labels are LongTensor
|
42 |
+
|
43 |
+
model.zero_grad()
|
44 |
+
outputs = model(b_input_ids, b_input_mask)
|
45 |
+
|
46 |
+
loss = loss_fn(outputs, b_labels)
|
47 |
+
total_train_loss += loss.item()
|
48 |
+
loss.backward()
|
49 |
+
optimizer.step()
|
50 |
+
|
51 |
+
avg_train_loss = total_train_loss / len(train_dataloader)
|
52 |
+
print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")
|
53 |
+
|
54 |
+
model.eval()
|
55 |
+
total_val_accuracy = 0
|
56 |
+
total_val_loss = 0
|
57 |
+
|
58 |
+
with torch.no_grad():
|
59 |
+
for batch in val_dataloader:
|
60 |
+
b_input_ids, b_input_mask, b_labels = batch
|
61 |
+
b_input_ids = b_input_ids.to(device)
|
62 |
+
b_input_mask = b_input_mask.to(device)
|
63 |
+
b_labels = b_labels.to(device)
|
64 |
+
|
65 |
+
outputs = model(b_input_ids, b_input_mask)
|
66 |
+
loss = loss_fn(outputs, b_labels)
|
67 |
+
total_val_loss += loss.item()
|
68 |
+
|
69 |
+
preds = torch.argmax(outputs, dim=1)
|
70 |
+
total_val_accuracy += (preds == b_labels).sum().item()
|
71 |
+
|
72 |
+
avg_val_accuracy = total_val_accuracy / len(val_dataloader.dataset)
|
73 |
+
avg_val_loss = total_val_loss / len(val_dataloader)
|
74 |
+
print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}")
|
75 |
+
|
76 |
+
# Testing the model
|
77 |
+
def test_model(model, test_dataloader, device):
|
78 |
+
model.eval()
|
79 |
+
all_preds = []
|
80 |
+
all_labels = []
|
81 |
+
with torch.no_grad():
|
82 |
+
for batch in test_dataloader:
|
83 |
+
b_input_ids, b_input_mask, b_labels = batch
|
84 |
+
b_input_ids = b_input_ids.to(device)
|
85 |
+
b_input_mask = b_input_mask.to(device)
|
86 |
+
b_labels = b_labels.to(device)
|
87 |
+
|
88 |
+
outputs = model(b_input_ids, b_input_mask)
|
89 |
+
preds = torch.argmax(outputs, dim=1)
|
90 |
+
|
91 |
+
all_preds.append(preds.cpu().numpy())
|
92 |
+
all_labels.append(b_labels.cpu().numpy())
|
93 |
+
|
94 |
+
all_preds = np.concatenate(all_preds)
|
95 |
+
all_labels = np.concatenate(all_labels)
|
96 |
+
accuracy = accuracy_score(all_labels, all_preds)
|
97 |
+
print(f"Test Accuracy: {accuracy}")
|
98 |
+
|
99 |
+
# Main function to train, validate, and test the model
|
100 |
+
def main(data_path, epochs=3, batch_size=16):
|
101 |
+
# Read and preprocess data
|
102 |
+
data = read_data(data_path)
|
103 |
+
if data is None:
|
104 |
+
return
|
105 |
+
|
106 |
+
input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
|
107 |
+
X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)
|
108 |
+
|
109 |
+
# Determine the number of labels
|
110 |
+
num_labels = len(labelencoder.classes_)
|
111 |
+
|
112 |
+
# Create the model
|
113 |
+
model = BertClassifier(num_labels)
|
114 |
+
|
115 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
116 |
+
model.to(device)
|
117 |
+
|
118 |
+
# Create dataloaders
|
119 |
+
train_dataset = TensorDataset(X_train_ids, X_train_masks, y_train)
|
120 |
+
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
|
121 |
+
|
122 |
+
val_dataset = TensorDataset(X_test_ids, X_test_masks, y_test)
|
123 |
+
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
|
124 |
+
|
125 |
+
# Train the model
|
126 |
+
train_model(model, train_dataloader, val_dataloader, device, epochs=epochs)
|
127 |
+
|
128 |
+
# Test the model
|
129 |
+
test_dataloader = DataLoader(val_dataset, batch_size=batch_size)
|
130 |
+
test_model(model, test_dataloader, device)
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
|
134 |
+
main(data_path)
|
datapreprocessing.py → data_preprocessing.py
RENAMED
@@ -1,57 +1,74 @@
|
|
1 |
-
# Import Required
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
|
5 |
-
import tensorflow
|
6 |
-
import keras
|
7 |
import torch
|
8 |
-
|
9 |
-
import re
|
10 |
-
|
11 |
from transformers import BertTokenizer
|
12 |
from sklearn.preprocessing import LabelEncoder
|
|
|
|
|
13 |
|
14 |
-
# Read the data
|
15 |
def read_data(path):
|
16 |
try:
|
17 |
-
df=pd.read_csv(path)
|
|
|
|
|
|
|
18 |
return df
|
19 |
except FileNotFoundError:
|
20 |
-
print("File not
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
if data is not None:
|
|
|
24 |
print(data.head(15))
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
27 |
def clean_text(text):
|
28 |
-
text=text.lower()
|
29 |
-
text=re.sub(r"\d+"," ",text)
|
30 |
-
text=re.sub(r"[^\w\s]"," ",text)
|
31 |
-
text=text.strip()
|
32 |
return text
|
33 |
|
34 |
-
|
35 |
-
|
|
|
36 |
|
37 |
-
input_ids=[]
|
38 |
-
attention_masks=[]
|
|
|
|
|
|
|
|
|
39 |
|
40 |
for description in df["Transaction Description"]:
|
41 |
cleaned_text = clean_text(description)
|
42 |
|
43 |
# Debugging print statements
|
44 |
-
print(f"Original Description: {description}")
|
45 |
-
print(f"Cleaned Text: {cleaned_text}")
|
46 |
|
47 |
# Only tokenize if the cleaned text is not empty
|
48 |
if cleaned_text:
|
49 |
encoded_dict = tokenizer.encode_plus(
|
50 |
cleaned_text,
|
51 |
-
add_special_tokens=True, #
|
52 |
max_length=max_length,
|
53 |
pad_to_max_length=True,
|
54 |
-
return_attention_mask=True,
|
55 |
return_tensors="pt",
|
56 |
truncation=True
|
57 |
)
|
@@ -68,13 +85,33 @@ def preprocessing_data(df,max_length=20):
|
|
68 |
if not input_ids:
|
69 |
raise ValueError("No input_ids were collected. Check the cleaning process.")
|
70 |
|
|
|
71 |
input_ids = torch.cat(input_ids, dim=0)
|
72 |
attention_masks = torch.cat(attention_masks, dim=0)
|
73 |
|
|
|
74 |
labelencoder = LabelEncoder()
|
75 |
labels = labelencoder.fit_transform(df["Category"])
|
76 |
-
labels = torch.tensor(labels)
|
77 |
|
78 |
return input_ids, attention_masks, labels, labelencoder
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import Required Libraries:
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
|
|
|
|
|
5 |
import torch
|
|
|
|
|
|
|
6 |
from transformers import BertTokenizer
|
7 |
from sklearn.preprocessing import LabelEncoder
|
8 |
+
from sklearn.model_selection import train_test_split
|
9 |
+
import re
|
10 |
|
11 |
+
# Read the data
|
12 |
def read_data(path):
|
13 |
try:
|
14 |
+
df = pd.read_csv(path)
|
15 |
+
if df.empty:
|
16 |
+
print("The file is empty.")
|
17 |
+
return None
|
18 |
return df
|
19 |
except FileNotFoundError:
|
20 |
+
print(f"File not found at: {path}")
|
21 |
+
return None
|
22 |
+
except Exception as e:
|
23 |
+
print(f"An error occurred: {e}")
|
24 |
+
return None
|
25 |
+
|
26 |
+
# Path to your data file
|
27 |
+
data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
|
28 |
+
|
29 |
+
# Read the data and check if it was loaded successfully
|
30 |
+
data = read_data(data_path)
|
31 |
if data is not None:
|
32 |
+
print("Data loaded successfully:")
|
33 |
print(data.head(15))
|
34 |
+
else:
|
35 |
+
print("Data loading failed. Exiting...")
|
36 |
+
exit()
|
37 |
+
|
38 |
+
# Cleaning the text
|
39 |
def clean_text(text):
|
40 |
+
text = text.lower() # Converting uppercase to lowercase
|
41 |
+
text = re.sub(r"\d+", " ", text) # Removing digits in the text
|
42 |
+
text = re.sub(r"[^\w\s]", " ", text) # Removing punctuations
|
43 |
+
text = text.strip() # Remove extra spaces
|
44 |
return text
|
45 |
|
46 |
+
# Preprocessing the data
|
47 |
+
def preprocessing_data(df, max_length=20):
|
48 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
49 |
|
50 |
+
input_ids = []
|
51 |
+
attention_masks = []
|
52 |
+
|
53 |
+
# Ensure the dataframe has the required columns
|
54 |
+
if "Transaction Description" not in df.columns or "Category" not in df.columns:
|
55 |
+
raise ValueError("The required columns 'Transaction Description' and 'Category' are missing from the dataset.")
|
56 |
|
57 |
for description in df["Transaction Description"]:
|
58 |
cleaned_text = clean_text(description)
|
59 |
|
60 |
# Debugging print statements
|
61 |
+
# print(f"Original Description: {description}")
|
62 |
+
# print(f"Cleaned Text: {cleaned_text}")
|
63 |
|
64 |
# Only tokenize if the cleaned text is not empty
|
65 |
if cleaned_text:
|
66 |
encoded_dict = tokenizer.encode_plus(
|
67 |
cleaned_text,
|
68 |
+
add_special_tokens=True, # Add special tokens for BERT
|
69 |
max_length=max_length,
|
70 |
pad_to_max_length=True,
|
71 |
+
return_attention_mask=True,
|
72 |
return_tensors="pt",
|
73 |
truncation=True
|
74 |
)
|
|
|
85 |
if not input_ids:
|
86 |
raise ValueError("No input_ids were collected. Check the cleaning process.")
|
87 |
|
88 |
+
# Concatenating the list of tensors to form a single tensor
|
89 |
input_ids = torch.cat(input_ids, dim=0)
|
90 |
attention_masks = torch.cat(attention_masks, dim=0)
|
91 |
|
92 |
+
# Encoding the labels
|
93 |
labelencoder = LabelEncoder()
|
94 |
labels = labelencoder.fit_transform(df["Category"])
|
95 |
+
labels = torch.tensor(labels, dtype=torch.long) # Convert labels to LongTensor
|
96 |
|
97 |
return input_ids, attention_masks, labels, labelencoder
|
98 |
|
99 |
+
# Split the data into train and test sets
|
100 |
+
def split_data(input_ids, attention_masks, labels, test_size=0.2, random_state=42):
|
101 |
+
X_train_ids, X_test_ids, y_train, y_test = train_test_split(
|
102 |
+
input_ids, labels, test_size=test_size, random_state=random_state
|
103 |
+
)
|
104 |
+
|
105 |
+
X_train_masks, X_test_masks = train_test_split(
|
106 |
+
attention_masks, test_size=test_size, random_state=random_state
|
107 |
+
)
|
108 |
+
|
109 |
+
return X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test
|
110 |
+
|
111 |
+
# Preprocess the data and split into train and test sets
|
112 |
input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
|
113 |
+
X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)
|
114 |
+
|
115 |
+
# Output the sizes of the splits for confirmation
|
116 |
+
print(f"Training set size: {X_train_ids.shape[0]}")
|
117 |
+
print(f"Test set size: {X_test_ids.shape[0]}")
|
requirenments.txt → requirements.txt
RENAMED
File without changes
|
setup.md
CHANGED
@@ -56,4 +56,4 @@ type >> cd transactify_venv
|
|
56 |
|
57 |
to install required libaries...
|
58 |
go to cmd..
|
59 |
-
type >>pip install -r
|
|
|
56 |
|
57 |
to install required libaries...
|
58 |
go to cmd..
|
59 |
+
type >>pip install -r requirements.txt
|