ananthakrishnan commited on
Commit
0cb9929
1 Parent(s): f03f427

tech: model creation

Browse files
__pycache__/datapreprocessing.cpython-312.pyc ADDED
Binary file (4.3 kB). View file
 
bert_model.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Required Libraries
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.utils.data import DataLoader, TensorDataset
5
+ from transformers import BertModel, AdamW
6
+ from sklearn.metrics import accuracy_score
7
+ import numpy as np
8
+
9
+ # Import functions from the preprocessing module
10
+ from transactify.data_preprocessing import preprocessing_data, split_data, read_data
11
+
12
+ # Define a BERT-based classification model
13
+ class BertClassifier(nn.Module):
14
+ def __init__(self, num_labels, dropout_rate=0.3):
15
+ super(BertClassifier, self).__init__()
16
+ self.bert = BertModel.from_pretrained("bert-base-uncased")
17
+ self.dropout = nn.Dropout(dropout_rate)
18
+ self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
19
+
20
+ def forward(self, input_ids, attention_mask):
21
+ outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
22
+ pooled_output = outputs[1] # Pooler output (CLS token)
23
+ output = self.dropout(pooled_output)
24
+ logits = self.classifier(output)
25
+ return logits
26
+
27
+ # Training the model
28
+ # Training the model
29
+ def train_model(model, train_dataloader, val_dataloader, device, epochs=3, lr=2e-5):
30
+ optimizer = AdamW(model.parameters(), lr=lr)
31
+ loss_fn = nn.CrossEntropyLoss()
32
+
33
+ for epoch in range(epochs):
34
+ model.train()
35
+ total_train_loss = 0
36
+ for step, batch in enumerate(train_dataloader):
37
+ b_input_ids, b_input_mask, b_labels = batch
38
+
39
+ b_input_ids = b_input_ids.to(device)
40
+ b_input_mask = b_input_mask.to(device)
41
+ b_labels = b_labels.to(device).long() # Ensure labels are LongTensor
42
+
43
+ model.zero_grad()
44
+ outputs = model(b_input_ids, b_input_mask)
45
+
46
+ loss = loss_fn(outputs, b_labels)
47
+ total_train_loss += loss.item()
48
+ loss.backward()
49
+ optimizer.step()
50
+
51
+ avg_train_loss = total_train_loss / len(train_dataloader)
52
+ print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")
53
+
54
+ model.eval()
55
+ total_val_accuracy = 0
56
+ total_val_loss = 0
57
+
58
+ with torch.no_grad():
59
+ for batch in val_dataloader:
60
+ b_input_ids, b_input_mask, b_labels = batch
61
+ b_input_ids = b_input_ids.to(device)
62
+ b_input_mask = b_input_mask.to(device)
63
+ b_labels = b_labels.to(device)
64
+
65
+ outputs = model(b_input_ids, b_input_mask)
66
+ loss = loss_fn(outputs, b_labels)
67
+ total_val_loss += loss.item()
68
+
69
+ preds = torch.argmax(outputs, dim=1)
70
+ total_val_accuracy += (preds == b_labels).sum().item()
71
+
72
+ avg_val_accuracy = total_val_accuracy / len(val_dataloader.dataset)
73
+ avg_val_loss = total_val_loss / len(val_dataloader)
74
+ print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}")
75
+
76
+ # Testing the model
77
+ def test_model(model, test_dataloader, device):
78
+ model.eval()
79
+ all_preds = []
80
+ all_labels = []
81
+ with torch.no_grad():
82
+ for batch in test_dataloader:
83
+ b_input_ids, b_input_mask, b_labels = batch
84
+ b_input_ids = b_input_ids.to(device)
85
+ b_input_mask = b_input_mask.to(device)
86
+ b_labels = b_labels.to(device)
87
+
88
+ outputs = model(b_input_ids, b_input_mask)
89
+ preds = torch.argmax(outputs, dim=1)
90
+
91
+ all_preds.append(preds.cpu().numpy())
92
+ all_labels.append(b_labels.cpu().numpy())
93
+
94
+ all_preds = np.concatenate(all_preds)
95
+ all_labels = np.concatenate(all_labels)
96
+ accuracy = accuracy_score(all_labels, all_preds)
97
+ print(f"Test Accuracy: {accuracy}")
98
+
99
+ # Main function to train, validate, and test the model
100
+ def main(data_path, epochs=3, batch_size=16):
101
+ # Read and preprocess data
102
+ data = read_data(data_path)
103
+ if data is None:
104
+ return
105
+
106
+ input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
107
+ X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)
108
+
109
+ # Determine the number of labels
110
+ num_labels = len(labelencoder.classes_)
111
+
112
+ # Create the model
113
+ model = BertClassifier(num_labels)
114
+
115
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116
+ model.to(device)
117
+
118
+ # Create dataloaders
119
+ train_dataset = TensorDataset(X_train_ids, X_train_masks, y_train)
120
+ train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
121
+
122
+ val_dataset = TensorDataset(X_test_ids, X_test_masks, y_test)
123
+ val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
124
+
125
+ # Train the model
126
+ train_model(model, train_dataloader, val_dataloader, device, epochs=epochs)
127
+
128
+ # Test the model
129
+ test_dataloader = DataLoader(val_dataset, batch_size=batch_size)
130
+ test_model(model, test_dataloader, device)
131
+
132
+ if __name__ == "__main__":
133
+ data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
134
+ main(data_path)
datapreprocessing.py → data_preprocessing.py RENAMED
@@ -1,57 +1,74 @@
1
- # Import Required Libaries:
2
  import numpy as np
3
  import pandas as pd
4
 
5
- import tensorflow
6
- import keras
7
  import torch
8
-
9
- import re
10
-
11
  from transformers import BertTokenizer
12
  from sklearn.preprocessing import LabelEncoder
 
 
13
 
14
- # Read the data.
15
  def read_data(path):
16
  try:
17
- df=pd.read_csv(path)
 
 
 
18
  return df
19
  except FileNotFoundError:
20
- print("File not exsists")
21
-
22
- data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
 
 
 
 
 
 
 
 
23
  if data is not None:
 
24
  print(data.head(15))
25
-
26
- # cleaning the text...
 
 
 
27
  def clean_text(text):
28
- text=text.lower() # converting uppercase to lowercase
29
- text=re.sub(r"\d+"," ",text) # Removing digits in the text
30
- text=re.sub(r"[^\w\s]"," ",text) # Removing punctuations
31
- text=text.strip() # Remove extra spaces
32
  return text
33
 
34
- def preprocessing_data(df,max_length=20):
35
- tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
 
36
 
37
- input_ids=[]
38
- attention_masks=[]
 
 
 
 
39
 
40
  for description in df["Transaction Description"]:
41
  cleaned_text = clean_text(description)
42
 
43
  # Debugging print statements
44
- print(f"Original Description: {description}")
45
- print(f"Cleaned Text: {cleaned_text}")
46
 
47
  # Only tokenize if the cleaned text is not empty
48
  if cleaned_text:
49
  encoded_dict = tokenizer.encode_plus(
50
  cleaned_text,
51
- add_special_tokens=True, # Correct argument
52
  max_length=max_length,
53
  pad_to_max_length=True,
54
- return_attention_mask=True, # Correct argument
55
  return_tensors="pt",
56
  truncation=True
57
  )
@@ -68,13 +85,33 @@ def preprocessing_data(df,max_length=20):
68
  if not input_ids:
69
  raise ValueError("No input_ids were collected. Check the cleaning process.")
70
 
 
71
  input_ids = torch.cat(input_ids, dim=0)
72
  attention_masks = torch.cat(attention_masks, dim=0)
73
 
 
74
  labelencoder = LabelEncoder()
75
  labels = labelencoder.fit_transform(df["Category"])
76
- labels = torch.tensor(labels)
77
 
78
  return input_ids, attention_masks, labels, labelencoder
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
 
 
 
 
 
 
1
+ # Import Required Libraries:
2
  import numpy as np
3
  import pandas as pd
4
 
 
 
5
  import torch
 
 
 
6
  from transformers import BertTokenizer
7
  from sklearn.preprocessing import LabelEncoder
8
+ from sklearn.model_selection import train_test_split
9
+ import re
10
 
11
+ # Read the data
12
  def read_data(path):
13
  try:
14
+ df = pd.read_csv(path)
15
+ if df.empty:
16
+ print("The file is empty.")
17
+ return None
18
  return df
19
  except FileNotFoundError:
20
+ print(f"File not found at: {path}")
21
+ return None
22
+ except Exception as e:
23
+ print(f"An error occurred: {e}")
24
+ return None
25
+
26
+ # Path to your data file
27
+ data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
28
+
29
+ # Read the data and check if it was loaded successfully
30
+ data = read_data(data_path)
31
  if data is not None:
32
+ print("Data loaded successfully:")
33
  print(data.head(15))
34
+ else:
35
+ print("Data loading failed. Exiting...")
36
+ exit()
37
+
38
+ # Cleaning the text
39
  def clean_text(text):
40
+ text = text.lower() # Converting uppercase to lowercase
41
+ text = re.sub(r"\d+", " ", text) # Removing digits in the text
42
+ text = re.sub(r"[^\w\s]", " ", text) # Removing punctuations
43
+ text = text.strip() # Remove extra spaces
44
  return text
45
 
46
+ # Preprocessing the data
47
+ def preprocessing_data(df, max_length=20):
48
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
49
 
50
+ input_ids = []
51
+ attention_masks = []
52
+
53
+ # Ensure the dataframe has the required columns
54
+ if "Transaction Description" not in df.columns or "Category" not in df.columns:
55
+ raise ValueError("The required columns 'Transaction Description' and 'Category' are missing from the dataset.")
56
 
57
  for description in df["Transaction Description"]:
58
  cleaned_text = clean_text(description)
59
 
60
  # Debugging print statements
61
+ # print(f"Original Description: {description}")
62
+ # print(f"Cleaned Text: {cleaned_text}")
63
 
64
  # Only tokenize if the cleaned text is not empty
65
  if cleaned_text:
66
  encoded_dict = tokenizer.encode_plus(
67
  cleaned_text,
68
+ add_special_tokens=True, # Add special tokens for BERT
69
  max_length=max_length,
70
  pad_to_max_length=True,
71
+ return_attention_mask=True,
72
  return_tensors="pt",
73
  truncation=True
74
  )
 
85
  if not input_ids:
86
  raise ValueError("No input_ids were collected. Check the cleaning process.")
87
 
88
+ # Concatenating the list of tensors to form a single tensor
89
  input_ids = torch.cat(input_ids, dim=0)
90
  attention_masks = torch.cat(attention_masks, dim=0)
91
 
92
+ # Encoding the labels
93
  labelencoder = LabelEncoder()
94
  labels = labelencoder.fit_transform(df["Category"])
95
+ labels = torch.tensor(labels, dtype=torch.long) # Convert labels to LongTensor
96
 
97
  return input_ids, attention_masks, labels, labelencoder
98
 
99
+ # Split the data into train and test sets
100
+ def split_data(input_ids, attention_masks, labels, test_size=0.2, random_state=42):
101
+ X_train_ids, X_test_ids, y_train, y_test = train_test_split(
102
+ input_ids, labels, test_size=test_size, random_state=random_state
103
+ )
104
+
105
+ X_train_masks, X_test_masks = train_test_split(
106
+ attention_masks, test_size=test_size, random_state=random_state
107
+ )
108
+
109
+ return X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test
110
+
111
+ # Preprocess the data and split into train and test sets
112
  input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
113
+ X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)
114
+
115
+ # Output the sizes of the splits for confirmation
116
+ print(f"Training set size: {X_train_ids.shape[0]}")
117
+ print(f"Test set size: {X_test_ids.shape[0]}")
requirenments.txt → requirements.txt RENAMED
File without changes
setup.md CHANGED
@@ -56,4 +56,4 @@ type >> cd transactify_venv
56
 
57
  to install required libaries...
58
  go to cmd..
59
- type >>pip install -r requirenments.txt
 
56
 
57
  to install required libaries...
58
  go to cmd..
59
+ type >>pip install -r requirements.txt