Model creation

by Ananthakrishnan12 - opened Oct 8

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+252

-81

Files changed (5) hide show

__pycache__/datapreprocessing.cpython-312.pyc +0 -0
bert_model.py +134 -0
datapreprocessing.py → data_preprocessing.py +63 -26
requirenments.txt → requirements.txt +0 -0
setup.md +1 -1

__pycache__/datapreprocessing.cpython-312.pyc ADDED Viewed

Binary file (4.3 kB). View file

bert_model.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Import Required Libraries
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from transformers import BertModel, AdamW
+from sklearn.metrics import accuracy_score
+import numpy as np
+# Import functions from the preprocessing module
+from transactify.data_preprocessing import preprocessing_data, split_data, read_data
+# Define a BERT-based classification model
+class BertClassifier(nn.Module):
+    def __init__(self, num_labels, dropout_rate=0.3):
+        super(BertClassifier, self).__init__()
+        self.bert = BertModel.from_pretrained("bert-base-uncased")
+        self.dropout = nn.Dropout(dropout_rate)
+        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        pooled_output = outputs[1]  # Pooler output (CLS token)
+        output = self.dropout(pooled_output)
+        logits = self.classifier(output)
+        return logits
+# Training the model
+# Training the model
+def train_model(model, train_dataloader, val_dataloader, device, epochs=3, lr=2e-5):
+    optimizer = AdamW(model.parameters(), lr=lr)
+    loss_fn = nn.CrossEntropyLoss()
+    for epoch in range(epochs):
+        model.train()
+        total_train_loss = 0
+        for step, batch in enumerate(train_dataloader):
+            b_input_ids, b_input_mask, b_labels = batch
+            b_input_ids = b_input_ids.to(device)
+            b_input_mask = b_input_mask.to(device)
+            b_labels = b_labels.to(device).long()  # Ensure labels are LongTensor
+            model.zero_grad()
+            outputs = model(b_input_ids, b_input_mask)
+            loss = loss_fn(outputs, b_labels)
+            total_train_loss += loss.item()
+            loss.backward()
+            optimizer.step()
+        avg_train_loss = total_train_loss / len(train_dataloader)
+        print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")
+        model.eval()
+        total_val_accuracy = 0
+        total_val_loss = 0
+        with torch.no_grad():
+            for batch in val_dataloader:
+                b_input_ids, b_input_mask, b_labels = batch
+                b_input_ids = b_input_ids.to(device)
+                b_input_mask = b_input_mask.to(device)
+                b_labels = b_labels.to(device)
+                outputs = model(b_input_ids, b_input_mask)
+                loss = loss_fn(outputs, b_labels)
+                total_val_loss += loss.item()
+                preds = torch.argmax(outputs, dim=1)
+                total_val_accuracy += (preds == b_labels).sum().item()
+        avg_val_accuracy = total_val_accuracy / len(val_dataloader.dataset)
+        avg_val_loss = total_val_loss / len(val_dataloader)
+        print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}")
+# Testing the model
+def test_model(model, test_dataloader, device):
+    model.eval()
+    all_preds = []
+    all_labels = []
+    with torch.no_grad():
+        for batch in test_dataloader:
+            b_input_ids, b_input_mask, b_labels = batch
+            b_input_ids = b_input_ids.to(device)
+            b_input_mask = b_input_mask.to(device)
+            b_labels = b_labels.to(device)
+            outputs = model(b_input_ids, b_input_mask)
+            preds = torch.argmax(outputs, dim=1)
+            all_preds.append(preds.cpu().numpy())
+            all_labels.append(b_labels.cpu().numpy())
+    all_preds = np.concatenate(all_preds)
+    all_labels = np.concatenate(all_labels)
+    accuracy = accuracy_score(all_labels, all_preds)
+    print(f"Test Accuracy: {accuracy}")
+# Main function to train, validate, and test the model
+def main(data_path, epochs=3, batch_size=16):
+    # Read and preprocess data
+    data = read_data(data_path)
+    if data is None:
+        return
+    input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
+    X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)
+    # Determine the number of labels
+    num_labels = len(labelencoder.classes_)
+    # Create the model
+    model = BertClassifier(num_labels)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    # Create dataloaders
+    train_dataset = TensorDataset(X_train_ids, X_train_masks, y_train)
+    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
+    val_dataset = TensorDataset(X_test_ids, X_test_masks, y_test)
+    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
+    # Train the model
+    train_model(model, train_dataloader, val_dataloader, device, epochs=epochs)
+    # Test the model
+    test_dataloader = DataLoader(val_dataset, batch_size=batch_size)
+    test_model(model, test_dataloader, device)
+if __name__ == "__main__":
+    data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
+    main(data_path)

datapreprocessing.py → data_preprocessing.py RENAMED Viewed

@@ -1,57 +1,74 @@
-# Import Required Libaries:
 import numpy as np
 import pandas as pd
-import tensorflow
-import keras
 import torch
-import re
 from transformers import BertTokenizer
 from sklearn.preprocessing import LabelEncoder
-# Read the data.
 def read_data(path):
     try:
-        df=pd.read_csv(path)
         return df
     except FileNotFoundError:
-        print("File not exsists")
-data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
 if data is not None:
     print(data.head(15))
-# cleaning the text...
 def clean_text(text):
-    text=text.lower()                   # converting uppercase to lowercase
-    text=re.sub(r"\d+"," ",text)        # Removing digits in the text
-    text=re.sub(r"[^\w\s]"," ",text)    # Removing punctuations
-    text=text.strip()                   # Remove extra spaces
     return text
-def preprocessing_data(df,max_length=20):
-    tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
-    input_ids=[]
-    attention_masks=[]
     for description in df["Transaction Description"]:
         cleaned_text = clean_text(description)
         # Debugging print statements
-        print(f"Original Description: {description}")
-        print(f"Cleaned Text: {cleaned_text}")
         # Only tokenize if the cleaned text is not empty
         if cleaned_text:
             encoded_dict = tokenizer.encode_plus(
                 cleaned_text,
-                add_special_tokens=True,  # Correct argument
                 max_length=max_length,
                 pad_to_max_length=True,
-                return_attention_mask=True,  # Correct argument
                 return_tensors="pt",
                 truncation=True
             )
@@ -68,13 +85,33 @@ def preprocessing_data(df,max_length=20):
     if not input_ids:
         raise ValueError("No input_ids were collected. Check the cleaning process.")
     input_ids = torch.cat(input_ids, dim=0)
     attention_masks = torch.cat(attention_masks, dim=0)
     labelencoder = LabelEncoder()
     labels = labelencoder.fit_transform(df["Category"])
-    labels = torch.tensor(labels)
     return input_ids, attention_masks, labels, labelencoder
 input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)

+# Import Required Libraries:
 import numpy as np
 import pandas as pd
 import torch
 from transformers import BertTokenizer
 from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+import re
+# Read the data
 def read_data(path):
     try:
+        df = pd.read_csv(path)
+        if df.empty:
+            print("The file is empty.")
+            return None
         return df
     except FileNotFoundError:
+        print(f"File not found at: {path}")
+        return None
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None
+# Path to your data file
+data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
+# Read the data and check if it was loaded successfully
+data = read_data(data_path)
 if data is not None:
+    print("Data loaded successfully:")
     print(data.head(15))
+else:
+    print("Data loading failed. Exiting...")
+    exit()
+# Cleaning the text
 def clean_text(text):
+    text = text.lower()                    # Converting uppercase to lowercase
+    text = re.sub(r"\d+", " ", text)       # Removing digits in the text
+    text = re.sub(r"[^\w\s]", " ", text)   # Removing punctuations
+    text = text.strip()                    # Remove extra spaces
     return text
+# Preprocessing the data
+def preprocessing_data(df, max_length=20):
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    input_ids = []
+    attention_masks = []
+    # Ensure the dataframe has the required columns
+    if "Transaction Description" not in df.columns or "Category" not in df.columns:
+        raise ValueError("The required columns 'Transaction Description' and 'Category' are missing from the dataset.")
     for description in df["Transaction Description"]:
         cleaned_text = clean_text(description)
         # Debugging print statements
+        # print(f"Original Description: {description}")
+        # print(f"Cleaned Text: {cleaned_text}")
         # Only tokenize if the cleaned text is not empty
         if cleaned_text:
             encoded_dict = tokenizer.encode_plus(
                 cleaned_text,
+                add_special_tokens=True,  # Add special tokens for BERT
                 max_length=max_length,
                 pad_to_max_length=True,
+                return_attention_mask=True,
                 return_tensors="pt",
                 truncation=True
             )
     if not input_ids:
         raise ValueError("No input_ids were collected. Check the cleaning process.")
+    # Concatenating the list of tensors to form a single tensor
     input_ids = torch.cat(input_ids, dim=0)
     attention_masks = torch.cat(attention_masks, dim=0)
+    # Encoding the labels
     labelencoder = LabelEncoder()
     labels = labelencoder.fit_transform(df["Category"])
+    labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to LongTensor
     return input_ids, attention_masks, labels, labelencoder
+# Split the data into train and test sets
+def split_data(input_ids, attention_masks, labels, test_size=0.2, random_state=42):
+    X_train_ids, X_test_ids, y_train, y_test = train_test_split(
+        input_ids, labels, test_size=test_size, random_state=random_state
+    )
+    X_train_masks, X_test_masks = train_test_split(
+        attention_masks, test_size=test_size, random_state=random_state
+    )
+    return X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test
+# Preprocess the data and split into train and test sets
 input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
+X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels)
+# Output the sizes of the splits for confirmation
+print(f"Training set size: {X_train_ids.shape[0]}")
+print(f"Test set size: {X_test_ids.shape[0]}")

requirenments.txt → requirements.txt RENAMED Viewed

File without changes

setup.md CHANGED Viewed

@@ -56,4 +56,4 @@ type >> cd transactify_venv
 to install required libaries...
 go to cmd..
-type >>pip install -r requirenments.txt

 to install required libaries...
 go to cmd..
+type >>pip install -r requirements.txt