File size: 2,594 Bytes

# Import Required Libaries:
import numpy as np
import pandas as pd

import tensorflow
import keras
import torch

import re

from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder

# Read the data.
def read_data(path):
    try:
        df=pd.read_csv(path)
        return df
    except FileNotFoundError:
        print("File not exsists")
        
data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
if data is not None:
    print(data.head(15))
    
# cleaning the text...
def clean_text(text):
    text=text.lower()                   # converting uppercase to lowercase
    text=re.sub(r"\d+"," ",text)        # Removing digits in the text
    text=re.sub(r"[^\w\s]"," ",text)    # Removing punctuations
    text=text.strip()                   # Remove extra spaces
    return text

def preprocessing_data(df,max_length=20):
    tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
    
    input_ids=[]
    attention_masks=[]
    
    for description in df["Transaction Description"]:
        cleaned_text = clean_text(description)
        
        # Debugging print statements
        print(f"Original Description: {description}")
        print(f"Cleaned Text: {cleaned_text}")
        
        # Only tokenize if the cleaned text is not empty
        if cleaned_text:  
            encoded_dict = tokenizer.encode_plus(
                cleaned_text,
                add_special_tokens=True,  # Correct argument
                max_length=max_length,
                pad_to_max_length=True,
                return_attention_mask=True,  # Correct argument
                return_tensors="pt",
                truncation=True
            )
            
            input_ids.append(encoded_dict['input_ids'])  # Append input IDs
            attention_masks.append(encoded_dict['attention_mask'])  # Append attention masks
        else:
            print("Cleaned text is empty, skipping...")

    # Debugging output to check sizes
    print(f"Total input_ids collected: {len(input_ids)}")
    print(f"Total attention_masks collected: {len(attention_masks)}")
    
    if not input_ids:
        raise ValueError("No input_ids were collected. Check the cleaning process.")

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    labelencoder = LabelEncoder()
    labels = labelencoder.fit_transform(df["Category"])
    labels = torch.tensor(labels)
    
    return input_ids, attention_masks, labels, labelencoder

input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)