# Import Required Libaries: import numpy as np import pandas as pd import tensorflow import keras import torch import re from transformers import BertTokenizer from sklearn.preprocessing import LabelEncoder # Read the data. def read_data(path): try: df=pd.read_csv(path) return df except FileNotFoundError: print("File not exsists") data=read_data(r"E:\transactify\Dataset\transaction_data.csv") if data is not None: print(data.head(15)) # cleaning the text... def clean_text(text): text=text.lower() # converting uppercase to lowercase text=re.sub(r"\d+"," ",text) # Removing digits in the text text=re.sub(r"[^\w\s]"," ",text) # Removing punctuations text=text.strip() # Remove extra spaces return text def preprocessing_data(df,max_length=20): tokenizer=BertTokenizer.from_pretrained("bert-base-uncased") input_ids=[] attention_masks=[] for description in df["Transaction Description"]: cleaned_text = clean_text(description) # Debugging print statements print(f"Original Description: {description}") print(f"Cleaned Text: {cleaned_text}") # Only tokenize if the cleaned text is not empty if cleaned_text: encoded_dict = tokenizer.encode_plus( cleaned_text, add_special_tokens=True, # Correct argument max_length=max_length, pad_to_max_length=True, return_attention_mask=True, # Correct argument return_tensors="pt", truncation=True ) input_ids.append(encoded_dict['input_ids']) # Append input IDs attention_masks.append(encoded_dict['attention_mask']) # Append attention masks else: print("Cleaned text is empty, skipping...") # Debugging output to check sizes print(f"Total input_ids collected: {len(input_ids)}") print(f"Total attention_masks collected: {len(attention_masks)}") if not input_ids: raise ValueError("No input_ids were collected. Check the cleaning process.") input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labelencoder = LabelEncoder() labels = labelencoder.fit_transform(df["Category"]) labels = torch.tensor(labels) return input_ids, attention_masks, labels, labelencoder input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)