# data_preprocessing.py import numpy as np import pandas as pd import re from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # Read the data def read_data(path): try: df = pd.read_csv(path) if df.empty: print("The file is empty.") return None return df except FileNotFoundError: print(f"File not found at: {path}") return None except Exception as e: print(f"An error occurred: {e}") return None # Cleaning the text def clean_text(text): text = text.lower() # Convert uppercase to lowercase text = re.sub(r"\d+", " ", text) # Remove digits text = re.sub(r"[^\w\s]", " ", text) # Remove punctuations text = text.strip() # Remove extra spaces return text # Main preprocessing function def preprocess_data(file_path, max_len=10, vocab_size=250): # Read the data df = read_data(file_path) if df is None: print("Data loading failed.") return None, None, None, None # Clean the text df['Transaction Description'] = df['Transaction Description'].apply(clean_text) # Initialize the tokenizer tokenizer = Tokenizer(num_words=vocab_size, oov_token="") tokenizer.fit_on_texts(df['Transaction Description']) # Convert texts to sequences and pad them sequences = tokenizer.texts_to_sequences(df['Transaction Description']) padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post') # Initialize LabelEncoder and encode the labels label_encoder = LabelEncoder() labels = label_encoder.fit_transform(df['Category']) return padded_sequences, labels, tokenizer, label_encoder # Train-test split function def split_data(sequences, labels, test_size=0.2, random_state=42): X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=test_size, random_state=random_state) return X_train, X_test, y_train, y_test # Main function to execute preprocessing def main(): # Path to your data file data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv" # Preprocess the data sequences, labels, tokenizer, label_encoder = preprocess_data(data_path) # Check if preprocessing succeeded if sequences is not None: print("Data preprocessing successful!") # Split the data into training and testing sets X_train, X_test, y_train, y_test = split_data(sequences, labels) print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}") print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}") else: print("Data preprocessing failed.") # Execute the main function if __name__ == "__main__": main()