# data_preprocessing.py
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Read the data
def read_data(path):
    try:
        df = pd.read_csv(path)
        if df.empty:
            print("The file is empty.")
            return None
        return df
    except FileNotFoundError:
        print(f"File not found at: {path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Cleaning the text
def clean_text(text):
    text = text.lower()                    # Convert uppercase to lowercase
    text = re.sub(r"\d+", " ", text)       # Remove digits
    text = re.sub(r"[^\w\s]", " ", text)   # Remove punctuations
    text = text.strip()                    # Remove extra spaces
    return text

# Main preprocessing function
def preprocess_data(file_path, max_len=10, vocab_size=250):
    # Read the data
    df = read_data(file_path)
    if df is None:
        print("Data loading failed.")
        return None, None, None, None

    # Clean the text
    df['Transaction Description'] = df['Transaction Description'].apply(clean_text)
    
    # Initialize the tokenizer
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(df['Transaction Description'])
    
    # Convert texts to sequences and pad them
    sequences = tokenizer.texts_to_sequences(df['Transaction Description'])
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    
    # Initialize LabelEncoder and encode the labels
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(df['Category'])
    
    return padded_sequences, labels, tokenizer, label_encoder

# Train-test split function
def split_data(sequences, labels, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Main function to execute preprocessing
def main():
    # Path to your data file
    data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"

    # Preprocess the data
    sequences, labels, tokenizer, label_encoder = preprocess_data(data_path)

    # Check if preprocessing succeeded
    if sequences is not None:
        print("Data preprocessing successful!")
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = split_data(sequences, labels)
        print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}")
        print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}")
    else:
        print("Data preprocessing failed.")

# Execute the main function
if __name__ == "__main__":
    main()