transactify / data_preprocessing.py
ananthakrishnan
tech: build LSTM model
02b8f3f
raw
history blame
3.01 kB
# data_preprocessing.py
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Read the data
def read_data(path):
try:
df = pd.read_csv(path)
if df.empty:
print("The file is empty.")
return None
return df
except FileNotFoundError:
print(f"File not found at: {path}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
# Cleaning the text
def clean_text(text):
text = text.lower() # Convert uppercase to lowercase
text = re.sub(r"\d+", " ", text) # Remove digits
text = re.sub(r"[^\w\s]", " ", text) # Remove punctuations
text = text.strip() # Remove extra spaces
return text
# Main preprocessing function
def preprocess_data(file_path, max_len=10, vocab_size=250):
# Read the data
df = read_data(file_path)
if df is None:
print("Data loading failed.")
return None, None, None, None
# Clean the text
df['Transaction Description'] = df['Transaction Description'].apply(clean_text)
# Initialize the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Transaction Description'])
# Convert texts to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['Transaction Description'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
# Initialize LabelEncoder and encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Category'])
return padded_sequences, labels, tokenizer, label_encoder
# Train-test split function
def split_data(sequences, labels, test_size=0.2, random_state=42):
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=test_size, random_state=random_state)
return X_train, X_test, y_train, y_test
# Main function to execute preprocessing
def main():
# Path to your data file
data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv"
# Preprocess the data
sequences, labels, tokenizer, label_encoder = preprocess_data(data_path)
# Check if preprocessing succeeded
if sequences is not None:
print("Data preprocessing successful!")
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = split_data(sequences, labels)
print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}")
else:
print("Data preprocessing failed.")
# Execute the main function
if __name__ == "__main__":
main()