transactify / datapreprocessing.py
ananthakrishnan
tech: datapreprocessing
339e6e3
raw
history blame
2.59 kB
# Import Required Libaries:
import numpy as np
import pandas as pd
import tensorflow
import keras
import torch
import re
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
# Read the data.
def read_data(path):
try:
df=pd.read_csv(path)
return df
except FileNotFoundError:
print("File not exsists")
data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
if data is not None:
print(data.head(15))
# cleaning the text...
def clean_text(text):
text=text.lower() # converting uppercase to lowercase
text=re.sub(r"\d+"," ",text) # Removing digits in the text
text=re.sub(r"[^\w\s]"," ",text) # Removing punctuations
text=text.strip() # Remove extra spaces
return text
def preprocessing_data(df,max_length=20):
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
input_ids=[]
attention_masks=[]
for description in df["Transaction Description"]:
cleaned_text = clean_text(description)
# Debugging print statements
print(f"Original Description: {description}")
print(f"Cleaned Text: {cleaned_text}")
# Only tokenize if the cleaned text is not empty
if cleaned_text:
encoded_dict = tokenizer.encode_plus(
cleaned_text,
add_special_tokens=True, # Correct argument
max_length=max_length,
pad_to_max_length=True,
return_attention_mask=True, # Correct argument
return_tensors="pt",
truncation=True
)
input_ids.append(encoded_dict['input_ids']) # Append input IDs
attention_masks.append(encoded_dict['attention_mask']) # Append attention masks
else:
print("Cleaned text is empty, skipping...")
# Debugging output to check sizes
print(f"Total input_ids collected: {len(input_ids)}")
print(f"Total attention_masks collected: {len(attention_masks)}")
if not input_ids:
raise ValueError("No input_ids were collected. Check the cleaning process.")
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labelencoder = LabelEncoder()
labels = labelencoder.fit_transform(df["Category"])
labels = torch.tensor(labels)
return input_ids, attention_masks, labels, labelencoder
input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)