webslate
/

transactify

Model card Files Files and versions Community

transactify / datapreprocessing.py

ananthakrishnan

tech: datapreprocessing

339e6e3 9 months ago

2.59 kB

	# Import Required Libaries:
	import numpy as np
	import pandas as pd

	import tensorflow
	import keras
	import torch

	import re

	from transformers import BertTokenizer
	from sklearn.preprocessing import LabelEncoder

	# Read the data.
	def read_data(path):
	try:
	df=pd.read_csv(path)
	return df
	except FileNotFoundError:
	print("File not exsists")

	data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
	if data is not None:
	print(data.head(15))

	# cleaning the text...
	def clean_text(text):
	text=text.lower() # converting uppercase to lowercase
	text=re.sub(r"\d+"," ",text) # Removing digits in the text
	text=re.sub(r"[^\w\s]"," ",text) # Removing punctuations
	text=text.strip() # Remove extra spaces
	return text

	def preprocessing_data(df,max_length=20):
	tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

	input_ids=[]
	attention_masks=[]

	for description in df["Transaction Description"]:
	cleaned_text = clean_text(description)

	# Debugging print statements
	print(f"Original Description: {description}")
	print(f"Cleaned Text: {cleaned_text}")

	# Only tokenize if the cleaned text is not empty
	if cleaned_text:
	encoded_dict = tokenizer.encode_plus(
	cleaned_text,
	add_special_tokens=True, # Correct argument
	max_length=max_length,
	pad_to_max_length=True,
	return_attention_mask=True, # Correct argument
	return_tensors="pt",
	truncation=True
	)

	input_ids.append(encoded_dict['input_ids']) # Append input IDs
	attention_masks.append(encoded_dict['attention_mask']) # Append attention masks
	else:
	print("Cleaned text is empty, skipping...")

	# Debugging output to check sizes
	print(f"Total input_ids collected: {len(input_ids)}")
	print(f"Total attention_masks collected: {len(attention_masks)}")

	if not input_ids:
	raise ValueError("No input_ids were collected. Check the cleaning process.")

	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)

	labelencoder = LabelEncoder()
	labels = labelencoder.fit_transform(df["Category"])
	labels = torch.tensor(labels)

	return input_ids, attention_masks, labels, labelencoder

	input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)