|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
import tensorflow |
|
import keras |
|
import torch |
|
|
|
import re |
|
|
|
from transformers import BertTokenizer |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
|
|
def read_data(path): |
|
try: |
|
df=pd.read_csv(path) |
|
return df |
|
except FileNotFoundError: |
|
print("File not found") |
|
|
|
data=read_data(r"E:\transactify\Dataset\transaction_data.csv") |
|
if data is not None: |
|
print(data.head(15)) |
|
|
|
|
|
def clean_text(text): |
|
text=text.lower() |
|
text=re.sub(r"\d+"," ",text) |
|
text=re.sub(r"[^\w\s]"," ",text) |
|
text=text.strip() |
|
return text |
|
|
|
def preprocessing_data(df,max_length=20): |
|
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased") |
|
|
|
input_ids=[] |
|
attention_masks=[] |
|
|
|
for description in df["Transaction Description"]: |
|
cleaned_text = clean_text(description) |
|
|
|
|
|
print(f"Original Description: {description}") |
|
print(f"Cleaned Text: {cleaned_text}") |
|
|
|
|
|
if cleaned_text: |
|
encoded_dict = tokenizer.encode_plus( |
|
cleaned_text, |
|
add_special_tokens=True, |
|
max_length=max_length, |
|
pad_to_max_length=True, |
|
return_attention_mask=True, |
|
return_tensors="pt", |
|
truncation=True |
|
) |
|
|
|
input_ids.append(encoded_dict['input_ids']) |
|
attention_masks.append(encoded_dict['attention_mask']) |
|
else: |
|
print("Cleaned text is empty, skipping...") |
|
|
|
|
|
print(f"Total input_ids collected: {len(input_ids)}") |
|
print(f"Total attention_masks collected: {len(attention_masks)}") |
|
|
|
if not input_ids: |
|
raise ValueError("No input_ids were collected. Check the cleaning process.") |
|
|
|
input_ids = torch.cat(input_ids, dim=0) |
|
attention_masks = torch.cat(attention_masks, dim=0) |
|
|
|
labelencoder = LabelEncoder() |
|
labels = labelencoder.fit_transform(df["Category"]) |
|
labels = torch.tensor(labels) |
|
|
|
return input_ids, attention_masks, labels, labelencoder |
|
|
|
input_ids, attention_masks, labels, labelencoder = preprocessing_data(data) |
|
|