File size: 2,594 Bytes
56c9213 339e6e3 56c9213 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# Import Required Libaries:
import numpy as np
import pandas as pd
import tensorflow
import keras
import torch
import re
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
# Read the data.
def read_data(path):
try:
df=pd.read_csv(path)
return df
except FileNotFoundError:
print("File not exsists")
data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
if data is not None:
print(data.head(15))
# cleaning the text...
def clean_text(text):
text=text.lower() # converting uppercase to lowercase
text=re.sub(r"\d+"," ",text) # Removing digits in the text
text=re.sub(r"[^\w\s]"," ",text) # Removing punctuations
text=text.strip() # Remove extra spaces
return text
def preprocessing_data(df,max_length=20):
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
input_ids=[]
attention_masks=[]
for description in df["Transaction Description"]:
cleaned_text = clean_text(description)
# Debugging print statements
print(f"Original Description: {description}")
print(f"Cleaned Text: {cleaned_text}")
# Only tokenize if the cleaned text is not empty
if cleaned_text:
encoded_dict = tokenizer.encode_plus(
cleaned_text,
add_special_tokens=True, # Correct argument
max_length=max_length,
pad_to_max_length=True,
return_attention_mask=True, # Correct argument
return_tensors="pt",
truncation=True
)
input_ids.append(encoded_dict['input_ids']) # Append input IDs
attention_masks.append(encoded_dict['attention_mask']) # Append attention masks
else:
print("Cleaned text is empty, skipping...")
# Debugging output to check sizes
print(f"Total input_ids collected: {len(input_ids)}")
print(f"Total attention_masks collected: {len(attention_masks)}")
if not input_ids:
raise ValueError("No input_ids were collected. Check the cleaning process.")
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labelencoder = LabelEncoder()
labels = labelencoder.fit_transform(df["Category"])
labels = torch.tensor(labels)
return input_ids, attention_masks, labels, labelencoder
input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
|