ananthakrishnan commited on
Commit
56c9213
1 Parent(s): d5e6b38

texh: datapreprocessing

Browse files
.gitignore CHANGED
@@ -1 +1 @@
1
- venv
 
1
+ transactify_venv
Dataset/transaction_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
datapreprocessing.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Required Libaries:
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ import tensorflow
6
+ import keras
7
+ import torch
8
+
9
+ import re
10
+
11
+ from transformers import BertTokenizer
12
+ from sklearn.preprocessing import LabelEncoder
13
+
14
+ # Read the data.
15
+ def read_data(path):
16
+ try:
17
+ df=pd.read_csv(path)
18
+ return df
19
+ except FileNotFoundError:
20
+ print("File not found")
21
+
22
+ data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
23
+ if data is not None:
24
+ print(data.head(15))
25
+
26
+ # cleaning the text...
27
+ def clean_text(text):
28
+ text=text.lower() # converting uppercase to lowercase
29
+ text=re.sub(r"\d+"," ",text) # Removing digits in the text
30
+ text=re.sub(r"[^\w\s]"," ",text) # Removing punctuations
31
+ text=text.strip() # Remove extra spaces
32
+ return text
33
+
34
+ def preprocessing_data(df,max_length=20):
35
+ tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
36
+
37
+ input_ids=[]
38
+ attention_masks=[]
39
+
40
+ for description in df["Transaction Description"]:
41
+ cleaned_text = clean_text(description)
42
+
43
+ # Debugging print statements
44
+ print(f"Original Description: {description}")
45
+ print(f"Cleaned Text: {cleaned_text}")
46
+
47
+ # Only tokenize if the cleaned text is not empty
48
+ if cleaned_text:
49
+ encoded_dict = tokenizer.encode_plus(
50
+ cleaned_text,
51
+ add_special_tokens=True, # Correct argument
52
+ max_length=max_length,
53
+ pad_to_max_length=True,
54
+ return_attention_mask=True, # Correct argument
55
+ return_tensors="pt",
56
+ truncation=True
57
+ )
58
+
59
+ input_ids.append(encoded_dict['input_ids']) # Append input IDs
60
+ attention_masks.append(encoded_dict['attention_mask']) # Append attention masks
61
+ else:
62
+ print("Cleaned text is empty, skipping...")
63
+
64
+ # Debugging output to check sizes
65
+ print(f"Total input_ids collected: {len(input_ids)}")
66
+ print(f"Total attention_masks collected: {len(attention_masks)}")
67
+
68
+ if not input_ids:
69
+ raise ValueError("No input_ids were collected. Check the cleaning process.")
70
+
71
+ input_ids = torch.cat(input_ids, dim=0)
72
+ attention_masks = torch.cat(attention_masks, dim=0)
73
+
74
+ labelencoder = LabelEncoder()
75
+ labels = labelencoder.fit_transform(df["Category"])
76
+ labels = torch.tensor(labels)
77
+
78
+ return input_ids, attention_masks, labels, labelencoder
79
+
80
+ input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)
main.py DELETED
@@ -1,6 +0,0 @@
1
- # hello_world.py
2
- def hello_world():
3
- print("Hello World")
4
-
5
- if __name__ == "__main__":
6
- hello_world()
 
 
 
 
 
 
 
requirements.txt DELETED
File without changes
requirenments.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ tensorflow
4
+ transformers
5
+ scikit-learn
6
+ torch
7
+ torchvision
8
+ torchaudio
setup.md CHANGED
@@ -43,10 +43,17 @@ Choose Write Tab (3rd one) / go here https://huggingface.co/settings/tokens/new?
43
  ## Create Virtual Environment
44
 
45
  ```
46
- python3 -m venv venv
47
- source venv/bin/activate
48
- ```
49
 
50
- ## FYI
 
 
 
 
 
 
51
 
52
- > We initially started with `pip install transformers datasets`
 
 
 
43
  ## Create Virtual Environment
44
 
45
  ```
46
+ create a Virtual Environment for Transactify project...
47
+ python -m venv transactify_venv
 
48
 
49
+ To activate environment..
50
+ go to cmd ..
51
+ type >> cd transactify_venv
52
+ >> cd scripts
53
+ >> activate
54
+ ```
55
+ ## Installing Required Libaries.
56
 
57
+ to install required libaries...
58
+ go to cmd..
59
+ type >>pip install -r requirenments.txt