danagyl commited on
Commit
cdb656c
·
verified ·
1 Parent(s): 4364781

Upload Negativ/

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Negativ/train.csv/train.csv filter=lfs diff=lfs merge=lfs -text
Negativ/PythonCode/TrainSentimentModel.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ from datasets import load_dataset
3
+ from sklearn.utils.class_weight import compute_class_weight
4
+ import torch
5
+ from transformers import AutoTokenizer, DebertaForSequenceClassification, TrainingArguments, Trainer
6
+ import numpy as np
7
+ import evaluate
8
+
9
+ # Load the toxic chat dataset from HuggingFace
10
+ ds = load_dataset("csv", data_files={'train': 'C:/Users/Michael/Documents/GitHub/Negativ/train.csv/train.csv'})
11
+
12
+ # Access the train split
13
+ train_ds = ds["train"]
14
+
15
+ labels = train_ds["toxic"]
16
+
17
+ # Compute class weights
18
+ class_weights = compute_class_weight(
19
+ class_weight='balanced', # Use 'balanced' strategy
20
+ classes=np.unique(labels), # Unique class labels
21
+ y=labels
22
+ )
23
+
24
+ print(f"Computed class weights: {class_weights}")
25
+
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+
28
+ # Convert to tensor for PyTorch
29
+ class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
30
+
31
+ num_labels = 2 # Set number of labels correctly
32
+
33
+ # Training arguments can help fine-tune the model by including a specific evaluation strategy
34
+ training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
35
+
36
+ # Accuracy metric was the recommended metric for the training process
37
+ metric = evaluate.load("accuracy")
38
+
39
+ # Load the DeBERTa AutoTokenizer as well as the pre-trained model we are fine-tuning
40
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
41
+ model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
42
+
43
+ model.to(device)
44
+
45
+ # Function to tokenize and encode the dataset
46
+ def tokenize_function_deberta(ds):
47
+ batch_texts = ds['comment_text']
48
+ print(batch_texts[:5])
49
+
50
+ # Encode the text using the tokenizer
51
+ encoded_text = tokenizer(batch_texts, padding='max_length', truncation=True, return_tensors='pt')
52
+
53
+ print(encoded_text)
54
+
55
+ labels = ds["toxic"]
56
+ print(labels[:5])
57
+
58
+ return {'input_ids': encoded_text['input_ids'], 'attention_mask': encoded_text['attention_mask'], 'labels': labels}
59
+
60
+ # Function to compute metrics
61
+ def compute_metrics(eval_pred):
62
+ logits, labels = eval_pred
63
+ predictions = np.argmax(logits, axis=-1)
64
+ return metric.compute(predictions=predictions, references=labels)
65
+
66
+ # Train the model
67
+ def train_deberta_model():
68
+ # Tokenize the dataset
69
+ tokenized_datasets = train_ds.map(tokenize_function_deberta, batched=True)
70
+
71
+ # Create smaller datasets for quick testing
72
+ small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))
73
+ small_eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))
74
+
75
+ class CustomTrainer(Trainer):
76
+ def compute_loss(self, model, inputs, return_outputs=False):
77
+ labels = inputs.pop("labels")
78
+ outputs = model(**inputs)
79
+ logits = outputs.logits
80
+
81
+ # Custom loss function with weights
82
+ loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
83
+ loss = loss_fct(logits, labels)
84
+
85
+ return (loss, outputs) if return_outputs else loss
86
+
87
+ # Initialize trainer
88
+ custom_trainer = CustomTrainer(
89
+ model=model,
90
+ args=training_args,
91
+ train_dataset=small_train_dataset,
92
+ eval_dataset=small_eval_dataset,
93
+ compute_metrics=compute_metrics,
94
+ )
95
+
96
+ # Train the model
97
+ custom_trainer.train()
98
+
99
+ # Save the model's state dictionary as a .pth file
100
+ model_save_path = "toxic_deberta_tuned.pth"
101
+ torch.save(model.state_dict(), model_save_path)
102
+ print(f"Model state dictionary saved to {model_save_path}")
103
+
104
+ # Optional: Save the tokenizer
105
+ tokenizer_save_path = "custom_tokenizer"
106
+ tokenizer.save_pretrained(tokenizer_save_path)
107
+ print(f"Tokenizer saved to {tokenizer_save_path}")
108
+
109
+ # Try-catch block to handle errors
110
+ try:
111
+ train_deberta_model()
112
+
113
+ except FileNotFoundError:
114
+ print("Directory not found. Please double-check your path!")
115
+
116
+ except Exception as e:
117
+ print("A random error occurred, sorry partner: ", e)
Negativ/PythonCode/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .positive_reframe import is_toxic, predict
Negativ/PythonCode/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (234 Bytes). View file
 
Negativ/PythonCode/__pycache__/positive_reframe.cpython-312.pyc ADDED
Binary file (2.74 kB). View file
 
Negativ/PythonCode/custom_tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Negativ/PythonCode/custom_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
Negativ/PythonCode/custom_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Negativ/PythonCode/custom_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "[PAD]",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "[CLS]",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "[SEP]",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "[UNK]",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "50264": {
38
+ "content": "[MASK]",
39
+ "lstrip": true,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "bos_token": "[CLS]",
47
+ "clean_up_tokenization_spaces": true,
48
+ "cls_token": "[CLS]",
49
+ "do_lower_case": false,
50
+ "eos_token": "[SEP]",
51
+ "errors": "replace",
52
+ "mask_token": "[MASK]",
53
+ "model_max_length": 512,
54
+ "pad_token": "[PAD]",
55
+ "sep_token": "[SEP]",
56
+ "tokenizer_class": "DebertaTokenizer",
57
+ "unk_token": "[UNK]",
58
+ "vocab_type": "gpt2"
59
+ }
Negativ/PythonCode/custom_tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Negativ/PythonCode/kaggle.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"username":"michaelkleindl","key":"cdce5c43b43e95114651caf7365c893f"}
Negativ/PythonCode/login_cred.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ username = michaelkleindl
2
+ key = cdce5c43b43e95114651caf7365c893f
Negativ/PythonCode/positive_reframe.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, DebertaForSequenceClassification
3
+
4
+ sentence = "I really love that you suck so bad at this game, you are the worst teammate and i hope you die in a fire"
5
+
6
+ # Define the model architecture
7
+ num_labels = 2 # Assuming a three-class classification task
8
+
9
+ # Load the tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
11
+
12
+ # Initialize the model architecture
13
+ model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
14
+
15
+ # Load the saved state dictionary
16
+ state_dict = torch.load('C:/Users/Michael/Documents/GitHub/Negativ/PythonCode/toxic_deberta_tuned.pth')
17
+
18
+ # Load the state dictionary into the model
19
+ model.load_state_dict(state_dict)
20
+
21
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
+
23
+ # Ensure the model is set up with the correct number of labels
24
+ assert model.config.num_labels == num_labels, "Model labels mismatch!"
25
+
26
+ def predict(sentence):
27
+ # Tokenize the input sentence
28
+ inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True)
29
+
30
+ print(inputs)
31
+
32
+ inputs = {k: v.to(device) for k, v in inputs.items()}
33
+
34
+ # Move the model to evaluation mode
35
+ model.eval()
36
+
37
+ # Perform prediction
38
+ with torch.no_grad():
39
+ outputs = model(**inputs)
40
+
41
+ print(outputs)
42
+
43
+ logits = outputs.logits
44
+ probabilities = torch.softmax(logits, dim=1)
45
+
46
+ print(f"Logits: {logits}")
47
+ print(f"Probabilities: {probabilities}")
48
+
49
+ highest_logit, prediction = torch.max(logits, dim=-1)
50
+
51
+ print(f"Highest Logit: {highest_logit.item()}, Predicted Class: {prediction.item()}")
52
+
53
+ # Interpret the result
54
+ return "Toxic" if prediction == 1 else "Non-toxic"
55
+
56
+ def is_toxic(sentence):
57
+
58
+ result = predict(sentence)
59
+
60
+ print(f"Given Sentence: {sentence}", f"\tPrediction: {result}")
61
+
62
+ if result == "Toxic":
63
+ return True
64
+ else:
65
+ return False
Negativ/PythonCode/test_model.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, DebertaForSequenceClassification
3
+
4
+ # Define the model architecture
5
+ num_labels = 2 # Assuming a three-class classification task
6
+
7
+ # Load the tokenizer
8
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
9
+
10
+ # Initialize the model architecture
11
+ model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
12
+
13
+ # Load the saved state dictionary
14
+ state_dict = torch.load('./toxic_deberta_tuned.pth')
15
+
16
+ # Load the state dictionary into the model
17
+ model.load_state_dict(state_dict)
18
+
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+
21
+ # Ensure the model is set up with the correct number of labels
22
+ assert model.config.num_labels == num_labels, "Model labels mismatch!"
23
+
24
+ def predict(sentence):
25
+ # Tokenize the input sentence
26
+ inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True)
27
+
28
+ print(inputs)
29
+
30
+ inputs = {k: v.to(device) for k, v in inputs.items()}
31
+
32
+ # Move the model to evaluation mode
33
+ model.eval()
34
+
35
+ # Perform prediction
36
+ with torch.no_grad():
37
+ outputs = model(**inputs)
38
+
39
+ print(outputs)
40
+
41
+ logits = outputs.logits
42
+ probabilities = torch.softmax(logits, dim=1)
43
+
44
+ print(f"Logits: {logits}")
45
+ print(f"Probabilities: {probabilities}")
46
+
47
+ highest_logit, prediction = torch.max(logits, dim=-1)
48
+
49
+ print(f"Highest Logit: {highest_logit.item()}, Predicted Class: {prediction.item()}")
50
+
51
+ # Interpret the result
52
+ return "Toxic" if prediction == 1 else "Non-toxic"
53
+
54
+ # Test prediction
55
+ sentence = "you are really bad at this game"
56
+ result = predict(sentence)
57
+ print(f"Given Sentence: {sentence}", f"\tPrediction: {result}")
Negativ/PythonCode/toxic_deberta_tuned.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:028d5e6f97db45391fb27ef12bac008bda23fb2fc4be72700e95aeaf8d8282ab
3
+ size 556858674
Negativ/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Negativ
2
+ LLM Model Used to limit toxic game chat behavior.
Negativ/app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PythonCode.__init__ as pc
3
+
4
+ msg_deleted = "--This Toxic Comment Has Been Deleted--"
5
+
6
+ print(dir(pc))
7
+
8
+ def predict_toxicity(sentence):
9
+
10
+ if pc.is_toxic(sentence):
11
+ return msg_deleted
12
+
13
+ else:
14
+ return sentence
15
+
16
+ demo = gr.Interface(fn=predict_toxicity, inputs="text", outputs="text")
17
+ demo.launch()
Negativ/flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ sentence,output,flag,username,timestamp
2
+ ,,,,2024-08-14 19:26:49.221851
Negativ/train.csv/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd4084611bd27c939ba98e5e63bc3e5a2c1a4e99477dcba46c829e4c986c429d
3
+ size 68802655