Spaces:

danagyl
/

Negativ

Runtime error

App Files Files Community

danagyl commited on Aug 15, 2024

Commit

cdb656c

verified ·

1 Parent(s): 4364781

Upload Negativ/

Browse files

Files changed (19) hide show

.gitattributes +1 -0
Negativ/PythonCode/TrainSentimentModel.py +117 -0
Negativ/PythonCode/__init__.py +1 -0
Negativ/PythonCode/__pycache__/__init__.cpython-312.pyc +0 -0
Negativ/PythonCode/__pycache__/positive_reframe.cpython-312.pyc +0 -0
Negativ/PythonCode/custom_tokenizer/merges.txt +0 -0
Negativ/PythonCode/custom_tokenizer/special_tokens_map.json +51 -0
Negativ/PythonCode/custom_tokenizer/tokenizer.json +0 -0
Negativ/PythonCode/custom_tokenizer/tokenizer_config.json +59 -0
Negativ/PythonCode/custom_tokenizer/vocab.json +0 -0
Negativ/PythonCode/kaggle.json +1 -0
Negativ/PythonCode/login_cred.env +2 -0
Negativ/PythonCode/positive_reframe.py +65 -0
Negativ/PythonCode/test_model.py +57 -0
Negativ/PythonCode/toxic_deberta_tuned.pth +3 -0
Negativ/README.md +2 -0
Negativ/app.py +17 -0
Negativ/flagged/log.csv +2 -0
Negativ/train.csv/train.csv +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Negativ/train.csv/train.csv filter=lfs diff=lfs merge=lfs -text

Negativ/PythonCode/TrainSentimentModel.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Import necessary libraries
+from datasets import load_dataset
+from sklearn.utils.class_weight import compute_class_weight
+import torch
+from transformers import AutoTokenizer, DebertaForSequenceClassification, TrainingArguments, Trainer
+import numpy as np
+import evaluate
+# Load the toxic chat dataset from HuggingFace
+ds = load_dataset("csv", data_files={'train': 'C:/Users/Michael/Documents/GitHub/Negativ/train.csv/train.csv'})
+# Access the train split
+train_ds = ds["train"]
+labels = train_ds["toxic"]
+# Compute class weights
+class_weights = compute_class_weight(
+    class_weight='balanced',  # Use 'balanced' strategy
+    classes=np.unique(labels),  # Unique class labels
+    y=labels
+)
+print(f"Computed class weights: {class_weights}")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Convert to tensor for PyTorch
+class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)
+num_labels = 2  # Set number of labels correctly
+# Training arguments can help fine-tune the model by including a specific evaluation strategy
+training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+# Accuracy metric was the recommended metric for the training process
+metric = evaluate.load("accuracy")
+# Load the DeBERTa AutoTokenizer as well as the pre-trained model we are fine-tuning
+tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
+model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
+model.to(device)
+# Function to tokenize and encode the dataset
+def tokenize_function_deberta(ds):
+    batch_texts = ds['comment_text']
+    print(batch_texts[:5])
+    # Encode the text using the tokenizer
+    encoded_text = tokenizer(batch_texts, padding='max_length', truncation=True, return_tensors='pt')
+    print(encoded_text)
+    labels = ds["toxic"]
+    print(labels[:5])
+    return {'input_ids': encoded_text['input_ids'], 'attention_mask': encoded_text['attention_mask'], 'labels': labels}
+# Function to compute metrics
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
+# Train the model
+def train_deberta_model():
+    # Tokenize the dataset
+    tokenized_datasets = train_ds.map(tokenize_function_deberta, batched=True)
+    # Create smaller datasets for quick testing
+    small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))
+    small_eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))
+    class CustomTrainer(Trainer):
+        def compute_loss(self, model, inputs, return_outputs=False):
+            labels = inputs.pop("labels")
+            outputs = model(**inputs)
+            logits = outputs.logits
+            # Custom loss function with weights
+            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
+            loss = loss_fct(logits, labels)
+            return (loss, outputs) if return_outputs else loss
+    # Initialize trainer
+    custom_trainer = CustomTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=small_train_dataset,
+        eval_dataset=small_eval_dataset,
+        compute_metrics=compute_metrics,
+    )
+    # Train the model
+    custom_trainer.train()
+     # Save the model's state dictionary as a .pth file
+    model_save_path = "toxic_deberta_tuned.pth"
+    torch.save(model.state_dict(), model_save_path)
+    print(f"Model state dictionary saved to {model_save_path}")
+    # Optional: Save the tokenizer
+    tokenizer_save_path = "custom_tokenizer"
+    tokenizer.save_pretrained(tokenizer_save_path)
+    print(f"Tokenizer saved to {tokenizer_save_path}")
+# Try-catch block to handle errors
+try:
+    train_deberta_model()
+except FileNotFoundError:
+    print("Directory not found. Please double-check your path!")
+except Exception as e:
+    print("A random error occurred, sorry partner: ", e)

Negativ/PythonCode/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .positive_reframe import is_toxic, predict

Negativ/PythonCode/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (234 Bytes). View file

Negativ/PythonCode/__pycache__/positive_reframe.cpython-312.pyc ADDED Viewed

Binary file (2.74 kB). View file

Negativ/PythonCode/custom_tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

Negativ/PythonCode/custom_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Negativ/PythonCode/custom_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Negativ/PythonCode/custom_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "[MASK]",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "errors": "replace",
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "DebertaTokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "gpt2"
+}

Negativ/PythonCode/custom_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Negativ/PythonCode/kaggle.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"username":"michaelkleindl","key":"cdce5c43b43e95114651caf7365c893f"}

Negativ/PythonCode/login_cred.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ username = michaelkleindl
2	+ key = cdce5c43b43e95114651caf7365c893f

Negativ/PythonCode/positive_reframe.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from transformers import AutoTokenizer, DebertaForSequenceClassification
+sentence = "I really love that you suck so bad at this game, you are the worst teammate and i hope you die in a fire"
+# Define the model architecture
+num_labels = 2  # Assuming a three-class classification task
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
+# Initialize the model architecture
+model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
+# Load the saved state dictionary
+state_dict = torch.load('C:/Users/Michael/Documents/GitHub/Negativ/PythonCode/toxic_deberta_tuned.pth')
+# Load the state dictionary into the model
+model.load_state_dict(state_dict)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Ensure the model is set up with the correct number of labels
+assert model.config.num_labels == num_labels, "Model labels mismatch!"
+def predict(sentence):
+    # Tokenize the input sentence
+    inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True)
+    print(inputs)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Move the model to evaluation mode
+    model.eval()
+    # Perform prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+        print(outputs)
+        logits = outputs.logits
+        probabilities = torch.softmax(logits, dim=1)
+        print(f"Logits: {logits}")
+        print(f"Probabilities: {probabilities}")
+        highest_logit, prediction = torch.max(logits, dim=-1)
+        print(f"Highest Logit: {highest_logit.item()}, Predicted Class: {prediction.item()}")
+    # Interpret the result
+    return "Toxic" if prediction == 1 else "Non-toxic"
+def is_toxic(sentence):
+    result = predict(sentence)
+    print(f"Given Sentence: {sentence}", f"\tPrediction: {result}")
+    if result == "Toxic":
+        return True
+    else:
+        return False

Negativ/PythonCode/test_model.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+from transformers import AutoTokenizer, DebertaForSequenceClassification
+# Define the model architecture
+num_labels = 2  # Assuming a three-class classification task
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
+# Initialize the model architecture
+model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
+# Load the saved state dictionary
+state_dict = torch.load('./toxic_deberta_tuned.pth')
+# Load the state dictionary into the model
+model.load_state_dict(state_dict)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Ensure the model is set up with the correct number of labels
+assert model.config.num_labels == num_labels, "Model labels mismatch!"
+def predict(sentence):
+    # Tokenize the input sentence
+    inputs = tokenizer(sentence, return_tensors="pt", padding='max_length', truncation=True)
+    print(inputs)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Move the model to evaluation mode
+    model.eval()
+    # Perform prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+        print(outputs)
+        logits = outputs.logits
+        probabilities = torch.softmax(logits, dim=1)
+        print(f"Logits: {logits}")
+        print(f"Probabilities: {probabilities}")
+        highest_logit, prediction = torch.max(logits, dim=-1)
+        print(f"Highest Logit: {highest_logit.item()}, Predicted Class: {prediction.item()}")
+    # Interpret the result
+    return "Toxic" if prediction == 1 else "Non-toxic"
+# Test prediction
+sentence = "you are really bad at this game"
+result = predict(sentence)
+print(f"Given Sentence: {sentence}", f"\tPrediction: {result}")

Negativ/PythonCode/toxic_deberta_tuned.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:028d5e6f97db45391fb27ef12bac008bda23fb2fc4be72700e95aeaf8d8282ab
+size 556858674

Negativ/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Negativ
2	+ LLM Model Used to limit toxic game chat behavior.

Negativ/app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import gradio as gr
+import PythonCode.__init__ as pc
+msg_deleted = "--This Toxic Comment Has Been Deleted--"
+print(dir(pc))
+def predict_toxicity(sentence):
+    if pc.is_toxic(sentence):
+        return msg_deleted
+    else:
+        return sentence
+demo = gr.Interface(fn=predict_toxicity, inputs="text", outputs="text")
+demo.launch()

Negativ/flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ sentence,output,flag,username,timestamp
2	+ ,,,,2024-08-14 19:26:49.221851

Negativ/train.csv/train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd4084611bd27c939ba98e5e63bc3e5a2c1a4e99477dcba46c829e4c986c429d
+size 68802655