Toxicity Classifier (Teeny-Tiny Castle)
This model is part of a tutorial tied to the Teeny-Tiny Castle, an open-source repository containing educational tools for AI Ethics and Safety research.
How to Use
from huggingface_hub import hf_hub_download
# Download the model (this will be the target of our attack)
hf_hub_download(repo_id="AiresPucrs/toxicity-classifier",
filename="toxicity-classifier/toxicity-model.keras",
local_dir="./",
repo_type="model"
)
# Download the tokenizer file
hf_hub_download(repo_id="AiresPucrs/toxicity-classifier",
filename="toxic-vocabulary.txt",
local_dir="./",
repo_type="model"
)
toxicity_model = tf.keras.models.load_model('./toxicity-classifier/toxicity-model.keras')
# If you cloned the model repo, the path is toxicity_model/toxic_vocabulary.txt
with open('toxic-vocabulary.txt', encoding='utf-8') as fp:
vocabulary = [line.strip() for line in fp]
fp.close()
vectorization_layer = tf.keras.layers.TextVectorization(max_tokens=20000,
output_mode="int",
output_sequence_length=100,
vocabulary=vocabulary)
strings = [
'I think you should shut up your big mouth',
'I do not agree with you'
]
preds = toxicity_model.predict(vectorization_layer(strings),verbose=0)
for i, string in enumerate(strings):
print(f'{string}\n')
print(f'Toxic 🤬 {(1 - preds[i][0]) * 100:.2f)}% | Not toxic 😊 {preds[i][0] * 100:.2f}\n')
print("_" * 50)