idanpers commited on
Commit
e154248
·
verified ·
1 Parent(s): 6b84f49

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +41 -22
README.md CHANGED
@@ -56,41 +56,60 @@ To run the inference pipeline for classifying prompts, follow these steps:
56
  # Load model directly
57
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
58
 
59
- tokenizer = AutoTokenizer.from_pretrained("idanpers/JailBreakModel")
60
  model = AutoModelForSequenceClassification.from_pretrained("idanpers/JailBreakModel")
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  use:
64
- # Function to classify a single prompt using the trained model in Trainer
65
  def classify_prompt(prompt):
66
- # Error handling for empty input
67
- if not isinstance(prompt, str) or prompt.strip() == "":
68
- return {"error": "Invalid input. Please provide a non-empty text prompt."}
69
 
70
- # Tokenize the input prompt and convert to dataset format expected by trainer.predict
71
- inputs = Tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
72
- dataset = Dataset.from_dict({"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})
73
 
74
- # Use trainer.predict to classify
75
- prediction_output = model.predict(dataset)
76
 
77
- # Get the softmax probabilities for confidence scores
78
- probs = torch.softmax(torch.tensor(prediction_output.predictions), dim=1).cpu().numpy()
79
- confidence = np.max(probs)
80
- pred_label = np.argmax(probs, axis=1)[0]
81
 
82
- # Map prediction to label
83
- label = "PROMPT_INJECTION" if pred_label == 1 else "BENIGN"
84
 
85
- return {"label": label, "confidence": confidence}
86
 
87
- #Accept input from the user and classify it
88
  prompt = input("Enter a prompt for classification: ")
89
  result = classify_prompt(prompt)
90
 
91
- #Check for errors before accessing the classification result
92
  if "error" in result:
93
- print(f"Error: {result['error']}")
94
  else:
95
- print(f"Classification Result: {result['label']}")
96
- print(f"Confidence Score: {result['confidence']:.2f}")
 
56
  # Load model directly
57
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
58
 
59
+ Tokenizer = AutoTokenizer.from_pretrained("idanpers/JailBreakModel")
60
  model = AutoModelForSequenceClassification.from_pretrained("idanpers/JailBreakModel")
61
 
62
 
63
+ training_args = TrainingArguments(
64
+ output_dir="./results",
65
+ per_device_train_batch_size=16,
66
+ per_device_eval_batch_size=16,
67
+ report_to="none", # Disable W&B
68
+ save_safetensors=False,
69
+ )
70
+
71
+
72
+
73
+
74
+ # Create Trainer instance
75
+ trainer = Trainer(
76
+ model=model,
77
+ args=training_args,
78
+ tokenizer=tokenizer,
79
+ )
80
+
81
+
82
+
83
  use:
 
84
  def classify_prompt(prompt):
85
+ # Error handling for empty input
86
+ if not isinstance(prompt, str) or prompt.strip() == "":
87
+ return {"error": "Invalid input. Please provide a non-empty text prompt."}
88
 
89
+ # Tokenize the input prompt and convert to dataset format expected by trainer.predict
90
+ inputs = Tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
91
+ dataset = Dataset.from_dict({"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"]})
92
 
93
+ # Use trainer.predict to classify
94
+ prediction_output = trainer.predict(dataset)
95
 
96
+ # Get the softmax probabilities for confidence scores
97
+ probs = torch.softmax(torch.tensor(prediction_output.predictions), dim=1).cpu().numpy()
98
+ confidence = np.max(probs)
99
+ pred_label = np.argmax(probs, axis=1)[0]
100
 
101
+ # Map prediction to label
102
+ label = "PROMPT_INJECTION" if pred_label == 1 else "BENIGN"
103
 
104
+ return {"label": label, "confidence": confidence}
105
 
106
+ #Accept input from the user and classify it
107
  prompt = input("Enter a prompt for classification: ")
108
  result = classify_prompt(prompt)
109
 
110
+ #Check for errors before accessing the classification result
111
  if "error" in result:
112
+ print(f"Error: {result['error']}")
113
  else:
114
+ print(f"Classification Result: {result['label']}")
115
+ print(f"Confidence Score: {result['confidence']:.2f}")