Ozgur Unlu commited on
Commit
1694d38
·
1 Parent(s): 0e7da48

improvements for bias check

Browse files
Files changed (1) hide show
  1. app.py +69 -5
app.py CHANGED
@@ -63,21 +63,85 @@ def check_text_length(text):
63
 
64
  def check_hate_speech_and_bias(text, model, tokenizer):
65
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
67
  outputs = model(**inputs)
68
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
69
 
70
- # Adjusted thresholds and messages for both hate speech and bias
71
- if predictions[0][1].item() > 0.3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  return {
73
  'status': 'fail',
74
- 'message': 'Content contains potential hate speech or strong bias'
75
  }
76
- elif predictions[0][1].item() > 0.1:
 
 
 
 
 
 
 
 
 
 
 
77
  return {
78
  'status': 'warning',
79
- 'message': 'Content may contain subtle bias or potentially offensive language'
80
  }
 
81
  return {
82
  'status': 'pass',
83
  'message': 'No significant bias or hate speech detected'
 
63
 
64
  def check_hate_speech_and_bias(text, model, tokenizer):
65
  try:
66
+ # List of potentially problematic words and phrases
67
+ bias_terms = {
68
+ 'political_bias': [
69
+ 'woke', 'snowflake', 'libtard', 'conservatard', 'trumptard',
70
+ 'leftist agenda', 'right-wing agenda', 'radical left', 'radical right'
71
+ ],
72
+ 'discriminatory': [
73
+ 'crazy', 'insane', 'psycho', 'retarded', 'schizo',
74
+ 'ghetto', 'thug', 'illegal', 'normal people', 'regular people',
75
+ 'third-world', 'primitive', 'savage'
76
+ ],
77
+ 'gender_bias': [
78
+ 'mankind', 'chairman', 'policeman', 'fireman', 'stewardess',
79
+ 'manpower', 'man-made', 'guys', 'hysterical', 'drama queen'
80
+ ],
81
+ 'ageist': [
82
+ 'boomer', 'millennial', 'ok boomer', 'zoomer', 'gen z',
83
+ 'old-timer', 'geezer', 'young people these days', 'kids these days'
84
+ ],
85
+ 'cultural_insensitivity': [
86
+ 'exotic', 'oriental', 'ethnic', 'colored', 'urban',
87
+ 'tribal', 'backwards', 'uncivilized'
88
+ ]
89
+ }
90
+
91
+ # Check for problematic terms
92
+ found_terms = {}
93
+ lower_text = text.lower()
94
+
95
+ for category, terms in bias_terms.items():
96
+ found = [term for term in terms if term.lower() in lower_text]
97
+ if found:
98
+ found_terms[category] = found
99
+
100
+ # Run the model for hate speech detection
101
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
102
  outputs = model(**inputs)
103
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
104
 
105
+ model_score = predictions[0][1].item()
106
+
107
+ # Determine the result based on both checks
108
+ if model_score > 0.3 or len(found_terms) > 0:
109
+ message = "Content contains potential hate speech or bias:\n\n"
110
+
111
+ if found_terms:
112
+ message += "Problematic language found:\n"
113
+ for category, terms in found_terms.items():
114
+ category_name = category.replace('_', ' ').title()
115
+ message += f"- {category_name}: {', '.join(terms)}\n"
116
+ message += "\nSuggestions:\n"
117
+ message += "- Consider using more inclusive and neutral language\n"
118
+ message += "- Avoid stereotypes and discriminatory terms\n"
119
+ message += "- Focus on specific behaviors or facts rather than generalizations\n"
120
+
121
+ if model_score > 0.3:
122
+ message += "\nThe content has been flagged by our AI model as potentially containing hate speech or strong bias."
123
+
124
  return {
125
  'status': 'fail',
126
+ 'message': message
127
  }
128
+ elif model_score > 0.1 or any(term in lower_text for terms in bias_terms.values() for term in terms):
129
+ message = "Content may contain subtle bias:\n\n"
130
+
131
+ if found_terms:
132
+ message += "Consider reviewing these terms:\n"
133
+ for category, terms in found_terms.items():
134
+ category_name = category.replace('_', ' ').title()
135
+ message += f"- {category_name}: {', '.join(terms)}\n"
136
+ message += "\nSuggestions:\n"
137
+ message += "- Review the flagged terms for potential unintended bias\n"
138
+ message += "- Consider using more inclusive alternatives\n"
139
+
140
  return {
141
  'status': 'warning',
142
+ 'message': message
143
  }
144
+
145
  return {
146
  'status': 'pass',
147
  'message': 'No significant bias or hate speech detected'