Ozgur Unlu
commited on
Commit
·
1694d38
1
Parent(s):
0e7da48
improvements for bias check
Browse files
app.py
CHANGED
@@ -63,21 +63,85 @@ def check_text_length(text):
|
|
63 |
|
64 |
def check_hate_speech_and_bias(text, model, tokenizer):
|
65 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
67 |
outputs = model(**inputs)
|
68 |
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
69 |
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
return {
|
73 |
'status': 'fail',
|
74 |
-
'message':
|
75 |
}
|
76 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
return {
|
78 |
'status': 'warning',
|
79 |
-
'message':
|
80 |
}
|
|
|
81 |
return {
|
82 |
'status': 'pass',
|
83 |
'message': 'No significant bias or hate speech detected'
|
|
|
63 |
|
64 |
def check_hate_speech_and_bias(text, model, tokenizer):
|
65 |
try:
|
66 |
+
# List of potentially problematic words and phrases
|
67 |
+
bias_terms = {
|
68 |
+
'political_bias': [
|
69 |
+
'woke', 'snowflake', 'libtard', 'conservatard', 'trumptard',
|
70 |
+
'leftist agenda', 'right-wing agenda', 'radical left', 'radical right'
|
71 |
+
],
|
72 |
+
'discriminatory': [
|
73 |
+
'crazy', 'insane', 'psycho', 'retarded', 'schizo',
|
74 |
+
'ghetto', 'thug', 'illegal', 'normal people', 'regular people',
|
75 |
+
'third-world', 'primitive', 'savage'
|
76 |
+
],
|
77 |
+
'gender_bias': [
|
78 |
+
'mankind', 'chairman', 'policeman', 'fireman', 'stewardess',
|
79 |
+
'manpower', 'man-made', 'guys', 'hysterical', 'drama queen'
|
80 |
+
],
|
81 |
+
'ageist': [
|
82 |
+
'boomer', 'millennial', 'ok boomer', 'zoomer', 'gen z',
|
83 |
+
'old-timer', 'geezer', 'young people these days', 'kids these days'
|
84 |
+
],
|
85 |
+
'cultural_insensitivity': [
|
86 |
+
'exotic', 'oriental', 'ethnic', 'colored', 'urban',
|
87 |
+
'tribal', 'backwards', 'uncivilized'
|
88 |
+
]
|
89 |
+
}
|
90 |
+
|
91 |
+
# Check for problematic terms
|
92 |
+
found_terms = {}
|
93 |
+
lower_text = text.lower()
|
94 |
+
|
95 |
+
for category, terms in bias_terms.items():
|
96 |
+
found = [term for term in terms if term.lower() in lower_text]
|
97 |
+
if found:
|
98 |
+
found_terms[category] = found
|
99 |
+
|
100 |
+
# Run the model for hate speech detection
|
101 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
102 |
outputs = model(**inputs)
|
103 |
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
104 |
|
105 |
+
model_score = predictions[0][1].item()
|
106 |
+
|
107 |
+
# Determine the result based on both checks
|
108 |
+
if model_score > 0.3 or len(found_terms) > 0:
|
109 |
+
message = "Content contains potential hate speech or bias:\n\n"
|
110 |
+
|
111 |
+
if found_terms:
|
112 |
+
message += "Problematic language found:\n"
|
113 |
+
for category, terms in found_terms.items():
|
114 |
+
category_name = category.replace('_', ' ').title()
|
115 |
+
message += f"- {category_name}: {', '.join(terms)}\n"
|
116 |
+
message += "\nSuggestions:\n"
|
117 |
+
message += "- Consider using more inclusive and neutral language\n"
|
118 |
+
message += "- Avoid stereotypes and discriminatory terms\n"
|
119 |
+
message += "- Focus on specific behaviors or facts rather than generalizations\n"
|
120 |
+
|
121 |
+
if model_score > 0.3:
|
122 |
+
message += "\nThe content has been flagged by our AI model as potentially containing hate speech or strong bias."
|
123 |
+
|
124 |
return {
|
125 |
'status': 'fail',
|
126 |
+
'message': message
|
127 |
}
|
128 |
+
elif model_score > 0.1 or any(term in lower_text for terms in bias_terms.values() for term in terms):
|
129 |
+
message = "Content may contain subtle bias:\n\n"
|
130 |
+
|
131 |
+
if found_terms:
|
132 |
+
message += "Consider reviewing these terms:\n"
|
133 |
+
for category, terms in found_terms.items():
|
134 |
+
category_name = category.replace('_', ' ').title()
|
135 |
+
message += f"- {category_name}: {', '.join(terms)}\n"
|
136 |
+
message += "\nSuggestions:\n"
|
137 |
+
message += "- Review the flagged terms for potential unintended bias\n"
|
138 |
+
message += "- Consider using more inclusive alternatives\n"
|
139 |
+
|
140 |
return {
|
141 |
'status': 'warning',
|
142 |
+
'message': message
|
143 |
}
|
144 |
+
|
145 |
return {
|
146 |
'status': 'pass',
|
147 |
'message': 'No significant bias or hate speech detected'
|