Spaces:

ozgurunlu
/

m-check

Sleeping

App Files Files Community

Ozgur Unlu commited on Nov 6, 2024

Commit

1694d38

1 Parent(s): 0e7da48

improvements for bias check

Browse files

Files changed (1) hide show

app.py +69 -5

app.py CHANGED Viewed

@@ -63,21 +63,85 @@ def check_text_length(text):
 def check_hate_speech_and_bias(text, model, tokenizer):
     try:
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         outputs = model(**inputs)
         predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
-        # Adjusted thresholds and messages for both hate speech and bias
-        if predictions[0][1].item() > 0.3:
             return {
                 'status': 'fail',
-                'message': 'Content contains potential hate speech or strong bias'
             }
-        elif predictions[0][1].item() > 0.1:
             return {
                 'status': 'warning',
-                'message': 'Content may contain subtle bias or potentially offensive language'
             }
         return {
             'status': 'pass',
             'message': 'No significant bias or hate speech detected'

 def check_hate_speech_and_bias(text, model, tokenizer):
     try:
+        # List of potentially problematic words and phrases
+        bias_terms = {
+            'political_bias': [
+                'woke', 'snowflake', 'libtard', 'conservatard', 'trumptard',
+                'leftist agenda', 'right-wing agenda', 'radical left', 'radical right'
+            ],
+            'discriminatory': [
+                'crazy', 'insane', 'psycho', 'retarded', 'schizo',
+                'ghetto', 'thug', 'illegal', 'normal people', 'regular people',
+                'third-world', 'primitive', 'savage'
+            ],
+            'gender_bias': [
+                'mankind', 'chairman', 'policeman', 'fireman', 'stewardess',
+                'manpower', 'man-made', 'guys', 'hysterical', 'drama queen'
+            ],
+            'ageist': [
+                'boomer', 'millennial', 'ok boomer', 'zoomer', 'gen z',
+                'old-timer', 'geezer', 'young people these days', 'kids these days'
+            ],
+            'cultural_insensitivity': [
+                'exotic', 'oriental', 'ethnic', 'colored', 'urban',
+                'tribal', 'backwards', 'uncivilized'
+            ]
+        }
+        # Check for problematic terms
+        found_terms = {}
+        lower_text = text.lower()
+        for category, terms in bias_terms.items():
+            found = [term for term in terms if term.lower() in lower_text]
+            if found:
+                found_terms[category] = found
+        # Run the model for hate speech detection
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         outputs = model(**inputs)
         predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        model_score = predictions[0][1].item()
+        # Determine the result based on both checks
+        if model_score > 0.3 or len(found_terms) > 0:
+            message = "Content contains potential hate speech or bias:\n\n"
+            if found_terms:
+                message += "Problematic language found:\n"
+                for category, terms in found_terms.items():
+                    category_name = category.replace('_', ' ').title()
+                    message += f"- {category_name}: {', '.join(terms)}\n"
+                message += "\nSuggestions:\n"
+                message += "- Consider using more inclusive and neutral language\n"
+                message += "- Avoid stereotypes and discriminatory terms\n"
+                message += "- Focus on specific behaviors or facts rather than generalizations\n"
+            if model_score > 0.3:
+                message += "\nThe content has been flagged by our AI model as potentially containing hate speech or strong bias."
             return {
                 'status': 'fail',
+                'message': message
             }
+        elif model_score > 0.1 or any(term in lower_text for terms in bias_terms.values() for term in terms):
+            message = "Content may contain subtle bias:\n\n"
+            if found_terms:
+                message += "Consider reviewing these terms:\n"
+                for category, terms in found_terms.items():
+                    category_name = category.replace('_', ' ').title()
+                    message += f"- {category_name}: {', '.join(terms)}\n"
+                message += "\nSuggestions:\n"
+                message += "- Review the flagged terms for potential unintended bias\n"
+                message += "- Consider using more inclusive alternatives\n"
             return {
                 'status': 'warning',
+                'message': message
             }
         return {
             'status': 'pass',
             'message': 'No significant bias or hate speech detected'