Ozgur Unlu
commited on
Commit
·
d618fb6
1
Parent(s):
c1e6e68
more fixes
Browse files
app.py
CHANGED
@@ -14,6 +14,15 @@ import re
|
|
14 |
load_dotenv()
|
15 |
|
16 |
CONTRACTIONS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
|
18 |
"hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
|
19 |
"isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
|
@@ -78,33 +87,51 @@ def check_hate_speech_and_bias(text, model, tokenizer):
|
|
78 |
'status': 'error',
|
79 |
'message': f'Error in hate speech/bias detection: {str(e)}'
|
80 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
def check_spelling(text, spell_checker):
|
83 |
try:
|
|
|
|
|
|
|
84 |
# Split text into words
|
85 |
words = text.split()
|
86 |
|
87 |
-
# Process words
|
88 |
-
|
89 |
for word in words:
|
|
|
|
|
|
|
90 |
# Remove surrounding punctuation but keep internal apostrophes
|
91 |
cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
if
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
if misspelled:
|
110 |
corrections = []
|
@@ -114,7 +141,9 @@ def check_spelling(text, spell_checker):
|
|
114 |
if candidates:
|
115 |
# Take up to 3 suggestions
|
116 |
suggestions = list(candidates)[:3]
|
117 |
-
|
|
|
|
|
118 |
|
119 |
if corrections:
|
120 |
return {
|
|
|
14 |
load_dotenv()
|
15 |
|
16 |
CONTRACTIONS = {
|
17 |
+
# With straight apostrophe
|
18 |
+
"ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
|
19 |
+
"hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
|
20 |
+
"isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
|
21 |
+
"shouldn't", "that's", "there's", "they'd", "they'll", "they're", "they've",
|
22 |
+
"we'd", "we're", "we've", "weren't", "what'll", "what're", "what's", "what've",
|
23 |
+
"where's", "who'd", "who'll", "who're", "who's", "who've", "won't", "wouldn't",
|
24 |
+
"you'd", "you'll", "you're", "you've",
|
25 |
+
# With curly apostrophe
|
26 |
"ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
|
27 |
"hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
|
28 |
"isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
|
|
|
87 |
'status': 'error',
|
88 |
'message': f'Error in hate speech/bias detection: {str(e)}'
|
89 |
}
|
90 |
+
|
91 |
+
def normalize_apostrophes(text):
|
92 |
+
"""Normalize different types of apostrophes and quotes to standard straight apostrophe"""
|
93 |
+
# Replace various types of apostrophes and quotes with standard straight apostrophe
|
94 |
+
return text.replace(''', "'").replace(''', "'").replace('`', "'").replace('´', "'")
|
95 |
+
|
96 |
|
97 |
def check_spelling(text, spell_checker):
|
98 |
try:
|
99 |
+
# Normalize apostrophes in the entire text
|
100 |
+
text = normalize_apostrophes(text)
|
101 |
+
|
102 |
# Split text into words
|
103 |
words = text.split()
|
104 |
|
105 |
+
# Process words
|
106 |
+
misspelled = set()
|
107 |
for word in words:
|
108 |
+
# Normalize apostrophes in the word
|
109 |
+
word = normalize_apostrophes(word)
|
110 |
+
|
111 |
# Remove surrounding punctuation but keep internal apostrophes
|
112 |
cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
|
113 |
+
|
114 |
+
# Skip empty strings
|
115 |
+
if not cleaned:
|
116 |
+
continue
|
117 |
+
|
118 |
+
# Skip if the word is in our contractions list
|
119 |
+
if cleaned.lower() in CONTRACTIONS:
|
120 |
+
continue
|
121 |
+
|
122 |
+
# Skip special cases
|
123 |
+
if (cleaned.isdigit() or # Skip numbers
|
124 |
+
any(char.isdigit() for char in cleaned) or # Skip words with numbers
|
125 |
+
cleaned.startswith('@') or # Skip mentions
|
126 |
+
cleaned.startswith('#') or # Skip hashtags
|
127 |
+
cleaned.startswith('http') or # Skip URLs
|
128 |
+
cleaned.isupper() or # Skip acronyms
|
129 |
+
len(cleaned) <= 1): # Skip single letters
|
130 |
+
continue
|
131 |
+
|
132 |
+
# Check if word is misspelled
|
133 |
+
if cleaned.lower() not in spell_checker.word_frequency:
|
134 |
+
misspelled.add(cleaned)
|
135 |
|
136 |
if misspelled:
|
137 |
corrections = []
|
|
|
141 |
if candidates:
|
142 |
# Take up to 3 suggestions
|
143 |
suggestions = list(candidates)[:3]
|
144 |
+
# Only include if we have valid suggestions
|
145 |
+
if any(sugg.lower() != word.lower() for sugg in suggestions):
|
146 |
+
corrections.append(f"'{word}' -> suggestions: {', '.join(suggestions)}")
|
147 |
|
148 |
if corrections:
|
149 |
return {
|