Ozgur Unlu commited on
Commit
d618fb6
·
1 Parent(s): c1e6e68

more fixes

Browse files
Files changed (1) hide show
  1. app.py +48 -19
app.py CHANGED
@@ -14,6 +14,15 @@ import re
14
  load_dotenv()
15
 
16
  CONTRACTIONS = {
 
 
 
 
 
 
 
 
 
17
  "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
18
  "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
19
  "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
@@ -78,33 +87,51 @@ def check_hate_speech_and_bias(text, model, tokenizer):
78
  'status': 'error',
79
  'message': f'Error in hate speech/bias detection: {str(e)}'
80
  }
 
 
 
 
 
 
81
 
82
  def check_spelling(text, spell_checker):
83
  try:
 
 
 
84
  # Split text into words
85
  words = text.split()
86
 
87
- # Process words while preserving contractions and special cases
88
- clean_words = []
89
  for word in words:
 
 
 
90
  # Remove surrounding punctuation but keep internal apostrophes
91
  cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
92
- if cleaned:
93
- clean_words.append(cleaned)
94
-
95
- # Find misspelled words, excluding contractions and special cases
96
- misspelled = set()
97
- for word in clean_words:
98
- if (word.lower() not in CONTRACTIONS and # Skip known contractions
99
- not word.isdigit() and # Skip numbers
100
- not any(char.isdigit() for char in word) and # Skip words with numbers
101
- not word.startswith('@') and # Skip mentions
102
- not word.startswith('#') and # Skip hashtags
103
- not word.startswith('http') and # Skip URLs
104
- not word.isupper() and # Skip acronyms
105
- len(word) > 1 and # Skip single letters
106
- word.lower() not in spell_checker.word_frequency): # Check if word is in dictionary
107
- misspelled.add(word)
 
 
 
 
 
 
108
 
109
  if misspelled:
110
  corrections = []
@@ -114,7 +141,9 @@ def check_spelling(text, spell_checker):
114
  if candidates:
115
  # Take up to 3 suggestions
116
  suggestions = list(candidates)[:3]
117
- corrections.append(f"'{word}' -> suggestions: {', '.join(suggestions)}")
 
 
118
 
119
  if corrections:
120
  return {
 
14
  load_dotenv()
15
 
16
  CONTRACTIONS = {
17
+ # With straight apostrophe
18
+ "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
19
+ "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
20
+ "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
21
+ "shouldn't", "that's", "there's", "they'd", "they'll", "they're", "they've",
22
+ "we'd", "we're", "we've", "weren't", "what'll", "what're", "what's", "what've",
23
+ "where's", "who'd", "who'll", "who're", "who's", "who've", "won't", "wouldn't",
24
+ "you'd", "you'll", "you're", "you've",
25
+ # With curly apostrophe
26
  "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
27
  "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
28
  "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
 
87
  'status': 'error',
88
  'message': f'Error in hate speech/bias detection: {str(e)}'
89
  }
90
+
91
+ def normalize_apostrophes(text):
92
+ """Normalize different types of apostrophes and quotes to standard straight apostrophe"""
93
+ # Replace various types of apostrophes and quotes with standard straight apostrophe
94
+ return text.replace(''', "'").replace(''', "'").replace('`', "'").replace('´', "'")
95
+
96
 
97
  def check_spelling(text, spell_checker):
98
  try:
99
+ # Normalize apostrophes in the entire text
100
+ text = normalize_apostrophes(text)
101
+
102
  # Split text into words
103
  words = text.split()
104
 
105
+ # Process words
106
+ misspelled = set()
107
  for word in words:
108
+ # Normalize apostrophes in the word
109
+ word = normalize_apostrophes(word)
110
+
111
  # Remove surrounding punctuation but keep internal apostrophes
112
  cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
113
+
114
+ # Skip empty strings
115
+ if not cleaned:
116
+ continue
117
+
118
+ # Skip if the word is in our contractions list
119
+ if cleaned.lower() in CONTRACTIONS:
120
+ continue
121
+
122
+ # Skip special cases
123
+ if (cleaned.isdigit() or # Skip numbers
124
+ any(char.isdigit() for char in cleaned) or # Skip words with numbers
125
+ cleaned.startswith('@') or # Skip mentions
126
+ cleaned.startswith('#') or # Skip hashtags
127
+ cleaned.startswith('http') or # Skip URLs
128
+ cleaned.isupper() or # Skip acronyms
129
+ len(cleaned) <= 1): # Skip single letters
130
+ continue
131
+
132
+ # Check if word is misspelled
133
+ if cleaned.lower() not in spell_checker.word_frequency:
134
+ misspelled.add(cleaned)
135
 
136
  if misspelled:
137
  corrections = []
 
141
  if candidates:
142
  # Take up to 3 suggestions
143
  suggestions = list(candidates)[:3]
144
+ # Only include if we have valid suggestions
145
+ if any(sugg.lower() != word.lower() for sugg in suggestions):
146
+ corrections.append(f"'{word}' -> suggestions: {', '.join(suggestions)}")
147
 
148
  if corrections:
149
  return {