Spaces:

ozgurunlu
/

m-check

Sleeping

App Files Files Community

Ozgur Unlu commited on Nov 6, 2024

Commit

d618fb6

1 Parent(s): c1e6e68

more fixes

Browse files

Files changed (1) hide show

app.py +48 -19

app.py CHANGED Viewed

@@ -14,6 +14,15 @@ import re
 load_dotenv()
 CONTRACTIONS = {
     "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
     "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
     "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
@@ -78,33 +87,51 @@ def check_hate_speech_and_bias(text, model, tokenizer):
             'status': 'error',
             'message': f'Error in hate speech/bias detection: {str(e)}'
         }
 def check_spelling(text, spell_checker):
     try:
         # Split text into words
         words = text.split()
-        # Process words while preserving contractions and special cases
-        clean_words = []
         for word in words:
             # Remove surrounding punctuation but keep internal apostrophes
             cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
-            if cleaned:
-                clean_words.append(cleaned)
-        # Find misspelled words, excluding contractions and special cases
-        misspelled = set()
-        for word in clean_words:
-            if (word.lower() not in CONTRACTIONS and  # Skip known contractions
-                not word.isdigit() and               # Skip numbers
-                not any(char.isdigit() for char in word) and  # Skip words with numbers
-                not word.startswith('@') and         # Skip mentions
-                not word.startswith('#') and         # Skip hashtags
-                not word.startswith('http') and      # Skip URLs
-                not word.isupper() and              # Skip acronyms
-                len(word) > 1 and                   # Skip single letters
-                word.lower() not in spell_checker.word_frequency):  # Check if word is in dictionary
-                misspelled.add(word)
         if misspelled:
             corrections = []
@@ -114,7 +141,9 @@ def check_spelling(text, spell_checker):
                 if candidates:
                     # Take up to 3 suggestions
                     suggestions = list(candidates)[:3]
-                    corrections.append(f"'{word}' -> suggestions: {', '.join(suggestions)}")
             if corrections:
                 return {

 load_dotenv()
 CONTRACTIONS = {
+    # With straight apostrophe
+    "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
+    "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
+    "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
+    "shouldn't", "that's", "there's", "they'd", "they'll", "they're", "they've",
+    "we'd", "we're", "we've", "weren't", "what'll", "what're", "what's", "what've",
+    "where's", "who'd", "who'll", "who're", "who's", "who've", "won't", "wouldn't",
+    "you'd", "you'll", "you're", "you've",
+    # With curly apostrophe
     "ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
     "hasn't", "haven't", "he'd", "he'll", "he's", "i'd", "i'll", "i'm", "i've",
     "isn't", "let's", "mightn't", "mustn't", "shan't", "she'd", "she'll", "she's",
             'status': 'error',
             'message': f'Error in hate speech/bias detection: {str(e)}'
         }
+def normalize_apostrophes(text):
+    """Normalize different types of apostrophes and quotes to standard straight apostrophe"""
+    # Replace various types of apostrophes and quotes with standard straight apostrophe
+    return text.replace(''', "'").replace(''', "'").replace('`', "'").replace('´', "'")
 def check_spelling(text, spell_checker):
     try:
+        # Normalize apostrophes in the entire text
+        text = normalize_apostrophes(text)
         # Split text into words
         words = text.split()
+        # Process words
+        misspelled = set()
         for word in words:
+            # Normalize apostrophes in the word
+            word = normalize_apostrophes(word)
             # Remove surrounding punctuation but keep internal apostrophes
             cleaned = re.sub(r'^[^\w\']+|[^\w\']+$', '', word)
+            # Skip empty strings
+            if not cleaned:
+                continue
+            # Skip if the word is in our contractions list
+            if cleaned.lower() in CONTRACTIONS:
+                continue
+            # Skip special cases
+            if (cleaned.isdigit() or               # Skip numbers
+                any(char.isdigit() for char in cleaned) or  # Skip words with numbers
+                cleaned.startswith('@') or         # Skip mentions
+                cleaned.startswith('#') or         # Skip hashtags
+                cleaned.startswith('http') or      # Skip URLs
+                cleaned.isupper() or              # Skip acronyms
+                len(cleaned) <= 1):               # Skip single letters
+                continue
+            # Check if word is misspelled
+            if cleaned.lower() not in spell_checker.word_frequency:
+                misspelled.add(cleaned)
         if misspelled:
             corrections = []
                 if candidates:
                     # Take up to 3 suggestions
                     suggestions = list(candidates)[:3]
+                    # Only include if we have valid suggestions
+                    if any(sugg.lower() != word.lower() for sugg in suggestions):
+                        corrections.append(f"'{word}' -> suggestions: {', '.join(suggestions)}")
             if corrections:
                 return {