Spaces:

ozgurunlu
/

m-check

Sleeping

App Files Files Community

Ozgur Unlu commited on Nov 6, 2024

Commit

72d5e40

1 Parent(s): f73d159

made some changes to grammar check, bias check, news check

Browse files

Files changed (3) hide show

app.py +42 -45
news_checker.py +53 -128
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -2,29 +2,29 @@ import gradio as gr
 import torch
 from transformers import (
     AutoTokenizer,
-    AutoModelForSequenceClassification,
-    AutoModelForSeq2SeqLM
 )
 import os
 from pdf_generator import ReportGenerator
 from news_checker import NewsChecker
 from dotenv import load_dotenv
 load_dotenv()
 # Initialize models and tokenizers
 def load_models():
-    # Hate speech detection model
-    hate_tokenizer = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
-    hate_model = AutoModelForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
-    # Grammar check model
-    grammar_tokenizer = AutoTokenizer.from_pretrained("vennify/t5-base-grammar-correction")
-    grammar_model = AutoModelForSeq2SeqLM.from_pretrained("vennify/t5-base-grammar-correction")
     return {
         'hate_speech': (hate_model, hate_tokenizer),
-        'grammar': (grammar_model, grammar_tokenizer)
     }
 # Initialize news checker
@@ -41,61 +41,58 @@ def check_text_length(text):
         'message': 'Text length is within limits'
     }
-def check_hate_speech(text, model, tokenizer):
     try:
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         outputs = model(**inputs)
         predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
-        # Threshold for hate speech detection
-        if predictions[0][1].item() > 0.3:  # Adjusted threshold
             return {
                 'status': 'fail',
-                'message': 'Potential hate speech detected'
             }
         elif predictions[0][1].item() > 0.1:
             return {
                 'status': 'warning',
-                'message': 'Some concerning language detected - please review'
             }
         return {
             'status': 'pass',
-            'message': 'No hate speech detected'
         }
     except Exception as e:
         return {
             'status': 'error',
-            'message': f'Error in hate speech detection: {str(e)}'
         }
-def check_grammar(text, model, tokenizer):
     try:
-        input_text = f"grammar: {text}"
-        encoding = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
-        outputs = model.generate(
-            input_ids=encoding.input_ids,
-            attention_mask=encoding.attention_mask,
-            max_length=512,
-            num_beams=5,
-            num_return_sequences=1
-        )
-        corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        if corrected.lower().strip() != text.lower().strip():
             return {
                 'status': 'warning',
-                'message': f'Suggested corrections:\n{corrected}'
             }
         return {
             'status': 'pass',
-            'message': 'No grammar issues detected'
         }
     except Exception as e:
         return {
             'status': 'error',
-            'message': f'Error in grammar check: {str(e)}'
         }
 def analyze_content(text):
@@ -120,15 +117,15 @@ def analyze_content(text):
             report_path = report_gen.save_report()
             return results, report_path
-        # 2. Hate Speech Check
-        hate_result = check_hate_speech(text, models['hate_speech'][0], models['hate_speech'][1])
-        results['Hate Speech Check'] = hate_result
-        report_gen.add_check_result("Hate Speech Check", hate_result['status'], hate_result['message'])
-        # 3. Grammar Check
-        grammar_result = check_grammar(text, models['grammar'][0], models['grammar'][1])
-        results['Grammar Check'] = grammar_result
-        report_gen.add_check_result("Grammar Check", grammar_result['status'], grammar_result['message'])
         # 4. News Context Check
         if os.getenv('NEWS_API_KEY'):
@@ -149,8 +146,8 @@ def analyze_content(text):
         print(f"Error in analyze_content: {str(e)}")
         return {
             'Length Check': {'status': 'error', 'message': 'Analysis failed'},
-            'Hate Speech Check': {'status': 'error', 'message': 'Analysis failed'},
-            'Grammar Check': {'status': 'error', 'message': 'Analysis failed'},
             'Current Events Context': {'status': 'error', 'message': 'Analysis failed'}
         }, None
@@ -209,9 +206,9 @@ def create_interface():
         - Analysis may take up to 2 minutes
         - Results include checks for:
           - Text length
-          - Hate speech and bias
-          - Grammar
-          - Current events context
         """)
     return interface

 import torch
 from transformers import (
     AutoTokenizer,
+    AutoModelForSequenceClassification
 )
 import os
 from pdf_generator import ReportGenerator
 from news_checker import NewsChecker
 from dotenv import load_dotenv
+import language_tool_python  # For spell checking
 load_dotenv()
 # Initialize models and tokenizers
 def load_models():
+    # Hate speech and bias detection model
+    model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
+    hate_tokenizer = AutoTokenizer.from_pretrained(model_name)
+    hate_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    # Initialize spell checker
+    spell_tool = language_tool_python.LanguageTool('en-US')
     return {
         'hate_speech': (hate_model, hate_tokenizer),
+        'spell_check': spell_tool
     }
 # Initialize news checker
         'message': 'Text length is within limits'
     }
+def check_hate_speech_and_bias(text, model, tokenizer):
     try:
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
         outputs = model(**inputs)
         predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        # Adjusted thresholds and messages for both hate speech and bias
+        if predictions[0][1].item() > 0.3:
             return {
                 'status': 'fail',
+                'message': 'Content contains potential hate speech or strong bias'
             }
         elif predictions[0][1].item() > 0.1:
             return {
                 'status': 'warning',
+                'message': 'Content may contain subtle bias or potentially offensive language'
             }
         return {
             'status': 'pass',
+            'message': 'No significant bias or hate speech detected'
         }
     except Exception as e:
         return {
             'status': 'error',
+            'message': f'Error in hate speech/bias detection: {str(e)}'
         }
+def check_spelling(text, spell_tool):
     try:
+        matches = spell_tool.check(text)
+        spelling_errors = []
+        for match in matches:
+            if match.ruleId in ['MORFOLOGIK_RULE_EN_US', 'TYPOS']:  # Only check spelling errors
+                error_word = text[match.offset:match.offset + match.errorLength]
+                suggestions = match.replacements[:3]  # Limit to top 3 suggestions
+                if suggestions:
+                    spelling_errors.append(f"'{error_word}' -> suggestions: {', '.join(suggestions)}")
+        if spelling_errors:
             return {
                 'status': 'warning',
+                'message': 'Misspelled words found:\n' + '\n'.join(spelling_errors)
             }
         return {
             'status': 'pass',
+            'message': 'No spelling errors detected'
         }
     except Exception as e:
         return {
             'status': 'error',
+            'message': f'Error in spell check: {str(e)}'
         }
 def analyze_content(text):
             report_path = report_gen.save_report()
             return results, report_path
+        # 2. Hate Speech / Involuntary Bias Check
+        hate_result = check_hate_speech_and_bias(text, models['hate_speech'][0], models['hate_speech'][1])
+        results['Hate Speech / Involuntary Bias Check'] = hate_result
+        report_gen.add_check_result("Hate Speech / Involuntary Bias Check", hate_result['status'], hate_result['message'])
+        # 3. Spelling Check
+        spell_result = check_spelling(text, models['spell_check'])
+        results['Spelling Check'] = spell_result
+        report_gen.add_check_result("Spelling Check", spell_result['status'], spell_result['message'])
         # 4. News Context Check
         if os.getenv('NEWS_API_KEY'):
         print(f"Error in analyze_content: {str(e)}")
         return {
             'Length Check': {'status': 'error', 'message': 'Analysis failed'},
+            'Hate Speech / Involuntary Bias Check': {'status': 'error', 'message': 'Analysis failed'},
+            'Spelling Check': {'status': 'error', 'message': 'Analysis failed'},
             'Current Events Context': {'status': 'error', 'message': 'Analysis failed'}
         }, None
         - Analysis may take up to 2 minutes
         - Results include checks for:
           - Text length
+          - Hate speech and involuntary bias
+          - Spelling
+          - Negative news context
         """)
     return interface

news_checker.py CHANGED Viewed

@@ -3,11 +3,7 @@ from newsapi import NewsApiClient
 from dotenv import load_dotenv
 import pandas as pd
 from datetime import datetime, timedelta
-import nltk
-from nltk.tokenize import word_tokenize
-from nltk.tag import pos_tag
-from nltk.chunk import ne_chunk
-from collections import Counter
 load_dotenv()
@@ -21,66 +17,24 @@ class NewsChecker:
         try:
             self.newsapi = NewsApiClient(api_key=self.api_key)
-            # Download required NLTK data
-            nltk.download('punkt', quiet=True)
-            nltk.download('averaged_perceptron_tagger', quiet=True)
-            nltk.download('maxent_ne_chunker', quiet=True)
-            nltk.download('words', quiet=True)
         except Exception as e:
-            print(f"Error initializing NewsAPI client: {str(e)}")
-    def extract_keywords(self, text, max_keywords=3):
-        """Extract meaningful keywords from text using NLP techniques"""
         try:
-            # Tokenize and tag parts of speech
-            tokens = word_tokenize(text)
-            tagged = pos_tag(tokens)
-            # Extract named entities
-            named_entities = []
-            chunks = ne_chunk(tagged)
-            for chunk in chunks:
-                if hasattr(chunk, 'label'):
-                    named_entities.append(' '.join(c[0] for c in chunk))
-            # Extract nouns and adjectives (excluding common words)
-            common_words = {'new', 'great', 'good', 'best', 'better', 'more', 'most',
-                          'today', 'now', 'get', 'our', 'your', 'their', 'this', 'that',
-                          'these', 'those', 'here', 'there', 'when', 'where', 'who',
-                          'what', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
-                          'more', 'most', 'other', 'some', 'such', 'only', 'own',
-                          'same', 'than', 'too', 'very', 'can', 'will', 'just', 'should',
-                          'features', 'feature', 'offers', 'offer', 'price', 'prices'}
-            important_words = []
-            for word, tag in tagged:
-                # NN* for nouns, JJ* for adjectives
-                if (tag.startswith('NN') or tag.startswith('JJ')) and \
-                   word.lower() not in common_words and \
-                   len(word) > 2:
-                    important_words.append(word.lower())
-            # Combine named entities and important words, count frequencies
-            all_keywords = named_entities + important_words
-            keyword_freq = Counter(all_keywords)
-            # Get most common keywords
-            main_keywords = [word for word, count in keyword_freq.most_common(max_keywords)]
-            # If no keywords found, return None to trigger general news search
-            if not main_keywords:
-                return None
-            # Create search query
-            search_query = ' OR '.join(f'"{kw}"' for kw in main_keywords)
-            print(f"Generated search query: {search_query}")
-            return search_query
-        except Exception as e:
-            print(f"Error in keyword extraction: {str(e)}")
-            return None
-    def get_recent_news(self, search_query=None):
         if not self.api_key:
             print("Cannot fetch news: No API key configured")
             return pd.DataFrame()
@@ -88,95 +42,66 @@ class NewsChecker:
         try:
             # Get news from the last 7 days
             week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
-            articles = []
-            # Get top headlines first (major news events)
-            top_headlines = self.newsapi.get_top_headlines(
                 language='en',
-                page_size=10  # Limit to top 10 headlines
             )
-            if top_headlines['status'] == 'ok':
-                articles.extend(top_headlines['articles'])
-            # If we have specific keywords, search for related news
-            if search_query:
-                everything = self.newsapi.get_everything(
-                    q=search_query,
-                    from_param=week_ago,
-                    language='en',
-                    sort_by='relevancy',
-                    page_size=15  # More articles for specific searches
-                )
-                if everything['status'] == 'ok':
-                    articles.extend(everything['articles'])
-            # Extract and clean article data
-            news_data = []
-            seen_titles = set()  # To avoid duplicates
-            for article in articles:
-                title = article.get('title', '').strip()
-                desc = article.get('description', '').strip()
-                # Skip articles without title or description
-                if not title or not desc:
-                    continue
-                # Skip duplicate titles
-                if title in seen_titles:
-                    continue
-                news_data.append({
-                    'title': title,
-                    'description': desc
-                })
-                seen_titles.add(title)
-            print(f"Successfully fetched {len(news_data)} unique articles")
-            return pd.DataFrame(news_data)
         except Exception as e:
             print(f"Error fetching news: {str(e)}")
             return pd.DataFrame()
     def check_content_against_news(self, marketing_text):
-        # Extract meaningful keywords from marketing text
-        search_query = self.extract_keywords(marketing_text)
-        print(f"Using search query: {search_query}")
-        # Get news articles
-        news_df = self.get_recent_news(search_query)
         if news_df.empty:
             return {
-                'status': 'warning',
-                'message': 'Unable to check against current news context. Proceed with caution.'
             }
-        # Prepare marketing text for comparison
-        marketing_words = set(word.lower() for word in word_tokenize(marketing_text))
-        potential_conflicts = []
         for _, row in news_df.iterrows():
-            title_words = set(word.lower() for word in word_tokenize(row['title']))
-            desc_words = set(word.lower() for word in word_tokenize(str(row['description'])))
-            # Calculate overlap ratios
-            title_overlap = len(marketing_words.intersection(title_words)) / len(title_words)
-            desc_overlap = len(marketing_words.intersection(desc_words)) / len(desc_words)
-            # Flag if significant overlap found
-            if title_overlap > 0.3 or desc_overlap > 0.25:  # Adjusted thresholds
-                potential_conflicts.append(row['title'])
-        if potential_conflicts:
             return {
                 'status': 'warning',
-                'message': 'Potential conflicts found with current news:\n- ' +
-                          '\n- '.join(potential_conflicts[:3]) +
-                          ('\n\nAnd more...' if len(potential_conflicts) > 3 else '')
             }
         return {
             'status': 'pass',
-            'message': 'No significant conflicts with current news found.'
         }

 from dotenv import load_dotenv
 import pandas as pd
 from datetime import datetime, timedelta
+from transformers import pipeline
 load_dotenv()
         try:
             self.newsapi = NewsApiClient(api_key=self.api_key)
+            # Initialize sentiment analyzer
+            self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
         except Exception as e:
+            print(f"Error initializing clients: {str(e)}")
+    def is_negative_news(self, title, description):
+        """Check if the news article has negative sentiment"""
         try:
+            # Combine title and description for better context
+            text = f"{title} {description}"
+            result = self.sentiment_analyzer(text)[0]
+            # Return True if sentiment is negative
+            return result['label'] == 'NEGATIVE' and result['score'] > 0.7
+        except:
+            return False
+    def get_recent_news(self, marketing_text):
         if not self.api_key:
             print("Cannot fetch news: No API key configured")
             return pd.DataFrame()
         try:
             # Get news from the last 7 days
             week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
+            # Create a simple search query from marketing text
+            # Take the first 5 significant words
+            words = [word for word in marketing_text.lower().split()
+                    if len(word) > 3 and not word.startswith(('http', 'www'))][:5]
+            search_query = ' OR '.join(words) if words else 'news'
+            response = self.newsapi.get_everything(
+                q=search_query,
+                from_param=week_ago,
                 language='en',
+                sort_by='relevancy',
+                page_size=30
             )
+            if response['status'] == 'ok':
+                # Filter for negative news only
+                negative_news = []
+                for article in response['articles']:
+                    if article['title'] and article['description']:
+                        if self.is_negative_news(article['title'], article['description']):
+                            negative_news.append({
+                                'title': article['title'],
+                                'description': article['description']
+                            })
+                print(f"Found {len(negative_news)} negative news articles")
+                return pd.DataFrame(negative_news)
+            return pd.DataFrame()
         except Exception as e:
             print(f"Error fetching news: {str(e)}")
             return pd.DataFrame()
     def check_content_against_news(self, marketing_text):
+        news_df = self.get_recent_news(marketing_text)
         if news_df.empty:
             return {
+                'status': 'pass',
+                'message': 'No relevant negative news found.'
             }
+        # Simple word matching for relevance
+        marketing_words = set(marketing_text.lower().split())
+        relevant_negative_news = []
         for _, row in news_df.iterrows():
+            title_words = set(row['title'].lower().split())
+            if len(marketing_words.intersection(title_words)) >= 2:
+                relevant_negative_news.append(row['title'])
+        if relevant_negative_news:
             return {
                 'status': 'warning',
+                'message': 'Found relevant negative news that might impact your marketing:\n- ' +
+                          '\n- '.join(relevant_negative_news[:3])
             }
         return {
             'status': 'pass',
+            'message': 'No relevant negative news found.'
         }

requirements.txt CHANGED Viewed

@@ -8,4 +8,4 @@ pandas==2.1.4
 numpy==1.24.3
 requests==2.31.0
 python-dotenv==1.0.0
-nltk==3.8.1

 numpy==1.24.3
 requests==2.31.0
 python-dotenv==1.0.0
+language-tool-python==2.7.1