Ozgur Unlu commited on
Commit
72d5e40
·
1 Parent(s): f73d159

made some changes to grammar check, bias check, news check

Browse files
Files changed (3) hide show
  1. app.py +42 -45
  2. news_checker.py +53 -128
  3. requirements.txt +1 -1
app.py CHANGED
@@ -2,29 +2,29 @@ import gradio as gr
2
  import torch
3
  from transformers import (
4
  AutoTokenizer,
5
- AutoModelForSequenceClassification,
6
- AutoModelForSeq2SeqLM
7
  )
8
  import os
9
  from pdf_generator import ReportGenerator
10
  from news_checker import NewsChecker
11
  from dotenv import load_dotenv
 
12
 
13
  load_dotenv()
14
 
15
  # Initialize models and tokenizers
16
  def load_models():
17
- # Hate speech detection model
18
- hate_tokenizer = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
19
- hate_model = AutoModelForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
 
20
 
21
- # Grammar check model
22
- grammar_tokenizer = AutoTokenizer.from_pretrained("vennify/t5-base-grammar-correction")
23
- grammar_model = AutoModelForSeq2SeqLM.from_pretrained("vennify/t5-base-grammar-correction")
24
 
25
  return {
26
  'hate_speech': (hate_model, hate_tokenizer),
27
- 'grammar': (grammar_model, grammar_tokenizer)
28
  }
29
 
30
  # Initialize news checker
@@ -41,61 +41,58 @@ def check_text_length(text):
41
  'message': 'Text length is within limits'
42
  }
43
 
44
- def check_hate_speech(text, model, tokenizer):
45
  try:
46
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
47
  outputs = model(**inputs)
48
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
49
 
50
- # Threshold for hate speech detection
51
- if predictions[0][1].item() > 0.3: # Adjusted threshold
52
  return {
53
  'status': 'fail',
54
- 'message': 'Potential hate speech detected'
55
  }
56
  elif predictions[0][1].item() > 0.1:
57
  return {
58
  'status': 'warning',
59
- 'message': 'Some concerning language detected - please review'
60
  }
61
  return {
62
  'status': 'pass',
63
- 'message': 'No hate speech detected'
64
  }
65
  except Exception as e:
66
  return {
67
  'status': 'error',
68
- 'message': f'Error in hate speech detection: {str(e)}'
69
  }
70
 
71
- def check_grammar(text, model, tokenizer):
72
  try:
73
- input_text = f"grammar: {text}"
74
- encoding = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
75
 
76
- outputs = model.generate(
77
- input_ids=encoding.input_ids,
78
- attention_mask=encoding.attention_mask,
79
- max_length=512,
80
- num_beams=5,
81
- num_return_sequences=1
82
- )
83
-
84
- corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
85
 
86
- if corrected.lower().strip() != text.lower().strip():
87
  return {
88
  'status': 'warning',
89
- 'message': f'Suggested corrections:\n{corrected}'
90
  }
91
  return {
92
  'status': 'pass',
93
- 'message': 'No grammar issues detected'
94
  }
95
  except Exception as e:
96
  return {
97
  'status': 'error',
98
- 'message': f'Error in grammar check: {str(e)}'
99
  }
100
 
101
  def analyze_content(text):
@@ -120,15 +117,15 @@ def analyze_content(text):
120
  report_path = report_gen.save_report()
121
  return results, report_path
122
 
123
- # 2. Hate Speech Check
124
- hate_result = check_hate_speech(text, models['hate_speech'][0], models['hate_speech'][1])
125
- results['Hate Speech Check'] = hate_result
126
- report_gen.add_check_result("Hate Speech Check", hate_result['status'], hate_result['message'])
127
 
128
- # 3. Grammar Check
129
- grammar_result = check_grammar(text, models['grammar'][0], models['grammar'][1])
130
- results['Grammar Check'] = grammar_result
131
- report_gen.add_check_result("Grammar Check", grammar_result['status'], grammar_result['message'])
132
 
133
  # 4. News Context Check
134
  if os.getenv('NEWS_API_KEY'):
@@ -149,8 +146,8 @@ def analyze_content(text):
149
  print(f"Error in analyze_content: {str(e)}")
150
  return {
151
  'Length Check': {'status': 'error', 'message': 'Analysis failed'},
152
- 'Hate Speech Check': {'status': 'error', 'message': 'Analysis failed'},
153
- 'Grammar Check': {'status': 'error', 'message': 'Analysis failed'},
154
  'Current Events Context': {'status': 'error', 'message': 'Analysis failed'}
155
  }, None
156
 
@@ -209,9 +206,9 @@ def create_interface():
209
  - Analysis may take up to 2 minutes
210
  - Results include checks for:
211
  - Text length
212
- - Hate speech and bias
213
- - Grammar
214
- - Current events context
215
  """)
216
 
217
  return interface
 
2
  import torch
3
  from transformers import (
4
  AutoTokenizer,
5
+ AutoModelForSequenceClassification
 
6
  )
7
  import os
8
  from pdf_generator import ReportGenerator
9
  from news_checker import NewsChecker
10
  from dotenv import load_dotenv
11
+ import language_tool_python # For spell checking
12
 
13
  load_dotenv()
14
 
15
  # Initialize models and tokenizers
16
  def load_models():
17
+ # Hate speech and bias detection model
18
+ model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
19
+ hate_tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ hate_model = AutoModelForSequenceClassification.from_pretrained(model_name)
21
 
22
+ # Initialize spell checker
23
+ spell_tool = language_tool_python.LanguageTool('en-US')
 
24
 
25
  return {
26
  'hate_speech': (hate_model, hate_tokenizer),
27
+ 'spell_check': spell_tool
28
  }
29
 
30
  # Initialize news checker
 
41
  'message': 'Text length is within limits'
42
  }
43
 
44
+ def check_hate_speech_and_bias(text, model, tokenizer):
45
  try:
46
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
47
  outputs = model(**inputs)
48
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
49
 
50
+ # Adjusted thresholds and messages for both hate speech and bias
51
+ if predictions[0][1].item() > 0.3:
52
  return {
53
  'status': 'fail',
54
+ 'message': 'Content contains potential hate speech or strong bias'
55
  }
56
  elif predictions[0][1].item() > 0.1:
57
  return {
58
  'status': 'warning',
59
+ 'message': 'Content may contain subtle bias or potentially offensive language'
60
  }
61
  return {
62
  'status': 'pass',
63
+ 'message': 'No significant bias or hate speech detected'
64
  }
65
  except Exception as e:
66
  return {
67
  'status': 'error',
68
+ 'message': f'Error in hate speech/bias detection: {str(e)}'
69
  }
70
 
71
+ def check_spelling(text, spell_tool):
72
  try:
73
+ matches = spell_tool.check(text)
74
+ spelling_errors = []
75
 
76
+ for match in matches:
77
+ if match.ruleId in ['MORFOLOGIK_RULE_EN_US', 'TYPOS']: # Only check spelling errors
78
+ error_word = text[match.offset:match.offset + match.errorLength]
79
+ suggestions = match.replacements[:3] # Limit to top 3 suggestions
80
+ if suggestions:
81
+ spelling_errors.append(f"'{error_word}' -> suggestions: {', '.join(suggestions)}")
 
 
 
82
 
83
+ if spelling_errors:
84
  return {
85
  'status': 'warning',
86
+ 'message': 'Misspelled words found:\n' + '\n'.join(spelling_errors)
87
  }
88
  return {
89
  'status': 'pass',
90
+ 'message': 'No spelling errors detected'
91
  }
92
  except Exception as e:
93
  return {
94
  'status': 'error',
95
+ 'message': f'Error in spell check: {str(e)}'
96
  }
97
 
98
  def analyze_content(text):
 
117
  report_path = report_gen.save_report()
118
  return results, report_path
119
 
120
+ # 2. Hate Speech / Involuntary Bias Check
121
+ hate_result = check_hate_speech_and_bias(text, models['hate_speech'][0], models['hate_speech'][1])
122
+ results['Hate Speech / Involuntary Bias Check'] = hate_result
123
+ report_gen.add_check_result("Hate Speech / Involuntary Bias Check", hate_result['status'], hate_result['message'])
124
 
125
+ # 3. Spelling Check
126
+ spell_result = check_spelling(text, models['spell_check'])
127
+ results['Spelling Check'] = spell_result
128
+ report_gen.add_check_result("Spelling Check", spell_result['status'], spell_result['message'])
129
 
130
  # 4. News Context Check
131
  if os.getenv('NEWS_API_KEY'):
 
146
  print(f"Error in analyze_content: {str(e)}")
147
  return {
148
  'Length Check': {'status': 'error', 'message': 'Analysis failed'},
149
+ 'Hate Speech / Involuntary Bias Check': {'status': 'error', 'message': 'Analysis failed'},
150
+ 'Spelling Check': {'status': 'error', 'message': 'Analysis failed'},
151
  'Current Events Context': {'status': 'error', 'message': 'Analysis failed'}
152
  }, None
153
 
 
206
  - Analysis may take up to 2 minutes
207
  - Results include checks for:
208
  - Text length
209
+ - Hate speech and involuntary bias
210
+ - Spelling
211
+ - Negative news context
212
  """)
213
 
214
  return interface
news_checker.py CHANGED
@@ -3,11 +3,7 @@ from newsapi import NewsApiClient
3
  from dotenv import load_dotenv
4
  import pandas as pd
5
  from datetime import datetime, timedelta
6
- import nltk
7
- from nltk.tokenize import word_tokenize
8
- from nltk.tag import pos_tag
9
- from nltk.chunk import ne_chunk
10
- from collections import Counter
11
 
12
  load_dotenv()
13
 
@@ -21,66 +17,24 @@ class NewsChecker:
21
 
22
  try:
23
  self.newsapi = NewsApiClient(api_key=self.api_key)
24
- # Download required NLTK data
25
- nltk.download('punkt', quiet=True)
26
- nltk.download('averaged_perceptron_tagger', quiet=True)
27
- nltk.download('maxent_ne_chunker', quiet=True)
28
- nltk.download('words', quiet=True)
29
  except Exception as e:
30
- print(f"Error initializing NewsAPI client: {str(e)}")
31
 
32
- def extract_keywords(self, text, max_keywords=3):
33
- """Extract meaningful keywords from text using NLP techniques"""
34
  try:
35
- # Tokenize and tag parts of speech
36
- tokens = word_tokenize(text)
37
- tagged = pos_tag(tokens)
38
-
39
- # Extract named entities
40
- named_entities = []
41
- chunks = ne_chunk(tagged)
42
- for chunk in chunks:
43
- if hasattr(chunk, 'label'):
44
- named_entities.append(' '.join(c[0] for c in chunk))
45
-
46
- # Extract nouns and adjectives (excluding common words)
47
- common_words = {'new', 'great', 'good', 'best', 'better', 'more', 'most',
48
- 'today', 'now', 'get', 'our', 'your', 'their', 'this', 'that',
49
- 'these', 'those', 'here', 'there', 'when', 'where', 'who',
50
- 'what', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
51
- 'more', 'most', 'other', 'some', 'such', 'only', 'own',
52
- 'same', 'than', 'too', 'very', 'can', 'will', 'just', 'should',
53
- 'features', 'feature', 'offers', 'offer', 'price', 'prices'}
54
-
55
- important_words = []
56
- for word, tag in tagged:
57
- # NN* for nouns, JJ* for adjectives
58
- if (tag.startswith('NN') or tag.startswith('JJ')) and \
59
- word.lower() not in common_words and \
60
- len(word) > 2:
61
- important_words.append(word.lower())
62
-
63
- # Combine named entities and important words, count frequencies
64
- all_keywords = named_entities + important_words
65
- keyword_freq = Counter(all_keywords)
66
-
67
- # Get most common keywords
68
- main_keywords = [word for word, count in keyword_freq.most_common(max_keywords)]
69
-
70
- # If no keywords found, return None to trigger general news search
71
- if not main_keywords:
72
- return None
73
-
74
- # Create search query
75
- search_query = ' OR '.join(f'"{kw}"' for kw in main_keywords)
76
- print(f"Generated search query: {search_query}")
77
- return search_query
78
-
79
- except Exception as e:
80
- print(f"Error in keyword extraction: {str(e)}")
81
- return None
82
-
83
- def get_recent_news(self, search_query=None):
84
  if not self.api_key:
85
  print("Cannot fetch news: No API key configured")
86
  return pd.DataFrame()
@@ -88,95 +42,66 @@ class NewsChecker:
88
  try:
89
  # Get news from the last 7 days
90
  week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
91
- articles = []
92
 
93
- # Get top headlines first (major news events)
94
- top_headlines = self.newsapi.get_top_headlines(
 
 
 
 
 
 
 
95
  language='en',
96
- page_size=10 # Limit to top 10 headlines
 
97
  )
98
- if top_headlines['status'] == 'ok':
99
- articles.extend(top_headlines['articles'])
100
-
101
- # If we have specific keywords, search for related news
102
- if search_query:
103
- everything = self.newsapi.get_everything(
104
- q=search_query,
105
- from_param=week_ago,
106
- language='en',
107
- sort_by='relevancy',
108
- page_size=15 # More articles for specific searches
109
- )
110
- if everything['status'] == 'ok':
111
- articles.extend(everything['articles'])
112
 
113
- # Extract and clean article data
114
- news_data = []
115
- seen_titles = set() # To avoid duplicates
116
-
117
- for article in articles:
118
- title = article.get('title', '').strip()
119
- desc = article.get('description', '').strip()
 
 
 
120
 
121
- # Skip articles without title or description
122
- if not title or not desc:
123
- continue
124
-
125
- # Skip duplicate titles
126
- if title in seen_titles:
127
- continue
128
-
129
- news_data.append({
130
- 'title': title,
131
- 'description': desc
132
- })
133
- seen_titles.add(title)
134
 
135
- print(f"Successfully fetched {len(news_data)} unique articles")
136
- return pd.DataFrame(news_data)
137
 
138
  except Exception as e:
139
  print(f"Error fetching news: {str(e)}")
140
  return pd.DataFrame()
141
 
142
  def check_content_against_news(self, marketing_text):
143
- # Extract meaningful keywords from marketing text
144
- search_query = self.extract_keywords(marketing_text)
145
- print(f"Using search query: {search_query}")
146
-
147
- # Get news articles
148
- news_df = self.get_recent_news(search_query)
149
  if news_df.empty:
150
  return {
151
- 'status': 'warning',
152
- 'message': 'Unable to check against current news context. Proceed with caution.'
153
  }
154
 
155
- # Prepare marketing text for comparison
156
- marketing_words = set(word.lower() for word in word_tokenize(marketing_text))
157
- potential_conflicts = []
158
 
159
  for _, row in news_df.iterrows():
160
- title_words = set(word.lower() for word in word_tokenize(row['title']))
161
- desc_words = set(word.lower() for word in word_tokenize(str(row['description'])))
162
-
163
- # Calculate overlap ratios
164
- title_overlap = len(marketing_words.intersection(title_words)) / len(title_words)
165
- desc_overlap = len(marketing_words.intersection(desc_words)) / len(desc_words)
166
-
167
- # Flag if significant overlap found
168
- if title_overlap > 0.3 or desc_overlap > 0.25: # Adjusted thresholds
169
- potential_conflicts.append(row['title'])
170
 
171
- if potential_conflicts:
172
  return {
173
  'status': 'warning',
174
- 'message': 'Potential conflicts found with current news:\n- ' +
175
- '\n- '.join(potential_conflicts[:3]) +
176
- ('\n\nAnd more...' if len(potential_conflicts) > 3 else '')
177
  }
178
 
179
  return {
180
  'status': 'pass',
181
- 'message': 'No significant conflicts with current news found.'
182
  }
 
3
  from dotenv import load_dotenv
4
  import pandas as pd
5
  from datetime import datetime, timedelta
6
+ from transformers import pipeline
 
 
 
 
7
 
8
  load_dotenv()
9
 
 
17
 
18
  try:
19
  self.newsapi = NewsApiClient(api_key=self.api_key)
20
+ # Initialize sentiment analyzer
21
+ self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
 
 
 
22
  except Exception as e:
23
+ print(f"Error initializing clients: {str(e)}")
24
 
25
+ def is_negative_news(self, title, description):
26
+ """Check if the news article has negative sentiment"""
27
  try:
28
+ # Combine title and description for better context
29
+ text = f"{title} {description}"
30
+ result = self.sentiment_analyzer(text)[0]
31
+
32
+ # Return True if sentiment is negative
33
+ return result['label'] == 'NEGATIVE' and result['score'] > 0.7
34
+ except:
35
+ return False
36
+
37
+ def get_recent_news(self, marketing_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  if not self.api_key:
39
  print("Cannot fetch news: No API key configured")
40
  return pd.DataFrame()
 
42
  try:
43
  # Get news from the last 7 days
44
  week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
 
45
 
46
+ # Create a simple search query from marketing text
47
+ # Take the first 5 significant words
48
+ words = [word for word in marketing_text.lower().split()
49
+ if len(word) > 3 and not word.startswith(('http', 'www'))][:5]
50
+ search_query = ' OR '.join(words) if words else 'news'
51
+
52
+ response = self.newsapi.get_everything(
53
+ q=search_query,
54
+ from_param=week_ago,
55
  language='en',
56
+ sort_by='relevancy',
57
+ page_size=30
58
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ if response['status'] == 'ok':
61
+ # Filter for negative news only
62
+ negative_news = []
63
+ for article in response['articles']:
64
+ if article['title'] and article['description']:
65
+ if self.is_negative_news(article['title'], article['description']):
66
+ negative_news.append({
67
+ 'title': article['title'],
68
+ 'description': article['description']
69
+ })
70
 
71
+ print(f"Found {len(negative_news)} negative news articles")
72
+ return pd.DataFrame(negative_news)
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ return pd.DataFrame()
 
75
 
76
  except Exception as e:
77
  print(f"Error fetching news: {str(e)}")
78
  return pd.DataFrame()
79
 
80
  def check_content_against_news(self, marketing_text):
81
+ news_df = self.get_recent_news(marketing_text)
 
 
 
 
 
82
  if news_df.empty:
83
  return {
84
+ 'status': 'pass',
85
+ 'message': 'No relevant negative news found.'
86
  }
87
 
88
+ # Simple word matching for relevance
89
+ marketing_words = set(marketing_text.lower().split())
90
+ relevant_negative_news = []
91
 
92
  for _, row in news_df.iterrows():
93
+ title_words = set(row['title'].lower().split())
94
+ if len(marketing_words.intersection(title_words)) >= 2:
95
+ relevant_negative_news.append(row['title'])
 
 
 
 
 
 
 
96
 
97
+ if relevant_negative_news:
98
  return {
99
  'status': 'warning',
100
+ 'message': 'Found relevant negative news that might impact your marketing:\n- ' +
101
+ '\n- '.join(relevant_negative_news[:3])
 
102
  }
103
 
104
  return {
105
  'status': 'pass',
106
+ 'message': 'No relevant negative news found.'
107
  }
requirements.txt CHANGED
@@ -8,4 +8,4 @@ pandas==2.1.4
8
  numpy==1.24.3
9
  requests==2.31.0
10
  python-dotenv==1.0.0
11
- nltk==3.8.1
 
8
  numpy==1.24.3
9
  requests==2.31.0
10
  python-dotenv==1.0.0
11
+ language-tool-python==2.7.1