Ozgur Unlu
commited on
Commit
·
72d5e40
1
Parent(s):
f73d159
made some changes to grammar check, bias check, news check
Browse files- app.py +42 -45
- news_checker.py +53 -128
- requirements.txt +1 -1
app.py
CHANGED
@@ -2,29 +2,29 @@ import gradio as gr
|
|
2 |
import torch
|
3 |
from transformers import (
|
4 |
AutoTokenizer,
|
5 |
-
AutoModelForSequenceClassification
|
6 |
-
AutoModelForSeq2SeqLM
|
7 |
)
|
8 |
import os
|
9 |
from pdf_generator import ReportGenerator
|
10 |
from news_checker import NewsChecker
|
11 |
from dotenv import load_dotenv
|
|
|
12 |
|
13 |
load_dotenv()
|
14 |
|
15 |
# Initialize models and tokenizers
|
16 |
def load_models():
|
17 |
-
# Hate speech detection model
|
18 |
-
|
19 |
-
|
|
|
20 |
|
21 |
-
#
|
22 |
-
|
23 |
-
grammar_model = AutoModelForSeq2SeqLM.from_pretrained("vennify/t5-base-grammar-correction")
|
24 |
|
25 |
return {
|
26 |
'hate_speech': (hate_model, hate_tokenizer),
|
27 |
-
'
|
28 |
}
|
29 |
|
30 |
# Initialize news checker
|
@@ -41,61 +41,58 @@ def check_text_length(text):
|
|
41 |
'message': 'Text length is within limits'
|
42 |
}
|
43 |
|
44 |
-
def
|
45 |
try:
|
46 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
47 |
outputs = model(**inputs)
|
48 |
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
49 |
|
50 |
-
#
|
51 |
-
if predictions[0][1].item() > 0.3:
|
52 |
return {
|
53 |
'status': 'fail',
|
54 |
-
'message': '
|
55 |
}
|
56 |
elif predictions[0][1].item() > 0.1:
|
57 |
return {
|
58 |
'status': 'warning',
|
59 |
-
'message': '
|
60 |
}
|
61 |
return {
|
62 |
'status': 'pass',
|
63 |
-
'message': 'No hate speech detected'
|
64 |
}
|
65 |
except Exception as e:
|
66 |
return {
|
67 |
'status': 'error',
|
68 |
-
'message': f'Error in hate speech detection: {str(e)}'
|
69 |
}
|
70 |
|
71 |
-
def
|
72 |
try:
|
73 |
-
|
74 |
-
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
)
|
83 |
-
|
84 |
-
corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
85 |
|
86 |
-
if
|
87 |
return {
|
88 |
'status': 'warning',
|
89 |
-
'message':
|
90 |
}
|
91 |
return {
|
92 |
'status': 'pass',
|
93 |
-
'message': 'No
|
94 |
}
|
95 |
except Exception as e:
|
96 |
return {
|
97 |
'status': 'error',
|
98 |
-
'message': f'Error in
|
99 |
}
|
100 |
|
101 |
def analyze_content(text):
|
@@ -120,15 +117,15 @@ def analyze_content(text):
|
|
120 |
report_path = report_gen.save_report()
|
121 |
return results, report_path
|
122 |
|
123 |
-
# 2. Hate Speech Check
|
124 |
-
hate_result =
|
125 |
-
results['Hate Speech Check'] = hate_result
|
126 |
-
report_gen.add_check_result("Hate Speech Check", hate_result['status'], hate_result['message'])
|
127 |
|
128 |
-
# 3.
|
129 |
-
|
130 |
-
results['
|
131 |
-
report_gen.add_check_result("
|
132 |
|
133 |
# 4. News Context Check
|
134 |
if os.getenv('NEWS_API_KEY'):
|
@@ -149,8 +146,8 @@ def analyze_content(text):
|
|
149 |
print(f"Error in analyze_content: {str(e)}")
|
150 |
return {
|
151 |
'Length Check': {'status': 'error', 'message': 'Analysis failed'},
|
152 |
-
'Hate Speech Check': {'status': 'error', 'message': 'Analysis failed'},
|
153 |
-
'
|
154 |
'Current Events Context': {'status': 'error', 'message': 'Analysis failed'}
|
155 |
}, None
|
156 |
|
@@ -209,9 +206,9 @@ def create_interface():
|
|
209 |
- Analysis may take up to 2 minutes
|
210 |
- Results include checks for:
|
211 |
- Text length
|
212 |
-
- Hate speech and bias
|
213 |
-
-
|
214 |
-
-
|
215 |
""")
|
216 |
|
217 |
return interface
|
|
|
2 |
import torch
|
3 |
from transformers import (
|
4 |
AutoTokenizer,
|
5 |
+
AutoModelForSequenceClassification
|
|
|
6 |
)
|
7 |
import os
|
8 |
from pdf_generator import ReportGenerator
|
9 |
from news_checker import NewsChecker
|
10 |
from dotenv import load_dotenv
|
11 |
+
import language_tool_python # For spell checking
|
12 |
|
13 |
load_dotenv()
|
14 |
|
15 |
# Initialize models and tokenizers
|
16 |
def load_models():
|
17 |
+
# Hate speech and bias detection model
|
18 |
+
model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
|
19 |
+
hate_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
20 |
+
hate_model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
21 |
|
22 |
+
# Initialize spell checker
|
23 |
+
spell_tool = language_tool_python.LanguageTool('en-US')
|
|
|
24 |
|
25 |
return {
|
26 |
'hate_speech': (hate_model, hate_tokenizer),
|
27 |
+
'spell_check': spell_tool
|
28 |
}
|
29 |
|
30 |
# Initialize news checker
|
|
|
41 |
'message': 'Text length is within limits'
|
42 |
}
|
43 |
|
44 |
+
def check_hate_speech_and_bias(text, model, tokenizer):
|
45 |
try:
|
46 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
47 |
outputs = model(**inputs)
|
48 |
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
49 |
|
50 |
+
# Adjusted thresholds and messages for both hate speech and bias
|
51 |
+
if predictions[0][1].item() > 0.3:
|
52 |
return {
|
53 |
'status': 'fail',
|
54 |
+
'message': 'Content contains potential hate speech or strong bias'
|
55 |
}
|
56 |
elif predictions[0][1].item() > 0.1:
|
57 |
return {
|
58 |
'status': 'warning',
|
59 |
+
'message': 'Content may contain subtle bias or potentially offensive language'
|
60 |
}
|
61 |
return {
|
62 |
'status': 'pass',
|
63 |
+
'message': 'No significant bias or hate speech detected'
|
64 |
}
|
65 |
except Exception as e:
|
66 |
return {
|
67 |
'status': 'error',
|
68 |
+
'message': f'Error in hate speech/bias detection: {str(e)}'
|
69 |
}
|
70 |
|
71 |
+
def check_spelling(text, spell_tool):
|
72 |
try:
|
73 |
+
matches = spell_tool.check(text)
|
74 |
+
spelling_errors = []
|
75 |
|
76 |
+
for match in matches:
|
77 |
+
if match.ruleId in ['MORFOLOGIK_RULE_EN_US', 'TYPOS']: # Only check spelling errors
|
78 |
+
error_word = text[match.offset:match.offset + match.errorLength]
|
79 |
+
suggestions = match.replacements[:3] # Limit to top 3 suggestions
|
80 |
+
if suggestions:
|
81 |
+
spelling_errors.append(f"'{error_word}' -> suggestions: {', '.join(suggestions)}")
|
|
|
|
|
|
|
82 |
|
83 |
+
if spelling_errors:
|
84 |
return {
|
85 |
'status': 'warning',
|
86 |
+
'message': 'Misspelled words found:\n' + '\n'.join(spelling_errors)
|
87 |
}
|
88 |
return {
|
89 |
'status': 'pass',
|
90 |
+
'message': 'No spelling errors detected'
|
91 |
}
|
92 |
except Exception as e:
|
93 |
return {
|
94 |
'status': 'error',
|
95 |
+
'message': f'Error in spell check: {str(e)}'
|
96 |
}
|
97 |
|
98 |
def analyze_content(text):
|
|
|
117 |
report_path = report_gen.save_report()
|
118 |
return results, report_path
|
119 |
|
120 |
+
# 2. Hate Speech / Involuntary Bias Check
|
121 |
+
hate_result = check_hate_speech_and_bias(text, models['hate_speech'][0], models['hate_speech'][1])
|
122 |
+
results['Hate Speech / Involuntary Bias Check'] = hate_result
|
123 |
+
report_gen.add_check_result("Hate Speech / Involuntary Bias Check", hate_result['status'], hate_result['message'])
|
124 |
|
125 |
+
# 3. Spelling Check
|
126 |
+
spell_result = check_spelling(text, models['spell_check'])
|
127 |
+
results['Spelling Check'] = spell_result
|
128 |
+
report_gen.add_check_result("Spelling Check", spell_result['status'], spell_result['message'])
|
129 |
|
130 |
# 4. News Context Check
|
131 |
if os.getenv('NEWS_API_KEY'):
|
|
|
146 |
print(f"Error in analyze_content: {str(e)}")
|
147 |
return {
|
148 |
'Length Check': {'status': 'error', 'message': 'Analysis failed'},
|
149 |
+
'Hate Speech / Involuntary Bias Check': {'status': 'error', 'message': 'Analysis failed'},
|
150 |
+
'Spelling Check': {'status': 'error', 'message': 'Analysis failed'},
|
151 |
'Current Events Context': {'status': 'error', 'message': 'Analysis failed'}
|
152 |
}, None
|
153 |
|
|
|
206 |
- Analysis may take up to 2 minutes
|
207 |
- Results include checks for:
|
208 |
- Text length
|
209 |
+
- Hate speech and involuntary bias
|
210 |
+
- Spelling
|
211 |
+
- Negative news context
|
212 |
""")
|
213 |
|
214 |
return interface
|
news_checker.py
CHANGED
@@ -3,11 +3,7 @@ from newsapi import NewsApiClient
|
|
3 |
from dotenv import load_dotenv
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta
|
6 |
-
import
|
7 |
-
from nltk.tokenize import word_tokenize
|
8 |
-
from nltk.tag import pos_tag
|
9 |
-
from nltk.chunk import ne_chunk
|
10 |
-
from collections import Counter
|
11 |
|
12 |
load_dotenv()
|
13 |
|
@@ -21,66 +17,24 @@ class NewsChecker:
|
|
21 |
|
22 |
try:
|
23 |
self.newsapi = NewsApiClient(api_key=self.api_key)
|
24 |
-
#
|
25 |
-
|
26 |
-
nltk.download('averaged_perceptron_tagger', quiet=True)
|
27 |
-
nltk.download('maxent_ne_chunker', quiet=True)
|
28 |
-
nltk.download('words', quiet=True)
|
29 |
except Exception as e:
|
30 |
-
print(f"Error initializing
|
31 |
|
32 |
-
def
|
33 |
-
"""
|
34 |
try:
|
35 |
-
#
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
#
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# Extract nouns and adjectives (excluding common words)
|
47 |
-
common_words = {'new', 'great', 'good', 'best', 'better', 'more', 'most',
|
48 |
-
'today', 'now', 'get', 'our', 'your', 'their', 'this', 'that',
|
49 |
-
'these', 'those', 'here', 'there', 'when', 'where', 'who',
|
50 |
-
'what', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
|
51 |
-
'more', 'most', 'other', 'some', 'such', 'only', 'own',
|
52 |
-
'same', 'than', 'too', 'very', 'can', 'will', 'just', 'should',
|
53 |
-
'features', 'feature', 'offers', 'offer', 'price', 'prices'}
|
54 |
-
|
55 |
-
important_words = []
|
56 |
-
for word, tag in tagged:
|
57 |
-
# NN* for nouns, JJ* for adjectives
|
58 |
-
if (tag.startswith('NN') or tag.startswith('JJ')) and \
|
59 |
-
word.lower() not in common_words and \
|
60 |
-
len(word) > 2:
|
61 |
-
important_words.append(word.lower())
|
62 |
-
|
63 |
-
# Combine named entities and important words, count frequencies
|
64 |
-
all_keywords = named_entities + important_words
|
65 |
-
keyword_freq = Counter(all_keywords)
|
66 |
-
|
67 |
-
# Get most common keywords
|
68 |
-
main_keywords = [word for word, count in keyword_freq.most_common(max_keywords)]
|
69 |
-
|
70 |
-
# If no keywords found, return None to trigger general news search
|
71 |
-
if not main_keywords:
|
72 |
-
return None
|
73 |
-
|
74 |
-
# Create search query
|
75 |
-
search_query = ' OR '.join(f'"{kw}"' for kw in main_keywords)
|
76 |
-
print(f"Generated search query: {search_query}")
|
77 |
-
return search_query
|
78 |
-
|
79 |
-
except Exception as e:
|
80 |
-
print(f"Error in keyword extraction: {str(e)}")
|
81 |
-
return None
|
82 |
-
|
83 |
-
def get_recent_news(self, search_query=None):
|
84 |
if not self.api_key:
|
85 |
print("Cannot fetch news: No API key configured")
|
86 |
return pd.DataFrame()
|
@@ -88,95 +42,66 @@ class NewsChecker:
|
|
88 |
try:
|
89 |
# Get news from the last 7 days
|
90 |
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
|
91 |
-
articles = []
|
92 |
|
93 |
-
#
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
language='en',
|
96 |
-
|
|
|
97 |
)
|
98 |
-
if top_headlines['status'] == 'ok':
|
99 |
-
articles.extend(top_headlines['articles'])
|
100 |
-
|
101 |
-
# If we have specific keywords, search for related news
|
102 |
-
if search_query:
|
103 |
-
everything = self.newsapi.get_everything(
|
104 |
-
q=search_query,
|
105 |
-
from_param=week_ago,
|
106 |
-
language='en',
|
107 |
-
sort_by='relevancy',
|
108 |
-
page_size=15 # More articles for specific searches
|
109 |
-
)
|
110 |
-
if everything['status'] == 'ok':
|
111 |
-
articles.extend(everything['articles'])
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
continue
|
124 |
-
|
125 |
-
# Skip duplicate titles
|
126 |
-
if title in seen_titles:
|
127 |
-
continue
|
128 |
-
|
129 |
-
news_data.append({
|
130 |
-
'title': title,
|
131 |
-
'description': desc
|
132 |
-
})
|
133 |
-
seen_titles.add(title)
|
134 |
|
135 |
-
|
136 |
-
return pd.DataFrame(news_data)
|
137 |
|
138 |
except Exception as e:
|
139 |
print(f"Error fetching news: {str(e)}")
|
140 |
return pd.DataFrame()
|
141 |
|
142 |
def check_content_against_news(self, marketing_text):
|
143 |
-
|
144 |
-
search_query = self.extract_keywords(marketing_text)
|
145 |
-
print(f"Using search query: {search_query}")
|
146 |
-
|
147 |
-
# Get news articles
|
148 |
-
news_df = self.get_recent_news(search_query)
|
149 |
if news_df.empty:
|
150 |
return {
|
151 |
-
'status': '
|
152 |
-
'message': '
|
153 |
}
|
154 |
|
155 |
-
#
|
156 |
-
marketing_words = set(
|
157 |
-
|
158 |
|
159 |
for _, row in news_df.iterrows():
|
160 |
-
title_words = set(
|
161 |
-
|
162 |
-
|
163 |
-
# Calculate overlap ratios
|
164 |
-
title_overlap = len(marketing_words.intersection(title_words)) / len(title_words)
|
165 |
-
desc_overlap = len(marketing_words.intersection(desc_words)) / len(desc_words)
|
166 |
-
|
167 |
-
# Flag if significant overlap found
|
168 |
-
if title_overlap > 0.3 or desc_overlap > 0.25: # Adjusted thresholds
|
169 |
-
potential_conflicts.append(row['title'])
|
170 |
|
171 |
-
if
|
172 |
return {
|
173 |
'status': 'warning',
|
174 |
-
'message': '
|
175 |
-
'\n- '.join(
|
176 |
-
('\n\nAnd more...' if len(potential_conflicts) > 3 else '')
|
177 |
}
|
178 |
|
179 |
return {
|
180 |
'status': 'pass',
|
181 |
-
'message': 'No
|
182 |
}
|
|
|
3 |
from dotenv import load_dotenv
|
4 |
import pandas as pd
|
5 |
from datetime import datetime, timedelta
|
6 |
+
from transformers import pipeline
|
|
|
|
|
|
|
|
|
7 |
|
8 |
load_dotenv()
|
9 |
|
|
|
17 |
|
18 |
try:
|
19 |
self.newsapi = NewsApiClient(api_key=self.api_key)
|
20 |
+
# Initialize sentiment analyzer
|
21 |
+
self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
|
|
|
|
|
|
|
22 |
except Exception as e:
|
23 |
+
print(f"Error initializing clients: {str(e)}")
|
24 |
|
25 |
+
def is_negative_news(self, title, description):
|
26 |
+
"""Check if the news article has negative sentiment"""
|
27 |
try:
|
28 |
+
# Combine title and description for better context
|
29 |
+
text = f"{title} {description}"
|
30 |
+
result = self.sentiment_analyzer(text)[0]
|
31 |
+
|
32 |
+
# Return True if sentiment is negative
|
33 |
+
return result['label'] == 'NEGATIVE' and result['score'] > 0.7
|
34 |
+
except:
|
35 |
+
return False
|
36 |
+
|
37 |
+
def get_recent_news(self, marketing_text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
if not self.api_key:
|
39 |
print("Cannot fetch news: No API key configured")
|
40 |
return pd.DataFrame()
|
|
|
42 |
try:
|
43 |
# Get news from the last 7 days
|
44 |
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
|
|
|
45 |
|
46 |
+
# Create a simple search query from marketing text
|
47 |
+
# Take the first 5 significant words
|
48 |
+
words = [word for word in marketing_text.lower().split()
|
49 |
+
if len(word) > 3 and not word.startswith(('http', 'www'))][:5]
|
50 |
+
search_query = ' OR '.join(words) if words else 'news'
|
51 |
+
|
52 |
+
response = self.newsapi.get_everything(
|
53 |
+
q=search_query,
|
54 |
+
from_param=week_ago,
|
55 |
language='en',
|
56 |
+
sort_by='relevancy',
|
57 |
+
page_size=30
|
58 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
if response['status'] == 'ok':
|
61 |
+
# Filter for negative news only
|
62 |
+
negative_news = []
|
63 |
+
for article in response['articles']:
|
64 |
+
if article['title'] and article['description']:
|
65 |
+
if self.is_negative_news(article['title'], article['description']):
|
66 |
+
negative_news.append({
|
67 |
+
'title': article['title'],
|
68 |
+
'description': article['description']
|
69 |
+
})
|
70 |
|
71 |
+
print(f"Found {len(negative_news)} negative news articles")
|
72 |
+
return pd.DataFrame(negative_news)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
return pd.DataFrame()
|
|
|
75 |
|
76 |
except Exception as e:
|
77 |
print(f"Error fetching news: {str(e)}")
|
78 |
return pd.DataFrame()
|
79 |
|
80 |
def check_content_against_news(self, marketing_text):
|
81 |
+
news_df = self.get_recent_news(marketing_text)
|
|
|
|
|
|
|
|
|
|
|
82 |
if news_df.empty:
|
83 |
return {
|
84 |
+
'status': 'pass',
|
85 |
+
'message': 'No relevant negative news found.'
|
86 |
}
|
87 |
|
88 |
+
# Simple word matching for relevance
|
89 |
+
marketing_words = set(marketing_text.lower().split())
|
90 |
+
relevant_negative_news = []
|
91 |
|
92 |
for _, row in news_df.iterrows():
|
93 |
+
title_words = set(row['title'].lower().split())
|
94 |
+
if len(marketing_words.intersection(title_words)) >= 2:
|
95 |
+
relevant_negative_news.append(row['title'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
+
if relevant_negative_news:
|
98 |
return {
|
99 |
'status': 'warning',
|
100 |
+
'message': 'Found relevant negative news that might impact your marketing:\n- ' +
|
101 |
+
'\n- '.join(relevant_negative_news[:3])
|
|
|
102 |
}
|
103 |
|
104 |
return {
|
105 |
'status': 'pass',
|
106 |
+
'message': 'No relevant negative news found.'
|
107 |
}
|
requirements.txt
CHANGED
@@ -8,4 +8,4 @@ pandas==2.1.4
|
|
8 |
numpy==1.24.3
|
9 |
requests==2.31.0
|
10 |
python-dotenv==1.0.0
|
11 |
-
|
|
|
8 |
numpy==1.24.3
|
9 |
requests==2.31.0
|
10 |
python-dotenv==1.0.0
|
11 |
+
language-tool-python==2.7.1
|