sashtech commited on
Commit
fdbab88
·
verified ·
1 Parent(s): 7b071b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -103
app.py CHANGED
@@ -3,56 +3,30 @@ import gradio as gr
3
  from transformers import pipeline
4
  import spacy
5
  import subprocess
6
- import json
7
  import nltk
8
- from nltk.corpus import wordnet, stopwords
 
 
9
  from spellchecker import SpellChecker
10
  import re
11
- import random
12
  import string
 
13
 
14
- # Ensure necessary NLTK data is downloaded
15
- def download_nltk_resources():
16
- try:
17
- nltk.download('punkt')
18
- nltk.download('stopwords')
19
- nltk.download('averaged_perceptron_tagger')
20
- nltk.download('averaged_perceptron_tagger_eng')
21
- nltk.download('wordnet')
22
- nltk.download('omw-1.4')
23
- nltk.download('punkt_tab')
24
-
25
- except Exception as e:
26
- print(f"Error downloading NLTK resources: {e}")
27
-
28
- # Call the download function
29
- download_nltk_resources()
30
-
31
- top_words = set(stopwords.words("english"))
32
-
33
- # Path to the thesaurus file
34
- thesaurus_file_path = 'en_thesaurus.jsonl' # Ensure the file path is correct
35
-
36
- # Function to load the thesaurus into a dictionary
37
- def load_thesaurus(file_path):
38
- thesaurus_dict = {}
39
- try:
40
- with open(file_path, 'r', encoding='utf-8') as file:
41
- for line in file:
42
- entry = json.loads(line.strip())
43
- word = entry.get("word")
44
- synonyms = entry.get("synonyms", [])
45
- if word:
46
- thesaurus_dict[word] = synonyms
47
- except Exception as e:
48
- print(f"Error loading thesaurus: {e}")
49
-
50
- return thesaurus_dict
51
 
52
- # Load the thesaurus
53
- synonym_dict = load_thesaurus(thesaurus_file_path)
 
54
 
55
- # Words and POS tags we don't want to replace
 
 
 
 
56
  exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
57
  exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
58
 
@@ -69,44 +43,59 @@ except OSError:
69
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
70
  nlp = spacy.load("en_core_web_sm")
71
 
72
- # Function to predict the label and score for English text (AI Detection)
73
- def predict_en(text):
74
- try:
75
- res = pipeline_en(text)[0]
76
- return res['label'], res['score']
77
- except Exception as e:
78
- return f"Error during AI detection: {e}"
79
-
80
- # Function to remove plagiarism
81
- def plagiarism_remover(word):
82
- if word.lower() in top_words or word.lower() in exclude_words or word in string.punctuation:
83
- return word
84
-
85
- # Check for synonyms in the custom thesaurus
86
- synonyms = synonym_dict.get(word.lower(), set())
87
-
88
- # If no synonyms found in the custom thesaurus, use WordNet
89
- if not synonyms:
90
  for syn in wordnet.synsets(word):
91
  for lemma in syn.lemmas():
 
92
  if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
93
  synonyms.add(lemma.name())
94
 
95
- pos_tag_word = nltk.pos_tag([word])[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- if pos_tag_word[1] in exclude_tags:
98
- return word
99
-
100
- filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
101
-
102
- if not filtered_synonyms:
103
- return word
104
 
105
- synonym_choice = random.choice(filtered_synonyms)
106
 
107
- if word.istitle():
108
- return synonym_choice.title()
109
- return synonym_choice
 
110
 
111
  # Function to remove redundant and meaningless words
112
  def remove_redundant_words(text):
@@ -117,6 +106,7 @@ def remove_redundant_words(text):
117
 
118
  # Function to fix spacing before punctuation
119
  def fix_punctuation_spacing(text):
 
120
  words = text.split(' ')
121
  cleaned_words = []
122
  punctuation_marks = {',', '.', "'", '!', '?', ':'}
@@ -132,7 +122,8 @@ def fix_punctuation_spacing(text):
132
 
133
  # Function to fix possessives like "Earth's"
134
  def fix_possessives(text):
135
- return re.sub(r'(\w)\s\'\s?s', r"\1's", text)
 
136
 
137
  # Function to capitalize the first letter of sentences and proper nouns
138
  def capitalize_sentences_and_nouns(text):
@@ -214,38 +205,43 @@ def correct_spelling(text):
214
  corrected_words = []
215
  for word in words:
216
  corrected_word = spell.correction(word)
217
- corrected_words.append(corrected_word if corrected_word is not None else word)
 
 
 
218
  return ' '.join(corrected_words)
219
 
220
- # Main processing function for paraphrasing and grammar correction
221
  def paraphrase_and_correct(text):
 
222
  cleaned_text = remove_redundant_words(text)
223
- cleaned_text = fix_punctuation_spacing(cleaned_text)
224
- cleaned_text = fix_possessives(cleaned_text)
225
- cleaned_text = capitalize_sentences_and_nouns(cleaned_text)
226
- cleaned_text = force_first_letter_capital(cleaned_text)
227
- cleaned_text = correct_tense_errors(cleaned_text)
228
- cleaned_text = correct_article_errors(cleaned_text)
229
- cleaned_text = ensure_subject_verb_agreement(cleaned_text)
230
- cleaned_text = correct_spelling(cleaned_text)
231
- plag_removed = plagiarism_remover(cleaned_text)
232
- return plag_removed
233
-
234
- # Create the Gradio interface
 
235
  with gr.Blocks() as demo:
236
- gr.Markdown("# AI Text Processor")
237
-
238
  with gr.Tab("AI Detection"):
239
- t1 = gr.Textbox(lines=5, label='Input Text')
240
- btn1 = gr.Button("Detect AI")
241
- out1 = gr.Textbox(label='Prediction', interactive=False)
242
- out2 = gr.Textbox(label='Confidence', interactive=False)
243
- btn1.click(fn=predict_en, inputs=t1, outputs=[out1, out2])
244
-
245
- with gr.Tab("Paraphrasing and Grammar Correction"):
246
- t2 = gr.Textbox(lines=5, label='Input Text')
247
- btn2 = gr.Button("Process Text")
248
- out3 = gr.Textbox(label='Processed Text', interactive=False)
249
- btn2.click(fn=paraphrase_and_correct, inputs=t2, outputs=out3)
250
-
251
- demo.launch()
 
 
 
3
  from transformers import pipeline
4
  import spacy
5
  import subprocess
 
6
  import nltk
7
+ from nltk.corpus import wordnet
8
+ from nltk.corpus import stopwords
9
+ from nltk.tokenize import word_tokenize
10
  from spellchecker import SpellChecker
11
  import re
 
12
  import string
13
+ import random
14
 
15
+ # Download necessary NLTK data
16
+ nltk.download('punkt')
17
+ nltk.download('stopwords')
18
+ nltk.download('averaged_perceptron_tagger')
19
+ nltk.download('averaged_perceptron_tagger_eng')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ nltk.download('wordnet')
22
+ nltk.download('omw-1.4')
23
+ nltk.download('punkt_tab')
24
 
25
+
26
+ # Initialize stopwords
27
+ stop_words = set(stopwords.words("english"))
28
+
29
+ # Words we don't want to replace
30
  exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
31
  exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
32
 
 
43
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
44
  nlp = spacy.load("en_core_web_sm")
45
 
46
+ def plagiarism_removal(text):
47
+ def plagiarism_remover(word):
48
+ # Handle stopwords, punctuation, and excluded words
49
+ if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
50
+ return word
51
+
52
+ # Find synonyms
53
+ synonyms = set()
 
 
 
 
 
 
 
 
 
 
54
  for syn in wordnet.synsets(word):
55
  for lemma in syn.lemmas():
56
+ # Exclude overly technical synonyms or words with underscores
57
  if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
58
  synonyms.add(lemma.name())
59
 
60
+ # Get part of speech for word and filter synonyms with the same POS
61
+ pos_tag_word = nltk.pos_tag([word])[0]
62
+
63
+ # Avoid replacing certain parts of speech
64
+ if pos_tag_word[1] in exclude_tags:
65
+ return word
66
+
67
+ filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
68
+
69
+ # Return original word if no appropriate synonyms found
70
+ if not filtered_synonyms:
71
+ return word
72
+
73
+ # Select a random synonym from the filtered list
74
+ synonym_choice = random.choice(filtered_synonyms)
75
+
76
+ # Retain original capitalization
77
+ if word.istitle():
78
+ return synonym_choice.title()
79
+ return synonym_choice
80
+
81
+ # Tokenize, replace words, and join them back
82
+ para_split = word_tokenize(text)
83
+ final_text = [plagiarism_remover(word) for word in para_split]
84
 
85
+ # Handle spacing around punctuation correctly
86
+ corrected_text = []
87
+ for i in range(len(final_text)):
88
+ if final_text[i] in string.punctuation and i > 0:
89
+ corrected_text[-1] += final_text[i] # Append punctuation to previous word
90
+ else:
91
+ corrected_text.append(final_text[i])
92
 
93
+ return " ".join(corrected_text)
94
 
95
+ # Function to predict the label and score for English text (AI Detection)
96
+ def predict_en(text):
97
+ res = pipeline_en(text)[0]
98
+ return res['label'], res['score']
99
 
100
  # Function to remove redundant and meaningless words
101
  def remove_redundant_words(text):
 
106
 
107
  # Function to fix spacing before punctuation
108
  def fix_punctuation_spacing(text):
109
+ # Split the text into words and punctuation
110
  words = text.split(' ')
111
  cleaned_words = []
112
  punctuation_marks = {',', '.', "'", '!', '?', ':'}
 
122
 
123
  # Function to fix possessives like "Earth's"
124
  def fix_possessives(text):
125
+ text = re.sub(r'(\w)\s\'\s?s', r"\1's", text)
126
+ return text
127
 
128
  # Function to capitalize the first letter of sentences and proper nouns
129
  def capitalize_sentences_and_nouns(text):
 
205
  corrected_words = []
206
  for word in words:
207
  corrected_word = spell.correction(word)
208
+ if corrected_word is not None:
209
+ corrected_words.append(corrected_word)
210
+ else:
211
+ corrected_words.append(word)
212
  return ' '.join(corrected_words)
213
 
214
+ # Main function for paraphrasing and grammar correction
215
  def paraphrase_and_correct(text):
216
+ # Add synonym replacement here
217
  cleaned_text = remove_redundant_words(text)
218
+ plag_removed = plagiarism_removal(cleaned_text)
219
+ paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
220
+ paraphrased_text = force_first_letter_capital(paraphrased_text)
221
+ paraphrased_text = correct_article_errors(paraphrased_text)
222
+ paraphrased_text = correct_tense_errors(paraphrased_text)
223
+ paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
224
+ paraphrased_text = fix_possessives(paraphrased_text)
225
+ paraphrased_text = correct_spelling(paraphrased_text)
226
+ paraphrased_text = fix_punctuation_spacing(paraphrased_text)
227
+
228
+ return paraphrased_text
229
+
230
+ # Gradio app setup
231
  with gr.Blocks() as demo:
 
 
232
  with gr.Tab("AI Detection"):
233
+ t1 = gr.Textbox(lines=5, label='Text')
234
+ button1 = gr.Button("🤖 Predict!")
235
+ label1 = gr.Textbox(lines=1, label='Predicted Label 🎃')
236
+ score1 = gr.Textbox(lines=1, label='Prob')
237
+
238
+ button1.click(fn=predict_en, inputs=t1, outputs=[label1, score1])
239
+
240
+ with gr.Tab("Paraphrasing & Grammar Correction"):
241
+ t2 = gr.Textbox(lines=5, label='Enter text for paraphrasing and grammar correction')
242
+ button2 = gr.Button("🔄 Paraphrase and Correct")
243
+ result2 = gr.Textbox(lines=5, label='Corrected Text')
244
+
245
+ button2.click(fn=paraphrase_and_correct, inputs=t2, outputs=result2)
246
+
247
+ demo.launch(share=True)