Spaces:

sashtech
/

aihumanifierandgrmoform

Sleeping

App Files Files Community

“[shujaatalishariati]” commited on Sep 4, 2024

Commit

116d721

1 Parent(s): 6eddba2

gramoformer

Browse files

Files changed (9) hide show

LICENSE +21 -0
__init__.py +0 -1
app.py +23 -123
gramformer.py +0 -126
gramformer/__init__.py +1 -0
gramformer/demo.py +30 -0
gramformer/gramformer.py +128 -0
requirements.txt +4 -9
setup.py +20 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Prithivida
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .gramformer import Gramformer

app.py CHANGED Viewed

@@ -1,126 +1,26 @@
-import os
 import gradio as gr
-from transformers import pipeline
-import spacy
-import subprocess
-import nltk
-from nltk.corpus import wordnet
-import torch
 from gramformer import Gramformer
-# Initialize the English text classification pipeline for AI detection
-pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
-# Initialize Gramformer
-gf = Gramformer(models=1, use_gpu=False)  # 1 = corrector
-# Function to predict the label and score for English text (AI Detection)
-def predict_en(text):
-    res = pipeline_en(text)[0]
-    return res['label'], res['score']
-# Ensure necessary NLTK data is downloaded for Humanifier
-nltk.download('wordnet')
-nltk.download('omw-1.4')
-# Ensure the SpaCy model is installed for Humanifier
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
-    nlp = spacy.load("en_core_web_sm")
-# Function to get synonyms using NLTK WordNet (Humanifier)
-def get_synonyms_nltk(word, pos):
-    synsets = wordnet.synsets(word, pos=pos)
-    if synsets:
-        lemmas = synsets[0].lemmas()
-        return [lemma.name() for lemma in lemmas]
-    return []
-# Function to capitalize the first letter of sentences and proper nouns (Humanifier)
-def capitalize_sentences_and_nouns(text):
-    doc = nlp(text)
-    corrected_text = []
-    for sent in doc.sents:
-        sentence = []
-        for token in sent:
-            if token.i == sent.start:  # First word of the sentence
-                sentence.append(token.text.capitalize())
-            elif token.pos_ == "PROPN":  # Proper noun
-                sentence.append(token.text.capitalize())
-            else:
-                sentence.append(token.text)
-        corrected_text.append(' '.join(sentence))
-    return ' '.join(corrected_text)
-# Paraphrasing function using SpaCy and NLTK (Humanifier)
-def paraphrase_with_spacy_nltk(text):
-    doc = nlp(text)
-    paraphrased_words = []
-    for token in doc:
-        # Map SpaCy POS tags to WordNet POS tags
-        pos = None
-        if token.pos_ in {"NOUN"}:
-            pos = wordnet.NOUN
-        elif token.pos_ in {"VERB"}:
-            pos = wordnet.VERB
-        elif token.pos_ in {"ADJ"}:
-            pos = wordnet.ADJ
-        elif token.pos_ in {"ADV"}:
-            pos = wordnet.ADV
-        synonyms = get_synonyms_nltk(token.text.lower(), pos) if pos else []
-        # Replace with a synonym only if it makes sense
-        if synonyms and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"} and synonyms[0] != token.text.lower():
-            paraphrased_words.append(synonyms[0])
-        else:
-            paraphrased_words.append(token.text)
-    # Join the words back into a sentence
-    paraphrased_sentence = ' '.join(paraphrased_words)
-    # Capitalize sentences and proper nouns
-    corrected_text = capitalize_sentences_and_nouns(paraphrased_sentence)
-    return corrected_text
-# Combined function: Paraphrase -> Capitalization -> Grammar Correction (Humanifier)
-def paraphrase_correct_and_grammar(text):
-    # Step 1: Paraphrase the text
-    paraphrased_text = paraphrase_with_spacy_nltk(text)
-    # Step 2: Capitalize sentences and proper nouns
-    capitalized_text = capitalize_sentences_and_nouns(paraphrased_text)
-    # Step 3: Grammar correction using Gramformer
-    corrected_sentences = gf.correct(capitalized_text, max_candidates=1)
-    final_text = next(iter(corrected_sentences)) if corrected_sentences else capitalized_text
-    return final_text
-# Gradio app setup with two tabs
-with gr.Blocks() as demo:
-    with gr.Tab("AI Detection"):
-        t1 = gr.Textbox(lines=5, label='Text')
-        button1 = gr.Button("🤖 Predict!")
-        label1 = gr.Textbox(lines=1, label='Predicted Label 🎃')
-        score1 = gr.Textbox(lines=1, label='Prob')
-        # Connect the prediction function to the button
-        button1.click(predict_en, inputs=[t1], outputs=[label1, score1], api_name='predict_en')
-    with gr.Tab("Humanifier"):
-        text_input = gr.Textbox(lines=5, label="Input Text")
-        paraphrase_button = gr.Button("Paraphrase, Correct & Grammar Check")
-        output_text = gr.Textbox(label="Processed Text")
-        # Connect the paraphrasing and grammar correction function to the button
-        paraphrase_button.click(paraphrase_correct_and_grammar, inputs=text_input, outputs=output_text)
-# Launch the app with both functionalities
-demo.launch()

 import gradio as gr
 from gramformer import Gramformer
+# Initialize the Gramformer model (using default settings for now)
+gf = Gramformer(models=1, use_gpu=False)
+def correct_grammar(text):
+    # Correct the input text using Gramformer
+    corrected_sentences = gf.correct(text)
+    return " ".join(corrected_sentences)
+# Gradio Interface
+def main():
+    interface = gr.Interface(
+        fn=correct_grammar,
+        inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
+        outputs="text",
+        title="Grammar Correction App",
+        description="This app corrects grammar using the Gramformer model. Enter a sentence to correct its grammar.",
+    )
+    # Launch the Gradio interface
+    interface.launch()
+if __name__ == "__main__":
+    main()

gramformer.py DELETED Viewed

@@ -1,126 +0,0 @@
-import spacy.cli
-import errant
-class Gramformer:
-    def __init__(self, models=1, use_gpu=False):
-        from transformers import AutoTokenizer
-        from transformers import AutoModelForSeq2SeqLM
-        # Ensure the SpaCy model 'en_core_web_sm' is downloaded
-        spacy.cli.download("en_core_web_sm")
-        # Load the correct SpaCy model for errant
-        self.annotator = errant.load('en_core_web_sm')
-        if use_gpu:
-            device = "cuda:0"
-        else:
-            device = "cpu"
-        batch_size = 1
-        self.device = device
-        correction_model_tag = "prithivida/grammar_error_correcter_v1"
-        self.model_loaded = False
-        if models == 1:
-            self.correction_tokenizer = AutoTokenizer.from_pretrained(correction_model_tag, use_auth_token=False)
-            self.correction_model = AutoModelForSeq2SeqLM.from_pretrained(correction_model_tag, use_auth_token=False)
-            self.correction_model = self.correction_model.to(device)
-            self.model_loaded = True
-            print("[Gramformer] Grammar error correct/highlight model loaded..")
-        elif models == 2:
-            # TODO: Implement this part
-            print("TO BE IMPLEMENTED!!!")
-    def correct(self, input_sentence, max_candidates=1):
-        if self.model_loaded:
-            correction_prefix = "gec: "
-            input_sentence = correction_prefix + input_sentence
-            input_ids = self.correction_tokenizer.encode(input_sentence, return_tensors='pt')
-            input_ids = input_ids.to(self.device)
-            preds = self.correction_model.generate(
-                input_ids,
-                do_sample=True,
-                max_length=128,
-                num_beams=7,
-                early_stopping=True,
-                num_return_sequences=max_candidates
-            )
-            corrected = set()
-            for pred in preds:
-                corrected.add(self.correction_tokenizer.decode(pred, skip_special_tokens=True).strip())
-            return corrected
-        else:
-            print("Model is not loaded")
-            return None
-    def highlight(self, orig, cor):
-        edits = self._get_edits(orig, cor)
-        orig_tokens = orig.split()
-        ignore_indexes = []
-        for edit in edits:
-            edit_type = edit[0]
-            edit_str_start = edit[1]
-            edit_spos = edit[2]
-            edit_epos = edit[3]
-            edit_str_end = edit[4]
-            # if no_of_tokens(edit_str_start) > 1 ==> excluding the first token, mark all other tokens for deletion
-            for i in range(edit_spos + 1, edit_epos):
-                ignore_indexes.append(i)
-            if edit_str_start == "":
-                if edit_spos - 1 >= 0:
-                    new_edit_str = orig_tokens[edit_spos - 1]
-                    edit_spos -= 1
-                else:
-                    new_edit_str = orig_tokens[edit_spos + 1]
-                    edit_spos += 1
-                if edit_type == "PUNCT":
-                    st = f"<a type='{edit_type}' edit='{edit_str_end}'>{new_edit_str}</a>"
-                else:
-                    st = f"<a type='{edit_type}' edit='{new_edit_str} {edit_str_end}'>{new_edit_str}</a>"
-                orig_tokens[edit_spos] = st
-            elif edit_str_end == "":
-                st = f"<d type='{edit_type}' edit=''>{edit_str_start}</d>"
-                orig_tokens[edit_spos] = st
-            else:
-                st = f"<c type='{edit_type}' edit='{edit_str_end}'>{edit_str_start}</c>"
-                orig_tokens[edit_spos] = st
-        for i in sorted(ignore_indexes, reverse=True):
-            del orig_tokens[i]
-        return " ".join(orig_tokens)
-    def detect(self, input_sentence):
-        # TO BE IMPLEMENTED
-        pass
-    def _get_edits(self, orig, cor):
-        orig = self.annotator.parse(orig)
-        cor = self.annotator.parse(cor)
-        alignment = self.annotator.align(orig, cor)
-        edits = self.annotator.merge(alignment)
-        if len(edits) == 0:
-            return []
-        edit_annotations = []
-        for e in edits:
-            e = self.annotator.classify(e)
-            edit_annotations.append((e.type[2:], e.o_str, e.o_start, e.o_end,  e.c_str, e.c_start, e.c_end))
-        if len(edit_annotations) > 0:
-            return edit_annotations
-        else:
-            return []
-    def get_edits(self, orig, cor):
-        return self._get_edits(orig, cor)

gramformer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from gramformer.gramformer import Gramformer

gramformer/demo.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from gramformer import Gramformer
+import torch
+def set_seed(seed):
+  torch.manual_seed(seed)
+  if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+set_seed(1212)
+gf = Gramformer(models = 1, use_gpu=False) # 1=corrector, 2=detector
+influent_sentences = [
+    "Matt like fish",
+    "the collection of letters was original used by the ancient Romans",
+    "We enjoys horror movies",
+    "Anna and Mike is going skiing",
+    "I walk to the store and I bought milk",
+    "We all eat the fish and then made dessert",
+    "I will eat fish for dinner and drank milk",
+    "what be the reason for everyone leave the company",
+]
+for influent_sentence in influent_sentences:
+    corrected_sentences = gf.correct(influent_sentence, max_candidates=1)
+    print("[Input] ", influent_sentence)
+    for corrected_sentence in corrected_sentences:
+      print("[Correction] ",corrected_sentence)
+    print("-" *100)

gramformer/gramformer.py ADDED Viewed

	@@ -0,0 +1,128 @@

+class Gramformer:
+  def __init__(self, models=1, use_gpu=False):
+    from transformers import AutoTokenizer
+    from transformers import AutoModelForSeq2SeqLM
+    #from lm_scorer.models.auto import AutoLMScorer as LMScorer
+    import errant
+    self.annotator = errant.load('en')
+    if use_gpu:
+        device= "cuda:0"
+    else:
+        device = "cpu"
+    batch_size = 1
+    #self.scorer = LMScorer.from_pretrained("gpt2", device=device, batch_size=batch_size)
+    self.device    = device
+    correction_model_tag = "prithivida/grammar_error_correcter_v1"
+    self.model_loaded = False
+    if models == 1:
+        self.correction_tokenizer = AutoTokenizer.from_pretrained(correction_model_tag, use_auth_token=False)
+        self.correction_model     = AutoModelForSeq2SeqLM.from_pretrained(correction_model_tag, use_auth_token=False)
+        self.correction_model     = self.correction_model.to(device)
+        self.model_loaded = True
+        print("[Gramformer] Grammar error correct/highlight model loaded..")
+    elif models == 2:
+        # TODO
+        print("TO BE IMPLEMENTED!!!")
+  def correct(self, input_sentence, max_candidates=1):
+      if self.model_loaded:
+        correction_prefix = "gec: "
+        input_sentence = correction_prefix + input_sentence
+        input_ids = self.correction_tokenizer.encode(input_sentence, return_tensors='pt')
+        input_ids = input_ids.to(self.device)
+        preds = self.correction_model.generate(
+            input_ids,
+            do_sample=True,
+            max_length=128,
+#             top_k=50,
+#             top_p=0.95,
+            num_beams=7,
+            early_stopping=True,
+            num_return_sequences=max_candidates)
+        corrected = set()
+        for pred in preds:
+          corrected.add(self.correction_tokenizer.decode(pred, skip_special_tokens=True).strip())
+        #corrected = list(corrected)
+        #scores = self.scorer.sentence_score(corrected, log=True)
+        #ranked_corrected = [(c,s) for c, s in zip(corrected, scores)]
+        #ranked_corrected.sort(key = lambda x:x[1], reverse=True)
+        return corrected
+      else:
+        print("Model is not loaded")
+        return None
+  def highlight(self, orig, cor):
+      edits = self._get_edits(orig, cor)
+      orig_tokens = orig.split()
+      ignore_indexes = []
+      for edit in edits:
+          edit_type = edit[0]
+          edit_str_start = edit[1]
+          edit_spos = edit[2]
+          edit_epos = edit[3]
+          edit_str_end = edit[4]
+          # if no_of_tokens(edit_str_start) > 1 ==> excluding the first token, mark all other tokens for deletion
+          for i in range(edit_spos+1, edit_epos):
+            ignore_indexes.append(i)
+          if edit_str_start == "":
+              if edit_spos - 1 >= 0:
+                  new_edit_str = orig_tokens[edit_spos - 1]
+                  edit_spos -= 1
+              else:
+                  new_edit_str = orig_tokens[edit_spos + 1]
+                  edit_spos += 1
+              if edit_type == "PUNCT":
+                st = "<a type='" + edit_type + "' edit='" + \
+                    edit_str_end + "'>" + new_edit_str + "</a>"
+              else:
+                st = "<a type='" + edit_type + "' edit='" + new_edit_str + \
+                    " " + edit_str_end + "'>" + new_edit_str + "</a>"
+              orig_tokens[edit_spos] = st
+          elif edit_str_end == "":
+            st = "<d type='" + edit_type + "' edit=''>" + edit_str_start + "</d>"
+            orig_tokens[edit_spos] = st
+          else:
+            st = "<c type='" + edit_type + "' edit='" + \
+                edit_str_end + "'>" + edit_str_start + "</c>"
+            orig_tokens[edit_spos] = st
+      for i in sorted(ignore_indexes, reverse=True):
+        del(orig_tokens[i])
+      return(" ".join(orig_tokens))
+  def detect(self, input_sentence):
+        # TO BE IMPLEMENTED
+        pass
+  def _get_edits(self, orig, cor):
+        orig = self.annotator.parse(orig)
+        cor = self.annotator.parse(cor)
+        alignment = self.annotator.align(orig, cor)
+        edits = self.annotator.merge(alignment)
+        if len(edits) == 0:
+            return []
+        edit_annotations = []
+        for e in edits:
+            e = self.annotator.classify(e)
+            edit_annotations.append((e.type[2:], e.o_str, e.o_start, e.o_end,  e.c_str, e.c_start, e.c_end))
+        if len(edit_annotations) > 0:
+            return edit_annotations
+        else:
+            return []
+  def get_edits(self, orig, cor):
+      return self._get_edits(orig, cor)

requirements.txt CHANGED Viewed

@@ -1,9 +1,4 @@
-gradio==3.50.2
-transformers==4.36.2
-spacy==3.5.3
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0.tar.gz
-nltk==3.8.1
-torch==2.1.2
-git+https://github.com/PrithivirajDamodaran/Gramformer.git
-typer==0.9.0
-click==8.0.4

+transformers
+torch
+gradio
+errant

setup.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import setuptools
+setuptools.setup(
+    name="gramformer",
+    version="1.0",
+    author="prithiviraj damodaran",
+    author_email="",
+    description="Gramformer",
+    long_description="A framework for detecting, highlighting and correcting grammatical errors on natural language text",
+    url="https://github.com/PrithivirajDamodaran/Gramformer.git",
+    packages=setuptools.find_packages(),
+    #install_requires=['transformers', 'sentencepiece==0.1.95', 'python-Levenshtein==0.12.2', 'fuzzywuzzy==0.18.0',  'tokenizers==0.10.2', 'fsspec==2021.5.0', 'lm-scorer==0.4.2', 'errant'],
+    install_requires=['transformers', 'sentencepiece', 'python-Levenshtein', 'fuzzywuzzy',  'tokenizers', 'fsspec', 'errant'],
+    classifiers=[
+        "Programming Language :: Python :: 3.7",
+        "License :: Apache 2.0",
+        "Operating System :: OS Independent",
+    ],
+)