Spaces:

fahmiaziz
/

Auto-Qa

Sleeping

App Files Files Community

fahmiaziz commited on Feb 25, 2024

Commit

aeae383

verified ·

1 Parent(s): bc2c5d1

init

Browse files

Files changed (7) hide show

app.py +66 -0
cleaned_text.py +58 -0
distractor.py +78 -0
models.py +7 -0
pipeline.py +82 -0
qagenerator.py +71 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import streamlit as st
+import random
+from pipeline import Pipeline
+st.header("Generate Multiple Choice QA Generation")
+st.markdown(
+    "I built this project based on this [paper](https://www.sciencedirect.com/science/article/pii/S0957417422014014#s0015), "
+    "where they created End-to-End generation of Multiple-Choice questions using Text-to-Text transfer Transformer models (T5).\n\n"
+    "This research focuses on using Transformer-based language models to automate the generation of multiple-choice questions (MCQs), "
+    "with the aim of assisting or assisting educators in the process of creating reading comprehension (RC) assessments. "
+    "This is relevant and timely as teachers can invest less time doing routine work and share more time with their students, "
+    "thus building an engaging experience for face-to-face classroom interaction. "
+    "This study addresses the issue of creating multiple-choice questionnaires from 3 viewpoints: QG, QA, and distractor generation (DG). "
+    "An end-to-end pipeline for generating multiple-choice questions is proposed, based on a pre-trained T5 language model."
+)
+st.sidebar.info(
+    "Note: The number of questions generated depends on the length of the context. "
+    "You may find that the number of QA pairs does not match the number you want."
+)
+with st.sidebar:
+    if "num_qa" not in st.session_state:
+        st.session_state.num_qa = 5
+    def on_change():
+        st.session_state.num_qa = num_qa
+    num_qa = st.slider("Select Number of QA questions", min_value=1, max_value=10, value=1, step=1, on_change=on_change)
+if 'context' not in st.session_state:
+    st.session_state.context = ""
+st_text_area = st.text_area('Context to generate the QA', value=st.session_state.context, height=500)
+def generate_qa():
+    st.session_state.context = st_text_area
+    mcq_generator = Pipeline()
+    generator = mcq_generator.generate_mcqs(st_text_area, num_qa)
+    st.session_state.generator = generator
+# generate qa button
+st_generate_button = st.button('Generate', on_click=generate_qa)
+# Display generated MCQs in Streamlit
+if hasattr(st.session_state, 'generator') and len(st.session_state.generator) > 0:
+    st.subheader("Generated MCQs")
+    for i, question in enumerate(st.session_state.generator, start=1):
+        correct_answer = [question.answerText]
+        distractors_subset = question.distractors[:3]  # Assuming you want 3 distractors
+        options = correct_answer + distractors_subset
+        # Shuffle options
+        random.shuffle(options)
+        options_with_labels = [{'label': chr(ord('A') + j), 'text': option} for j, option in enumerate(options)]
+        st.write(f'Number {i}: {question.questionText}')
+        for option in options_with_labels:
+            if option["text"] == correct_answer[0]:
+                st.write(f'<span style="color:green;">{option["label"]}. {option["text"]}</span>', unsafe_allow_html=True)
+            else:
+                st.write(f'{option["label"]}. {option["text"]}')
+        st.write('-------------------')

cleaned_text.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import re
+import string
+from typing import List
+def normalize_item(item) -> str:
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+    def white_space_fix(text):
+        return ' '.join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(item))))
+def remove_duplicates(items: List[str]) -> List[str]:
+    unique_items = []
+    normalized_unique_items = []
+    for item in items:
+        normalized_item = normalize_item(item)
+        if normalized_item not in normalized_unique_items:
+            unique_items.append(item)
+            normalized_unique_items.append(normalized_item)
+    return unique_items
+def remove_distractors_duplicate_with_correct_answer(correct: str, distractors: List[str]) -> List[str]:
+    normalized_correct = normalize_item(correct)
+    filtered_distractors = []
+    for distractor in distractors:
+        if normalize_item(distractor) != normalized_correct:
+            filtered_distractors.append(distractor)
+    return filtered_distractors
+def clean_text(text: str) -> str:
+    # remove brackets
+    cleaned_text = re.sub(r"\((.*?)\)", lambda L: "", text)
+    # remove square bracket
+    cleaned_text = re.sub(r"\[(.*?)\]", lambda L: "", cleaned_text)
+    # remove multiple space
+    cleaned_text = re.sub(" +", " ", cleaned_text)
+    # replace weird hypen
+    cleaned_text = cleaned_text.replace('–', '-')
+    return cleaned_text

distractor.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from transformers import T5TokenizerFast, T5ForConditionalGeneration
+import string
+from typing import List
+SOURCE_MAX_TOKEN_LEN = 512
+TARGET_MAX_TOKEN_LEN = 50
+SEP_TOKEN = "[SEP]"
+MODEL_NAME = "t5-small"
+# Definisi kelas DistractorGenerator
+class DistractorGenerator:
+    def __init__(self):
+        self.tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
+        self.tokenizer.add_tokens(SEP_TOKEN)
+        self.tokenizer_len = len(self.tokenizer)
+        self.model = T5ForConditionalGeneration.from_pretrained("fahmiaziz/QDModel")
+    def generate(self, generate_count: int, correct: str, question: str, context: str) -> List[str]:
+        model_output = self._model_predict(generate_count, correct, question, context)
+        cleaned_result = model_output.replace('<pad>', '').replace('</s>', ',')
+        cleaned_result = self._replace_all_extra_id(cleaned_result)
+        distractors = cleaned_result.split(",")[:-1]
+        distractors = [x.translate(str.maketrans('', '', string.punctuation)) for x in distractors]
+        distractors = list(map(lambda x: x.strip(), distractors))
+        return distractors
+    def _model_predict(self, generate_count: int, correct: str, question: str, context: str) -> str:
+        source_encoding = self.tokenizer(
+            '{} {} {} {} {}'.format(correct, SEP_TOKEN, question, SEP_TOKEN, context),
+            max_length=SOURCE_MAX_TOKEN_LEN,
+            padding='max_length',
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors='pt'
+        )
+        generated_ids = self.model.generate(
+            input_ids=source_encoding['input_ids'],
+            attention_mask=source_encoding['attention_mask'],
+            num_beams=generate_count,
+            num_return_sequences=generate_count,
+            max_length=TARGET_MAX_TOKEN_LEN,
+            repetition_penalty=2.5,
+            length_penalty=1.0,
+            early_stopping=True,
+            use_cache=True
+        )
+        preds = {
+            self.tokenizer.decode(generated_id, skip_special_tokens=False, clean_up_tokenization_spaces=True)
+            for generated_id in generated_ids
+        }
+        return ''.join(preds)
+    def _correct_index_of(self, text: str, substring: str, start_index: int = 0):
+        try:
+            index = text.index(substring, start_index)
+        except ValueError:
+            index = -1
+        return index
+    def _replace_all_extra_id(self, text: str):
+        new_text = text
+        start_index_of_extra_id = 0
+        while (self._correct_index_of(new_text, '<extra_id_') >= 0):
+            start_index_of_extra_id = self._correct_index_of(new_text, '<extra_id_', start_index_of_extra_id)
+            end_index_of_extra_id = self._correct_index_of(new_text, '>', start_index_of_extra_id)
+            new_text = new_text[:start_index_of_extra_id] + '[SEP]' + new_text[end_index_of_extra_id + 1:]
+        return new_text

models.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from typing import List
+class Question:
+    def __init__(self, answerText:str, questionText: str = '', distractors: List[str] = []):
+        self.answerText = answerText
+        self.questionText = questionText
+        self.distractors = distractors

pipeline.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import List
+from nltk.tokenize import sent_tokenize
+import toolz
+from models import Question
+from cleaned_text import clean_text, remove_duplicates, remove_distractors_duplicate_with_correct_answer
+from distractor import DistractorGenerator
+from qagenerator import QuestionAnswerGenerator
+class Pipeline:
+    def __init__(self):
+        self.question_generator = QuestionAnswerGenerator()
+        self.distractor_generator = DistractorGenerator()
+    #  <======================= Main Function =============================>
+    def generate_mcqs(self, context: str, desired_count: int) -> List[Question]:
+        cleaned_text =  clean_text(context)
+        questions = self._generate_question_answer_pairs(cleaned_text, desired_count)
+        questions = self._generate_distractors(cleaned_text, questions)
+        return questions
+    # <====================================================>
+    # number: 1
+    def _generate_question_answer_pairs(self, context: str, desired_count: int) -> List[Question]:
+        context_splits = self._split_context_according_to_desired_count(context, desired_count)
+        questions = []
+        for split in context_splits:
+            answer, question = self.question_generator.generate_qna(split)
+            questions.append(Question(answer.capitalize(), question))
+        questions = list(toolz.unique(questions, key=lambda x: x.answerText))
+        return questions
+    # number: 2
+    def _generate_distractors(self, context: str, questions: List[Question]) -> List[Question]:
+        for question in questions:
+            t5_distractors =  self.distractor_generator.generate(5, question.answerText, question.questionText, context)
+            distractors = remove_duplicates(t5_distractors)
+            distractors = remove_distractors_duplicate_with_correct_answer(question.answerText, distractors)
+            #TODO - filter distractors having a similar bleu score with another distractor
+            # filter_distractors = []
+            # for dist in distractors:
+            #     bleu_score = self._calculate_nltk_bleu([dist], question.answerText)
+            #     if bleu_score > 0.1:
+            #         filter_distractors.append(dist)
+            # <=================Need Improve Model=================>
+            question.distractors = distractors
+        return questions
+    # Helper functions
+    def _split_context_according_to_desired_count(self, context: str, desired_count: int) -> List[str]:
+        sents = sent_tokenize(context)
+        total_sents = len(sents)
+        if total_sents <= desired_count:
+            return sents  # No need to split if the desired count is greater than or equal to the total sentences.
+        sentences_per_split = total_sents // desired_count
+        remainder = total_sents % desired_count  # Handle the remaining sentences.
+        context_splits = []
+        start_sent_index = 0
+        for i in range(desired_count):
+            end_sent_index = start_sent_index + sentences_per_split + (1 if i < remainder else 0)
+            context_split = ' '.join(sents[start_sent_index:end_sent_index])
+            context_splits.append(context_split)
+            start_sent_index = end_sent_index
+        return context_splits

qagenerator.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from typing import List, Dict, Tuple
+from transformers import T5TokenizerFast, T5ForConditionalGeneration
+import string
+from typing import List
+# Constants
+MODEL_NAME = 't5-small'
+SOURCE_MAX_TOKEN_LEN = 300
+TARGET_MAX_TOKEN_LEN = 80
+SEP_TOKEN = '<sep>'
+TOKENIZER_LEN = 32101
+class QuestionAnswerGenerator():
+    def __init__(self):
+        self.tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
+        self.tokenizer.add_tokens(SEP_TOKEN)
+        self.tokenizer_len = len(self.tokenizer)
+        self.model = T5ForConditionalGeneration.from_pretrained("fahmiaziz/QAModel")
+    def generate(self, answer: str, context: str) -> str:
+        model_output = self._model_predict(answer, context)
+        generated_answer, generated_question = model_output.split(SEP_TOKEN)
+        return generated_question
+    def generate_qna(self, context: str) -> Tuple[str, str]:
+        answer_mask = '[MASK]'
+        model_output = self._model_predict(answer_mask, context)
+        qna_pair = model_output.split(SEP_TOKEN)
+        if len(qna_pair) < 2:
+            generated_answer = ''
+            generated_question = qna_pair[0]
+        else:
+            generated_answer = qna_pair[0]
+            generated_question = qna_pair[1]
+        return generated_answer, generated_question
+    def _model_predict(self, answer: str, context: str) -> str:
+        source_encoding = self.tokenizer(
+            '{} {} {}'.format(answer, SEP_TOKEN, context),
+            max_length=SOURCE_MAX_TOKEN_LEN,
+            padding='max_length',
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors='pt'
+        )
+        generated_ids = self.model.generate(
+            input_ids=source_encoding['input_ids'],
+            attention_mask=source_encoding['attention_mask'],
+            num_beams=16,
+            max_length=TARGET_MAX_TOKEN_LEN,
+            repetition_penalty=2.5,
+            length_penalty=1.0,
+            early_stopping=True,
+            use_cache=True
+        )
+        preds = {
+            self.tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            for generated_id in generated_ids
+        }
+        return ''.join(preds)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit
+nltk
+transformers