Spaces:

DevBM
/

QGen

Sleeping

App Files Files Community

DevBM commited on Jul 5, 2024

Commit

366078f

verified ·

1 Parent(s): a563a42

adding exception handling, updating feedback data to take context, options

Browse files

also added operations progress bar.

with the previous update; improved the batch processing

Files changed (1) hide show

app.py +133 -89

app.py CHANGED Viewed

@@ -37,14 +37,19 @@ st.set_page_config(
     page_title="Question Generator",
     initial_sidebar_state="auto",
     menu_items={
-        "About" : "#Hi this our project."
     }
 )
 st.set_option('deprecation.showPyplotGlobalUse',False)
 # Initialize Wikipedia API with a user agent
-user_agent = 'QGen/1.0 ([email protected])'
 wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
 def get_session_id():
@@ -134,12 +139,6 @@ def display_info():
     """)
-# Text Preprocessing Function
-def preprocess_text(text):
-    # Remove newlines and extra spaces
-    text = re.sub(r'[\n]', ' ', text)
-    return text
 def get_pdf_text(pdf_file):
     doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
     text = ""
@@ -147,7 +146,7 @@ def get_pdf_text(pdf_file):
         page = doc.load_page(page_num)
         text += page.get_text()
     return text
-def save_feedback(question, answer,rating):
     feedback_file = 'question_feedback.json'
     if os.path.exists(feedback_file):
         with open(feedback_file, 'r') as f:
@@ -157,6 +156,8 @@ def save_feedback(question, answer,rating):
     tpl = {
         'question' : question,
         'answer' : answer,
         'rating' : rating,
     }
     # feedback_data[question] = rating
@@ -195,33 +196,36 @@ def segment_text(text, max_segment_length=700, batch_size=7):
 # Function to extract keywords using combined techniques
 def extract_keywords(text, extract_all):
-    doc = nlp(text)
-    spacy_keywords = set([ent.text for ent in doc.ents])
-    spacy_entities = spacy_keywords
-    print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
-    # Use Only Spacy Entities
-    if extract_all is False:
-        return list(spacy_entities)
-    # Use RAKE
-    rake = Rake()
-    rake.extract_keywords_from_text(text)
-    rake_keywords = set(rake.get_ranked_phrases())
-    print(f"\n\nRake Keywords: {rake_keywords} \n\n")
-    # Use spaCy for NER and POS tagging
-    spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
-    print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
-    # Use TF-IDF
-    vectorizer = TfidfVectorizer(stop_words='english')
-    X = vectorizer.fit_transform([text])
-    tfidf_keywords = set(vectorizer.get_feature_names_out())
-    print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
-    # Combine all keywords
-    combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
-    return list(combined_keywords)
 def get_similar_words_sense2vec(word, n=3):
     # Try to find the word with its most likely part-of-speech
@@ -316,59 +320,80 @@ def entity_linking(keyword):
     return None
 async def generate_question_async(context, answer, num_beams):
-    input_text = f"<context> {context} <answer> {answer}"
-    print(f"\n{input_text}\n")
-    input_ids = tokenizer.encode(input_text, return_tensors='pt')
-    outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
-    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    print(f"\n{question}\n")
-    return question
 async def generate_options_async(answer, context, n=3):
-    options = [answer]
-    # Add contextually relevant words using a pre-trained model
-    context_embedding = await asyncio.to_thread(context_model.encode, context)
-    answer_embedding = await asyncio.to_thread(context_model.encode, answer)
-    context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
-    # Compute similarity scores and sort context words
-    similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
-    sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
-    options.extend(sorted_context_words[:n])
-    # Try to get similar words based on sense2vec
-    similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
-    options.extend(similar_words)
-    # If we don't have enough options, try synonyms
-    if len(options) < n + 1:
-        synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
-        options.extend(synonyms)
-    # Ensure we have the correct number of unique options
-    options = list(dict.fromkeys(options))[:n+1]
-    # Shuffle the options
-    random.shuffle(options)
-    return options
 # Function to generate questions using beam search
 async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
-    batches = segment_text(text)
-    keywords = extract_keywords(text, extract_all_keywords)
-    all_questions = []
-    for batch in batches:
-        batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
-        all_questions.extend(batch_questions)
-        if len(all_questions) >= num_questions:
-            break
-    return all_questions[:num_questions]
 async def process_batch(batch, keywords, context_window_size, num_beams):
     questions = []
@@ -481,20 +506,35 @@ def main():
     text = None
     if input_type == "Text Input":
-        text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
     elif input_type == "Upload PDF":
         file = st.file_uploader("Upload PDF Files")
         if file is not None:
-            text = get_pdf_text(file)
     if text:
         text = clean_text(text)
     generate_questions_button = st.button("Generate Questions")
-    q_count = 0
     # if generate_questions_button:
     if generate_questions_button and text:
         start_time = time.time()
         with st.spinner("Generating questions..."):
-            state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
         print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
         data = get_state(session_id)
         print(data)
@@ -532,9 +572,9 @@ def main():
                 # q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
             if enable_feedback_mode:
                 q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
-                q['rating'] = st.selectbox(f"Rate this question (1-5)", options=[1, 2, 3, 4, 5], key=f"rating_{i}")
                 if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
-                    save_feedback(q['question'], q['answer'], q['rating'])
                     st.success(f"Feedback submitted for Question {i+1}")
             st.write("---")
@@ -590,4 +630,8 @@ def main():
     print("********************************************************************************")
 if __name__ == '__main__':
-    main()

     page_title="Question Generator",
     initial_sidebar_state="auto",
     menu_items={
+        "About" : "Hi this our project."
     }
 )
 st.set_option('deprecation.showPyplotGlobalUse',False)
+class QuestionGenerationError(Exception):
+    """Custom exception for question generation errors."""
+    pass
 # Initialize Wikipedia API with a user agent
+user_agent = 'QGen/1.2'
 wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
 def get_session_id():
     """)
 def get_pdf_text(pdf_file):
     doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
     text = ""
         page = doc.load_page(page_num)
         text += page.get_text()
     return text
+def save_feedback(question, answer, rating, options, context):
     feedback_file = 'question_feedback.json'
     if os.path.exists(feedback_file):
         with open(feedback_file, 'r') as f:
     tpl = {
         'question' : question,
         'answer' : answer,
+        'context' : context,
+        'options' : options,
         'rating' : rating,
     }
     # feedback_data[question] = rating
 # Function to extract keywords using combined techniques
 def extract_keywords(text, extract_all):
+    try:
+        doc = nlp(text)
+        spacy_keywords = set([ent.text for ent in doc.ents])
+        spacy_entities = spacy_keywords
+        print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
+        # Use Only Spacy Entities
+        if extract_all is False:
+            return list(spacy_entities)
+        # Use RAKE
+        rake = Rake()
+        rake.extract_keywords_from_text(text)
+        rake_keywords = set(rake.get_ranked_phrases())
+        print(f"\n\nRake Keywords: {rake_keywords} \n\n")
+        # Use spaCy for NER and POS tagging
+        spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
+        print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
+        # Use TF-IDF
+        vectorizer = TfidfVectorizer(stop_words='english')
+        X = vectorizer.fit_transform([text])
+        tfidf_keywords = set(vectorizer.get_feature_names_out())
+        print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
+        # Combine all keywords
+        combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
+        return list(combined_keywords)
+    except Exception as e:
+        raise QuestionGenerationError(f"Error in keyword extraction: {str(e)}")
 def get_similar_words_sense2vec(word, n=3):
     # Try to find the word with its most likely part-of-speech
     return None
 async def generate_question_async(context, answer, num_beams):
+    try:
+        input_text = f"<context> {context} <answer> {answer}"
+        print(f"\n{input_text}\n")
+        input_ids = tokenizer.encode(input_text, return_tensors='pt')
+        outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
+        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        print(f"\n{question}\n")
+        return question
+    except Exception as e:
+        raise QuestionGenerationError(f"Error in question generation: {str(e)}")
 async def generate_options_async(answer, context, n=3):
+    try:
+        options = [answer]
+        # Add contextually relevant words using a pre-trained model
+        context_embedding = await asyncio.to_thread(context_model.encode, context)
+        answer_embedding = await asyncio.to_thread(context_model.encode, answer)
+        context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
+        # Compute similarity scores and sort context words
+        similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
+        sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
+        options.extend(sorted_context_words[:n])
+        # Try to get similar words based on sense2vec
+        similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
+        options.extend(similar_words)
+        # If we don't have enough options, try synonyms
+        if len(options) < n + 1:
+            synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
+            options.extend(synonyms)
+        # Ensure we have the correct number of unique options
+        options = list(dict.fromkeys(options))[:n+1]
+        # Shuffle the options
+        random.shuffle(options)
+        return options
+    except Exception as e:
+        raise QuestionGenerationError(f"Error in generating options: {str(e)}")
 # Function to generate questions using beam search
 async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
+    try:
+        batches = segment_text(text)
+        keywords = extract_keywords(text, extract_all_keywords)
+        all_questions = []
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        for i, batch in enumerate(batches):
+            status_text.text(f"Processing batch {i+1} of {len(batches)}...")
+            batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
+            all_questions.extend(batch_questions)
+            progress_bar.progress((i + 1) / len(batches))
+            if len(all_questions) >= num_questions:
+                break
+        progress_bar.empty()
+        status_text.empty()
+        return all_questions[:num_questions]
+    except QuestionGenerationError as e:
+        st.error(f"An error occurred during question generation: {str(e)}")
+        return []
+    except Exception as e:
+        st.error(f"An unexpected error occurred: {str(e)}")
+        return []
 async def process_batch(batch, keywords, context_window_size, num_beams):
     questions = []
     text = None
     if input_type == "Text Input":
+        text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.", help="Enter or paste your text here")
     elif input_type == "Upload PDF":
         file = st.file_uploader("Upload PDF Files")
         if file is not None:
+            try:
+                text = get_pdf_text(file)
+            except Exception as e:
+                st.error(f"Error reading PDF file: {str(e)}")
+                text = None
     if text:
         text = clean_text(text)
     generate_questions_button = st.button("Generate Questions")
+    st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
     # if generate_questions_button:
     if generate_questions_button and text:
         start_time = time.time()
         with st.spinner("Generating questions..."):
+            try:
+                state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
+                if not state['generated_questions']:
+                    st.warning("No questions were generated. The text might be too short or lack suitable content.")
+                else:
+                    st.success(f"Successfully generated {len(state['generated_questions'])} questions!")
+            except QuestionGenerationError as e:
+                st.error(f"An error occurred during question generation: {str(e)}")
+            except Exception as e:
+                st.error(f"An unexpected error occurred: {str(e)}")
         print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
         data = get_state(session_id)
         print(data)
                 # q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
             if enable_feedback_mode:
                 q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
+                q['rating'] = st.select_slider(f"Rate this question (1-5)", options=[1, 2, 3, 4, 5], key=f"rating_{i}")
                 if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
+                    save_feedback(q['question'], q['answer'], q['rating'], q['options'], q['context'])
                     st.success(f"Feedback submitted for Question {i+1}")
             st.write("---")
     print("********************************************************************************")
 if __name__ == '__main__':
+    try:
+        main()
+    except Exception as e:
+        st.error(f"An unexpected error occurred: {str(e)}")
+        st.error("Please try refreshing the page. If the problem persists, contact support.")