Spaces:
Sleeping
Sleeping
adding exception handling, updating feedback data to take context, options
Browse filesalso added operations progress bar.
with the previous update; improved the batch processing
app.py
CHANGED
@@ -37,14 +37,19 @@ st.set_page_config(
|
|
37 |
page_title="Question Generator",
|
38 |
initial_sidebar_state="auto",
|
39 |
menu_items={
|
40 |
-
"About" : "
|
41 |
}
|
42 |
)
|
43 |
|
44 |
st.set_option('deprecation.showPyplotGlobalUse',False)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
46 |
# Initialize Wikipedia API with a user agent
|
47 |
-
user_agent = 'QGen/1.
|
48 |
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
|
49 |
|
50 |
def get_session_id():
|
@@ -134,12 +139,6 @@ def display_info():
|
|
134 |
|
135 |
""")
|
136 |
|
137 |
-
# Text Preprocessing Function
|
138 |
-
def preprocess_text(text):
|
139 |
-
# Remove newlines and extra spaces
|
140 |
-
text = re.sub(r'[\n]', ' ', text)
|
141 |
-
return text
|
142 |
-
|
143 |
def get_pdf_text(pdf_file):
|
144 |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
145 |
text = ""
|
@@ -147,7 +146,7 @@ def get_pdf_text(pdf_file):
|
|
147 |
page = doc.load_page(page_num)
|
148 |
text += page.get_text()
|
149 |
return text
|
150 |
-
def save_feedback(question, answer,rating):
|
151 |
feedback_file = 'question_feedback.json'
|
152 |
if os.path.exists(feedback_file):
|
153 |
with open(feedback_file, 'r') as f:
|
@@ -157,6 +156,8 @@ def save_feedback(question, answer,rating):
|
|
157 |
tpl = {
|
158 |
'question' : question,
|
159 |
'answer' : answer,
|
|
|
|
|
160 |
'rating' : rating,
|
161 |
}
|
162 |
# feedback_data[question] = rating
|
@@ -195,33 +196,36 @@ def segment_text(text, max_segment_length=700, batch_size=7):
|
|
195 |
|
196 |
# Function to extract keywords using combined techniques
|
197 |
def extract_keywords(text, extract_all):
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
225 |
|
226 |
def get_similar_words_sense2vec(word, n=3):
|
227 |
# Try to find the word with its most likely part-of-speech
|
@@ -316,59 +320,80 @@ def entity_linking(keyword):
|
|
316 |
return None
|
317 |
|
318 |
async def generate_question_async(context, answer, num_beams):
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
326 |
|
327 |
async def generate_options_async(answer, context, n=3):
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
# Compute similarity scores and sort context words
|
336 |
-
similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
|
337 |
-
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
|
338 |
-
options.extend(sorted_context_words[:n])
|
339 |
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
options.extend(
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
|
357 |
|
358 |
# Function to generate questions using beam search
|
359 |
async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
|
373 |
async def process_batch(batch, keywords, context_window_size, num_beams):
|
374 |
questions = []
|
@@ -481,20 +506,35 @@ def main():
|
|
481 |
|
482 |
text = None
|
483 |
if input_type == "Text Input":
|
484 |
-
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
|
485 |
elif input_type == "Upload PDF":
|
486 |
file = st.file_uploader("Upload PDF Files")
|
487 |
if file is not None:
|
488 |
-
|
|
|
|
|
|
|
|
|
489 |
if text:
|
490 |
text = clean_text(text)
|
491 |
generate_questions_button = st.button("Generate Questions")
|
492 |
-
|
|
|
493 |
# if generate_questions_button:
|
494 |
if generate_questions_button and text:
|
495 |
start_time = time.time()
|
496 |
with st.spinner("Generating questions..."):
|
497 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
|
499 |
data = get_state(session_id)
|
500 |
print(data)
|
@@ -532,9 +572,9 @@ def main():
|
|
532 |
# q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
|
533 |
if enable_feedback_mode:
|
534 |
q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
|
535 |
-
q['rating'] = st.
|
536 |
if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
|
537 |
-
save_feedback(q['question'], q['answer'], q['rating'])
|
538 |
st.success(f"Feedback submitted for Question {i+1}")
|
539 |
st.write("---")
|
540 |
|
@@ -590,4 +630,8 @@ def main():
|
|
590 |
print("********************************************************************************")
|
591 |
|
592 |
if __name__ == '__main__':
|
593 |
-
|
|
|
|
|
|
|
|
|
|
37 |
page_title="Question Generator",
|
38 |
initial_sidebar_state="auto",
|
39 |
menu_items={
|
40 |
+
"About" : "Hi this our project."
|
41 |
}
|
42 |
)
|
43 |
|
44 |
st.set_option('deprecation.showPyplotGlobalUse',False)
|
45 |
|
46 |
+
class QuestionGenerationError(Exception):
|
47 |
+
"""Custom exception for question generation errors."""
|
48 |
+
pass
|
49 |
+
|
50 |
+
|
51 |
# Initialize Wikipedia API with a user agent
|
52 |
+
user_agent = 'QGen/1.2'
|
53 |
wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
|
54 |
|
55 |
def get_session_id():
|
|
|
139 |
|
140 |
""")
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
def get_pdf_text(pdf_file):
|
143 |
doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
144 |
text = ""
|
|
|
146 |
page = doc.load_page(page_num)
|
147 |
text += page.get_text()
|
148 |
return text
|
149 |
+
def save_feedback(question, answer, rating, options, context):
|
150 |
feedback_file = 'question_feedback.json'
|
151 |
if os.path.exists(feedback_file):
|
152 |
with open(feedback_file, 'r') as f:
|
|
|
156 |
tpl = {
|
157 |
'question' : question,
|
158 |
'answer' : answer,
|
159 |
+
'context' : context,
|
160 |
+
'options' : options,
|
161 |
'rating' : rating,
|
162 |
}
|
163 |
# feedback_data[question] = rating
|
|
|
196 |
|
197 |
# Function to extract keywords using combined techniques
|
198 |
def extract_keywords(text, extract_all):
|
199 |
+
try:
|
200 |
+
doc = nlp(text)
|
201 |
+
spacy_keywords = set([ent.text for ent in doc.ents])
|
202 |
+
spacy_entities = spacy_keywords
|
203 |
+
print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
|
204 |
+
|
205 |
+
# Use Only Spacy Entities
|
206 |
+
if extract_all is False:
|
207 |
+
return list(spacy_entities)
|
208 |
+
|
209 |
+
# Use RAKE
|
210 |
+
rake = Rake()
|
211 |
+
rake.extract_keywords_from_text(text)
|
212 |
+
rake_keywords = set(rake.get_ranked_phrases())
|
213 |
+
print(f"\n\nRake Keywords: {rake_keywords} \n\n")
|
214 |
+
# Use spaCy for NER and POS tagging
|
215 |
+
spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
|
216 |
+
print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
|
217 |
+
# Use TF-IDF
|
218 |
+
vectorizer = TfidfVectorizer(stop_words='english')
|
219 |
+
X = vectorizer.fit_transform([text])
|
220 |
+
tfidf_keywords = set(vectorizer.get_feature_names_out())
|
221 |
+
print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
|
222 |
+
|
223 |
+
# Combine all keywords
|
224 |
+
combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
|
225 |
+
|
226 |
+
return list(combined_keywords)
|
227 |
+
except Exception as e:
|
228 |
+
raise QuestionGenerationError(f"Error in keyword extraction: {str(e)}")
|
229 |
|
230 |
def get_similar_words_sense2vec(word, n=3):
|
231 |
# Try to find the word with its most likely part-of-speech
|
|
|
320 |
return None
|
321 |
|
322 |
async def generate_question_async(context, answer, num_beams):
|
323 |
+
try:
|
324 |
+
input_text = f"<context> {context} <answer> {answer}"
|
325 |
+
print(f"\n{input_text}\n")
|
326 |
+
input_ids = tokenizer.encode(input_text, return_tensors='pt')
|
327 |
+
outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
|
328 |
+
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
329 |
+
print(f"\n{question}\n")
|
330 |
+
return question
|
331 |
+
except Exception as e:
|
332 |
+
raise QuestionGenerationError(f"Error in question generation: {str(e)}")
|
333 |
|
334 |
async def generate_options_async(answer, context, n=3):
|
335 |
+
try:
|
336 |
+
options = [answer]
|
337 |
+
|
338 |
+
# Add contextually relevant words using a pre-trained model
|
339 |
+
context_embedding = await asyncio.to_thread(context_model.encode, context)
|
340 |
+
answer_embedding = await asyncio.to_thread(context_model.encode, answer)
|
341 |
+
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
|
|
|
|
|
|
|
|
|
342 |
|
343 |
+
# Compute similarity scores and sort context words
|
344 |
+
similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
|
345 |
+
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
|
346 |
+
options.extend(sorted_context_words[:n])
|
347 |
+
|
348 |
+
# Try to get similar words based on sense2vec
|
349 |
+
similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
|
350 |
+
options.extend(similar_words)
|
351 |
+
|
352 |
+
# If we don't have enough options, try synonyms
|
353 |
+
if len(options) < n + 1:
|
354 |
+
synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
|
355 |
+
options.extend(synonyms)
|
356 |
+
|
357 |
+
# Ensure we have the correct number of unique options
|
358 |
+
options = list(dict.fromkeys(options))[:n+1]
|
359 |
+
|
360 |
+
# Shuffle the options
|
361 |
+
random.shuffle(options)
|
362 |
+
|
363 |
+
return options
|
364 |
+
except Exception as e:
|
365 |
+
raise QuestionGenerationError(f"Error in generating options: {str(e)}")
|
366 |
|
367 |
|
368 |
# Function to generate questions using beam search
|
369 |
async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
|
370 |
+
try:
|
371 |
+
batches = segment_text(text)
|
372 |
+
keywords = extract_keywords(text, extract_all_keywords)
|
373 |
+
all_questions = []
|
374 |
+
|
375 |
+
progress_bar = st.progress(0)
|
376 |
+
status_text = st.empty()
|
377 |
+
|
378 |
+
for i, batch in enumerate(batches):
|
379 |
+
status_text.text(f"Processing batch {i+1} of {len(batches)}...")
|
380 |
+
batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
|
381 |
+
all_questions.extend(batch_questions)
|
382 |
+
progress_bar.progress((i + 1) / len(batches))
|
383 |
+
|
384 |
+
if len(all_questions) >= num_questions:
|
385 |
+
break
|
386 |
+
|
387 |
+
progress_bar.empty()
|
388 |
+
status_text.empty()
|
389 |
+
|
390 |
+
return all_questions[:num_questions]
|
391 |
+
except QuestionGenerationError as e:
|
392 |
+
st.error(f"An error occurred during question generation: {str(e)}")
|
393 |
+
return []
|
394 |
+
except Exception as e:
|
395 |
+
st.error(f"An unexpected error occurred: {str(e)}")
|
396 |
+
return []
|
397 |
|
398 |
async def process_batch(batch, keywords, context_window_size, num_beams):
|
399 |
questions = []
|
|
|
506 |
|
507 |
text = None
|
508 |
if input_type == "Text Input":
|
509 |
+
text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.", help="Enter or paste your text here")
|
510 |
elif input_type == "Upload PDF":
|
511 |
file = st.file_uploader("Upload PDF Files")
|
512 |
if file is not None:
|
513 |
+
try:
|
514 |
+
text = get_pdf_text(file)
|
515 |
+
except Exception as e:
|
516 |
+
st.error(f"Error reading PDF file: {str(e)}")
|
517 |
+
text = None
|
518 |
if text:
|
519 |
text = clean_text(text)
|
520 |
generate_questions_button = st.button("Generate Questions")
|
521 |
+
st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
|
522 |
+
|
523 |
# if generate_questions_button:
|
524 |
if generate_questions_button and text:
|
525 |
start_time = time.time()
|
526 |
with st.spinner("Generating questions..."):
|
527 |
+
try:
|
528 |
+
state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
|
529 |
+
if not state['generated_questions']:
|
530 |
+
st.warning("No questions were generated. The text might be too short or lack suitable content.")
|
531 |
+
else:
|
532 |
+
st.success(f"Successfully generated {len(state['generated_questions'])} questions!")
|
533 |
+
except QuestionGenerationError as e:
|
534 |
+
st.error(f"An error occurred during question generation: {str(e)}")
|
535 |
+
except Exception as e:
|
536 |
+
st.error(f"An unexpected error occurred: {str(e)}")
|
537 |
+
|
538 |
print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
|
539 |
data = get_state(session_id)
|
540 |
print(data)
|
|
|
572 |
# q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
|
573 |
if enable_feedback_mode:
|
574 |
q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
|
575 |
+
q['rating'] = st.select_slider(f"Rate this question (1-5)", options=[1, 2, 3, 4, 5], key=f"rating_{i}")
|
576 |
if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
|
577 |
+
save_feedback(q['question'], q['answer'], q['rating'], q['options'], q['context'])
|
578 |
st.success(f"Feedback submitted for Question {i+1}")
|
579 |
st.write("---")
|
580 |
|
|
|
630 |
print("********************************************************************************")
|
631 |
|
632 |
if __name__ == '__main__':
|
633 |
+
try:
|
634 |
+
main()
|
635 |
+
except Exception as e:
|
636 |
+
st.error(f"An unexpected error occurred: {str(e)}")
|
637 |
+
st.error("Please try refreshing the page. If the problem persists, contact support.")
|