DevBM commited on
Commit
366078f
·
verified ·
1 Parent(s): a563a42

adding exception handling, updating feedback data to take context, options

Browse files

also added operations progress bar.

with the previous update; improved the batch processing

Files changed (1) hide show
  1. app.py +133 -89
app.py CHANGED
@@ -37,14 +37,19 @@ st.set_page_config(
37
  page_title="Question Generator",
38
  initial_sidebar_state="auto",
39
  menu_items={
40
- "About" : "#Hi this our project."
41
  }
42
  )
43
 
44
  st.set_option('deprecation.showPyplotGlobalUse',False)
45
 
 
 
 
 
 
46
  # Initialize Wikipedia API with a user agent
47
- user_agent = 'QGen/1.0 ([email protected])'
48
  wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
49
 
50
  def get_session_id():
@@ -134,12 +139,6 @@ def display_info():
134
 
135
  """)
136
 
137
- # Text Preprocessing Function
138
- def preprocess_text(text):
139
- # Remove newlines and extra spaces
140
- text = re.sub(r'[\n]', ' ', text)
141
- return text
142
-
143
  def get_pdf_text(pdf_file):
144
  doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
145
  text = ""
@@ -147,7 +146,7 @@ def get_pdf_text(pdf_file):
147
  page = doc.load_page(page_num)
148
  text += page.get_text()
149
  return text
150
- def save_feedback(question, answer,rating):
151
  feedback_file = 'question_feedback.json'
152
  if os.path.exists(feedback_file):
153
  with open(feedback_file, 'r') as f:
@@ -157,6 +156,8 @@ def save_feedback(question, answer,rating):
157
  tpl = {
158
  'question' : question,
159
  'answer' : answer,
 
 
160
  'rating' : rating,
161
  }
162
  # feedback_data[question] = rating
@@ -195,33 +196,36 @@ def segment_text(text, max_segment_length=700, batch_size=7):
195
 
196
  # Function to extract keywords using combined techniques
197
  def extract_keywords(text, extract_all):
198
- doc = nlp(text)
199
- spacy_keywords = set([ent.text for ent in doc.ents])
200
- spacy_entities = spacy_keywords
201
- print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
202
-
203
- # Use Only Spacy Entities
204
- if extract_all is False:
205
- return list(spacy_entities)
206
-
207
- # Use RAKE
208
- rake = Rake()
209
- rake.extract_keywords_from_text(text)
210
- rake_keywords = set(rake.get_ranked_phrases())
211
- print(f"\n\nRake Keywords: {rake_keywords} \n\n")
212
- # Use spaCy for NER and POS tagging
213
- spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
214
- print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
215
- # Use TF-IDF
216
- vectorizer = TfidfVectorizer(stop_words='english')
217
- X = vectorizer.fit_transform([text])
218
- tfidf_keywords = set(vectorizer.get_feature_names_out())
219
- print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
220
-
221
- # Combine all keywords
222
- combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
223
-
224
- return list(combined_keywords)
 
 
 
225
 
226
  def get_similar_words_sense2vec(word, n=3):
227
  # Try to find the word with its most likely part-of-speech
@@ -316,59 +320,80 @@ def entity_linking(keyword):
316
  return None
317
 
318
  async def generate_question_async(context, answer, num_beams):
319
- input_text = f"<context> {context} <answer> {answer}"
320
- print(f"\n{input_text}\n")
321
- input_ids = tokenizer.encode(input_text, return_tensors='pt')
322
- outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
323
- question = tokenizer.decode(outputs[0], skip_special_tokens=True)
324
- print(f"\n{question}\n")
325
- return question
 
 
 
326
 
327
  async def generate_options_async(answer, context, n=3):
328
- options = [answer]
329
-
330
- # Add contextually relevant words using a pre-trained model
331
- context_embedding = await asyncio.to_thread(context_model.encode, context)
332
- answer_embedding = await asyncio.to_thread(context_model.encode, answer)
333
- context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
334
-
335
- # Compute similarity scores and sort context words
336
- similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
337
- sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
338
- options.extend(sorted_context_words[:n])
339
 
340
- # Try to get similar words based on sense2vec
341
- similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
342
- options.extend(similar_words)
343
-
344
- # If we don't have enough options, try synonyms
345
- if len(options) < n + 1:
346
- synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
347
- options.extend(synonyms)
348
-
349
- # Ensure we have the correct number of unique options
350
- options = list(dict.fromkeys(options))[:n+1]
351
-
352
- # Shuffle the options
353
- random.shuffle(options)
354
-
355
- return options
 
 
 
 
 
 
 
356
 
357
 
358
  # Function to generate questions using beam search
359
  async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
360
- batches = segment_text(text)
361
- keywords = extract_keywords(text, extract_all_keywords)
362
- all_questions = []
363
-
364
- for batch in batches:
365
- batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
366
- all_questions.extend(batch_questions)
367
- if len(all_questions) >= num_questions:
368
- break
369
-
370
- return all_questions[:num_questions]
371
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
  async def process_batch(batch, keywords, context_window_size, num_beams):
374
  questions = []
@@ -481,20 +506,35 @@ def main():
481
 
482
  text = None
483
  if input_type == "Text Input":
484
- text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.")
485
  elif input_type == "Upload PDF":
486
  file = st.file_uploader("Upload PDF Files")
487
  if file is not None:
488
- text = get_pdf_text(file)
 
 
 
 
489
  if text:
490
  text = clean_text(text)
491
  generate_questions_button = st.button("Generate Questions")
492
- q_count = 0
 
493
  # if generate_questions_button:
494
  if generate_questions_button and text:
495
  start_time = time.time()
496
  with st.spinner("Generating questions..."):
497
- state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
 
 
 
 
 
 
 
 
 
 
498
  print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
499
  data = get_state(session_id)
500
  print(data)
@@ -532,9 +572,9 @@ def main():
532
  # q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
533
  if enable_feedback_mode:
534
  q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
535
- q['rating'] = st.selectbox(f"Rate this question (1-5)", options=[1, 2, 3, 4, 5], key=f"rating_{i}")
536
  if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
537
- save_feedback(q['question'], q['answer'], q['rating'])
538
  st.success(f"Feedback submitted for Question {i+1}")
539
  st.write("---")
540
 
@@ -590,4 +630,8 @@ def main():
590
  print("********************************************************************************")
591
 
592
  if __name__ == '__main__':
593
- main()
 
 
 
 
 
37
  page_title="Question Generator",
38
  initial_sidebar_state="auto",
39
  menu_items={
40
+ "About" : "Hi this our project."
41
  }
42
  )
43
 
44
  st.set_option('deprecation.showPyplotGlobalUse',False)
45
 
46
+ class QuestionGenerationError(Exception):
47
+ """Custom exception for question generation errors."""
48
+ pass
49
+
50
+
51
  # Initialize Wikipedia API with a user agent
52
+ user_agent = 'QGen/1.2'
53
  wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
54
 
55
  def get_session_id():
 
139
 
140
  """)
141
 
 
 
 
 
 
 
142
  def get_pdf_text(pdf_file):
143
  doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
144
  text = ""
 
146
  page = doc.load_page(page_num)
147
  text += page.get_text()
148
  return text
149
+ def save_feedback(question, answer, rating, options, context):
150
  feedback_file = 'question_feedback.json'
151
  if os.path.exists(feedback_file):
152
  with open(feedback_file, 'r') as f:
 
156
  tpl = {
157
  'question' : question,
158
  'answer' : answer,
159
+ 'context' : context,
160
+ 'options' : options,
161
  'rating' : rating,
162
  }
163
  # feedback_data[question] = rating
 
196
 
197
  # Function to extract keywords using combined techniques
198
  def extract_keywords(text, extract_all):
199
+ try:
200
+ doc = nlp(text)
201
+ spacy_keywords = set([ent.text for ent in doc.ents])
202
+ spacy_entities = spacy_keywords
203
+ print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
204
+
205
+ # Use Only Spacy Entities
206
+ if extract_all is False:
207
+ return list(spacy_entities)
208
+
209
+ # Use RAKE
210
+ rake = Rake()
211
+ rake.extract_keywords_from_text(text)
212
+ rake_keywords = set(rake.get_ranked_phrases())
213
+ print(f"\n\nRake Keywords: {rake_keywords} \n\n")
214
+ # Use spaCy for NER and POS tagging
215
+ spacy_keywords.update([token.text for token in doc if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"]])
216
+ print(f"\n\nSpacy Keywords: {spacy_keywords} \n\n")
217
+ # Use TF-IDF
218
+ vectorizer = TfidfVectorizer(stop_words='english')
219
+ X = vectorizer.fit_transform([text])
220
+ tfidf_keywords = set(vectorizer.get_feature_names_out())
221
+ print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
222
+
223
+ # Combine all keywords
224
+ combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
225
+
226
+ return list(combined_keywords)
227
+ except Exception as e:
228
+ raise QuestionGenerationError(f"Error in keyword extraction: {str(e)}")
229
 
230
  def get_similar_words_sense2vec(word, n=3):
231
  # Try to find the word with its most likely part-of-speech
 
320
  return None
321
 
322
  async def generate_question_async(context, answer, num_beams):
323
+ try:
324
+ input_text = f"<context> {context} <answer> {answer}"
325
+ print(f"\n{input_text}\n")
326
+ input_ids = tokenizer.encode(input_text, return_tensors='pt')
327
+ outputs = await asyncio.to_thread(model.generate, input_ids, num_beams=num_beams, early_stopping=True, max_length=250)
328
+ question = tokenizer.decode(outputs[0], skip_special_tokens=True)
329
+ print(f"\n{question}\n")
330
+ return question
331
+ except Exception as e:
332
+ raise QuestionGenerationError(f"Error in question generation: {str(e)}")
333
 
334
  async def generate_options_async(answer, context, n=3):
335
+ try:
336
+ options = [answer]
337
+
338
+ # Add contextually relevant words using a pre-trained model
339
+ context_embedding = await asyncio.to_thread(context_model.encode, context)
340
+ answer_embedding = await asyncio.to_thread(context_model.encode, answer)
341
+ context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
 
 
 
 
342
 
343
+ # Compute similarity scores and sort context words
344
+ similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
345
+ sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
346
+ options.extend(sorted_context_words[:n])
347
+
348
+ # Try to get similar words based on sense2vec
349
+ similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
350
+ options.extend(similar_words)
351
+
352
+ # If we don't have enough options, try synonyms
353
+ if len(options) < n + 1:
354
+ synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
355
+ options.extend(synonyms)
356
+
357
+ # Ensure we have the correct number of unique options
358
+ options = list(dict.fromkeys(options))[:n+1]
359
+
360
+ # Shuffle the options
361
+ random.shuffle(options)
362
+
363
+ return options
364
+ except Exception as e:
365
+ raise QuestionGenerationError(f"Error in generating options: {str(e)}")
366
 
367
 
368
  # Function to generate questions using beam search
369
  async def generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords):
370
+ try:
371
+ batches = segment_text(text)
372
+ keywords = extract_keywords(text, extract_all_keywords)
373
+ all_questions = []
374
+
375
+ progress_bar = st.progress(0)
376
+ status_text = st.empty()
377
+
378
+ for i, batch in enumerate(batches):
379
+ status_text.text(f"Processing batch {i+1} of {len(batches)}...")
380
+ batch_questions = await process_batch(batch, keywords, context_window_size, num_beams)
381
+ all_questions.extend(batch_questions)
382
+ progress_bar.progress((i + 1) / len(batches))
383
+
384
+ if len(all_questions) >= num_questions:
385
+ break
386
+
387
+ progress_bar.empty()
388
+ status_text.empty()
389
+
390
+ return all_questions[:num_questions]
391
+ except QuestionGenerationError as e:
392
+ st.error(f"An error occurred during question generation: {str(e)}")
393
+ return []
394
+ except Exception as e:
395
+ st.error(f"An unexpected error occurred: {str(e)}")
396
+ return []
397
 
398
  async def process_batch(batch, keywords, context_window_size, num_beams):
399
  questions = []
 
506
 
507
  text = None
508
  if input_type == "Text Input":
509
+ text = st.text_area("Enter text here:", value="Joe Biden, the current US president is on a weak wicket going in for his reelection later this November against former President Donald Trump.", help="Enter or paste your text here")
510
  elif input_type == "Upload PDF":
511
  file = st.file_uploader("Upload PDF Files")
512
  if file is not None:
513
+ try:
514
+ text = get_pdf_text(file)
515
+ except Exception as e:
516
+ st.error(f"Error reading PDF file: {str(e)}")
517
+ text = None
518
  if text:
519
  text = clean_text(text)
520
  generate_questions_button = st.button("Generate Questions")
521
+ st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
522
+
523
  # if generate_questions_button:
524
  if generate_questions_button and text:
525
  start_time = time.time()
526
  with st.spinner("Generating questions..."):
527
+ try:
528
+ state['generated_questions'] = asyncio.run(generate_questions_async(text, num_questions, context_window_size, num_beams, extract_all_keywords))
529
+ if not state['generated_questions']:
530
+ st.warning("No questions were generated. The text might be too short or lack suitable content.")
531
+ else:
532
+ st.success(f"Successfully generated {len(state['generated_questions'])} questions!")
533
+ except QuestionGenerationError as e:
534
+ st.error(f"An error occurred during question generation: {str(e)}")
535
+ except Exception as e:
536
+ st.error(f"An unexpected error occurred: {str(e)}")
537
+
538
  print("\n\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
539
  data = get_state(session_id)
540
  print(data)
 
572
  # q['context'] = st.text_area(f"Edit Context {i+1}:", value=q['context'], key=f"context_{i}")
573
  if enable_feedback_mode:
574
  q['question'] = st.text_input(f"Edit Question {i+1}:", value=q['question'], key=f"question_{i}")
575
+ q['rating'] = st.select_slider(f"Rate this question (1-5)", options=[1, 2, 3, 4, 5], key=f"rating_{i}")
576
  if st.button(f"Submit Feedback for Question {i+1}", key=f"submit_{i}"):
577
+ save_feedback(q['question'], q['answer'], q['rating'], q['options'], q['context'])
578
  st.success(f"Feedback submitted for Question {i+1}")
579
  st.write("---")
580
 
 
630
  print("********************************************************************************")
631
 
632
  if __name__ == '__main__':
633
+ try:
634
+ main()
635
+ except Exception as e:
636
+ st.error(f"An unexpected error occurred: {str(e)}")
637
+ st.error("Please try refreshing the page. If the problem persists, contact support.")