DevBM commited on
Commit
2070fbb
·
verified ·
1 Parent(s): 6070578

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -21
app.py CHANGED
@@ -15,15 +15,18 @@ from nltk.tokenize import sent_tokenize
15
  nltk.download('wordnet')
16
  from nltk.corpus import wordnet
17
  import random
18
-
 
19
  # Load spaCy model
20
  nlp = spacy.load("en_core_web_sm")
 
21
 
 
22
  # Initialize Wikipedia API with a user agent
23
  user_agent = 'QGen/1.0 ([email protected])'
24
  wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
25
 
26
- @st.cache_resource(allow_output_mutation=True)
27
  def load_model():
28
  model_name = "DevBM/t5-large-squad"
29
  model = T5ForConditionalGeneration.from_pretrained(model_name)
@@ -55,20 +58,19 @@ def extract_keywords(text):
55
  # Load spaCy model (medium-sized model with word vectors)
56
  nlp = spacy.load("en_core_web_md")
57
 
58
- def get_similar_words(word, n=3):
59
- # Get the vector for the word
60
- word_vector = nlp(word).vector
 
 
 
61
 
62
- # Find similar words
63
- similar_words = []
64
- for w in nlp.vocab:
65
- if w.has_vector and w.is_lower and w.is_alpha and w.text != word:
66
- similarity = nlp(w.text).similarity(nlp(word))
67
- similar_words.append((w.text, similarity))
68
 
69
- # Sort by similarity and return top n
70
- similar_words.sort(key=lambda x: x[1], reverse=True)
71
- return [word for word, _ in similar_words[:n]]
72
 
73
  def get_synonyms(word, n=3):
74
  synonyms = []
@@ -83,8 +85,8 @@ def get_synonyms(word, n=3):
83
  def generate_options(answer, context, n=3):
84
  options = [answer]
85
 
86
- # Try to get similar words based on word vectors
87
- similar_words = get_similar_words(answer, n)
88
  options.extend(similar_words)
89
 
90
  # If we don't have enough options, try synonyms
@@ -138,6 +140,7 @@ def entity_linking(keyword):
138
  return None
139
 
140
  # Function to generate questions using beam search
 
141
  def generate_question(context, answer, num_beams):
142
  input_text = f"<context> {context} <answer> {answer}"
143
  input_ids = tokenizer.encode(input_text, return_tensors='pt')
@@ -147,7 +150,7 @@ def generate_question(context, answer, num_beams):
147
 
148
  # Function to export questions to CSV
149
  def export_to_csv(data):
150
- df = pd.DataFrame(data, columns=["Context", "Answer", "Question"])
151
  csv = df.to_csv(index=False,encoding='utf-8')
152
  return csv
153
 
@@ -157,7 +160,7 @@ def export_to_pdf(data):
157
  pdf.add_page()
158
  pdf.set_font("Arial", size=12)
159
 
160
- for context, answer, question in data:
161
  pdf.multi_cell(0, 10, f"Context: {context}")
162
  pdf.multi_cell(0, 10, f"Answer: {answer}")
163
  pdf.multi_cell(0, 10, f"Question: {question}")
@@ -199,13 +202,13 @@ if st.button("Generate Questions"):
199
  st.write(f"**Answer:** {keyword}")
200
  st.write(f"**Question:** {question}")
201
  st.write(f"**Options:**")
202
- for j, option in options:
203
  st.write(f"{chr(65+j)}. {option}")
204
 
205
  if linked_entity:
206
  st.write(f"**Entity Link:** {linked_entity}")
207
  st.write("---")
208
- data.append((context, keyword, question))
209
 
210
  # Add the data to session state
211
  st.session_state.data = data
@@ -224,4 +227,4 @@ if st.button("Generate Questions"):
224
 
225
 
226
  else:
227
- st.write("Please enter some text to generate questions.")
 
15
  nltk.download('wordnet')
16
  from nltk.corpus import wordnet
17
  import random
18
+ from sense2vec import Sense2Vec
19
+ import sense2vec
20
  # Load spaCy model
21
  nlp = spacy.load("en_core_web_sm")
22
+ # s2v = Sense2Vec.from_disk(self=Sense2Vec,path='s2v_old')
23
 
24
+ s2v = sense2vec.Sense2Vec().from_disk('s2v_old')
25
  # Initialize Wikipedia API with a user agent
26
  user_agent = 'QGen/1.0 ([email protected])'
27
  wiki_wiki = wikipediaapi.Wikipedia(user_agent= user_agent,language='en')
28
 
29
+ @st.cache_resource
30
  def load_model():
31
  model_name = "DevBM/t5-large-squad"
32
  model = T5ForConditionalGeneration.from_pretrained(model_name)
 
58
  # Load spaCy model (medium-sized model with word vectors)
59
  nlp = spacy.load("en_core_web_md")
60
 
61
+ def get_similar_words_sense2vec(word, n=3):
62
+ # Try to find the word with its most likely part-of-speech
63
+ word_with_pos = word + "|NOUN"
64
+ if word_with_pos in s2v:
65
+ similar_words = s2v.most_similar(word_with_pos, n=n)
66
+ return [word.split("|")[0] for word, _ in similar_words]
67
 
68
+ # If not found, try without POS
69
+ if word in s2v:
70
+ similar_words = s2v.most_similar(word, n=n)
71
+ return [word.split("|")[0] for word, _ in similar_words]
 
 
72
 
73
+ return []
 
 
74
 
75
  def get_synonyms(word, n=3):
76
  synonyms = []
 
85
  def generate_options(answer, context, n=3):
86
  options = [answer]
87
 
88
+ # Try to get similar words based on sense2vec
89
+ similar_words = get_similar_words_sense2vec(answer, n)
90
  options.extend(similar_words)
91
 
92
  # If we don't have enough options, try synonyms
 
140
  return None
141
 
142
  # Function to generate questions using beam search
143
+ @st.cache_data
144
  def generate_question(context, answer, num_beams):
145
  input_text = f"<context> {context} <answer> {answer}"
146
  input_ids = tokenizer.encode(input_text, return_tensors='pt')
 
150
 
151
  # Function to export questions to CSV
152
  def export_to_csv(data):
153
+ df = pd.DataFrame(data, columns=["Context", "Answer", "Question", "Options"])
154
  csv = df.to_csv(index=False,encoding='utf-8')
155
  return csv
156
 
 
160
  pdf.add_page()
161
  pdf.set_font("Arial", size=12)
162
 
163
+ for context, answer, question, options in data:
164
  pdf.multi_cell(0, 10, f"Context: {context}")
165
  pdf.multi_cell(0, 10, f"Answer: {answer}")
166
  pdf.multi_cell(0, 10, f"Question: {question}")
 
202
  st.write(f"**Answer:** {keyword}")
203
  st.write(f"**Question:** {question}")
204
  st.write(f"**Options:**")
205
+ for j, option in enumerate(options):
206
  st.write(f"{chr(65+j)}. {option}")
207
 
208
  if linked_entity:
209
  st.write(f"**Entity Link:** {linked_entity}")
210
  st.write("---")
211
+ data.append((context, keyword, question, options))
212
 
213
  # Add the data to session state
214
  st.session_state.data = data
 
227
 
228
 
229
  else:
230
+ st.write("Please enter some text to generate questions.")