File size: 3,731 Bytes
14cac88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import pke
from sense2vec import Sense2Vec
import time
import gradio as gr
from transformers import AutoTokenizer
import os
from pathlib import Path
from FastT5 import get_onnx_runtime_sessions, OnnxT5
# commands = [
# "curl -LO https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz",
# "tar -xvf s2v_reddit_2015_md.tar.gz",
# ]
# for command in commands:
# return_code = os.system(command)
# if return_code == 0:
# print(f"Command '{command}' executed successfully")
# else:
# print(f"Command '{command}' failed with return code {return_code}")
s2v = Sense2Vec().from_disk("s2v_old")
trained_model_path = './t5_squad_v1/'
pretrained_model_name = Path(trained_model_path).stem
encoder_path = os.path.join(
trained_model_path, f"{pretrained_model_name}-encoder_quantized.onnx")
decoder_path = os.path.join(
trained_model_path, f"{pretrained_model_name}-decoder_quantized.onnx")
init_decoder_path = os.path.join(
trained_model_path, f"{pretrained_model_name}-init-decoder_quantized.onnx")
model_paths = encoder_path, decoder_path, init_decoder_path
model_sessions = get_onnx_runtime_sessions(model_paths)
model = OnnxT5(trained_model_path, model_sessions)
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
def get_question(sentence, answer, mdl, tknizer):
text = f"context: {sentence} answer: {answer}"
print(text)
max_len = 256
encoding = tknizer.encode_plus(
text, max_length=max_len, pad_to_max_length=False, truncation=True, return_tensors="pt")
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
outs = mdl.generate(input_ids=input_ids,
attention_mask=attention_mask,
early_stopping=True,
num_beams=5,
num_return_sequences=1,
no_repeat_ngram_size=2,
max_length=300)
dec = [tknizer.decode(ids, skip_special_tokens=True) for ids in outs]
Question = dec[0].replace("question:", "")
Question = Question.strip()
return Question
def generate_question(context, answer):
start_time = time.time() # Record the start time
result = get_question(context, answer, model, tokenizer)
end_time = time.time() # Record the end time
latency = end_time - start_time # Calculate latency
print(f"Latency: {latency} seconds")
return result
def generate_mcq(context):
extractor = pke.unsupervised.TopicRank()
extractor.load_document(input=context, language='en')
extractor.candidate_selection(pos={"NOUN", "PROPN", "ADJ"})
extractor.candidate_weighting()
keyphrases = extractor.get_n_best(n=10)
results = []
for keyword, _ in keyphrases:
original_keyword = keyword
keyword = original_keyword.lower().replace(" ", "_")
sense = s2v.get_best_sense(keyword)
if sense is not None:
most_similar = s2v.most_similar(sense, n=2)
distractors = [word.split("|")[0].lower().replace(
"_", " ") for word, _ in most_similar]
question = generate_question(context, original_keyword)
result = {
"Question": question,
"Keyword": original_keyword,
"Distractor1": distractors[0],
"Distractor2": distractors[1]
}
results.append(result)
return results
iface = gr.Interface(
fn=generate_mcq,
inputs=gr.Textbox(label="Context", type='text'),
outputs=gr.JSON(value=list),
title="Questgen AI",
description="Enter a context to generate MCQs for keywords."
)
iface.launch()
|