llm4research-query-visualization / generate_answers.py
jojortz's picture
add initial visualize app
3caa485
import gradio as gr
import pandas as pd
from uniflow.flow.client import TransformClient
from uniflow.flow.config import TransformOpenAIConfig
from uniflow.op.prompt import Context
DEBUG = False
def generate_relevant_chunks(query, input_data, progress=gr.Progress()):
data_list = []
for paper in input_data: # progress.tqdm(input_data, desc="Papers"):
data = [Context(context=query, paragraph=p) for p in paper["chunks"]]
data_list.append({"paper": paper["paper"], "data": data})
instruction = """
# Task: I am a researcher trying to understand information across several research papers. You are to determine which of the chunks most directly contains information related to the query.
## Input:
1. context: A brief query or description of the information I am looking for.
2. paragraph: An paragraph from a research paper.
## Evaluation Criteria: You should pick which sentence(s) contains directly relevant information to the context. The best answer is the sentences that most directly answer or contain the information specific to the context. If there are no such sentences, you should answer with ["None"].
## Response Format: Your response should only include two fields below:
1. explanation: Reasoning behind your judgment, explaining why the answer is appropriate or not.
2. answer: The best sentence(s) that meet the Evaluation Criteria as a list of strings. This should be ["None"] if no sentence answers the query. At most, include 3 sentences.
"""
few_shot_examples = []
num_thread_batch_size = 16
config = TransformOpenAIConfig()
config.prompt_template.instruction = instruction
config.prompt_template.few_shot_prompt = few_shot_examples
config.model_config.model_name = "gpt-4-1106-preview"
config.model_config.response_format = {"type": "json_object"}
config.model_config.num_call = 1
config.model_config.temperature = 0.0
config.model_config.num_thread = num_thread_batch_size
config.model_config.batch_size = num_thread_batch_size
client = TransformClient(config)
output = []
for paper in data_list:
init_output = client.run(paper["data"])
combined_output = init_output[0]
combined_output["output"][0]["response"][0]["explanation"] = [
combined_output["output"][0]["response"][0]["explanation"]
]
if DEBUG:
print(combined_output)
for item in init_output[1:]:
combined_output["output"][0]["response"][0]["answer"].extend(
item["output"][0]["response"][0]["answer"]
)
combined_output["output"][0]["response"][0]["explanation"].append(
item["output"][0]["response"][0]["explanation"]
)
output.append(combined_output)
output_answers = []
for idx, o in enumerate(output):
filtered_answers = [
item for item in o["output"][0]["response"][0]["answer"] if item != "None"
]
if len(filtered_answers) == 0:
filtered_answers = ["None"]
output_answers.append(
{"paper": input_data[idx]["paper"], "answer": filtered_answers}
)
df = pd.DataFrame(output_answers)
return [output_answers, df]