Spaces:

jojortz
/

llm4research-query-visualization

Sleeping

App Files Files Community

llm4research-query-visualization / generate_answers.py

jojortz

add initial visualize app

3caa485 10 months ago

raw

history blame contribute delete

3.25 kB

	import gradio as gr
	import pandas as pd
	from uniflow.flow.client import TransformClient
	from uniflow.flow.config import TransformOpenAIConfig
	from uniflow.op.prompt import Context

	DEBUG = False


	def generate_relevant_chunks(query, input_data, progress=gr.Progress()):
	data_list = []
	for paper in input_data: # progress.tqdm(input_data, desc="Papers"):
	data = [Context(context=query, paragraph=p) for p in paper["chunks"]]
	data_list.append({"paper": paper["paper"], "data": data})

	instruction = """
	# Task: I am a researcher trying to understand information across several research papers. You are to determine which of the chunks most directly contains information related to the query.
	## Input:
	1. context: A brief query or description of the information I am looking for.
	2. paragraph: An paragraph from a research paper.
	## Evaluation Criteria: You should pick which sentence(s) contains directly relevant information to the context. The best answer is the sentences that most directly answer or contain the information specific to the context. If there are no such sentences, you should answer with ["None"].
	## Response Format: Your response should only include two fields below:
	1. explanation: Reasoning behind your judgment, explaining why the answer is appropriate or not.
	2. answer: The best sentence(s) that meet the Evaluation Criteria as a list of strings. This should be ["None"] if no sentence answers the query. At most, include 3 sentences.
	"""

	few_shot_examples = []

	num_thread_batch_size = 16

	config = TransformOpenAIConfig()
	config.prompt_template.instruction = instruction
	config.prompt_template.few_shot_prompt = few_shot_examples
	config.model_config.model_name = "gpt-4-1106-preview"
	config.model_config.response_format = {"type": "json_object"}
	config.model_config.num_call = 1
	config.model_config.temperature = 0.0
	config.model_config.num_thread = num_thread_batch_size
	config.model_config.batch_size = num_thread_batch_size

	client = TransformClient(config)

	output = []

	for paper in data_list:
	init_output = client.run(paper["data"])
	combined_output = init_output[0]
	combined_output["output"][0]["response"][0]["explanation"] = [
	combined_output["output"][0]["response"][0]["explanation"]
	]
	if DEBUG:
	print(combined_output)
	for item in init_output[1:]:
	combined_output["output"][0]["response"][0]["answer"].extend(
	item["output"][0]["response"][0]["answer"]
	)
	combined_output["output"][0]["response"][0]["explanation"].append(
	item["output"][0]["response"][0]["explanation"]
	)
	output.append(combined_output)

	output_answers = []

	for idx, o in enumerate(output):
	filtered_answers = [
	item for item in o["output"][0]["response"][0]["answer"] if item != "None"
	]
	if len(filtered_answers) == 0:
	filtered_answers = ["None"]
	output_answers.append(
	{"paper": input_data[idx]["paper"], "answer": filtered_answers}
	)

	df = pd.DataFrame(output_answers)

	return [output_answers, df]