|
import gradio as gr |
|
import pandas as pd |
|
from uniflow.flow.client import TransformClient |
|
from uniflow.flow.config import TransformOpenAIConfig |
|
from uniflow.op.prompt import Context |
|
|
|
DEBUG = False |
|
|
|
|
|
def generate_relevant_chunks(query, input_data, progress=gr.Progress()): |
|
data_list = [] |
|
for paper in input_data: |
|
data = [Context(context=query, paragraph=p) for p in paper["chunks"]] |
|
data_list.append({"paper": paper["paper"], "data": data}) |
|
|
|
instruction = """ |
|
# Task: I am a researcher trying to understand information across several research papers. You are to determine which of the chunks most directly contains information related to the query. |
|
## Input: |
|
1. context: A brief query or description of the information I am looking for. |
|
2. paragraph: An paragraph from a research paper. |
|
## Evaluation Criteria: You should pick which sentence(s) contains directly relevant information to the context. The best answer is the sentences that most directly answer or contain the information specific to the context. If there are no such sentences, you should answer with ["None"]. |
|
## Response Format: Your response should only include two fields below: |
|
1. explanation: Reasoning behind your judgment, explaining why the answer is appropriate or not. |
|
2. answer: The best sentence(s) that meet the Evaluation Criteria as a list of strings. This should be ["None"] if no sentence answers the query. At most, include 3 sentences. |
|
""" |
|
|
|
few_shot_examples = [] |
|
|
|
num_thread_batch_size = 16 |
|
|
|
config = TransformOpenAIConfig() |
|
config.prompt_template.instruction = instruction |
|
config.prompt_template.few_shot_prompt = few_shot_examples |
|
config.model_config.model_name = "gpt-4-1106-preview" |
|
config.model_config.response_format = {"type": "json_object"} |
|
config.model_config.num_call = 1 |
|
config.model_config.temperature = 0.0 |
|
config.model_config.num_thread = num_thread_batch_size |
|
config.model_config.batch_size = num_thread_batch_size |
|
|
|
client = TransformClient(config) |
|
|
|
output = [] |
|
|
|
for paper in data_list: |
|
init_output = client.run(paper["data"]) |
|
combined_output = init_output[0] |
|
combined_output["output"][0]["response"][0]["explanation"] = [ |
|
combined_output["output"][0]["response"][0]["explanation"] |
|
] |
|
if DEBUG: |
|
print(combined_output) |
|
for item in init_output[1:]: |
|
combined_output["output"][0]["response"][0]["answer"].extend( |
|
item["output"][0]["response"][0]["answer"] |
|
) |
|
combined_output["output"][0]["response"][0]["explanation"].append( |
|
item["output"][0]["response"][0]["explanation"] |
|
) |
|
output.append(combined_output) |
|
|
|
output_answers = [] |
|
|
|
for idx, o in enumerate(output): |
|
filtered_answers = [ |
|
item for item in o["output"][0]["response"][0]["answer"] if item != "None" |
|
] |
|
if len(filtered_answers) == 0: |
|
filtered_answers = ["None"] |
|
output_answers.append( |
|
{"paper": input_data[idx]["paper"], "answer": filtered_answers} |
|
) |
|
|
|
df = pd.DataFrame(output_answers) |
|
|
|
return [output_answers, df] |
|
|