File size: 9,087 Bytes
3caa485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pprint

import pandas as pd
from uniflow.flow.client import TransformClient
from uniflow.flow.config import TransformOpenAIConfig
from uniflow.op.prompt import Context

from helpers import compare_strings_ignore_non_string
from visualize_upload import visualize

DEBUG = False


def cluster(query, answers_data):
    answers = []
    for answer in answers_data:
        answers.extend(answer["answer"])

    data = [Context(context=query, excerpts=answers)]

    instruction = """
# Task: I am a researcher with a query about research papers. I have a list of excerpts from those papers. I need you to cluster each of these excerpts into a category based on the query.
## Input:
1. context: A brief query/context
2. excerpts: An list of excerpts from research papers.
## Evaluation Steps:
### Step 1
Go through each excerpt. For each excerpt, if there is an answer to the context/query that's not already captured by a category, create a category and add it to your category list. If the context has the word 'specific', make the category as specific as the excerpt. Repeat this process for each excerpt. The categories should be mutually exclusive.
### Step 2
Once you've gone through all the excerpts and you have a list of categories, go through the excerpts a second time, and this time assign each excerpt to a category. A single excerpt can be assigned to multiple categories. If there is no information relevant to any of the categories, please categorize the excerpt as "None".
## Response Format: Your response should only include two fields below:
1. categories: A list of all the generated categories. This is the output of Step 1 above.
2. clusters: An object, with each category as a key, and a list of all the excerpts as strings that fall into that category as the value. This is the output of Step 2 above.
"""

    few_shot_examples = [
        # Context(
        #     context="Which types of batteries are discussed?",
        #     excerpts=[
        #         "This investigation will shed lights on the tuneable chemical environments of transition-metal oxides for advanced cathode materials and promote the development of sodium-ion batteries.",
        #         "Bi2Se3 was studied as a novel sodium-ion battery anode material because of its high theoretical capacity and high intrinsic conductivity.",
        #         "Magnesium-ion batteries (MIBs) are considered strong candidates for next-generation energy-storage systems owing to their high theoretical capacity, divalent nature and the natural abundancy of magnesium (Mg) resources on Earth.",
        #         "Magnesium-ion batteries (MIBs) have great potential in large-scale energy storage field with high capacity, excellent safety, and low cost.",
        #     ],
        #     categories=["Sodium-ion battery", "Magnesium-ion batteries"],
        #     clusters={
        #         "Sodium-ion battery": [
        #             "This investigation will shed lights on the tuneable chemical environments of transition-metal oxides for advanced cathode materials and promote the development of sodium-ion batteries.",
        #             "Bi2Se3 was studied as a novel sodium-ion battery anode material because of its high theoretical capacity and high intrinsic conductivity.",
        #         ],
        #         "Magnesium-ion batteries": [
        #             "Magnesium-ion batteries (MIBs) are considered strong candidates for next-generation energy-storage systems owing to their high theoretical capacity, divalent nature and the natural abundancy of magnesium (Mg) resources on Earth.",
        #             "Magnesium-ion batteries (MIBs) have great potential in large-scale energy storage field with high capacity, excellent safety, and low cost.",
        #         ],
        #     },
        # ),
        # Context(
        #     context="Which 3D printing materials are discussed?",
        #     excerpts=[
        #         "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
        #         "To this end, this work designs a novel 3D printing phase change aggregate to prepare concrete with prominent thermal capacity and ductility.",
        #         "In this study, 15 commercial pure titanium samples are processed under different conditions, and the 3D pore structures are characterized by X-ray tomography",
        #         "In this study, a support-less ceramic printing (SLCP) process using a hydrogel bath was developed to facilitate the manufacture of complex bone substitutes.",
        #     ],
        #     categories=[
        #         "metals",
        #         "polymer composites",
        #         "ceramics",
        #         "concrete",
        #         "phase change aggregate",
        #     ],
        #     clusters={
        #         "metals": [
        #             "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
        #             "In this study, 15 commercial pure titanium samples are processed under different conditions, and the 3D pore structures are characterized by X-ray tomography",
        #         ],
        #         "polymer composites": [
        #             "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented"
        #         ],
        #         "ceramics": [
        #             "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
        #             "In this study, a support-less ceramic printing (SLCP) process using a hydrogel bath was developed to facilitate the manufacture of complex bone substitutes.",
        #         ],
        #         "concrete": [
        #             "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
        #             "To this end, this work designs a novel 3D printing phase change aggregate to prepare concrete with prominent thermal capacity and ductility.",
        #         ],
        #         "phase change aggregate": [
        #             "To this end, this work designs a novel 3D printing phase change aggregate to prepare concrete with prominent thermal capacity and ductility."
        #         ],
        #     },
        # ),
    ]

    num_thread_batch_size = 1

    config = TransformOpenAIConfig()
    config.prompt_template.instruction = instruction
    config.prompt_template.few_shot_prompt = few_shot_examples
    config.model_config.model_name = "gpt-4-1106-preview"
    config.model_config.response_format = {"type": "json_object"}
    config.model_config.num_call = 1
    config.model_config.temperature = 0.0
    config.model_config.num_thread = num_thread_batch_size
    config.model_config.batch_size = num_thread_batch_size

    cluster_client = TransformClient(config)

    output = cluster_client.run(data)
    if DEBUG:
        pprint.pprint(output)
    output_clusters = answers_data
    clusters = output[0]["output"][0]["response"][0]["clusters"]
    output_answer_category = []

    for idx, paper in enumerate(answers_data):
        # Initialize an empty list to store the categories for each answer
        categories_per_answer = []

        # Iterate over each answer
        for ans in paper["answer"]:
            categories = []
            # Iterate over each category in clusters
            for category, texts in clusters.items():
                # Check if the answer is in any of the texts related to the category
                if any(compare_strings_ignore_non_string(ans, text) for text in texts):
                    if category not in categories_per_answer:
                        categories.append(category)
                    output_answer_category.append(
                        {"paper": paper["paper"], "answer": ans, "category": category}
                    )
            if len(categories) == 0:
                categories.append("None")
            categories_per_answer.extend(categories)

        output_clusters[idx]["categories"] = categories_per_answer
    for output_cluster in output_clusters:
        if len(output_cluster["categories"]) == 0:
            output_cluster["categories"].append("None")
    df = create_category_df(output_clusters, answers_data)
    output_answer_category_df = pd.DataFrame(output_answer_category)
    visualize_output = visualize(output_clusters)

    return [output_clusters, df, visualize_output, output_answer_category_df]


def create_category_df(cluster_output, answers_data):
    pd_data = {
        "Paper": [],
        "Excerpts": [],
        "Categories": [],
    }
    for i, paper in enumerate(cluster_output):
        pd_data["Paper"].append(paper["paper"])
        pd_data["Excerpts"].append(", ".join(answers_data[i]["answer"]))
        pd_data["Categories"].append(", ".join(paper["categories"]))

    df = pd.DataFrame(pd_data)
    return df