jojortz commited on
Commit
3caa485
·
1 Parent(s): 8619559

add initial visualize app

Browse files
Files changed (8) hide show
  1. .gitignore +2 -0
  2. app.py +228 -0
  3. cluster.py +159 -0
  4. extract.py +57 -0
  5. generate_answers.py +76 -0
  6. helpers.py +20 -0
  7. requirements.txt +8 -0
  8. visualize_upload.py +51 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ __pycache__
app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import gradio as gr
4
+
5
+ sys.path.append(".")
6
+ sys.path.append("..")
7
+ sys.path.append("../..")
8
+
9
+ from cluster import cluster
10
+ from extract import extract_endpoint
11
+ from generate_answers import generate_relevant_chunks
12
+
13
+ queries = [
14
+ "What is the size, shape, and energy (watt hour) or capacity (Amp hour) of battery discussed in the paper?",
15
+ "What specific mechanical testing methods were used to quantify strength?",
16
+ "What parameters they used to quantify the benefit of their individual design (mass saving, increased run time, etc.)?",
17
+ "What material chemistry combination (on the anode, cathode, separator, and electrolyte) was used in these papers?",
18
+ "What kind of end use application they targeted?",
19
+ ]
20
+ MAX_CATEGORIES = 10
21
+
22
+
23
+ def change_button(text):
24
+ if len(text) > 0:
25
+ return gr.Button(interactive=True)
26
+ else:
27
+ return gr.Button(interactive=False)
28
+
29
+
30
+ def generate_category_btn(cluster_output):
31
+ unique_categories = set()
32
+ for item in cluster_output:
33
+ unique_categories.update(item["categories"])
34
+
35
+ update_show = [gr.Button(visible=True, value=w) for w in unique_categories]
36
+ update_hide = [
37
+ gr.Button(visible=False, value="")
38
+ for _ in range(MAX_CATEGORIES - len(unique_categories))
39
+ ]
40
+ return update_show + update_hide
41
+
42
+
43
+ def get_query(btn):
44
+ return btn
45
+
46
+
47
+ btn_list = []
48
+
49
+
50
+ with gr.Blocks() as app:
51
+ gr.Markdown(
52
+ """
53
+ # Paper Query Clustering + Visualization
54
+ This app extracts text from papers and then searches for relevant excerpts based on a query. It then clusters and visualizes the relevant excerpts to find common themes across the papers.
55
+
56
+ ### Input
57
+ 1. A group of research papers that you want to run the query on.
58
+ 1. Query that you would like to know about these papers.
59
+
60
+ ### Output
61
+ Clustering and visualization of the relevant excerpts which answer the query across the papers.
62
+
63
+ # 1. Upload + Extract
64
+ First, upload the papers you want to analyze. Currently, we only support PDFs. Once they're uploaded, you can extract the text data from the papers.
65
+ """
66
+ )
67
+ file_upload = gr.Files()
68
+ extract_btn = gr.Button("Extract", interactive=False)
69
+ with gr.Tab(label="Table"):
70
+ extract_df = gr.Dataframe(
71
+ datatype="markdown", column_widths=[100, 400], wrap=True
72
+ )
73
+ with gr.Tab(label="JSON"):
74
+ extract_output = gr.JSON(label="Extract Output")
75
+
76
+ gr.Markdown(
77
+ """
78
+ ----------------
79
+ # 2. Extract Relevant Excerpts
80
+ Enter a query about these papers. This will search the papers to find the most relevant excerpts.
81
+ """
82
+ )
83
+
84
+ gr.Markdown(
85
+ """
86
+ ### Input
87
+ """
88
+ )
89
+ query = gr.Textbox(
90
+ label="Query", value=queries[1], lines=3, placeholder="Enter a query"
91
+ )
92
+ gr.Markdown(
93
+ """
94
+ You can also select some example queries below.
95
+ """
96
+ )
97
+ with gr.Row():
98
+ q0_btn = gr.Button(queries[0])
99
+ q1_btn = gr.Button(queries[1])
100
+ q2_btn = gr.Button(queries[2])
101
+ q3_btn = gr.Button(queries[3])
102
+ q4_btn = gr.Button(queries[4])
103
+ gr.Markdown(
104
+ """
105
+ ----
106
+ """
107
+ )
108
+ relevant_btn = gr.Button("Extract Excerpts", interactive=False)
109
+ gr.Markdown(
110
+ """
111
+ ### Output
112
+ """
113
+ )
114
+ with gr.Tab(label="Output Table"):
115
+ relevant_df = gr.Dataframe(
116
+ datatype="markdown", column_widths=[100, 100, 300], wrap=True
117
+ )
118
+ with gr.Tab(label="JSON"):
119
+ relevant_output = gr.JSON(label="Relevant Chunks Output")
120
+
121
+ gr.Markdown(
122
+ """
123
+ ----------------
124
+ # 3. Cluster & Visualize
125
+ Cluster the relevant excerpts to find common themes and visualize the results.
126
+ """
127
+ )
128
+ with gr.Row():
129
+ with gr.Column():
130
+ gr.Markdown(
131
+ """
132
+ ### Input
133
+ """
134
+ )
135
+ cluster_btn = gr.Button("Cluster", interactive=False)
136
+ cluster_output = gr.JSON(label="Cluster Output", visible=False)
137
+
138
+ gr.Markdown(
139
+ """
140
+ ### Visualization
141
+ """
142
+ )
143
+ visualize_output = gr.Plot()
144
+ with gr.Row():
145
+ for i in range(MAX_CATEGORIES):
146
+ btn = gr.Button(visible=False)
147
+ btn_list.append(btn)
148
+ with gr.Tab(label="By Paper"):
149
+ cluster_df = gr.Dataframe(
150
+ datatype="markdown", column_widths=[100, 100, 300], wrap=True
151
+ )
152
+
153
+ with gr.Tab(label="By Excerpt"):
154
+ cluster_granular_df = gr.Dataframe(
155
+ datatype="markdown", column_widths=[100, 100, 300], wrap=True
156
+ )
157
+
158
+ # Event handlers
159
+ file_upload.change(fn=change_button, inputs=[file_upload], outputs=[extract_btn])
160
+
161
+ extract_btn.click(
162
+ fn=extract_endpoint,
163
+ inputs=[file_upload],
164
+ outputs=[extract_output, extract_df],
165
+ )
166
+
167
+ extract_output.change(
168
+ fn=change_button,
169
+ inputs=[extract_output],
170
+ outputs=[relevant_btn],
171
+ )
172
+
173
+ q0_btn.click(
174
+ fn=get_query,
175
+ inputs=[q0_btn],
176
+ outputs=[query],
177
+ )
178
+
179
+ q1_btn.click(
180
+ fn=get_query,
181
+ inputs=[q1_btn],
182
+ outputs=[query],
183
+ )
184
+
185
+ q2_btn.click(
186
+ fn=get_query,
187
+ inputs=[q2_btn],
188
+ outputs=[query],
189
+ )
190
+
191
+ q3_btn.click(
192
+ fn=get_query,
193
+ inputs=[q3_btn],
194
+ outputs=[query],
195
+ )
196
+
197
+ q4_btn.click(
198
+ fn=get_query,
199
+ inputs=[q4_btn],
200
+ outputs=[query],
201
+ )
202
+
203
+ relevant_btn.click(
204
+ fn=generate_relevant_chunks,
205
+ inputs=[query, extract_output],
206
+ outputs=[relevant_output, relevant_df],
207
+ api_name="relevant_chunks",
208
+ )
209
+
210
+ relevant_output.change(
211
+ fn=change_button, inputs=[relevant_output], outputs=[cluster_btn]
212
+ )
213
+
214
+ cluster_btn.click(
215
+ fn=cluster,
216
+ inputs=[query, relevant_output],
217
+ outputs=[cluster_output, cluster_df, visualize_output, cluster_granular_df],
218
+ api_name="cluster",
219
+ )
220
+
221
+ cluster_output.change(
222
+ fn=generate_category_btn,
223
+ inputs=[cluster_output],
224
+ outputs=btn_list,
225
+ )
226
+
227
+ if __name__ == "__main__":
228
+ app.launch()
cluster.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pprint
2
+
3
+ import pandas as pd
4
+ from uniflow.flow.client import TransformClient
5
+ from uniflow.flow.config import TransformOpenAIConfig
6
+ from uniflow.op.prompt import Context
7
+
8
+ from helpers import compare_strings_ignore_non_string
9
+ from visualize_upload import visualize
10
+
11
+ DEBUG = False
12
+
13
+
14
+ def cluster(query, answers_data):
15
+ answers = []
16
+ for answer in answers_data:
17
+ answers.extend(answer["answer"])
18
+
19
+ data = [Context(context=query, excerpts=answers)]
20
+
21
+ instruction = """
22
+ # Task: I am a researcher with a query about research papers. I have a list of excerpts from those papers. I need you to cluster each of these excerpts into a category based on the query.
23
+ ## Input:
24
+ 1. context: A brief query/context
25
+ 2. excerpts: An list of excerpts from research papers.
26
+ ## Evaluation Steps:
27
+ ### Step 1
28
+ Go through each excerpt. For each excerpt, if there is an answer to the context/query that's not already captured by a category, create a category and add it to your category list. If the context has the word 'specific', make the category as specific as the excerpt. Repeat this process for each excerpt. The categories should be mutually exclusive.
29
+ ### Step 2
30
+ Once you've gone through all the excerpts and you have a list of categories, go through the excerpts a second time, and this time assign each excerpt to a category. A single excerpt can be assigned to multiple categories. If there is no information relevant to any of the categories, please categorize the excerpt as "None".
31
+ ## Response Format: Your response should only include two fields below:
32
+ 1. categories: A list of all the generated categories. This is the output of Step 1 above.
33
+ 2. clusters: An object, with each category as a key, and a list of all the excerpts as strings that fall into that category as the value. This is the output of Step 2 above.
34
+ """
35
+
36
+ few_shot_examples = [
37
+ # Context(
38
+ # context="Which types of batteries are discussed?",
39
+ # excerpts=[
40
+ # "This investigation will shed lights on the tuneable chemical environments of transition-metal oxides for advanced cathode materials and promote the development of sodium-ion batteries.",
41
+ # "Bi2Se3 was studied as a novel sodium-ion battery anode material because of its high theoretical capacity and high intrinsic conductivity.",
42
+ # "Magnesium-ion batteries (MIBs) are considered strong candidates for next-generation energy-storage systems owing to their high theoretical capacity, divalent nature and the natural abundancy of magnesium (Mg) resources on Earth.",
43
+ # "Magnesium-ion batteries (MIBs) have great potential in large-scale energy storage field with high capacity, excellent safety, and low cost.",
44
+ # ],
45
+ # categories=["Sodium-ion battery", "Magnesium-ion batteries"],
46
+ # clusters={
47
+ # "Sodium-ion battery": [
48
+ # "This investigation will shed lights on the tuneable chemical environments of transition-metal oxides for advanced cathode materials and promote the development of sodium-ion batteries.",
49
+ # "Bi2Se3 was studied as a novel sodium-ion battery anode material because of its high theoretical capacity and high intrinsic conductivity.",
50
+ # ],
51
+ # "Magnesium-ion batteries": [
52
+ # "Magnesium-ion batteries (MIBs) are considered strong candidates for next-generation energy-storage systems owing to their high theoretical capacity, divalent nature and the natural abundancy of magnesium (Mg) resources on Earth.",
53
+ # "Magnesium-ion batteries (MIBs) have great potential in large-scale energy storage field with high capacity, excellent safety, and low cost.",
54
+ # ],
55
+ # },
56
+ # ),
57
+ # Context(
58
+ # context="Which 3D printing materials are discussed?",
59
+ # excerpts=[
60
+ # "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
61
+ # "To this end, this work designs a novel 3D printing phase change aggregate to prepare concrete with prominent thermal capacity and ductility.",
62
+ # "In this study, 15 commercial pure titanium samples are processed under different conditions, and the 3D pore structures are characterized by X-ray tomography",
63
+ # "In this study, a support-less ceramic printing (SLCP) process using a hydrogel bath was developed to facilitate the manufacture of complex bone substitutes.",
64
+ # ],
65
+ # categories=[
66
+ # "metals",
67
+ # "polymer composites",
68
+ # "ceramics",
69
+ # "concrete",
70
+ # "phase change aggregate",
71
+ # ],
72
+ # clusters={
73
+ # "metals": [
74
+ # "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
75
+ # "In this study, 15 commercial pure titanium samples are processed under different conditions, and the 3D pore structures are characterized by X-ray tomography",
76
+ # ],
77
+ # "polymer composites": [
78
+ # "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented"
79
+ # ],
80
+ # "ceramics": [
81
+ # "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
82
+ # "In this study, a support-less ceramic printing (SLCP) process using a hydrogel bath was developed to facilitate the manufacture of complex bone substitutes.",
83
+ # ],
84
+ # "concrete": [
85
+ # "The current state of materials development, including metal alloys, polymer composites, ceramics and concrete, was presented",
86
+ # "To this end, this work designs a novel 3D printing phase change aggregate to prepare concrete with prominent thermal capacity and ductility.",
87
+ # ],
88
+ # "phase change aggregate": [
89
+ # "To this end, this work designs a novel 3D printing phase change aggregate to prepare concrete with prominent thermal capacity and ductility."
90
+ # ],
91
+ # },
92
+ # ),
93
+ ]
94
+
95
+ num_thread_batch_size = 1
96
+
97
+ config = TransformOpenAIConfig()
98
+ config.prompt_template.instruction = instruction
99
+ config.prompt_template.few_shot_prompt = few_shot_examples
100
+ config.model_config.model_name = "gpt-4-1106-preview"
101
+ config.model_config.response_format = {"type": "json_object"}
102
+ config.model_config.num_call = 1
103
+ config.model_config.temperature = 0.0
104
+ config.model_config.num_thread = num_thread_batch_size
105
+ config.model_config.batch_size = num_thread_batch_size
106
+
107
+ cluster_client = TransformClient(config)
108
+
109
+ output = cluster_client.run(data)
110
+ if DEBUG:
111
+ pprint.pprint(output)
112
+ output_clusters = answers_data
113
+ clusters = output[0]["output"][0]["response"][0]["clusters"]
114
+ output_answer_category = []
115
+
116
+ for idx, paper in enumerate(answers_data):
117
+ # Initialize an empty list to store the categories for each answer
118
+ categories_per_answer = []
119
+
120
+ # Iterate over each answer
121
+ for ans in paper["answer"]:
122
+ categories = []
123
+ # Iterate over each category in clusters
124
+ for category, texts in clusters.items():
125
+ # Check if the answer is in any of the texts related to the category
126
+ if any(compare_strings_ignore_non_string(ans, text) for text in texts):
127
+ if category not in categories_per_answer:
128
+ categories.append(category)
129
+ output_answer_category.append(
130
+ {"paper": paper["paper"], "answer": ans, "category": category}
131
+ )
132
+ if len(categories) == 0:
133
+ categories.append("None")
134
+ categories_per_answer.extend(categories)
135
+
136
+ output_clusters[idx]["categories"] = categories_per_answer
137
+ for output_cluster in output_clusters:
138
+ if len(output_cluster["categories"]) == 0:
139
+ output_cluster["categories"].append("None")
140
+ df = create_category_df(output_clusters, answers_data)
141
+ output_answer_category_df = pd.DataFrame(output_answer_category)
142
+ visualize_output = visualize(output_clusters)
143
+
144
+ return [output_clusters, df, visualize_output, output_answer_category_df]
145
+
146
+
147
+ def create_category_df(cluster_output, answers_data):
148
+ pd_data = {
149
+ "Paper": [],
150
+ "Excerpts": [],
151
+ "Categories": [],
152
+ }
153
+ for i, paper in enumerate(cluster_output):
154
+ pd_data["Paper"].append(paper["paper"])
155
+ pd_data["Excerpts"].append(", ".join(answers_data[i]["answer"]))
156
+ pd_data["Categories"].append(", ".join(paper["categories"]))
157
+
158
+ df = pd.DataFrame(pd_data)
159
+ return df
extract.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import pandas as pd
4
+ from dotenv import load_dotenv
5
+ from llama_index.core import SimpleDirectoryReader
6
+ from llama_parse import LlamaParse
7
+
8
+ load_dotenv()
9
+ MIN_PARAGRAPH_LENGTH = 50
10
+
11
+
12
+ def extract_paragraphs(markdown_text):
13
+ """
14
+ Extract paragraphs from a markdown text.
15
+ """
16
+ # Split the text into paragraphs using regex
17
+ paragraphs = re.split(r"\n\n+", markdown_text)
18
+ # Remove leading and trailing whitespaces from each paragraph
19
+ paragraphs = [p.strip() for p in paragraphs if p.strip()]
20
+ paragraphs = [
21
+ p
22
+ for p in paragraphs
23
+ if len(p) >= MIN_PARAGRAPH_LENGTH and not p.startswith("#")
24
+ ]
25
+ print(f"created {len(paragraphs)} paragraphs\n", paragraphs)
26
+
27
+ return paragraphs
28
+
29
+
30
+ def extract_endpoint(file_paths):
31
+ """
32
+ Extract PDFs using LlamaParse.
33
+ """
34
+
35
+ # set up parser
36
+ parser = LlamaParse(result_type="markdown") # "markdown" and "text" are available
37
+
38
+ # use SimpleDirectoryReader to parse our file
39
+ file_extractor = {".pdf": parser}
40
+ documents = SimpleDirectoryReader(
41
+ input_files=file_paths, file_extractor=file_extractor
42
+ ).load_data()
43
+
44
+ extracted_data = []
45
+
46
+ for doc in documents:
47
+ print(doc.text[:500])
48
+ paragraphs = extract_paragraphs(doc.text)
49
+ data = {
50
+ "paper": doc.metadata["file_name"],
51
+ "chunks": paragraphs,
52
+ }
53
+ extracted_data.append(data)
54
+
55
+ df = pd.DataFrame(extracted_data)
56
+
57
+ return [extracted_data, df]
generate_answers.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from uniflow.flow.client import TransformClient
4
+ from uniflow.flow.config import TransformOpenAIConfig
5
+ from uniflow.op.prompt import Context
6
+
7
+ DEBUG = False
8
+
9
+
10
+ def generate_relevant_chunks(query, input_data, progress=gr.Progress()):
11
+ data_list = []
12
+ for paper in input_data: # progress.tqdm(input_data, desc="Papers"):
13
+ data = [Context(context=query, paragraph=p) for p in paper["chunks"]]
14
+ data_list.append({"paper": paper["paper"], "data": data})
15
+
16
+ instruction = """
17
+ # Task: I am a researcher trying to understand information across several research papers. You are to determine which of the chunks most directly contains information related to the query.
18
+ ## Input:
19
+ 1. context: A brief query or description of the information I am looking for.
20
+ 2. paragraph: An paragraph from a research paper.
21
+ ## Evaluation Criteria: You should pick which sentence(s) contains directly relevant information to the context. The best answer is the sentences that most directly answer or contain the information specific to the context. If there are no such sentences, you should answer with ["None"].
22
+ ## Response Format: Your response should only include two fields below:
23
+ 1. explanation: Reasoning behind your judgment, explaining why the answer is appropriate or not.
24
+ 2. answer: The best sentence(s) that meet the Evaluation Criteria as a list of strings. This should be ["None"] if no sentence answers the query. At most, include 3 sentences.
25
+ """
26
+
27
+ few_shot_examples = []
28
+
29
+ num_thread_batch_size = 16
30
+
31
+ config = TransformOpenAIConfig()
32
+ config.prompt_template.instruction = instruction
33
+ config.prompt_template.few_shot_prompt = few_shot_examples
34
+ config.model_config.model_name = "gpt-4-1106-preview"
35
+ config.model_config.response_format = {"type": "json_object"}
36
+ config.model_config.num_call = 1
37
+ config.model_config.temperature = 0.0
38
+ config.model_config.num_thread = num_thread_batch_size
39
+ config.model_config.batch_size = num_thread_batch_size
40
+
41
+ client = TransformClient(config)
42
+
43
+ output = []
44
+
45
+ for paper in data_list:
46
+ init_output = client.run(paper["data"])
47
+ combined_output = init_output[0]
48
+ combined_output["output"][0]["response"][0]["explanation"] = [
49
+ combined_output["output"][0]["response"][0]["explanation"]
50
+ ]
51
+ if DEBUG:
52
+ print(combined_output)
53
+ for item in init_output[1:]:
54
+ combined_output["output"][0]["response"][0]["answer"].extend(
55
+ item["output"][0]["response"][0]["answer"]
56
+ )
57
+ combined_output["output"][0]["response"][0]["explanation"].append(
58
+ item["output"][0]["response"][0]["explanation"]
59
+ )
60
+ output.append(combined_output)
61
+
62
+ output_answers = []
63
+
64
+ for idx, o in enumerate(output):
65
+ filtered_answers = [
66
+ item for item in o["output"][0]["response"][0]["answer"] if item != "None"
67
+ ]
68
+ if len(filtered_answers) == 0:
69
+ filtered_answers = ["None"]
70
+ output_answers.append(
71
+ {"paper": input_data[idx]["paper"], "answer": filtered_answers}
72
+ )
73
+
74
+ df = pd.DataFrame(output_answers)
75
+
76
+ return [output_answers, df]
helpers.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unicodedata
2
+
3
+
4
+ def _remove_non_string_characters(string):
5
+ symbols_to_remove = ["Δ"]
6
+ return "".join(
7
+ char
8
+ for char in string
9
+ if unicodedata.category(char)[0] in {"L", "N", "P", "Z"}
10
+ and char not in symbols_to_remove
11
+ )
12
+
13
+
14
+ def compare_strings_ignore_non_string(string1, string2):
15
+ string1 = _remove_non_string_characters(string1)
16
+ string2 = _remove_non_string_characters(string2)
17
+ if string1 != string2 and string1[0:20] == string2[0:20]:
18
+ print(f"String1: {string1}")
19
+ print(f"String2: {string2}")
20
+ return string1 == string2
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ uniflow==0.0.25
2
+ python-dotenv==1.0.1
3
+ gradio==4.19.2
4
+ llama-index==0.10.19
5
+ llama-parse==0.3.9
6
+ rapidfuzz==3.6.2
7
+ dataclasses-json==0.6.4
8
+ plotly==5.20.0
visualize_upload.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.graph_objs as go
3
+
4
+
5
+ def visualize(cluster_data):
6
+
7
+ # Create a DataFrame with counts of each category
8
+ category_counts = {}
9
+ for paper in cluster_data:
10
+ categories = paper["categories"]
11
+ for category in categories:
12
+ if category in category_counts:
13
+ category_counts[category] += 1
14
+ else:
15
+ category_counts[category] = 1
16
+
17
+ category_df = pd.DataFrame(
18
+ {
19
+ "Category": list(category_counts.keys()),
20
+ "Count": list(category_counts.values()),
21
+ }
22
+ )
23
+
24
+ # Sort the DataFrame by count in descending order
25
+ category_df = category_df.sort_values(by="Count", ascending=False)
26
+
27
+ # Create hover text containing the count and titles of all papers for each category
28
+ hover_text = []
29
+ for category in category_df["Category"]:
30
+ titles = []
31
+ for paper in cluster_data:
32
+ if category in paper["categories"]:
33
+ titles.append(f'<a href="https://plot.ly/">{paper["paper"]}</a>')
34
+ hover_text.append(f'<br>Papers:<br>{"<br>".join(titles)}')
35
+
36
+ # Create Plotly Bar chart
37
+ fig = go.Figure(
38
+ data=[
39
+ go.Bar(
40
+ x=category_df["Category"],
41
+ y=category_df["Count"],
42
+ hovertext=hover_text,
43
+ marker=dict(color="brown"),
44
+ )
45
+ ]
46
+ )
47
+
48
+ # Update layout
49
+ fig.update_layout(title="Categories", xaxis_title="Category", yaxis_title="Count")
50
+
51
+ return fig