jackkuo commited on
Commit
7ce8079
·
verified ·
1 Parent(s): d75a33c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -94
app.py CHANGED
@@ -8,10 +8,6 @@ import re
8
  import tiktoken
9
  import pandas as pd
10
 
11
- # api_key = os.getenv('API_KEY')
12
- # base_url = os.getenv("BASE_URL")
13
-
14
- # client = OpenAI(api_key=api_key, base_url=base_url)
15
  api_key = os.getenv('API_KEY')
16
  base_url = os.getenv("BASE_URL")
17
 
@@ -33,25 +29,45 @@ def cal_tokens(message_data):
33
 
34
 
35
  def del_references(lines):
36
- # 定义正则表达式模式
37
- patterns = [
38
- (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', r'\section*{Tables\n'),
39
- (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', ''),
40
- (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', r'Tables'),
41
- (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', r'# SUPPLEMENTARY'),
42
- (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]', r'[^0]'),
43
- (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
44
- ]
45
-
46
- for pattern, replacement in patterns:
47
  matches = re.search(pattern, lines, re.DOTALL)
48
  if matches:
49
- lines = lines.replace(matches[0], replacement)
50
- print(f"匹配到了 {pattern}, 删除了 References, 保留了后面的 {replacement}")
51
- break
52
- else:
53
- print("没有匹配到 References")
54
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  return lines
56
 
57
 
@@ -94,8 +110,8 @@ def openai_api(messages):
94
 
95
  def openai_chat_2_step(prompt, file_content):
96
  all_response = ""
97
- for i in range(len(file_content)//123000 + 1):
98
- text = file_content[i*123000:(i+1)*123000]
99
  # step1: 拆分两部分,前半部分
100
  messages = [
101
  {
@@ -139,9 +155,11 @@ Please pay attention to the pipe format as shown in the example below. This form
139
  return response
140
 
141
 
142
- def predict(prompt, file_content):
143
- file_content = del_references(file_content)
 
144
 
 
145
  messages = [
146
  {
147
  "role": "system",
@@ -158,6 +176,7 @@ def predict(prompt, file_content):
158
  print("prompt tokens:", tokens)
159
  # time.sleep(20) # claude 需要加这个
160
  if tokens > 128000:
 
161
  extract_result = openai_chat_2_step(prompt, file_content)
162
  else:
163
  extract_result = openai_api(messages)
@@ -223,29 +242,32 @@ def update_input():
223
  return en_1
224
 
225
 
226
- CSV_FILE_PATH_Golden_Benchmark_Enzyme = "static/Golden Benchmark for Enzyme Kinetics.csv"
227
- CSV_FILE_PATH_Golden_Benchmark_Ribozyme = "static/Golden Benchmark for Ribozyme Kinetics.csv"
228
- CSV_FILE_PATH_LLENKA_Dataset = "static/3450_merged_data_2000_lines.csv"
229
 
230
 
231
- def load_csv(CSV_FILE_PATH):
232
  try:
233
- df = pd.read_csv(CSV_FILE_PATH)
 
 
234
  return df
235
  except Exception as e:
236
- return f"Error loading CSV file: {e}"
237
 
238
 
239
- def get_column_names(CSV_FILE_PATH):
240
- df = load_csv(CSV_FILE_PATH)
241
  if isinstance(df, str):
242
  return [] # 如果加载失败,返回空列表
243
  return df.columns.tolist() # 返回列名列表
244
 
245
 
246
- def search_data(df, keyword, selected_column):
 
247
  if isinstance(df, str): # 检查是否加载成功
248
  return df
 
249
  # 过滤包含关键字的行
250
  if selected_column not in df.columns:
251
  return "Invalid column selected."
@@ -254,21 +276,25 @@ def search_data(df, keyword, selected_column):
254
 
255
  if filtered_df.empty:
256
  return "No results found."
 
257
  return filtered_df.to_html(classes='data', index=False, header=True)
258
 
259
 
260
- def search_data_golden_Enzyme(keyword, selected_column):
261
- df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
262
- return search_data(df, keyword, selected_column)
 
263
 
264
- def search_data_golden_Ribozyme(keyword, selected_column):
265
- df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
266
- return search_data(df, keyword, selected_column)
 
 
267
 
 
 
268
 
269
- def search_data_LLENKA(keyword, selected_column):
270
- df = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
271
- return search_data(df, keyword, selected_column)
272
 
273
 
274
  with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
@@ -292,7 +318,6 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
292
 
293
  with gr.Row():
294
  with gr.Column(scale=1):
295
-
296
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
297
 
298
  with gr.Column(scale=1):
@@ -308,7 +333,8 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
308
  )
309
 
310
  with gr.Column():
311
- model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
 
312
  exp = gr.Button("Example Prompt")
313
  with gr.Row():
314
  gen = gr.Button("Generate", variant="primary")
@@ -318,9 +344,9 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
318
  | Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
319
  | Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
320
  | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
321
-
322
  """)
323
- with gr.Tab("Golden Benchmark for Enzyme Kinetics"):
324
  gr.Markdown(
325
  '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
326
  </p>'''
@@ -331,41 +357,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
331
 
332
  with gr.Row():
333
  # 选择搜索字段
334
- column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
335
- column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
336
-
337
- # 添加搜索框
338
- search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
339
- # 按钮点击后执行搜索
340
- search_button = gr.Button("Search", variant="primary")
341
-
342
- search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
343
-
344
- # 设置搜索功能
345
- search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
346
-
347
- # 将回车事件绑定到搜索按钮
348
- search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
349
-
350
- # 初始加载整个 CSV 表格
351
- initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
352
- if isinstance(initial_output, str):
353
- search_output.value = initial_output # 直接将错误消息赋值
354
- else:
355
- search_output.value = initial_output.to_html(classes='data', index=False, header=True)
356
-
357
- with gr.Tab("Golden Benchmark for Ribozyme Kinetics"):
358
- gr.Markdown(
359
- '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
360
- </p>'''
361
- )
362
- gr.Markdown("""
363
- dataset can be download in [LLM-Ribozyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)
364
- """)
365
-
366
- with gr.Row():
367
- # 选择搜索字段
368
- column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
369
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
370
 
371
  # 添加搜索框
@@ -376,13 +368,13 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
376
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
377
 
378
  # 设置搜索功能
379
- search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
380
 
381
  # 将回车事件绑定到搜索按钮
382
- search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
383
 
384
- # 初始加载整个 CSV 表格
385
- initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
386
  if isinstance(initial_output, str):
387
  search_output.value = initial_output # 直接将错误消息赋值
388
  else:
@@ -399,7 +391,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
399
  """)
400
  with gr.Row():
401
  # 选择搜索字段
402
- column_names = get_column_names(CSV_FILE_PATH_LLENKA_Dataset)
403
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
404
 
405
  # 添加搜索框
@@ -410,25 +402,23 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
410
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
411
 
412
  # 设置搜索功能
413
- search_button.click(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
414
 
415
  # 将回车事件绑定到搜索按钮
416
- search_box.submit(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
417
 
418
- # 初始加载整个 CSV 表格
419
- initial_output = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
420
  if isinstance(initial_output, str):
421
  search_output.value = initial_output # 直接将错误消息赋值
422
  else:
423
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
424
 
425
-
426
  extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
427
  exp.click(update_input, outputs=model_input)
428
- gen.click(fn=predict, inputs=[model_input, text_output], outputs=outputs)
429
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
430
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
431
 
432
-
433
  demo.launch()
434
 
 
8
  import tiktoken
9
  import pandas as pd
10
 
 
 
 
 
11
  api_key = os.getenv('API_KEY')
12
  base_url = os.getenv("BASE_URL")
13
 
 
29
 
30
 
31
  def del_references(lines):
32
+ # 1.mathpix md的格式:匹配\section*{REFERENCES}xxxx\section*{Table
33
+ pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
34
+ matches = re.search(pattern, lines, re.DOTALL)
35
+ if matches:
36
+ lines = lines.replace(matches[0], "\section*{Tables\n")
37
+ print("1.1.匹配到了References和Tables,删除了References,保留了后面的Tables")
38
+ else:
39
+ pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
 
 
 
40
  matches = re.search(pattern, lines, re.DOTALL)
41
  if matches:
42
+ print("1.2.匹配到了References,删除了References")
43
+ lines = lines.replace(matches[0], "")
44
+ else:
45
+ # 2.md的格式:匹配 ## REFERENCES
46
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
47
+ matches = re.search(pattern, lines, re.DOTALL)
48
+ if matches:
49
+ lines = lines.replace(matches[0], "Tables")
50
+ print("2.1.匹配到了## References和Tables,删除了References,保留了后面的Tables")
51
+ else:
52
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
53
+ matches = re.search(pattern, lines, re.DOTALL)
54
+ if matches:
55
+ lines = lines.replace(matches[0], "# SUPPLEMENTARY")
56
+ print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References,保留了后面的# SUPPLEMENTARY")
57
+ else:
58
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
59
+ matches = re.search(pattern, lines, re.DOTALL)
60
+ if matches:
61
+ print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
62
+ lines = lines.replace(matches[0], "[^0]")
63
+ else:
64
+ pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
65
+ matches = re.search(pattern, lines, re.DOTALL)
66
+ if matches:
67
+ print("2.4.匹配到了## References,删除了References")
68
+ lines = lines.replace(matches[0], "")
69
+ else:
70
+ print("没有匹配到References")
71
  return lines
72
 
73
 
 
110
 
111
  def openai_chat_2_step(prompt, file_content):
112
  all_response = ""
113
+ for i in range(len(file_content) // 123000 + 1):
114
+ text = file_content[i * 123000:(i + 1) * 123000]
115
  # step1: 拆分两部分,前半部分
116
  messages = [
117
  {
 
155
  return response
156
 
157
 
158
+ def predict(prompt, pdf_file):
159
+ if pdf_file is None:
160
+ return "Please upload a PDF file to proceed."
161
 
162
+ file_content = extract_pdf_pypdf(pdf_file.name)
163
  messages = [
164
  {
165
  "role": "system",
 
176
  print("prompt tokens:", tokens)
177
  # time.sleep(20) # claude 需要加这个
178
  if tokens > 128000:
179
+ file_content = del_references(file_content)
180
  extract_result = openai_chat_2_step(prompt, file_content)
181
  else:
182
  extract_result = openai_api(messages)
 
242
  return en_1
243
 
244
 
245
+ EXCEL_FILE_PATH_Golden_Benchmark = "static/golden benchmark.csv"
246
+ EXCEL_FILE_PATH_Expert_Annotated_Dataset = "static/3450_merged_data_2000_lines.csv"
 
247
 
248
 
249
+ def load_excel(EXCEL_FILE_PATH):
250
  try:
251
+ # 读取 Excel 文件
252
+ # df = pd.read_excel(EXCEL_FILE_PATH)
253
+ df = pd.read_csv(EXCEL_FILE_PATH)
254
  return df
255
  except Exception as e:
256
+ return f"Error loading Excel file: {e}"
257
 
258
 
259
+ def get_column_names(EXCEL_FILE_PATH):
260
+ df = load_excel(EXCEL_FILE_PATH)
261
  if isinstance(df, str):
262
  return [] # 如果加载失败,返回空列表
263
  return df.columns.tolist() # 返回列名列表
264
 
265
 
266
+ def search_data_golden(keyword, selected_column):
267
+ df = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
268
  if isinstance(df, str): # 检查是否加载成功
269
  return df
270
+
271
  # 过滤包含关键字的行
272
  if selected_column not in df.columns:
273
  return "Invalid column selected."
 
276
 
277
  if filtered_df.empty:
278
  return "No results found."
279
+
280
  return filtered_df.to_html(classes='data', index=False, header=True)
281
 
282
 
283
+ def search_data_entire(keyword, selected_column):
284
+ df = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
285
+ if isinstance(df, str): # 检查是否加载成功
286
+ return df
287
 
288
+ # 过滤包含关键字的行
289
+ if selected_column not in df.columns:
290
+ return "Invalid column selected."
291
+
292
+ filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]
293
 
294
+ if filtered_df.empty:
295
+ return "No results found."
296
 
297
+ return filtered_df.to_html(classes='data', index=False, header=True)
 
 
298
 
299
 
300
  with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
 
318
 
319
  with gr.Row():
320
  with gr.Column(scale=1):
 
321
  file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
322
 
323
  with gr.Column(scale=1):
 
333
  )
334
 
335
  with gr.Column():
336
+ model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here',
337
+ label='Input Prompt')
338
  exp = gr.Button("Example Prompt")
339
  with gr.Row():
340
  gen = gr.Button("Generate", variant="primary")
 
344
  | Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
345
  | Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
346
  | Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
347
+
348
  """)
349
+ with gr.Tab("Golden Benchmark"):
350
  gr.Markdown(
351
  '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
352
  </p>'''
 
357
 
358
  with gr.Row():
359
  # 选择搜索字段
360
+ column_names = get_column_names(EXCEL_FILE_PATH_Golden_Benchmark)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
362
 
363
  # 添加搜索框
 
368
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
369
 
370
  # 设置搜索功能
371
+ search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
372
 
373
  # 将回车事件绑定到搜索按钮
374
+ search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
375
 
376
+ # 初始加载整个 Excel 表格
377
+ initial_output = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
378
  if isinstance(initial_output, str):
379
  search_output.value = initial_output # 直接将错误消息赋值
380
  else:
 
391
  """)
392
  with gr.Row():
393
  # 选择搜索字段
394
+ column_names = get_column_names(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
395
  column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
396
 
397
  # 添加搜索框
 
402
  search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
403
 
404
  # 设置搜索功能
405
+ search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
406
 
407
  # 将回车事件绑定到搜索按钮
408
+ search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
409
 
410
+ # 初始加载整个 Excel 表格
411
+ initial_output = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
412
  if isinstance(initial_output, str):
413
  search_output.value = initial_output # 直接将错误消息赋值
414
  else:
415
  search_output.value = initial_output.to_html(classes='data', index=False, header=True)
416
 
 
417
  extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
418
  exp.click(update_input, outputs=model_input)
419
+ gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
420
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
421
  viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
422
 
 
423
  demo.launch()
424