Spaces:

jackkuo
/

Automated-Enzyme-Kinetics-Extractor

Running

App Files Files Community

jackkuo commited on Jan 24

Commit

7ce8079

verified ·

1 Parent(s): d75a33c

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -94

app.py CHANGED Viewed

@@ -8,10 +8,6 @@ import re
 import tiktoken
 import pandas as pd
-# api_key = os.getenv('API_KEY')
-# base_url = os.getenv("BASE_URL")
-# client = OpenAI(api_key=api_key, base_url=base_url)
 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
@@ -33,25 +29,45 @@ def cal_tokens(message_data):
 def del_references(lines):
-    # 定义正则表达式模式
-    patterns = [
-        (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables', r'\section*{Tables\n'),
-        (r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', ''),
-        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)', r'Tables'),
-        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY', r'# SUPPLEMENTARY'),
-        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\[\^0\]', r'[^0]'),
-        (r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)', '')
-    ]
-    for pattern, replacement in patterns:
         matches = re.search(pattern, lines, re.DOTALL)
         if matches:
-            lines = lines.replace(matches[0], replacement)
-            print(f"匹配到了 {pattern}, 删除了 References, 保留了后面的 {replacement}")
-            break
-    else:
-        print("没有匹配到 References")
     return lines
@@ -94,8 +110,8 @@ def openai_api(messages):
 def openai_chat_2_step(prompt, file_content):
     all_response = ""
-    for i in range(len(file_content)//123000 + 1):
-        text = file_content[i*123000:(i+1)*123000]
         # step1: 拆分两部分，前半部分
         messages = [
             {
@@ -139,9 +155,11 @@ Please pay attention to the pipe format as shown in the example below. This form
     return response
-def predict(prompt, file_content):
-    file_content = del_references(file_content)
     messages = [
         {
             "role": "system",
@@ -158,6 +176,7 @@ def predict(prompt, file_content):
     print("prompt tokens:", tokens)
     # time.sleep(20) # claude 需要加这个
     if tokens > 128000:
         extract_result = openai_chat_2_step(prompt, file_content)
     else:
         extract_result = openai_api(messages)
@@ -223,29 +242,32 @@ def update_input():
     return en_1
-CSV_FILE_PATH_Golden_Benchmark_Enzyme = "static/Golden Benchmark for Enzyme Kinetics.csv"
-CSV_FILE_PATH_Golden_Benchmark_Ribozyme = "static/Golden Benchmark for Ribozyme Kinetics.csv"
-CSV_FILE_PATH_LLENKA_Dataset = "static/3450_merged_data_2000_lines.csv"
-def load_csv(CSV_FILE_PATH):
     try:
-        df = pd.read_csv(CSV_FILE_PATH)
         return df
     except Exception as e:
-        return f"Error loading CSV file: {e}"
-def get_column_names(CSV_FILE_PATH):
-    df = load_csv(CSV_FILE_PATH)
     if isinstance(df, str):
         return []  # 如果加载失败，返回空列表
     return df.columns.tolist()  # 返回列名列表
-def search_data(df, keyword, selected_column):
     if isinstance(df, str):  # 检查是否加载成功
         return df
     # 过滤包含关键字的行
     if selected_column not in df.columns:
         return "Invalid column selected."
@@ -254,21 +276,25 @@ def search_data(df, keyword, selected_column):
     if filtered_df.empty:
         return "No results found."
     return filtered_df.to_html(classes='data', index=False, header=True)
-def search_data_golden_Enzyme(keyword, selected_column):
-    df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
-    return search_data(df, keyword, selected_column)
-def search_data_golden_Ribozyme(keyword, selected_column):
-    df = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
-    return search_data(df, keyword, selected_column)
-def search_data_LLENKA(keyword, selected_column):
-    df = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
-    return search_data(df, keyword, selected_column)
 with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
@@ -292,7 +318,6 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             with gr.Row():
                 with gr.Column(scale=1):
                     file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
                 with gr.Column(scale=1):
@@ -308,7 +333,8 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
                     )
             with gr.Column():
-                model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
                 exp = gr.Button("Example Prompt")
                 with gr.Row():
                     gen = gr.Button("Generate", variant="primary")
@@ -318,9 +344,9 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
         | Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
         | Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
         | Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
         """)
-        with gr.Tab("Golden Benchmark for Enzyme Kinetics"):
             gr.Markdown(
                 '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
                 </p>'''
@@ -331,41 +357,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             with gr.Row():
                 # 选择搜索字段
-                column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
-                column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
-                # 添加搜索框
-                search_box = gr.Textbox(label="Search", placeholder="Enter keyword to search...")
-            # 按钮点击后执行搜索
-            search_button = gr.Button("Search", variant="primary")
-            search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
-            # 设置搜索功能
-            search_button.click(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
-            # 将回车事件绑定到搜索按钮
-            search_box.submit(fn=search_data_golden_Enzyme, inputs=[search_box, column_dropdown], outputs=search_output)
-            # 初始加载整个 CSV 表格
-            initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Enzyme)
-            if isinstance(initial_output, str):
-                search_output.value = initial_output  # 直接将错误消息赋值
-            else:
-                search_output.value = initial_output.to_html(classes='data', index=False, header=True)
-        with gr.Tab("Golden Benchmark for Ribozyme Kinetics"):
-            gr.Markdown(
-                '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
-                </p>'''
-            )
-            gr.Markdown("""
-                dataset can be download in [LLM-Ribozyme-Kinetics-Golden-Benchmark](https://huggingface.co/datasets/jackkuo/LLM-Ribozyme-Kinetics-Golden-Benchmark)
-            """)
-            with gr.Row():
-                # 选择搜索字段
-                column_names = get_column_names(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
@@ -376,13 +368,13 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
-            search_button.click(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
-            search_box.submit(fn=search_data_golden_Ribozyme, inputs=[search_box, column_dropdown], outputs=search_output)
-            # 初始加载整个 CSV 表格
-            initial_output = load_csv(CSV_FILE_PATH_Golden_Benchmark_Ribozyme)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
@@ -399,7 +391,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             """)
             with gr.Row():
                 # 选择搜索字段
-                column_names = get_column_names(CSV_FILE_PATH_LLENKA_Dataset)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
@@ -410,25 +402,23 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
-            search_button.click(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
-            search_box.submit(fn=search_data_LLENKA, inputs=[search_box, column_dropdown], outputs=search_output)
-            # 初始加载整个 CSV 表格
-            initial_output = load_csv(CSV_FILE_PATH_LLENKA_Dataset)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
                 search_output.value = initial_output.to_html(classes='data', index=False, header=True)
     extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
     exp.click(update_input, outputs=model_input)
-    gen.click(fn=predict, inputs=[model_input, text_output], outputs=outputs)
     clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
     viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
 demo.launch()

 import tiktoken
 import pandas as pd
 api_key = os.getenv('API_KEY')
 base_url = os.getenv("BASE_URL")
 def del_references(lines):
+    # 1.mathpix md的格式：匹配\section*{REFERENCES}xxxx\section*{Table
+    pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
+    matches = re.search(pattern, lines, re.DOTALL)
+    if matches:
+        lines = lines.replace(matches[0], "\section*{Tables\n")
+        print("1.1.匹配到了References和Tables,删除了References，保留了后面的Tables")
+    else:
+        pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
         matches = re.search(pattern, lines, re.DOTALL)
         if matches:
+            print("1.2.匹配到了References,删除了References")
+            lines = lines.replace(matches[0], "")
+        else:
+            # 2.md的格式：匹配 ## REFERENCES
+            pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
+            matches = re.search(pattern, lines, re.DOTALL)
+            if matches:
+                lines = lines.replace(matches[0], "Tables")
+                print("2.1.匹配到了## References和Tables,删除了References，保留了后面的Tables")
+            else:
+                pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
+                matches = re.search(pattern, lines, re.DOTALL)
+                if matches:
+                    lines = lines.replace(matches[0], "# SUPPLEMENTARY")
+                    print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References，保留了后面的# SUPPLEMENTARY")
+                else:
+                    pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
+                    matches = re.search(pattern, lines, re.DOTALL)
+                    if matches:
+                        print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
+                        lines = lines.replace(matches[0], "[^0]")
+                    else:
+                        pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
+                        matches = re.search(pattern, lines, re.DOTALL)
+                        if matches:
+                            print("2.4.匹配到了## References,删除了References")
+                            lines = lines.replace(matches[0], "")
+                        else:
+                            print("没有匹配到References")
     return lines
 def openai_chat_2_step(prompt, file_content):
     all_response = ""
+    for i in range(len(file_content) // 123000 + 1):
+        text = file_content[i * 123000:(i + 1) * 123000]
         # step1: 拆分两部分，前半部分
         messages = [
             {
     return response
+def predict(prompt, pdf_file):
+    if pdf_file is None:
+        return "Please upload a PDF file to proceed."
+    file_content = extract_pdf_pypdf(pdf_file.name)
     messages = [
         {
             "role": "system",
     print("prompt tokens:", tokens)
     # time.sleep(20) # claude 需要加这个
     if tokens > 128000:
+        file_content = del_references(file_content)
         extract_result = openai_chat_2_step(prompt, file_content)
     else:
         extract_result = openai_api(messages)
     return en_1
+EXCEL_FILE_PATH_Golden_Benchmark = "static/golden benchmark.csv"
+EXCEL_FILE_PATH_Expert_Annotated_Dataset = "static/3450_merged_data_2000_lines.csv"
+def load_excel(EXCEL_FILE_PATH):
     try:
+        # 读取 Excel 文件
+        # df = pd.read_excel(EXCEL_FILE_PATH)
+        df = pd.read_csv(EXCEL_FILE_PATH)
         return df
     except Exception as e:
+        return f"Error loading Excel file: {e}"
+def get_column_names(EXCEL_FILE_PATH):
+    df = load_excel(EXCEL_FILE_PATH)
     if isinstance(df, str):
         return []  # 如果加载失败，返回空列表
     return df.columns.tolist()  # 返回列名列表
+def search_data_golden(keyword, selected_column):
+    df = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
     if isinstance(df, str):  # 检查是否加载成功
         return df
     # 过滤包含关键字的行
     if selected_column not in df.columns:
         return "Invalid column selected."
     if filtered_df.empty:
         return "No results found."
     return filtered_df.to_html(classes='data', index=False, header=True)
+def search_data_entire(keyword, selected_column):
+    df = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
+    if isinstance(df, str):  # 检查是否加载成功
+        return df
+    # 过滤包含关键字的行
+    if selected_column not in df.columns:
+        return "Invalid column selected."
+    filtered_df = df[df[selected_column].astype(str).str.contains(keyword, case=False)]
+    if filtered_df.empty:
+        return "No results found."
+    return filtered_df.to_html(classes='data', index=False, header=True)
 with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
             with gr.Row():
                 with gr.Column(scale=1):
                     file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
                 with gr.Column(scale=1):
                     )
             with gr.Column():
+                model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here',
+                                         label='Input Prompt')
                 exp = gr.Button("Example Prompt")
                 with gr.Row():
                     gen = gr.Button("Generate", variant="primary")
         | Enzyme1    | Bacillus subtilis | Substrate_A | 7.3 | mM      | 6.4  | s^-1      | 1.4 × 10^4   | M^-1s^-1     | 37°C             | 5.0            | WT                 | NADP^+                  |
         | Enzyme2    | Escherichia coli  | Substrate_B | 5.9 | mM      | 9.8  | s^-1      | 29000   | mM^-1min^-1  | 60°C             | 10.0           | Q176E             | NADPH                   |
         | Enzyme3    | Homo sapiens      | Substrate_C | 6.9 | mM      | 15.6 | s^-1      | 43000   | µM^-1s^-1    | 65°C             | 8.0            | T253S             | NAD^+                   |
         """)
+        with gr.Tab("Golden Benchmark"):
             gr.Markdown(
                 '''<h1 align="center"> Golden Benchmark Viewer with Advanced Search </h1>
                 </p>'''
             with gr.Row():
                 # 选择搜索字段
+                column_names = get_column_names(EXCEL_FILE_PATH_Golden_Benchmark)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
+            search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
+            search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
+            # 初始加载整个 Excel 表格
+            initial_output = load_excel(EXCEL_FILE_PATH_Golden_Benchmark)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
             """)
             with gr.Row():
                 # 选择搜索字段
+                column_names = get_column_names(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
                 column_dropdown = gr.Dropdown(label="Select Column to Search", choices=column_names)
                 # 添加搜索框
             search_output = gr.HTML(label="Search Results", min_height=1000, max_height=1000)
             # 设置搜索功能
+            search_button.click(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
             # 将回车事件绑定到搜索按钮
+            search_box.submit(fn=search_data_golden, inputs=[search_box, column_dropdown], outputs=search_output)
+            # 初始加载整个 Excel 表格
+            initial_output = load_excel(EXCEL_FILE_PATH_Expert_Annotated_Dataset)
             if isinstance(initial_output, str):
                 search_output.value = initial_output  # 直接将错误消息赋值
             else:
                 search_output.value = initial_output.to_html(classes='data', index=False, header=True)
     extract_button.click(extract_pdf_pypdf, inputs=file_input, outputs=text_output)
     exp.click(update_input, outputs=model_input)
+    gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
     clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
     viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
 demo.launch()