Spaces:

oliverwang15
/

DAN_AI

Sleeping

App Files Files Community

oliverwang15 commited on Nov 8, 2023

Commit

772f8cb

1 Parent(s): 97f7347

updates on the submit button

Browse files

Files changed (8) hide show

.gitattributes +35 -35
README.md +13 -13
app.py +156 -156
backend.py +301 -301
openai.py +44 -44
prompt.py +91 -91
requirements.txt +3 -3
template.py +176 -176

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
----
-title: DAN AI
-emoji: 🏆
-colorFrom: yellow
-colorTo: blue
-sdk: gradio
-sdk_version: 3.46.1
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: DAN AI
+emoji: 🏆
+colorFrom: yellow
+colorTo: blue
+sdk: gradio
+sdk_version: 3.46.1
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,157 +1,157 @@
-import warnings
-warnings.filterwarnings("ignore")
-import os, json
-import gradio as gr
-import pandas as pd
-from backend import Backend
-QUESTIONS = [
-    "Animal Type",
-    "Exposure Age",
-    "Behavior Test",
-    "Intervention 1",
-    "Intervention 2",
-    "Genetic Chain",
-]
-with gr.Blocks(theme="dark") as demo:
-    backend = Backend()
-    with gr.Row():
-        with gr.Row():
-            # Update
-            with gr.Group():
-                gr.Markdown(f'<center><h1>Input</h1></center>')
-                gr.Markdown(f'<center><p>Please First Upload the File</p></center>')
-                openai_key = gr.Textbox(
-                        label='Enter your OpenAI API key here',
-                        type='password')
-                file = gr.File(label='Upload your .txt or .pdf file here', file_types=['.txt', '.pdf'], file_count = 'multiple')
-                questions = gr.CheckboxGroup(choices = QUESTIONS, value = QUESTIONS, label="Questions", info="Please select the question you want to ask")
-                btn_submit_txt = gr.Button(value='Submit txt')
-                btn_submit_txt.style(full_width=True)
-            # Output
-            with gr.Group():
-                gr.Markdown(f'<center><h1>Output</h1></center>')
-                gr.Markdown(f'<center><p>The answer to your question is :</p></center>')
-                filename_box = gr.Textbox(label = "File")
-                question_box = gr.Textbox(label='Question')
-                answer_box = gr.Textbox(label='Answer')
-                # reference_box = gr.Textbox(label='Reference')
-                highlighted_text = gr.outputs.HTML(label="Highlighted Text")
-                with gr.Group():
-                    gr.Markdown("<center><h4>Please select different questions</h4></center>")
-                    with gr.Row():
-                        btn_last_question = gr.Button(value='Last Question')
-                        btn_next_question = gr.Button(value='Next Question')
-                with gr.Group():
-                    gr.Markdown("<center><h4>Please select different passages</h4></center>")
-                    with gr.Row():
-                        btn_last_passage = gr.Button(value='Last Passage')
-                        btn_next_passage = gr.Button(value='Next Passage')
-            # Correctness
-            with gr.Group():
-                gr.Markdown(f'<center><h1>Correct the Result</h1></center>')
-                gr.Markdown(f'<center><p>Please Correct the Results</p></center>')
-                with gr.Row():
-                    save_results = gr.Textbox(placeholder = "Still need to click the button above to save the results", label = 'Save Results')
-                with gr.Group():
-                    gr.Markdown(f'<center><p>Please Choose: </p></center>')
-                    answer_correct = gr.Radio(choices = ["Correct", "Incorrect"], label='Is the Generated Answer Correct?', info="Pease select whether the generated text is correct")
-                    correct_answer = gr.Textbox(placeholder = "Please judge on the generated answer", label = 'Correct Answer', interactive = True)
-                    reference_correct = gr.Radio(choices = ["Correct", "Incorrect"], label="Is the Reference Correct?", info="Pease select whether the reference is correct")
-                    correct_reference = gr.Textbox(placeholder = "Please judge on the generated answer", label = 'Correct Reference', interactive = True)
-                    btn_submit_correctness = gr.Button(value='Submit Correctness')
-                    btn_submit_correctness.style(full_width=True)
-            # Download
-            with gr.Group():
-                gr.Markdown(f'<center><h1>Download</h1></center>')
-                gr.Markdown(f'<center><p>Download the original LLM answers and corrected LLM answers</p></center>')
-                answer_file = gr.File(label='Download original LLM answers', file_types=['.xlsx'])
-                btn_download_answer = gr.Button(value='Download original LLM answers')
-                btn_download_answer.style(full_width=True)
-                corrected_file = gr.File(label='Download corrected data', file_types=['.xlsx'])
-                btn_download_corrected = gr.Button(value='Download corrected LLM answers')
-                btn_download_corrected.style(full_width=True)
-    with gr.Row():
-        reset = gr.Button(value='Reset')
-        reset.style(full_width=True)
-    # Answer change
-    answer_correct.input(
-        backend.change_correct_answer,
-        inputs = [answer_correct],
-        outputs = [correct_answer],
-    )
-    reference_correct.input(
-        backend.change_correct_reference,
-        inputs = [reference_correct],
-        outputs = [correct_reference],
-    )
-    # Submit button
-    btn_submit_txt.click(
-            backend.process_file,
-            inputs=[file, questions, openai_key],
-            outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference],
-        )
-    btn_submit_correctness.click(   # TODO
-            backend.process_results,
-            inputs=[answer_correct, correct_answer, reference_correct, correct_reference],
-            outputs=[save_results],
-        )
-    # Switch question button
-    btn_last_question.click(
-        backend.process_last,
-        outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
-    )
-    btn_next_question.click(
-            backend.process_next,
-            outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
-        )
-    # Switch passwage button
-    btn_last_passage.click(
-        backend.switch_last_passage,
-        outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
-    )
-    btn_next_passage.click(
-        backend.switch_next_passage,
-        outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
-    )
-    # Download button
-    btn_download_answer.click(
-            backend.download_answer,
-            outputs=[answer_file],
-        )
-    btn_download_corrected.click(
-            backend.download_corrected,
-            outputs=[corrected_file],
-        )
-demo.queue()
 demo.launch(show_error=True, show_tips=True)

+import warnings
+warnings.filterwarnings("ignore")
+import os, json
+import gradio as gr
+import pandas as pd
+from backend import Backend
+QUESTIONS = [
+    "Animal Type",
+    "Exposure Age",
+    "Behavior Test",
+    "Intervention 1",
+    "Intervention 2",
+    "Genetic Chain",
+]
+with gr.Blocks(theme="dark") as demo:
+    backend = Backend()
+    with gr.Row():
+        with gr.Row():
+            # Update
+            with gr.Group():
+                gr.Markdown(f'<center><h1>Input</h1></center>')
+                gr.Markdown(f'<center><p>Please First Upload the File</p></center>')
+                openai_key = gr.Textbox(
+                        label='Enter your OpenAI API key here',
+                        type='password')
+                file = gr.File(label='Upload your .txt or .pdf file here', file_types=['.txt', '.pdf'], file_count = 'multiple')
+                questions = gr.CheckboxGroup(choices = QUESTIONS, value = QUESTIONS, label="Questions", info="Please select the question you want to ask")
+                btn_submit_txt = gr.Button(value='Submit')
+                btn_submit_txt.style(full_width=True)
+            # Output
+            with gr.Group():
+                gr.Markdown(f'<center><h1>Output</h1></center>')
+                gr.Markdown(f'<center><p>The answer to your question is :</p></center>')
+                filename_box = gr.Textbox(label = "File")
+                question_box = gr.Textbox(label='Question')
+                answer_box = gr.Textbox(label='Answer')
+                # reference_box = gr.Textbox(label='Reference')
+                highlighted_text = gr.outputs.HTML(label="Highlighted Text")
+                with gr.Group():
+                    gr.Markdown("<center><h4>Please select different questions</h4></center>")
+                    with gr.Row():
+                        btn_last_question = gr.Button(value='Last Question')
+                        btn_next_question = gr.Button(value='Next Question')
+                with gr.Group():
+                    gr.Markdown("<center><h4>Please select different passages</h4></center>")
+                    with gr.Row():
+                        btn_last_passage = gr.Button(value='Last Passage')
+                        btn_next_passage = gr.Button(value='Next Passage')
+            # Correctness
+            with gr.Group():
+                gr.Markdown(f'<center><h1>Correct the Result</h1></center>')
+                gr.Markdown(f'<center><p>Please Correct the Results</p></center>')
+                with gr.Row():
+                    save_results = gr.Textbox(placeholder = "Still need to click the button above to save the results", label = 'Save Results')
+                with gr.Group():
+                    gr.Markdown(f'<center><p>Please Choose: </p></center>')
+                    answer_correct = gr.Radio(choices = ["Correct", "Incorrect"], label='Is the Generated Answer Correct?', info="Pease select whether the generated text is correct")
+                    correct_answer = gr.Textbox(placeholder = "Please judge on the generated answer", label = 'Correct Answer', interactive = True)
+                    reference_correct = gr.Radio(choices = ["Correct", "Incorrect"], label="Is the Reference Correct?", info="Pease select whether the reference is correct")
+                    correct_reference = gr.Textbox(placeholder = "Please judge on the generated answer", label = 'Correct Reference', interactive = True)
+                    btn_submit_correctness = gr.Button(value='Submit Correctness')
+                    btn_submit_correctness.style(full_width=True)
+            # Download
+            with gr.Group():
+                gr.Markdown(f'<center><h1>Download</h1></center>')
+                gr.Markdown(f'<center><p>Download the original LLM answers and corrected LLM answers</p></center>')
+                answer_file = gr.File(label='Download original LLM answers', file_types=['.xlsx'])
+                btn_download_answer = gr.Button(value='Download original LLM answers')
+                btn_download_answer.style(full_width=True)
+                corrected_file = gr.File(label='Download corrected data', file_types=['.xlsx'])
+                btn_download_corrected = gr.Button(value='Download corrected LLM answers')
+                btn_download_corrected.style(full_width=True)
+    with gr.Row():
+        reset = gr.Button(value='Reset')
+        reset.style(full_width=True)
+    # Answer change
+    answer_correct.input(
+        backend.change_correct_answer,
+        inputs = [answer_correct],
+        outputs = [correct_answer],
+    )
+    reference_correct.input(
+        backend.change_correct_reference,
+        inputs = [reference_correct],
+        outputs = [correct_reference],
+    )
+    # Submit button
+    btn_submit_txt.click(
+            backend.process_file,
+            inputs=[file, questions, openai_key],
+            outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference],
+        )
+    btn_submit_correctness.click(   # TODO
+            backend.process_results,
+            inputs=[answer_correct, correct_answer, reference_correct, correct_reference],
+            outputs=[save_results],
+        )
+    # Switch question button
+    btn_last_question.click(
+        backend.process_last,
+        outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
+    )
+    btn_next_question.click(
+            backend.process_next,
+            outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
+        )
+    # Switch passwage button
+    btn_last_passage.click(
+        backend.switch_last_passage,
+        outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
+    )
+    btn_next_passage.click(
+        backend.switch_next_passage,
+        outputs=[filename_box, question_box, answer_box, highlighted_text, correct_answer, correct_reference, save_results, answer_correct, reference_correct],
+    )
+    # Download button
+    btn_download_answer.click(
+            backend.download_answer,
+            outputs=[answer_file],
+        )
+    btn_download_corrected.click(
+            backend.download_corrected,
+            outputs=[corrected_file],
+        )
+demo.queue()
 demo.launch(show_error=True, show_tips=True)

backend.py CHANGED Viewed

@@ -1,302 +1,302 @@
-from prompt import Prompt
-from openai import OpenAI
-from fuzzywuzzy import fuzz
-from fuzzywuzzy import process
-import gradio as gr
-import pandas as pd
-import os
-class Backend:
-    def __init__(self):
-        self.agent = OpenAI()
-        self.prompt = Prompt()
-    def read_file_single(self, file):
-        # read the file
-        if file is not None:
-            with open(file.name, 'r') as f:
-                text = f.read()
-        else:
-            raise gr.Error("You need to upload a file first")
-        return text
-    def phrase_pdf(self, file_path):
-        from langchain.document_loaders import UnstructuredPDFLoader
-        loader = UnstructuredPDFLoader(file_path, model = 'elements')
-        file = loader.load()
-        return file[0].page_content
-    def read_file(self, files):
-        # read the file
-        text_list = []
-        self.filename_list = []
-        if files is not None:
-            for file in files:
-                if file.name.split('.')[-1] == 'pdf':
-                    # convert pdf to txt
-                    text = self.phrase_pdf(file.name)
-                else:
-                    with open(file.name, 'r', encoding='utf-8') as f:
-                        text = f.read()
-                text_list.append(text)
-                self.filename_list.append(file.name.split('\\')[-1])
-        else:
-            raise gr.Error("You need to upload a file first")
-        return text_list
-    def highlight_text(self, text, highlight_list):
-        # Find the original sentences
-        # Split the passage into sentences
-        sentences_in_passage = text.split('.')
-        sentences_in_passage = [i.split('\n') for i in sentences_in_passage]
-        new_sentences_in_passage = []
-        for i in sentences_in_passage:
-            new_sentences_in_passage =new_sentences_in_passage + i
-        # hightlight the reference
-        for hl in highlight_list:
-            # Find the best match using fuzzy matching
-            best_match = process.extractOne(hl, new_sentences_in_passage, scorer=fuzz.partial_ratio)
-            text = text.replace(best_match[0], f'<mark style="background: #A5D2F1">{best_match[0]}</mark><mark style="background: #FFC0CB"><font color="red"> (match score:{best_match[1]})</font></mark>')
-        # add line break
-        text = text.replace('\n', f" <br /> ")
-        # add scroll bar
-        text = f'<div style="height: 300px; overflow: auto;">{text}</div>'
-        return text
-    def process_file(self, file, questions, openai_key, progress = gr.Progress()):
-        # record the questions
-        self.questions = questions
-        # get the text_list
-        self.text_list = self.read_file(file)
-        # make the prompt
-        prompt_list = [self.prompt.get(text, questions, 'v3') for text in self.text_list]
-        # interact with openai
-        self.res_list = []
-        for prompt in progress.tqdm(prompt_list, desc = 'Generating answers...'):
-            res = self.agent(prompt, with_history = False, temperature = 0.1, model = 'gpt-3.5-turbo-16k', api_key = openai_key)
-            res = self.prompt.process_result(res, 'v3')
-            self.res_list.append(res)
-        # Use the first file as default
-        # Use the first question for multiple questions
-        gpt_res = self.res_list[0]
-        self.gpt_result = gpt_res
-        self.current_question = 0
-        self.totel_question = len(res.keys())
-        self.current_passage = 0
-        self.total_passages = len(self.res_list)
-        # make a dataframe to record everything
-        self.ori_answer_df = pd.DataFrame()
-        self.answer_df = pd.DataFrame()
-        for i, res in enumerate(self.res_list):
-            tmp = pd.DataFrame(res).T
-            tmp = tmp.reset_index()
-            tmp = tmp.rename(columns={"index":"question_id"})
-            tmp['filename'] = self.filename_list[i]
-            tmp['question'] = self.questions
-            self.ori_answer_df = pd.concat([tmp, self.ori_answer_df])
-            self.answer_df = pd.concat([tmp, self.answer_df])
-        # default fist question
-        res = res['Question 1']
-        question = self.questions[self.current_question]
-        self.answer = res['answer']
-        self.text = self.text_list[0]
-        self.highlighted_out = res['original sentences']
-        highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
-        self.highlighted_out = '\n'.join(self.highlighted_out)
-        file_name = self.filename_list[self.current_passage]
-        return file_name, question, self.answer, highlighted_out_html, self.answer, self.highlighted_out
-    def process_results(self, answer_correct, correct_answer, reference_correct, correct_reference):
-        if not hasattr(self, 'clicked_correct_answer'):
-            raise gr.Error("You need to judge whether the generated answer is correct first")
-        if not hasattr(self, 'clicked_correct_reference'):
-            raise gr.Error("You need to judge whether the highlighted reference is correct first")
-        if not hasattr(self, 'answer_df'):
-            raise gr.Error("You need to submit the document first")
-        if self.current_question >= self.totel_question or self.current_question < 0:
-            raise gr.Error("No more questions, please return back")
-        # record the answer
-        condition = (self.answer_df['question_id'] == f'Question {self.current_question + 1}' ) & \
-            (self.answer_df['filename'] == self.filename_list[self.current_passage])
-        self.answer_df.loc[condition, 'answer_correct'] = answer_correct
-        self.answer_df.loc[condition, 'reference_correct'] = reference_correct
-        # self.answer_df.loc[f'Question {self.current_question + 1}', 'answer_correct'] = answer_correct
-        # self.answer_df.loc[f'Question {self.current_question + 1}', 'reference_correct'] = reference_correct
-        if self.clicked_correct_answer == True:
-            if hasattr(self, 'answer'):
-                self.answer_df.loc[condition, 'correct_answer'] = self.answer
-            else:
-                raise gr.Error("You need to submit the document first")
-        else:
-            # self.answer_df.loc[f'Question {self.current_question + 1}', 'correct_answer'] = correct_answer
-            self.answer_df.loc[condition, 'correct_answer'] = correct_answer
-        if self.clicked_correct_reference == True:
-            if hasattr(self, 'highlighted_out'):
-                self.answer_df.loc[condition, 'correct_reference'] = self.highlighted_out
-            else:
-                raise gr.Error("You need to submit the document first")
-        else:
-            self.answer_df.loc[condition, 'correct_reference'] = correct_reference
-        gr.Info('Results saved!')
-        return "Results saved!"
-    def process_next(self):
-        self.current_question += 1
-        if hasattr(self, 'clicked_correct_answer'):
-            del self.clicked_correct_answer
-        if hasattr(self, 'clicked_correct_reference'):
-            del self.clicked_correct_reference
-        if self.current_question >= self.totel_question:
-            # self.current_question -= 1
-            return "No more questions!", "No more questions!", "No more questions!", "No more questions!", 'No more questions!', 'No more questions!', 'Still need to click the button above to save the results', None, None
-        else:
-            res = self.gpt_result[f'Question {self.current_question + 1}']
-            question = self.questions[self.current_question]
-            self.answer = res['answer']
-            self.highlighted_out = res['original sentences']
-            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
-            self.highlighted_out = '\n'.join(self.highlighted_out)
-            file_name = self.filename_list[self.current_passage]
-            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
-    def process_last(self):
-        self.current_question -= 1
-        # To make sure to correct the answer first
-        if hasattr(self, 'clicked_correct_answer'):
-            del self.clicked_correct_answer
-        if hasattr(self, 'clicked_correct_reference'):
-            del self.clicked_correct_reference
-        # check question boundary
-        if self.current_question < 0:
-            # self.current_question += 1
-            return "No more questions!", "No more questions!", "No more questions!", "No more questions!", 'No more questions!', 'No more questions!', 'Still need to click the button above to save the results', None, None
-        else:
-            res = self.gpt_result[f'Question {self.current_question + 1}']
-            question = self.questions[self.current_question]
-            self.answer = res['answer']
-            self.highlighted_out = res['original sentences']
-            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
-            self.highlighted_out = '\n'.join(self.highlighted_out)
-            file_name = self.filename_list[self.current_passage]
-            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
-    def switch_next_passage(self):
-        self.current_question = 0
-        # To make sure to correct the answer first
-        if hasattr(self, 'clicked_correct_answer'):
-            del self.clicked_correct_answer
-        if hasattr(self, 'clicked_correct_reference'):
-            del self.clicked_correct_reference
-        self.current_passage += 1
-        if self.current_passage >= self.total_passages:
-            # self.current_passage -= 1
-            return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
-        else:
-            self.text = self.text_list[self.current_passage]
-            gpt_res = self.res_list[self.current_passage]
-            self.gpt_result = gpt_res
-            res = self.gpt_result[f'Question {self.current_question + 1}']
-            question = self.questions[self.current_question]
-            self.answer = res['answer']
-            self.highlighted_out = res['original sentences']
-            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
-            self.highlighted_out = '\n'.join(self.highlighted_out)
-            file_name = self.filename_list[self.current_passage]
-            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
-    def switch_last_passage(self):
-        self.current_question = 0
-        # To make sure to correct the answer first
-        if hasattr(self, 'clicked_correct_answer'):
-            del self.clicked_correct_answer
-        if hasattr(self, 'clicked_correct_reference'):
-            del self.clicked_correct_reference
-        self.current_passage -= 1
-        if self.current_passage < 0:
-            # self.current_passage += 1
-            return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
-        else:
-            self.text = self.text_list[self.current_passage]
-            gpt_res = self.res_list[self.current_passage]
-            self.gpt_result = gpt_res
-            res = self.gpt_result[f'Question {self.current_question + 1}']
-            question = self.questions[self.current_question]
-            self.answer = res['answer']
-            self.highlighted_out = res['original sentences']
-            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
-            self.highlighted_out = '\n'.join(self.highlighted_out)
-            file_name = self.filename_list[self.current_passage]
-            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
-    def download_answer(self, path = './tmp', name = 'answer.xlsx'):
-        os.makedirs(path, exist_ok = True)
-        path = os.path.join(path, name)
-        # self.ori_answer_df['questions'] = self.questions
-        self.ori_answer_df.to_excel(path, index = False)
-        return path
-    def download_corrected(self, path = './tmp', name = 'corrected_answer.xlsx'):
-        os.makedirs(path, exist_ok = True)
-        path = os.path.join(path, name)
-        # self.answer_df['questions'] = self.questions
-        self.answer_df.to_excel(path, index = False)
-        return path
-    def change_correct_answer(self, correctness):
-        if correctness == "Correct":
-            self.clicked_correct_answer = True
-            return "No need to change"
-        else:
-            if hasattr(self, 'answer'):
-                self.clicked_correct_answer = False
-                return self.answer
-            else:
-                return "No answer yet, you need to submit the document first"
-    def change_correct_reference(self, correctness):
-        if correctness == "Correct":
-            self.clicked_correct_reference = True
-            return "No need to change"
-        else:
-            if hasattr(self, 'highlighted_out'):
-                self.clicked_correct_reference = False
-                return self.highlighted_out
-            else:
                 return "No answer yet, you need to submit the document first"

+from prompt import Prompt
+from openai import OpenAI
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
+import gradio as gr
+import pandas as pd
+import os
+class Backend:
+    def __init__(self):
+        self.agent = OpenAI()
+        self.prompt = Prompt()
+    def read_file_single(self, file):
+        # read the file
+        if file is not None:
+            with open(file.name, 'r') as f:
+                text = f.read()
+        else:
+            raise gr.Error("You need to upload a file first")
+        return text
+    def phrase_pdf(self, file_path):
+        from langchain.document_loaders import UnstructuredPDFLoader
+        loader = UnstructuredPDFLoader(file_path, model = 'elements')
+        file = loader.load()
+        return file[0].page_content
+    def read_file(self, files):
+        # read the file
+        text_list = []
+        self.filename_list = []
+        if files is not None:
+            for file in files:
+                if file.name.split('.')[-1] == 'pdf':
+                    # convert pdf to txt
+                    text = self.phrase_pdf(file.name)
+                else:
+                    with open(file.name, 'r', encoding='utf-8') as f:
+                        text = f.read()
+                text_list.append(text)
+                self.filename_list.append(file.name.split('\\')[-1])
+        else:
+            raise gr.Error("You need to upload a file first")
+        return text_list
+    def highlight_text(self, text, highlight_list):
+        # Find the original sentences
+        # Split the passage into sentences
+        sentences_in_passage = text.split('.')
+        sentences_in_passage = [i.split('\n') for i in sentences_in_passage]
+        new_sentences_in_passage = []
+        for i in sentences_in_passage:
+            new_sentences_in_passage =new_sentences_in_passage + i
+        # hightlight the reference
+        for hl in highlight_list:
+            # Find the best match using fuzzy matching
+            best_match = process.extractOne(hl, new_sentences_in_passage, scorer=fuzz.partial_ratio)
+            text = text.replace(best_match[0], f'<mark style="background: #A5D2F1">{best_match[0]}</mark><mark style="background: #FFC0CB"><font color="red"> (match score:{best_match[1]})</font></mark>')
+        # add line break
+        text = text.replace('\n', f" <br /> ")
+        # add scroll bar
+        text = f'<div style="height: 300px; overflow: auto;">{text}</div>'
+        return text
+    def process_file(self, file, questions, openai_key, progress = gr.Progress()):
+        # record the questions
+        self.questions = questions
+        # get the text_list
+        self.text_list = self.read_file(file)
+        # make the prompt
+        prompt_list = [self.prompt.get(text, questions, 'v3') for text in self.text_list]
+        # interact with openai
+        self.res_list = []
+        for prompt in progress.tqdm(prompt_list, desc = 'Generating answers...'):
+            res = self.agent(prompt, with_history = False, temperature = 0.1, model = 'gpt-3.5-turbo-16k', api_key = openai_key)
+            res = self.prompt.process_result(res, 'v3')
+            self.res_list.append(res)
+        # Use the first file as default
+        # Use the first question for multiple questions
+        gpt_res = self.res_list[0]
+        self.gpt_result = gpt_res
+        self.current_question = 0
+        self.totel_question = len(res.keys())
+        self.current_passage = 0
+        self.total_passages = len(self.res_list)
+        # make a dataframe to record everything
+        self.ori_answer_df = pd.DataFrame()
+        self.answer_df = pd.DataFrame()
+        for i, res in enumerate(self.res_list):
+            tmp = pd.DataFrame(res).T
+            tmp = tmp.reset_index()
+            tmp = tmp.rename(columns={"index":"question_id"})
+            tmp['filename'] = self.filename_list[i]
+            tmp['question'] = self.questions
+            self.ori_answer_df = pd.concat([tmp, self.ori_answer_df])
+            self.answer_df = pd.concat([tmp, self.answer_df])
+        # default fist question
+        res = res['Question 1']
+        question = self.questions[self.current_question]
+        self.answer = res['answer']
+        self.text = self.text_list[0]
+        self.highlighted_out = res['original sentences']
+        highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
+        self.highlighted_out = '\n'.join(self.highlighted_out)
+        file_name = self.filename_list[self.current_passage]
+        return file_name, question, self.answer, highlighted_out_html, self.answer, self.highlighted_out
+    def process_results(self, answer_correct, correct_answer, reference_correct, correct_reference):
+        if not hasattr(self, 'clicked_correct_answer'):
+            raise gr.Error("You need to judge whether the generated answer is correct first")
+        if not hasattr(self, 'clicked_correct_reference'):
+            raise gr.Error("You need to judge whether the highlighted reference is correct first")
+        if not hasattr(self, 'answer_df'):
+            raise gr.Error("You need to submit the document first")
+        if self.current_question >= self.totel_question or self.current_question < 0:
+            raise gr.Error("No more questions, please return back")
+        # record the answer
+        condition = (self.answer_df['question_id'] == f'Question {self.current_question + 1}' ) & \
+            (self.answer_df['filename'] == self.filename_list[self.current_passage])
+        self.answer_df.loc[condition, 'answer_correct'] = answer_correct
+        self.answer_df.loc[condition, 'reference_correct'] = reference_correct
+        # self.answer_df.loc[f'Question {self.current_question + 1}', 'answer_correct'] = answer_correct
+        # self.answer_df.loc[f'Question {self.current_question + 1}', 'reference_correct'] = reference_correct
+        if self.clicked_correct_answer == True:
+            if hasattr(self, 'answer'):
+                self.answer_df.loc[condition, 'correct_answer'] = self.answer
+            else:
+                raise gr.Error("You need to submit the document first")
+        else:
+            # self.answer_df.loc[f'Question {self.current_question + 1}', 'correct_answer'] = correct_answer
+            self.answer_df.loc[condition, 'correct_answer'] = correct_answer
+        if self.clicked_correct_reference == True:
+            if hasattr(self, 'highlighted_out'):
+                self.answer_df.loc[condition, 'correct_reference'] = self.highlighted_out
+            else:
+                raise gr.Error("You need to submit the document first")
+        else:
+            self.answer_df.loc[condition, 'correct_reference'] = correct_reference
+        gr.Info('Results saved!')
+        return "Results saved!"
+    def process_next(self):
+        self.current_question += 1
+        if hasattr(self, 'clicked_correct_answer'):
+            del self.clicked_correct_answer
+        if hasattr(self, 'clicked_correct_reference'):
+            del self.clicked_correct_reference
+        if self.current_question >= self.totel_question:
+            # self.current_question -= 1
+            return "No more questions!", "No more questions!", "No more questions!", "No more questions!", 'No more questions!', 'No more questions!', 'Still need to click the button above to save the results', None, None
+        else:
+            res = self.gpt_result[f'Question {self.current_question + 1}']
+            question = self.questions[self.current_question]
+            self.answer = res['answer']
+            self.highlighted_out = res['original sentences']
+            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
+            self.highlighted_out = '\n'.join(self.highlighted_out)
+            file_name = self.filename_list[self.current_passage]
+            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
+    def process_last(self):
+        self.current_question -= 1
+        # To make sure to correct the answer first
+        if hasattr(self, 'clicked_correct_answer'):
+            del self.clicked_correct_answer
+        if hasattr(self, 'clicked_correct_reference'):
+            del self.clicked_correct_reference
+        # check question boundary
+        if self.current_question < 0:
+            # self.current_question += 1
+            return "No more questions!", "No more questions!", "No more questions!", "No more questions!", 'No more questions!', 'No more questions!', 'Still need to click the button above to save the results', None, None
+        else:
+            res = self.gpt_result[f'Question {self.current_question + 1}']
+            question = self.questions[self.current_question]
+            self.answer = res['answer']
+            self.highlighted_out = res['original sentences']
+            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
+            self.highlighted_out = '\n'.join(self.highlighted_out)
+            file_name = self.filename_list[self.current_passage]
+            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
+    def switch_next_passage(self):
+        self.current_question = 0
+        # To make sure to correct the answer first
+        if hasattr(self, 'clicked_correct_answer'):
+            del self.clicked_correct_answer
+        if hasattr(self, 'clicked_correct_reference'):
+            del self.clicked_correct_reference
+        self.current_passage += 1
+        if self.current_passage >= self.total_passages:
+            # self.current_passage -= 1
+            return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
+        else:
+            self.text = self.text_list[self.current_passage]
+            gpt_res = self.res_list[self.current_passage]
+            self.gpt_result = gpt_res
+            res = self.gpt_result[f'Question {self.current_question + 1}']
+            question = self.questions[self.current_question]
+            self.answer = res['answer']
+            self.highlighted_out = res['original sentences']
+            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
+            self.highlighted_out = '\n'.join(self.highlighted_out)
+            file_name = self.filename_list[self.current_passage]
+            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
+    def switch_last_passage(self):
+        self.current_question = 0
+        # To make sure to correct the answer first
+        if hasattr(self, 'clicked_correct_answer'):
+            del self.clicked_correct_answer
+        if hasattr(self, 'clicked_correct_reference'):
+            del self.clicked_correct_reference
+        self.current_passage -= 1
+        if self.current_passage < 0:
+            # self.current_passage += 1
+            return "No more passages!", "No more passages!", "No more passages!", "No more passages!", 'No more passages!', 'No more passages!', 'Still need to click the button above to save the results', None, None
+        else:
+            self.text = self.text_list[self.current_passage]
+            gpt_res = self.res_list[self.current_passage]
+            self.gpt_result = gpt_res
+            res = self.gpt_result[f'Question {self.current_question + 1}']
+            question = self.questions[self.current_question]
+            self.answer = res['answer']
+            self.highlighted_out = res['original sentences']
+            highlighted_out_html = self.highlight_text(self.text, self.highlighted_out)
+            self.highlighted_out = '\n'.join(self.highlighted_out)
+            file_name = self.filename_list[self.current_passage]
+            return file_name, question, self.answer, highlighted_out_html, 'Please judge on the generated answer', 'Please judge on the generated answer', 'Still need to click the button above to save the results', None, None
+    def download_answer(self, path = './tmp', name = 'answer.xlsx'):
+        os.makedirs(path, exist_ok = True)
+        path = os.path.join(path, name)
+        # self.ori_answer_df['questions'] = self.questions
+        self.ori_answer_df.to_excel(path, index = False)
+        return path
+    def download_corrected(self, path = './tmp', name = 'corrected_answer.xlsx'):
+        os.makedirs(path, exist_ok = True)
+        path = os.path.join(path, name)
+        # self.answer_df['questions'] = self.questions
+        self.answer_df.to_excel(path, index = False)
+        return path
+    def change_correct_answer(self, correctness):
+        if correctness == "Correct":
+            self.clicked_correct_answer = True
+            return "No need to change"
+        else:
+            if hasattr(self, 'answer'):
+                self.clicked_correct_answer = False
+                return self.answer
+            else:
+                return "No answer yet, you need to submit the document first"
+    def change_correct_reference(self, correctness):
+        if correctness == "Correct":
+            self.clicked_correct_reference = True
+            return "No need to change"
+        else:
+            if hasattr(self, 'highlighted_out'):
+                self.clicked_correct_reference = False
+                return self.highlighted_out
+            else:
                 return "No answer yet, you need to submit the document first"

openai.py CHANGED Viewed

@@ -1,44 +1,44 @@
-import requests
-class OpenAI:
-    def __init__(self, init_prompt = None):
-        self.history = []
-        if init_prompt is not None:
-            self.history.append({'role': 'system', 'content': init_prompt})
-    def clear_history(self):
-        self.history = []
-    def show_history(self):
-        for message in self.history:
-            print(f"{message['role']}: {message['content']}")
-    def get_raw_history(self):
-        return self.history
-    def __call__(self, prompt, with_history = False, model = 'gpt-3.5-turbo', temperature = 0, api_key = None):
-        URL = 'https://api.openai.com/v1/chat/completions'
-        new_message = {'role': 'user', 'content': prompt}
-        if with_history:
-            self.history.append(new_message)
-            messages = self.history
-        else:
-            messages = [new_message]
-        resp = requests.post(URL, json={
-            'model': model,
-            'messages': messages,
-            'temperature': temperature,
-        }, headers={
-            'Authorization': f"Bearer {api_key}"
-        })
-        self.history.append(resp.json()['choices'][0]['message'])
-        res = resp.json()['choices'][0]['message']['content']
-        # with open("tmp_res.txt", 'w') as f:
-        #     f.write(res)
-        # with open("tmp_res.txt", 'r') as f:
-        #     res = f.read()
-        return res

+import requests
+class OpenAI:
+    def __init__(self, init_prompt = None):
+        self.history = []
+        if init_prompt is not None:
+            self.history.append({'role': 'system', 'content': init_prompt})
+    def clear_history(self):
+        self.history = []
+    def show_history(self):
+        for message in self.history:
+            print(f"{message['role']}: {message['content']}")
+    def get_raw_history(self):
+        return self.history
+    def __call__(self, prompt, with_history = False, model = 'gpt-3.5-turbo', temperature = 0, api_key = None):
+        URL = 'https://api.openai.com/v1/chat/completions'
+        new_message = {'role': 'user', 'content': prompt}
+        if with_history:
+            self.history.append(new_message)
+            messages = self.history
+        else:
+            messages = [new_message]
+        resp = requests.post(URL, json={
+            'model': model,
+            'messages': messages,
+            'temperature': temperature,
+        }, headers={
+            'Authorization': f"Bearer {api_key}"
+        })
+        self.history.append(resp.json()['choices'][0]['message'])
+        res = resp.json()['choices'][0]['message']['content']
+        # with open("tmp_res.txt", 'w') as f:
+        #     f.write(res)
+        # with open("tmp_res.txt", 'r') as f:
+        #     res = f.read()
+        return res

prompt.py CHANGED Viewed

@@ -1,91 +1,91 @@
-from template import TEMPLATE_v1, TEMPLATE_v2, TEMPLATE_v3, QUESTIONS
-import json
-class Prompt:
-    def __init__(self) -> None:
-        # self.questions = QUESTIONS
-        self.template_v1 = TEMPLATE_v1
-        self.template_v2 = TEMPLATE_v2
-        self.template_v3 = TEMPLATE_v3
-        self.version = "v3"
-    def combine_questions(self, questions):
-        questions = [ f'Question {id_ +1 }: {q}' for id_, q in enumerate(questions) if 'Input question' not in q]
-        questions = '\n'.join(questions)
-        return questions
-    def _get_v1(self, input, questions):
-        questions = self.combine_questions(questions)
-        return self.template_v1.format(input, self.questions)
-    def _get_v2(self, input, questions):
-        questions = self.combine_questions(questions)
-        return self.template_v2.format(input, self.questions)
-    def _get_v3(self, input, questions):
-        return self.template_v3.format(input)
-    def get(self, input, questions, version = None):
-        self.version = version if version else self.version
-        if self.version == 'v1':
-            return self._get_v1(input, questions)
-        elif self.version == 'v2':
-            return self._get_v2(input, questions)
-        elif self.version == 'v3':
-            return self._get_v3(input, questions)
-        else:
-            raise ValueError('Version should be one of {v1, v2, v3}')
-    def _process_v1(self, res):
-        res = json.loads(res)
-        return res
-    def _process_v2(self, res):
-        res = json.loads(res)
-        return res
-    def _process_v3(self, x):
-        x = json.loads(x)
-        res = {}
-        question_id = 0
-        for k, v in x.items():
-            if 'answer' in v:
-                question_id += 1
-                question_name = f'Question {question_id}'
-                res_tmp = {"answer": v['answer'], "original sentences": v['original sentences']}
-                res[question_name] = res_tmp
-            else:
-                k_1, k_2 = v.keys()
-                in_1 = v[k_1]
-                in_2 = v[k_2]
-                question_id += 1
-                question_name = f'Question {question_id}'
-                res_tmp_1 = {"answer": in_1['answer'], "original sentences": in_1['original sentences']}
-                res[question_name] = res_tmp_1
-                question_id += 1
-                question_name = f'Question {question_id}'
-                res_tmp_2 = {"answer": in_2['answer'], "original sentences": in_2['original sentences']}
-                res[question_name] = res_tmp_2
-        return res
-    def process_result(self, result, version = None):
-        if not version is None and self.version != version:
-            self.version = version
-            print(f'Version changed to {version}')
-        if version == 'v1':
-            result = self._process_v1(result)
-            return result
-        elif version == 'v2':
-            result = self._process_v2(result)
-            return result
-        elif version == 'v3':
-            result = self._process_v3(result)
-            return result
-        else:
-            raise ValueError('Version should be one of {v1, v2, v3}')

+from template import TEMPLATE_v1, TEMPLATE_v2, TEMPLATE_v3, QUESTIONS
+import json
+class Prompt:
+    def __init__(self) -> None:
+        # self.questions = QUESTIONS
+        self.template_v1 = TEMPLATE_v1
+        self.template_v2 = TEMPLATE_v2
+        self.template_v3 = TEMPLATE_v3
+        self.version = "v3"
+    def combine_questions(self, questions):
+        questions = [ f'Question {id_ +1 }: {q}' for id_, q in enumerate(questions) if 'Input question' not in q]
+        questions = '\n'.join(questions)
+        return questions
+    def _get_v1(self, input, questions):
+        questions = self.combine_questions(questions)
+        return self.template_v1.format(input, self.questions)
+    def _get_v2(self, input, questions):
+        questions = self.combine_questions(questions)
+        return self.template_v2.format(input, self.questions)
+    def _get_v3(self, input, questions):
+        return self.template_v3.format(input)
+    def get(self, input, questions, version = None):
+        self.version = version if version else self.version
+        if self.version == 'v1':
+            return self._get_v1(input, questions)
+        elif self.version == 'v2':
+            return self._get_v2(input, questions)
+        elif self.version == 'v3':
+            return self._get_v3(input, questions)
+        else:
+            raise ValueError('Version should be one of {v1, v2, v3}')
+    def _process_v1(self, res):
+        res = json.loads(res)
+        return res
+    def _process_v2(self, res):
+        res = json.loads(res)
+        return res
+    def _process_v3(self, x):
+        x = json.loads(x)
+        res = {}
+        question_id = 0
+        for k, v in x.items():
+            if 'answer' in v:
+                question_id += 1
+                question_name = f'Question {question_id}'
+                res_tmp = {"answer": v['answer'], "original sentences": v['original sentences']}
+                res[question_name] = res_tmp
+            else:
+                k_1, k_2 = v.keys()
+                in_1 = v[k_1]
+                in_2 = v[k_2]
+                question_id += 1
+                question_name = f'Question {question_id}'
+                res_tmp_1 = {"answer": in_1['answer'], "original sentences": in_1['original sentences']}
+                res[question_name] = res_tmp_1
+                question_id += 1
+                question_name = f'Question {question_id}'
+                res_tmp_2 = {"answer": in_2['answer'], "original sentences": in_2['original sentences']}
+                res[question_name] = res_tmp_2
+        return res
+    def process_result(self, result, version = None):
+        if not version is None and self.version != version:
+            self.version = version
+            print(f'Version changed to {version}')
+        if version == 'v1':
+            result = self._process_v1(result)
+            return result
+        elif version == 'v2':
+            result = self._process_v2(result)
+            return result
+        elif version == 'v3':
+            result = self._process_v3(result)
+            return result
+        else:
+            raise ValueError('Version should be one of {v1, v2, v3}')

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-fuzzywuzzy
-openpyxl
-unstructured[all-docs]
 langchain

+fuzzywuzzy
+openpyxl
+unstructured[all-docs]
 langchain

template.py CHANGED Viewed

@@ -1,177 +1,177 @@
-TEMPLATE_v3 = '''We now have a following <document> in the medical field:
-"""
-{}
-"""
-You are an expert in biomedical research.
-You are asked to answer the following <question>s based on the <document>, the <question>s and their <instruction>s and <rule>s are as follows:
-- "Question 1":
-    - "question": "What is the <animal type> of this study?"
-    - "instruction": "This task is to find the <animal type> according to the <document>."
-    - "definition":
-        - "animal type": "The rodent type used in the article"
-    - "rule": "<answer> of <animal type> should be one of the two choices {{mice/rats}} or both"
-- "Question 2":
-    - "question": "What is the <exposure age> of this study?"
-    - "instruction": "This task is to find the <exposure age> according to the <document>."
-    - "definition":
-        - "exposure age": "The age when the animals were exposed to anesthetics. There are two kinds of <exposure age>: <postnatal day> and <gestational day>"
-        - "postnatal day": "<postnatal day> means the days after the animals were born. For example, 'postnatal day <int>' means the animals were born for <int> day. 'postnatal day <int>' is sometimes shortened to 'PND <int>' or 'pnd <int>', which still means 'postnatal day <int>', after birth. 'postnatal day <int>' is sometimes shortened to 'p<int>', which still means 'postnatal day <int>', after birth"
-        - "gestational day": "<gestational day> means the days after the animals were pregnant. For example, 'gestational day <int>' means the animals were pregnant for <int> day. 'gestational day <int>' is sometimes abbreviated as 'E <int>', 'E' meaning before birth"
-    - "rule": "<answer> of <exposure age> should be expressed as one of {{'postnatal day <int>'/'gestational day <int>'}}. For Example: "postnatal day 7", "Gestational day 21"
-- "Question 3":
-    - "question": "Is there any <behavior test> done in this study?"
-    - "instruction": "This task is to find whether there are any <behavior test> in the study according to the <document>."
-    - "definition":
-        - "behavior test": "If there are any of the <behavior tests> described and done in the article, which mentioned as: 'Open field test', 'Morris water task', 'fear conditioning test', 'Dark/light avoidance'; 'passive/active avoidance test'; 'elevated maze', 'Forced swim test', 'Object recognition test', 'Social interaction/preference'."
-    - "rule": "<answer> to <behavior test> should be one of the two choices {{Yes/No}}."
-- "Question 4":
-    - "question": "What's the <intervention>s of this study?
-    - "instruction": "This task is to find the <intervention>s according to the <document>."
-    - "definition":
-        - "intervention": "The <intervention>s are anesthetic drugs, which in one of {{"isoflurane"/"sevoflurane"/"desflurane"/"ketamine"/"propofol"/"Midazolam"/"Nitrous oxide“}}."
-    - "rule": "There are one or two <intervention>s in the <document>. Please find them all and answer the <question>. If there is only one <intervention>, the second one is 'None'."
-- "Question 5":
-    - "question": "What's the <genetic chain> of this study?"
-    - "instruction": "This task is to find the <genetic chain> according to the <document>."
-    - "definition":
-        - "genetic chain": "The <genetic chain> is the genetic type of the animals being used in the article."
-    - "rule": "Please do as the following examples: 'C57BL/6', 'C57BL/6J' should be extracted as 'C57BL/6'; 'Sprague Dawley', 'Sprague-Dawley', 'SD' should be extracted as 'Sprague Dawley'; 'CD-1' should be extracted as 'CD-1'; 'Wistar/ST' should be extracted as 'Wistar/ST'; 'Wistar' should be extracted as 'Wistar'; 'FMR-1 KO' should be extracted as 'FMR-1 KO'."
-Here is the instrucrtions for all the <question>s:
-1. Please select the <original sentences> related the "behavior tests" from the <document> for each <question>.
-2. Please use the <original sentences> to answer the <question> by following the <rule> and <instruction> accroding to the <definition>.
-3. Please provide <original sentences> coming from the <document>.
-4. Output the <answer> in the following JSON format:
-{{
-    "Question 1": {{
-        "answer": "{{mice/rats/both}}",
-        "original sentences": []
-    }},
-    "Question 2": {{
-        "answer": "{{postnatal day <int>/gestational day <int>}}",
-        "original sentences": []
-    }},
-    "Question 3": {{
-        "answer": "{{Yes/No}}",
-        "original sentences": []
-    }},
-    "Question 4":
-    {{
-        {{intervention 1}}: {{
-            "answer": "{{intervention 1}}]",
-            "original sentences": []
-        }}
-        {{intervention 2}}: {{
-            "answer": "{{intervention 2}}",
-            "original sentences": []
-        }}
-    }},
-    "Question 5": {{
-        "answer": "{{genetic chain}}",
-        "original sentences": []
-    }}
-}}
-'''
-TEMPLATE_v2 = '''We now have a following <document> in the medical field:
-"""
-{}
-"""
-We have some introduction here:
-1. DOI: The DOI link for the article, usually can be found in the first line of the .txt file for the article. E.g., “DOI: 10.3892/mmr.2019.10397”.
-2. Citation ID: The number in the file name. E.g., “1134”.
-3. First author: The last name in the file name. E.g., “Guan”.
-4. Year: The year in the file name. E.g., “2019”.
-5. Animal type: The rodent type used in the article, should be one of the choices: mice, rats. E.g., “rats”.
-6. Exposure age: The age when the animals were exposed to anesthetics, should be mentioned as "PND1", "PND7","postnatal day 7", "Gestational day 21", etc, which should be extract as: 'PND XX' , 'Gestational day xx'. E.g., “PND7”.
-7. Behavior test: Whether there is any behavior test in the article, should be one of the choices: "Y", "N". "Y" is chosen if there are any of the behavior tests described and done in the article, which mentioned as: "Open field test", "Morris water task", "fear conditioning test", "Dark/light avoidance"; "passive/active avoidance test"; "elevated maze", "Forced swim test", "Object recognition test", "Social interaction/preference“. E.g., “N”.
-8. Intervention 1 & Intervention 2: Intervention 1 and Intervention 2 are both anesthetic drugs, which listed as: "isoflurane", "sevoflurane", "desflurane", "ketamine", "propofol", "Midazolam", "Nitrous oxide“. If none, put “NA”. E.g., “propofol”.
-9. Genetic chain: Genetic chain is the genetic type of the animals being used in the article, here is the examples:
-    "C57BL/6", "C57BL/6J" should be extracted as "C57BL/6"; "Sprague Dawley", "Sprague-Dawley", "SD" should be extracted as "Sprague Dawley"; "CD-1" should be extracted as "CD-1"; "Wistar/ST" should be extracted as "Wistar/ST"; "Wistar" should be extracted as "Wistar"; "FMR-1 KO" should be extracted as "FMR-1 KO“. E.g., “Sprague Dawley”.
-We have some <question>s begin with "Question" here:
-"""
-{}
-"""
-Please finish the following task:
-1. Please select the <original sentences> related the each <question> from the <document>.
-2. Please use the <original sentences> to answer the <question>.
-3. Please provide <original sentences> coming from the <document>.
-4. Output the <answer> in the following json format:
-{{
-    "Question 1": {{
-        "question": {{}},
-        "answer": {{}},
-        "original sentences": []
-    }},
-    "Question 2": {{
-        "question": {{}},
-        "answer": {{}},
-        "original sentences": []
-    }},
-    ...
-}}
-'''
-TEMPLATE_v1 = '''We now have a following <document> in the medical field:
-"""
-{}
-"""
-We have some <question>s begin with "Question" here:
-"""
-{}
-"""
-Please finish the following task:
-1. Please select the <original sentences> related the each <question> from the <document>.
-2. Please use the <original sentences> to answer the <question>.
-3. Please provide <original sentences> coming from the <document>.
-4. Output the <answer> in the following json format:
-{{
-    "Question 1": {{
-        "question": {{}},
-        "answer": {{}},
-        "original sentences": []
-    }},
-    "Question 2": {{
-        "question": {{}},
-        "answer": {{}},
-        "original sentences": []
-    }},
-    ...
-}}
-'''
-QUESTIONS = [
-    "What is the DOI of this study?",
-    "What is the Citation ID of this study?",
-    "What is the First author of this study?",
-    "What is the year of this study?",
-    "What is the animal type of this study?",
-    "What is the exposure age of this study?",
-    "Is there any behavior test done in this study?",
-    "What's the Intervention 1's name of this study?(anesthetics only)",
-    "What's the Intervention 2's name of this study?(anesthetics only)",
-    "What's the genetic chain of this study?",
-    "Input question",
-]
-QUESTIONS = [ f'Question {id_ +1 }: {q}' for id_, q in enumerate(QUESTIONS) if 'Input question' not in q]
 QUESTIONS = '\n'.join(QUESTIONS)

+TEMPLATE_v3 = '''We now have a following <document> in the medical field:
+"""
+{}
+"""
+You are an expert in biomedical research.
+You are asked to answer the following <question>s based on the <document>, the <question>s and their <instruction>s and <rule>s are as follows:
+- "Question 1":
+    - "question": "What is the <animal type> of this study?"
+    - "instruction": "This task is to find the <animal type> according to the <document>."
+    - "definition":
+        - "animal type": "The rodent type used in the article"
+    - "rule": "<answer> of <animal type> should be one of the two choices {{mice/rats}} or both"
+- "Question 2":
+    - "question": "What is the <exposure age> of this study?"
+    - "instruction": "This task is to find the <exposure age> according to the <document>."
+    - "definition":
+        - "exposure age": "The age when the animals were exposed to anesthetics. There are two kinds of <exposure age>: <postnatal day> and <gestational day>"
+        - "postnatal day": "<postnatal day> means the days after the animals were born. For example, 'postnatal day <int>' means the animals were born for <int> day. 'postnatal day <int>' is sometimes shortened to 'PND <int>' or 'pnd <int>', which still means 'postnatal day <int>', after birth. 'postnatal day <int>' is sometimes shortened to 'p<int>', which still means 'postnatal day <int>', after birth"
+        - "gestational day": "<gestational day> means the days after the animals were pregnant. For example, 'gestational day <int>' means the animals were pregnant for <int> day. 'gestational day <int>' is sometimes abbreviated as 'E <int>', 'E' meaning before birth"
+    - "rule": "<answer> of <exposure age> should be expressed as one of {{'postnatal day <int>'/'gestational day <int>'}}. For Example: "postnatal day 7", "Gestational day 21"
+- "Question 3":
+    - "question": "Is there any <behavior test> done in this study?"
+    - "instruction": "This task is to find whether there are any <behavior test> in the study according to the <document>."
+    - "definition":
+        - "behavior test": "If there are any of the <behavior tests> described and done in the article, which mentioned as: 'Open field test', 'Morris water task', 'fear conditioning test', 'Dark/light avoidance'; 'passive/active avoidance test'; 'elevated maze', 'Forced swim test', 'Object recognition test', 'Social interaction/preference'."
+    - "rule": "<answer> to <behavior test> should be one of the two choices {{Yes/No}}."
+- "Question 4":
+    - "question": "What's the <intervention>s of this study?
+    - "instruction": "This task is to find the <intervention>s according to the <document>."
+    - "definition":
+        - "intervention": "The <intervention>s are anesthetic drugs, which in one of {{"isoflurane"/"sevoflurane"/"desflurane"/"ketamine"/"propofol"/"Midazolam"/"Nitrous oxide“}}."
+    - "rule": "There are one or two <intervention>s in the <document>. Please find them all and answer the <question>. If there is only one <intervention>, the second one is 'None'."
+- "Question 5":
+    - "question": "What's the <genetic chain> of this study?"
+    - "instruction": "This task is to find the <genetic chain> according to the <document>."
+    - "definition":
+        - "genetic chain": "The <genetic chain> is the genetic type of the animals being used in the article."
+    - "rule": "Please do as the following examples: 'C57BL/6', 'C57BL/6J' should be extracted as 'C57BL/6'; 'Sprague Dawley', 'Sprague-Dawley', 'SD' should be extracted as 'Sprague Dawley'; 'CD-1' should be extracted as 'CD-1'; 'Wistar/ST' should be extracted as 'Wistar/ST'; 'Wistar' should be extracted as 'Wistar'; 'FMR-1 KO' should be extracted as 'FMR-1 KO'."
+Here is the instrucrtions for all the <question>s:
+1. Please select the <original sentences> related the "behavior tests" from the <document> for each <question>.
+2. Please use the <original sentences> to answer the <question> by following the <rule> and <instruction> accroding to the <definition>.
+3. Please provide <original sentences> coming from the <document>.
+4. Output the <answer> in the following JSON format:
+{{
+    "Question 1": {{
+        "answer": "{{mice/rats/both}}",
+        "original sentences": []
+    }},
+    "Question 2": {{
+        "answer": "{{postnatal day <int>/gestational day <int>}}",
+        "original sentences": []
+    }},
+    "Question 3": {{
+        "answer": "{{Yes/No}}",
+        "original sentences": []
+    }},
+    "Question 4":
+    {{
+        {{intervention 1}}: {{
+            "answer": "{{intervention 1}}",
+            "original sentences": []
+        }}
+        {{intervention 2}}: {{
+            "answer": "{{intervention 2}}",
+            "original sentences": []
+        }}
+    }},
+    "Question 5": {{
+        "answer": "{{genetic chain}}",
+        "original sentences": []
+    }}
+}}
+'''
+TEMPLATE_v2 = '''We now have a following <document> in the medical field:
+"""
+{}
+"""
+We have some introduction here:
+1. DOI: The DOI link for the article, usually can be found in the first line of the .txt file for the article. E.g., “DOI: 10.3892/mmr.2019.10397”.
+2. Citation ID: The number in the file name. E.g., “1134”.
+3. First author: The last name in the file name. E.g., “Guan”.
+4. Year: The year in the file name. E.g., “2019”.
+5. Animal type: The rodent type used in the article, should be one of the choices: mice, rats. E.g., “rats”.
+6. Exposure age: The age when the animals were exposed to anesthetics, should be mentioned as "PND1", "PND7","postnatal day 7", "Gestational day 21", etc, which should be extract as: 'PND XX' , 'Gestational day xx'. E.g., “PND7”.
+7. Behavior test: Whether there is any behavior test in the article, should be one of the choices: "Y", "N". "Y" is chosen if there are any of the behavior tests described and done in the article, which mentioned as: "Open field test", "Morris water task", "fear conditioning test", "Dark/light avoidance"; "passive/active avoidance test"; "elevated maze", "Forced swim test", "Object recognition test", "Social interaction/preference“. E.g., “N”.
+8. Intervention 1 & Intervention 2: Intervention 1 and Intervention 2 are both anesthetic drugs, which listed as: "isoflurane", "sevoflurane", "desflurane", "ketamine", "propofol", "Midazolam", "Nitrous oxide“. If none, put “NA”. E.g., “propofol”.
+9. Genetic chain: Genetic chain is the genetic type of the animals being used in the article, here is the examples:
+    "C57BL/6", "C57BL/6J" should be extracted as "C57BL/6"; "Sprague Dawley", "Sprague-Dawley", "SD" should be extracted as "Sprague Dawley"; "CD-1" should be extracted as "CD-1"; "Wistar/ST" should be extracted as "Wistar/ST"; "Wistar" should be extracted as "Wistar"; "FMR-1 KO" should be extracted as "FMR-1 KO“. E.g., “Sprague Dawley”.
+We have some <question>s begin with "Question" here:
+"""
+{}
+"""
+Please finish the following task:
+1. Please select the <original sentences> related the each <question> from the <document>.
+2. Please use the <original sentences> to answer the <question>.
+3. Please provide <original sentences> coming from the <document>.
+4. Output the <answer> in the following json format:
+{{
+    "Question 1": {{
+        "question": {{}},
+        "answer": {{}},
+        "original sentences": []
+    }},
+    "Question 2": {{
+        "question": {{}},
+        "answer": {{}},
+        "original sentences": []
+    }},
+    ...
+}}
+'''
+TEMPLATE_v1 = '''We now have a following <document> in the medical field:
+"""
+{}
+"""
+We have some <question>s begin with "Question" here:
+"""
+{}
+"""
+Please finish the following task:
+1. Please select the <original sentences> related the each <question> from the <document>.
+2. Please use the <original sentences> to answer the <question>.
+3. Please provide <original sentences> coming from the <document>.
+4. Output the <answer> in the following json format:
+{{
+    "Question 1": {{
+        "question": {{}},
+        "answer": {{}},
+        "original sentences": []
+    }},
+    "Question 2": {{
+        "question": {{}},
+        "answer": {{}},
+        "original sentences": []
+    }},
+    ...
+}}
+'''
+QUESTIONS = [
+    "What is the DOI of this study?",
+    "What is the Citation ID of this study?",
+    "What is the First author of this study?",
+    "What is the year of this study?",
+    "What is the animal type of this study?",
+    "What is the exposure age of this study?",
+    "Is there any behavior test done in this study?",
+    "What's the Intervention 1's name of this study?(anesthetics only)",
+    "What's the Intervention 2's name of this study?(anesthetics only)",
+    "What's the genetic chain of this study?",
+    "Input question",
+]
+QUESTIONS = [ f'Question {id_ +1 }: {q}' for id_, q in enumerate(QUESTIONS) if 'Input question' not in q]
 QUESTIONS = '\n'.join(QUESTIONS)