Spaces:

AnunaAI
/

herbal-expert

Sleeping

App Files Files Community

anushm commited on Oct 24, 2023

Commit

ba60965

•

1 Parent(s): 93f00c1

Upload herbal_expert.py

Browse files

Files changed (1) hide show

herbal_expert.py +74 -74

herbal_expert.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import requests
 import json
 import random
 from langchain.agents import AgentExecutor, LLMSingleActionAgent, AgentOutputParser
 from langchain.prompts import StringPromptTemplate
@@ -16,8 +17,8 @@ import nltk
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from typing import List, Union, Optional, Any
-ngrok_url = 'https://9c1a-2605-7b80-3d-320-fc74-5877-9733-e99b.ngrok-free.app/'
-#ngrok_url = 'http://localhost:1234/'
 Entrez.email = "[email protected]"
 nltk.download('wordnet')
@@ -32,9 +33,10 @@ def get_num_citations(pmid: str):
     :return: The number of citations for a given pmid
     """
     citations_xml = Entrez.read(
-        Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pubmed_citedin", from_uid=pmid))
-    for i in range(0, len(citations_xml)):
         if len(citations_xml[i]["LinkSetDb"]) > 0:
             pmids_list = [link["Id"] for link in citations_xml[i]["LinkSetDb"][0]["Link"]]
             return len(pmids_list)
@@ -42,7 +44,7 @@ def get_num_citations(pmid: str):
             return 0
-def fetch_pubmed_articles(keywords, max_search=10, max_context=3):
     """
     The fetch_pubmed_articles function takes in a list of keywords and returns the top 3 articles from PubMed that
     are most relevant to those keywords. First the search is done on max_search articles, the list is then sorted by
@@ -56,37 +58,41 @@ def fetch_pubmed_articles(keywords, max_search=10, max_context=3):
     """
     try:
-        search_result = Entrez.esearch(db="pubmed", term=keywords, retmax=max_search)
-        id_list = Entrez.read(search_result)["IdList"]
-        if len(id_list) == 0:
-            search_result = Entrez.esearch(db="pubmed", term=keywords[:4], retmax=max_search)
-            id_list = Entrez.read(search_result)["IdList"]
-        num_citations = [(id, get_num_citations(id)) for id in id_list]
-        top_n_papers = sorted(num_citations, key=lambda x: x[1], reverse=True)[:max_context]
-        print(f"top_{max_context}_papers: ", top_n_papers)
-        top_n_papers = [paper[0] for paper in top_n_papers]
-        fetch_handle = Entrez.efetch(db="pubmed", id=top_n_papers, rettype="medline", retmode="xml")
-        fetched_articles = Entrez.read(fetch_handle)
-        articles = []
-        # somehow only pull natural therapeutic articles
-        for fetched in fetched_articles['PubmedArticle']:
-            title = fetched['MedlineCitation']['Article']['ArticleTitle']
-            abstract = fetched['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in fetched[
-                'MedlineCitation']['Article'] else "No Abstract"
-            # pmid = fetched['MedlineCitation']['PMID']
-            articles.append(title + "\n" + abstract)
-        return articles
-    except HTTPError as e:
-        print("HTTPError: ", e)
-        return []
-    except RuntimeError as e:
-        print("RuntimeError: ", e)
-        return []
 def call_model_with_history(messages: list):
@@ -98,31 +104,37 @@ def call_model_with_history(messages: list):
     """
     data = {
         "messages": messages,
-        "stop": ["### Instruction:"], "temperature": 0, "max_tokens": 512, "stream": False
     }
-    response = requests.post(ngrok_url + "v1/chat/completions", headers={"Content-Type": "application/json"}, json=data)
     return json.loads(response.text)['choices'][0]['message']['content']
-# TODO: add ability to pass message history to model
-def format_prompt_and_query(prompt, **kwargs):
     """
     The format_prompt_and_query function takes a prompt and keyword arguments, formats the prompt with the keyword
     arguments, and then calls call_model_with_history with a list of messages containing the formatted prompt.
     :param prompt: Format the prompt with the values in kwargs
     :param **kwargs: Pass a dictionary of key-value pairs to the prompt formatting function
     :return: A list of dictionaries
     """
     formatted_prompt = prompt.format(**kwargs)
-    messages = [
-        {"role": "system", "content": "Perform the instructions to the best of your ability."},
-        {"role": "user", "content": formatted_prompt}
-    ]
     return call_model_with_history(messages)
@@ -144,7 +156,7 @@ class HerbalExpert:
         # qd = Question Decompose, og = Original, qa = Question Asking, ri = Response Improvement
         self.prompts = {
             "qd_prompt": """### Instruction: Identify and list the keywords that capture the essence of the question. List them as a string separated by commas. Focus on the question. Order the keyword by importance. The first keyword should be the most important keyword in the question and the last keyword should be the least important keyword.
-            Question: {input}
             YOUR RESPONSE SHOULD BE A STRING OF COMMA SEPARATED KEYWORDS:
             ### Response: Keywords: """,
@@ -170,11 +182,10 @@ class HerbalExpert:
             ### Response: Improved Answer: """
         }
-    def process_query_words(self, question_words: str, answer_words: str):
         # don't need to be searching for these in pubmed. Should we include: 'supplements', 'supplement'
-        vague_words = ['recommendation', 'recommendations', 'products', 'product']
-        words = question_words.lower().split(",")[:4] + answer_words.lower().split(
-            ",")  # limit question words to 4 (since the number is unbounded)
         final_list = []
         for word in words:
@@ -185,29 +196,24 @@ class HerbalExpert:
         return list(set(final_list))
     def convert_question_into_words(self, question: str):
-        original_answer = format_prompt_and_query(self.prompts["og_answer_prompt"], question=question)
-        print("Original Answer: ", original_answer)
-        question_decompose = format_prompt_and_query(self.prompts["qd_prompt"], input=question)
-        print("Question Decompose: ", question_decompose)
-        original_answer_decompose = format_prompt_and_query(self.prompts["ans_decompose_prompt"],
-                                                            original_answer=original_answer)
-        print("Original Answer Decomposed: ", original_answer_decompose)
-        words = self.process_query_words(question_decompose, original_answer_decompose)
         return words, original_answer
     def query_expert(self, question: str = None):
         question = random.choice(self.default_questions) if question is None else question
-        print("Question: ", question)
         keywords, original_response = self.convert_question_into_words(question)
-        print("Keywords: ", keywords)
-        context = fetch_pubmed_articles(" AND ".join(keywords), max_search=5)
-        print("Context: ", context)
         if len(context) == 0:
             return {
                 "question": question,
@@ -215,20 +221,14 @@ class HerbalExpert:
                 "info": "No context found"
             }
-        contextual_response = format_prompt_and_query(self.prompts["qa_prompt"], question=question, context=context)
-        # if "I don't know" in contextual_response:
-        #     return {
-        #         "question": question,
-        #         "response": original_response,
-        #         "info": "Irrelevant context found"
-        #     }
-        print()
-        print("Contextual Response: ", contextual_response)
-        improved_response = format_prompt_and_query(self.prompts["ri_prompt"], question=question,
                                                     answer=original_response, answer2=contextual_response)
-        print()
-        print("Improved Response: ", improved_response)
         return {
             "question": question,
             "response": improved_response,
@@ -241,5 +241,5 @@ herbal_expert = HerbalExpert()
 if __name__ == '__main__':
     herbal_expert = HerbalExpert()
     answer = herbal_expert.query_expert()
-    print(answer['response'])
     # # return to api? who knows

 import requests
 import json
 import random
+import logging
 from langchain.agents import AgentExecutor, LLMSingleActionAgent, AgentOutputParser
 from langchain.prompts import StringPromptTemplate
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from typing import List, Union, Optional, Any
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+ngrok_url = "https://6d75-2605-7b80-3d-320-cc20-aa68-fd8-3c5e.ngrok-free.app/"
 Entrez.email = "[email protected]"
 nltk.download('wordnet')
     :return: The number of citations for a given pmid
     """
     citations_xml = Entrez.read(
+        Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pubmed_citedin", from_uid=pmid)
+    )
+    for i in range(len(citations_xml)):
         if len(citations_xml[i]["LinkSetDb"]) > 0:
             pmids_list = [link["Id"] for link in citations_xml[i]["LinkSetDb"][0]["Link"]]
             return len(pmids_list)
             return 0
+def fetch_pubmed_context(keywords, max_search=10, max_context=3):
     """
     The fetch_pubmed_articles function takes in a list of keywords and returns the top 3 articles from PubMed that
     are most relevant to those keywords. First the search is done on max_search articles, the list is then sorted by
     """
     try:
+        return query_pubmed(
+            keywords, max_search, max_context
+        )
+    except HTTPError as e:
+        logging.error(f"HTTPError: {e}")
+        return []
+    except RuntimeError as e:
+        logging.error(f"RuntimeError: {e}")
+        return []
+def query_pubmed(keywords, max_search, max_context):
+    search_result = Entrez.esearch(db="pubmed", term=keywords, retmax=max_search)
+    id_list = Entrez.read(search_result)["IdList"]
+    if len(id_list) == 0:
+        search_result = Entrez.esearch(db="pubmed", term=keywords[:4], retmax=max_search)
+        id_list = Entrez.read(search_result)["IdList"]
+    num_citations = [(id, get_num_citations(id)) for id in id_list]
+    top_n_papers = sorted(num_citations, key=lambda x: x[1], reverse=True)[:max_context]
+    logging.info(f"top_{max_context}_papers: {top_n_papers}")
+    top_n_papers = [paper[0] for paper in top_n_papers]
+    fetch_handle = Entrez.efetch(db="pubmed", id=top_n_papers, rettype="medline", retmode="xml")
+    fetched_articles = Entrez.read(fetch_handle)
+    articles = []
+    # somehow only pull natural therapeutic articles
+    for fetched in fetched_articles['PubmedArticle']:
+        title = fetched['MedlineCitation']['Article']['ArticleTitle']
+        abstract = fetched['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', ["No Abstract"])[0]
+        articles.append(title + "\n" + abstract)
+    return articles
 def call_model_with_history(messages: list):
     """
     data = {
         "messages": messages,
+        "stop": ["### Instruction:"], "temperature": 0, "max_tokens": 512, "stream": False, "repeat_penalty": 1.2
     }
+    response = requests.post(
+        f"{ngrok_url}v1/chat/completions",
+        headers={"Content-Type": "application/json"},
+        json=data,
+    )
     return json.loads(response.text)['choices'][0]['message']['content']
+def format_prompt_and_query(prompt: str, system_role: bool, **kwargs):
     """
     The format_prompt_and_query function takes a prompt and keyword arguments, formats the prompt with the keyword
     arguments, and then calls call_model_with_history with a list of messages containing the formatted prompt.
+    :param system_role:
     :param prompt: Format the prompt with the values in kwargs
     :param **kwargs: Pass a dictionary of key-value pairs to the prompt formatting function
     :return: A list of dictionaries
     """
     formatted_prompt = prompt.format(**kwargs)
+    messages = []
+    if system_role:
+        messages.append({"role": "system", "content": "Perform the instructions to the best of your ability."})
+    else:
+        messages.append({"role": "system",
+                         "content": "Develop an AI-based system to recommend optimal herbal products for specific health needs. Analyze the chemical composition, structural parameters, and pharmacology of natural medicinal substances found in plants, fungi, and roots. Cross-reference all information with toxicology data and pharmaceutical drugs to mitigate any potential risks, ensuring that the recommendations are safe, effective, and free from toxic chemicals."})
+    messages.append({"role": "user", "content": formatted_prompt})
     return call_model_with_history(messages)
         # qd = Question Decompose, og = Original, qa = Question Asking, ri = Response Improvement
         self.prompts = {
             "qd_prompt": """### Instruction: Identify and list the keywords that capture the essence of the question. List them as a string separated by commas. Focus on the question. Order the keyword by importance. The first keyword should be the most important keyword in the question and the last keyword should be the least important keyword.
+            Question: {question}
             YOUR RESPONSE SHOULD BE A STRING OF COMMA SEPARATED KEYWORDS:
             ### Response: Keywords: """,
             ### Response: Improved Answer: """
         }
+    def process_query_words(self, question_words: str):
         # don't need to be searching for these in pubmed. Should we include: 'supplements', 'supplement'
+        vague_words = ['recommendation', 'recommendations', 'products', 'product', 'scholarly articles', 'academic database']
+        words = question_words.lower().split(",")[:4]
         final_list = []
         for word in words:
         return list(set(final_list))
     def convert_question_into_words(self, question: str):
+        original_answer = format_prompt_and_query(self.prompts["og_answer_prompt"], system_role=False, question=question)
+        logging.info(f"Original Answer: {original_answer}")
+        question_decompose = format_prompt_and_query(self.prompts["qd_prompt"], system_role=True, question=question)
+        logging.info(f"Question Decompose: {question_decompose}")
+        words = self.process_query_words(question_decompose)
         return words, original_answer
     def query_expert(self, question: str = None):
         question = random.choice(self.default_questions) if question is None else question
+        logging.info(f"Question: {question}")
         keywords, original_response = self.convert_question_into_words(question)
+        logging.info(f"Keywords: {keywords}")
+        context = fetch_pubmed_context(" AND ".join(keywords), max_search=5)
+        logging.info(f"Context: {context}")
         if len(context) == 0:
             return {
                 "question": question,
                 "info": "No context found"
             }
+        contextual_response = format_prompt_and_query(self.prompts["qa_prompt"], system_role=False, question=question,
+                                                      context=context)
+        logging.info(f"Contextual Response: {contextual_response}")
+        improved_response = format_prompt_and_query(self.prompts["ri_prompt"], system_role=False, question=question,
                                                     answer=original_response, answer2=contextual_response)
+        logging.info(f"Improved Response: {improved_response}")
         return {
             "question": question,
             "response": improved_response,
 if __name__ == '__main__':
     herbal_expert = HerbalExpert()
     answer = herbal_expert.query_expert()
+    # logging.info(answer['response'])
     # # return to api? who knows