anushm commited on
Commit
ba60965
1 Parent(s): 93f00c1

Upload herbal_expert.py

Browse files
Files changed (1) hide show
  1. herbal_expert.py +74 -74
herbal_expert.py CHANGED
@@ -1,6 +1,7 @@
1
  import requests
2
  import json
3
  import random
 
4
 
5
  from langchain.agents import AgentExecutor, LLMSingleActionAgent, AgentOutputParser
6
  from langchain.prompts import StringPromptTemplate
@@ -16,8 +17,8 @@ import nltk
16
  from langchain.callbacks.manager import CallbackManagerForLLMRun
17
  from typing import List, Union, Optional, Any
18
 
19
- ngrok_url = 'https://9c1a-2605-7b80-3d-320-fc74-5877-9733-e99b.ngrok-free.app/'
20
- #ngrok_url = 'http://localhost:1234/'
21
  Entrez.email = "[email protected]"
22
  nltk.download('wordnet')
23
 
@@ -32,9 +33,10 @@ def get_num_citations(pmid: str):
32
  :return: The number of citations for a given pmid
33
  """
34
  citations_xml = Entrez.read(
35
- Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pubmed_citedin", from_uid=pmid))
 
36
 
37
- for i in range(0, len(citations_xml)):
38
  if len(citations_xml[i]["LinkSetDb"]) > 0:
39
  pmids_list = [link["Id"] for link in citations_xml[i]["LinkSetDb"][0]["Link"]]
40
  return len(pmids_list)
@@ -42,7 +44,7 @@ def get_num_citations(pmid: str):
42
  return 0
43
 
44
 
45
- def fetch_pubmed_articles(keywords, max_search=10, max_context=3):
46
  """
47
  The fetch_pubmed_articles function takes in a list of keywords and returns the top 3 articles from PubMed that
48
  are most relevant to those keywords. First the search is done on max_search articles, the list is then sorted by
@@ -56,37 +58,41 @@ def fetch_pubmed_articles(keywords, max_search=10, max_context=3):
56
  """
57
 
58
  try:
59
- search_result = Entrez.esearch(db="pubmed", term=keywords, retmax=max_search)
60
- id_list = Entrez.read(search_result)["IdList"]
 
 
 
 
 
 
 
61
 
62
- if len(id_list) == 0:
63
- search_result = Entrez.esearch(db="pubmed", term=keywords[:4], retmax=max_search)
64
- id_list = Entrez.read(search_result)["IdList"]
65
 
66
- num_citations = [(id, get_num_citations(id)) for id in id_list]
67
- top_n_papers = sorted(num_citations, key=lambda x: x[1], reverse=True)[:max_context]
68
- print(f"top_{max_context}_papers: ", top_n_papers)
69
 
70
- top_n_papers = [paper[0] for paper in top_n_papers]
71
- fetch_handle = Entrez.efetch(db="pubmed", id=top_n_papers, rettype="medline", retmode="xml")
72
- fetched_articles = Entrez.read(fetch_handle)
73
 
74
- articles = []
75
- # somehow only pull natural therapeutic articles
76
- for fetched in fetched_articles['PubmedArticle']:
77
- title = fetched['MedlineCitation']['Article']['ArticleTitle']
78
- abstract = fetched['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in fetched[
79
- 'MedlineCitation']['Article'] else "No Abstract"
80
- # pmid = fetched['MedlineCitation']['PMID']
81
- articles.append(title + "\n" + abstract)
82
 
83
- return articles
84
- except HTTPError as e:
85
- print("HTTPError: ", e)
86
- return []
87
- except RuntimeError as e:
88
- print("RuntimeError: ", e)
89
- return []
 
 
 
 
 
90
 
91
 
92
  def call_model_with_history(messages: list):
@@ -98,31 +104,37 @@ def call_model_with_history(messages: list):
98
  """
99
  data = {
100
  "messages": messages,
101
- "stop": ["### Instruction:"], "temperature": 0, "max_tokens": 512, "stream": False
102
  }
103
 
104
- response = requests.post(ngrok_url + "v1/chat/completions", headers={"Content-Type": "application/json"}, json=data)
 
 
 
 
105
  return json.loads(response.text)['choices'][0]['message']['content']
106
 
107
 
108
- # TODO: add ability to pass message history to model
109
- def format_prompt_and_query(prompt, **kwargs):
110
  """
111
  The format_prompt_and_query function takes a prompt and keyword arguments, formats the prompt with the keyword
112
  arguments, and then calls call_model_with_history with a list of messages containing the formatted prompt.
113
 
 
114
  :param prompt: Format the prompt with the values in kwargs
115
  :param **kwargs: Pass a dictionary of key-value pairs to the prompt formatting function
116
  :return: A list of dictionaries
117
  """
118
 
119
  formatted_prompt = prompt.format(**kwargs)
120
-
121
- messages = [
122
- {"role": "system", "content": "Perform the instructions to the best of your ability."},
123
- {"role": "user", "content": formatted_prompt}
124
- ]
125
-
 
 
126
  return call_model_with_history(messages)
127
 
128
 
@@ -144,7 +156,7 @@ class HerbalExpert:
144
  # qd = Question Decompose, og = Original, qa = Question Asking, ri = Response Improvement
145
  self.prompts = {
146
  "qd_prompt": """### Instruction: Identify and list the keywords that capture the essence of the question. List them as a string separated by commas. Focus on the question. Order the keyword by importance. The first keyword should be the most important keyword in the question and the last keyword should be the least important keyword.
147
- Question: {input}
148
 
149
  YOUR RESPONSE SHOULD BE A STRING OF COMMA SEPARATED KEYWORDS:
150
  ### Response: Keywords: """,
@@ -170,11 +182,10 @@ class HerbalExpert:
170
  ### Response: Improved Answer: """
171
  }
172
 
173
- def process_query_words(self, question_words: str, answer_words: str):
174
  # don't need to be searching for these in pubmed. Should we include: 'supplements', 'supplement'
175
- vague_words = ['recommendation', 'recommendations', 'products', 'product']
176
- words = question_words.lower().split(",")[:4] + answer_words.lower().split(
177
- ",") # limit question words to 4 (since the number is unbounded)
178
 
179
  final_list = []
180
  for word in words:
@@ -185,29 +196,24 @@ class HerbalExpert:
185
  return list(set(final_list))
186
 
187
  def convert_question_into_words(self, question: str):
188
- original_answer = format_prompt_and_query(self.prompts["og_answer_prompt"], question=question)
189
- print("Original Answer: ", original_answer)
190
 
191
- question_decompose = format_prompt_and_query(self.prompts["qd_prompt"], input=question)
192
- print("Question Decompose: ", question_decompose)
193
 
194
- original_answer_decompose = format_prompt_and_query(self.prompts["ans_decompose_prompt"],
195
- original_answer=original_answer)
196
- print("Original Answer Decomposed: ", original_answer_decompose)
197
-
198
- words = self.process_query_words(question_decompose, original_answer_decompose)
199
  return words, original_answer
200
 
201
  def query_expert(self, question: str = None):
202
  question = random.choice(self.default_questions) if question is None else question
203
- print("Question: ", question)
204
 
205
  keywords, original_response = self.convert_question_into_words(question)
206
- print("Keywords: ", keywords)
207
-
208
- context = fetch_pubmed_articles(" AND ".join(keywords), max_search=5)
209
- print("Context: ", context)
210
 
 
 
211
  if len(context) == 0:
212
  return {
213
  "question": question,
@@ -215,20 +221,14 @@ class HerbalExpert:
215
  "info": "No context found"
216
  }
217
 
218
- contextual_response = format_prompt_and_query(self.prompts["qa_prompt"], question=question, context=context)
219
- # if "I don't know" in contextual_response:
220
- # return {
221
- # "question": question,
222
- # "response": original_response,
223
- # "info": "Irrelevant context found"
224
- # }
225
-
226
- print()
227
- print("Contextual Response: ", contextual_response)
228
- improved_response = format_prompt_and_query(self.prompts["ri_prompt"], question=question,
229
  answer=original_response, answer2=contextual_response)
230
- print()
231
- print("Improved Response: ", improved_response)
232
  return {
233
  "question": question,
234
  "response": improved_response,
@@ -241,5 +241,5 @@ herbal_expert = HerbalExpert()
241
  if __name__ == '__main__':
242
  herbal_expert = HerbalExpert()
243
  answer = herbal_expert.query_expert()
244
- print(answer['response'])
245
  # # return to api? who knows
 
1
  import requests
2
  import json
3
  import random
4
+ import logging
5
 
6
  from langchain.agents import AgentExecutor, LLMSingleActionAgent, AgentOutputParser
7
  from langchain.prompts import StringPromptTemplate
 
17
  from langchain.callbacks.manager import CallbackManagerForLLMRun
18
  from typing import List, Union, Optional, Any
19
 
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
+ ngrok_url = "https://6d75-2605-7b80-3d-320-cc20-aa68-fd8-3c5e.ngrok-free.app/"
22
  Entrez.email = "[email protected]"
23
  nltk.download('wordnet')
24
 
 
33
  :return: The number of citations for a given pmid
34
  """
35
  citations_xml = Entrez.read(
36
+ Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pubmed_citedin", from_uid=pmid)
37
+ )
38
 
39
+ for i in range(len(citations_xml)):
40
  if len(citations_xml[i]["LinkSetDb"]) > 0:
41
  pmids_list = [link["Id"] for link in citations_xml[i]["LinkSetDb"][0]["Link"]]
42
  return len(pmids_list)
 
44
  return 0
45
 
46
 
47
+ def fetch_pubmed_context(keywords, max_search=10, max_context=3):
48
  """
49
  The fetch_pubmed_articles function takes in a list of keywords and returns the top 3 articles from PubMed that
50
  are most relevant to those keywords. First the search is done on max_search articles, the list is then sorted by
 
58
  """
59
 
60
  try:
61
+ return query_pubmed(
62
+ keywords, max_search, max_context
63
+ )
64
+ except HTTPError as e:
65
+ logging.error(f"HTTPError: {e}")
66
+ return []
67
+ except RuntimeError as e:
68
+ logging.error(f"RuntimeError: {e}")
69
+ return []
70
 
 
 
 
71
 
72
+ def query_pubmed(keywords, max_search, max_context):
73
+ search_result = Entrez.esearch(db="pubmed", term=keywords, retmax=max_search)
74
+ id_list = Entrez.read(search_result)["IdList"]
75
 
76
+ if len(id_list) == 0:
77
+ search_result = Entrez.esearch(db="pubmed", term=keywords[:4], retmax=max_search)
78
+ id_list = Entrez.read(search_result)["IdList"]
79
 
80
+ num_citations = [(id, get_num_citations(id)) for id in id_list]
81
+ top_n_papers = sorted(num_citations, key=lambda x: x[1], reverse=True)[:max_context]
82
+ logging.info(f"top_{max_context}_papers: {top_n_papers}")
 
 
 
 
 
83
 
84
+ top_n_papers = [paper[0] for paper in top_n_papers]
85
+ fetch_handle = Entrez.efetch(db="pubmed", id=top_n_papers, rettype="medline", retmode="xml")
86
+ fetched_articles = Entrez.read(fetch_handle)
87
+
88
+ articles = []
89
+ # somehow only pull natural therapeutic articles
90
+ for fetched in fetched_articles['PubmedArticle']:
91
+ title = fetched['MedlineCitation']['Article']['ArticleTitle']
92
+ abstract = fetched['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', ["No Abstract"])[0]
93
+ articles.append(title + "\n" + abstract)
94
+
95
+ return articles
96
 
97
 
98
  def call_model_with_history(messages: list):
 
104
  """
105
  data = {
106
  "messages": messages,
107
+ "stop": ["### Instruction:"], "temperature": 0, "max_tokens": 512, "stream": False, "repeat_penalty": 1.2
108
  }
109
 
110
+ response = requests.post(
111
+ f"{ngrok_url}v1/chat/completions",
112
+ headers={"Content-Type": "application/json"},
113
+ json=data,
114
+ )
115
  return json.loads(response.text)['choices'][0]['message']['content']
116
 
117
 
118
+ def format_prompt_and_query(prompt: str, system_role: bool, **kwargs):
 
119
  """
120
  The format_prompt_and_query function takes a prompt and keyword arguments, formats the prompt with the keyword
121
  arguments, and then calls call_model_with_history with a list of messages containing the formatted prompt.
122
 
123
+ :param system_role:
124
  :param prompt: Format the prompt with the values in kwargs
125
  :param **kwargs: Pass a dictionary of key-value pairs to the prompt formatting function
126
  :return: A list of dictionaries
127
  """
128
 
129
  formatted_prompt = prompt.format(**kwargs)
130
+ messages = []
131
+ if system_role:
132
+ messages.append({"role": "system", "content": "Perform the instructions to the best of your ability."})
133
+ else:
134
+ messages.append({"role": "system",
135
+ "content": "Develop an AI-based system to recommend optimal herbal products for specific health needs. Analyze the chemical composition, structural parameters, and pharmacology of natural medicinal substances found in plants, fungi, and roots. Cross-reference all information with toxicology data and pharmaceutical drugs to mitigate any potential risks, ensuring that the recommendations are safe, effective, and free from toxic chemicals."})
136
+
137
+ messages.append({"role": "user", "content": formatted_prompt})
138
  return call_model_with_history(messages)
139
 
140
 
 
156
  # qd = Question Decompose, og = Original, qa = Question Asking, ri = Response Improvement
157
  self.prompts = {
158
  "qd_prompt": """### Instruction: Identify and list the keywords that capture the essence of the question. List them as a string separated by commas. Focus on the question. Order the keyword by importance. The first keyword should be the most important keyword in the question and the last keyword should be the least important keyword.
159
+ Question: {question}
160
 
161
  YOUR RESPONSE SHOULD BE A STRING OF COMMA SEPARATED KEYWORDS:
162
  ### Response: Keywords: """,
 
182
  ### Response: Improved Answer: """
183
  }
184
 
185
+ def process_query_words(self, question_words: str):
186
  # don't need to be searching for these in pubmed. Should we include: 'supplements', 'supplement'
187
+ vague_words = ['recommendation', 'recommendations', 'products', 'product', 'scholarly articles', 'academic database']
188
+ words = question_words.lower().split(",")[:4]
 
189
 
190
  final_list = []
191
  for word in words:
 
196
  return list(set(final_list))
197
 
198
  def convert_question_into_words(self, question: str):
199
+ original_answer = format_prompt_and_query(self.prompts["og_answer_prompt"], system_role=False, question=question)
200
+ logging.info(f"Original Answer: {original_answer}")
201
 
202
+ question_decompose = format_prompt_and_query(self.prompts["qd_prompt"], system_role=True, question=question)
203
+ logging.info(f"Question Decompose: {question_decompose}")
204
 
205
+ words = self.process_query_words(question_decompose)
 
 
 
 
206
  return words, original_answer
207
 
208
  def query_expert(self, question: str = None):
209
  question = random.choice(self.default_questions) if question is None else question
210
+ logging.info(f"Question: {question}")
211
 
212
  keywords, original_response = self.convert_question_into_words(question)
213
+ logging.info(f"Keywords: {keywords}")
 
 
 
214
 
215
+ context = fetch_pubmed_context(" AND ".join(keywords), max_search=5)
216
+ logging.info(f"Context: {context}")
217
  if len(context) == 0:
218
  return {
219
  "question": question,
 
221
  "info": "No context found"
222
  }
223
 
224
+ contextual_response = format_prompt_and_query(self.prompts["qa_prompt"], system_role=False, question=question,
225
+ context=context)
226
+ logging.info(f"Contextual Response: {contextual_response}")
227
+
228
+ improved_response = format_prompt_and_query(self.prompts["ri_prompt"], system_role=False, question=question,
 
 
 
 
 
 
229
  answer=original_response, answer2=contextual_response)
230
+ logging.info(f"Improved Response: {improved_response}")
231
+
232
  return {
233
  "question": question,
234
  "response": improved_response,
 
241
  if __name__ == '__main__':
242
  herbal_expert = HerbalExpert()
243
  answer = herbal_expert.query_expert()
244
+ # logging.info(answer['response'])
245
  # # return to api? who knows