Spaces:
Sleeping
Sleeping
Upload herbal_expert.py
Browse files- herbal_expert.py +74 -74
herbal_expert.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import requests
|
2 |
import json
|
3 |
import random
|
|
|
4 |
|
5 |
from langchain.agents import AgentExecutor, LLMSingleActionAgent, AgentOutputParser
|
6 |
from langchain.prompts import StringPromptTemplate
|
@@ -16,8 +17,8 @@ import nltk
|
|
16 |
from langchain.callbacks.manager import CallbackManagerForLLMRun
|
17 |
from typing import List, Union, Optional, Any
|
18 |
|
19 |
-
|
20 |
-
|
21 |
Entrez.email = "[email protected]"
|
22 |
nltk.download('wordnet')
|
23 |
|
@@ -32,9 +33,10 @@ def get_num_citations(pmid: str):
|
|
32 |
:return: The number of citations for a given pmid
|
33 |
"""
|
34 |
citations_xml = Entrez.read(
|
35 |
-
Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pubmed_citedin", from_uid=pmid)
|
|
|
36 |
|
37 |
-
for i in range(
|
38 |
if len(citations_xml[i]["LinkSetDb"]) > 0:
|
39 |
pmids_list = [link["Id"] for link in citations_xml[i]["LinkSetDb"][0]["Link"]]
|
40 |
return len(pmids_list)
|
@@ -42,7 +44,7 @@ def get_num_citations(pmid: str):
|
|
42 |
return 0
|
43 |
|
44 |
|
45 |
-
def
|
46 |
"""
|
47 |
The fetch_pubmed_articles function takes in a list of keywords and returns the top 3 articles from PubMed that
|
48 |
are most relevant to those keywords. First the search is done on max_search articles, the list is then sorted by
|
@@ -56,37 +58,41 @@ def fetch_pubmed_articles(keywords, max_search=10, max_context=3):
|
|
56 |
"""
|
57 |
|
58 |
try:
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
if len(id_list) == 0:
|
63 |
-
search_result = Entrez.esearch(db="pubmed", term=keywords[:4], retmax=max_search)
|
64 |
-
id_list = Entrez.read(search_result)["IdList"]
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
title = fetched['MedlineCitation']['Article']['ArticleTitle']
|
78 |
-
abstract = fetched['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in fetched[
|
79 |
-
'MedlineCitation']['Article'] else "No Abstract"
|
80 |
-
# pmid = fetched['MedlineCitation']['PMID']
|
81 |
-
articles.append(title + "\n" + abstract)
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
|
92 |
def call_model_with_history(messages: list):
|
@@ -98,31 +104,37 @@ def call_model_with_history(messages: list):
|
|
98 |
"""
|
99 |
data = {
|
100 |
"messages": messages,
|
101 |
-
"stop": ["### Instruction:"], "temperature": 0, "max_tokens": 512, "stream": False
|
102 |
}
|
103 |
|
104 |
-
response = requests.post(
|
|
|
|
|
|
|
|
|
105 |
return json.loads(response.text)['choices'][0]['message']['content']
|
106 |
|
107 |
|
108 |
-
|
109 |
-
def format_prompt_and_query(prompt, **kwargs):
|
110 |
"""
|
111 |
The format_prompt_and_query function takes a prompt and keyword arguments, formats the prompt with the keyword
|
112 |
arguments, and then calls call_model_with_history with a list of messages containing the formatted prompt.
|
113 |
|
|
|
114 |
:param prompt: Format the prompt with the values in kwargs
|
115 |
:param **kwargs: Pass a dictionary of key-value pairs to the prompt formatting function
|
116 |
:return: A list of dictionaries
|
117 |
"""
|
118 |
|
119 |
formatted_prompt = prompt.format(**kwargs)
|
120 |
-
|
121 |
-
|
122 |
-
{"role": "system", "content": "Perform the instructions to the best of your ability."}
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
126 |
return call_model_with_history(messages)
|
127 |
|
128 |
|
@@ -144,7 +156,7 @@ class HerbalExpert:
|
|
144 |
# qd = Question Decompose, og = Original, qa = Question Asking, ri = Response Improvement
|
145 |
self.prompts = {
|
146 |
"qd_prompt": """### Instruction: Identify and list the keywords that capture the essence of the question. List them as a string separated by commas. Focus on the question. Order the keyword by importance. The first keyword should be the most important keyword in the question and the last keyword should be the least important keyword.
|
147 |
-
Question: {
|
148 |
|
149 |
YOUR RESPONSE SHOULD BE A STRING OF COMMA SEPARATED KEYWORDS:
|
150 |
### Response: Keywords: """,
|
@@ -170,11 +182,10 @@ class HerbalExpert:
|
|
170 |
### Response: Improved Answer: """
|
171 |
}
|
172 |
|
173 |
-
def process_query_words(self, question_words: str
|
174 |
# don't need to be searching for these in pubmed. Should we include: 'supplements', 'supplement'
|
175 |
-
vague_words = ['recommendation', 'recommendations', 'products', 'product']
|
176 |
-
words = question_words.lower().split(",")[:4]
|
177 |
-
",") # limit question words to 4 (since the number is unbounded)
|
178 |
|
179 |
final_list = []
|
180 |
for word in words:
|
@@ -185,29 +196,24 @@ class HerbalExpert:
|
|
185 |
return list(set(final_list))
|
186 |
|
187 |
def convert_question_into_words(self, question: str):
|
188 |
-
original_answer = format_prompt_and_query(self.prompts["og_answer_prompt"], question=question)
|
189 |
-
|
190 |
|
191 |
-
question_decompose = format_prompt_and_query(self.prompts["qd_prompt"],
|
192 |
-
|
193 |
|
194 |
-
|
195 |
-
original_answer=original_answer)
|
196 |
-
print("Original Answer Decomposed: ", original_answer_decompose)
|
197 |
-
|
198 |
-
words = self.process_query_words(question_decompose, original_answer_decompose)
|
199 |
return words, original_answer
|
200 |
|
201 |
def query_expert(self, question: str = None):
|
202 |
question = random.choice(self.default_questions) if question is None else question
|
203 |
-
|
204 |
|
205 |
keywords, original_response = self.convert_question_into_words(question)
|
206 |
-
|
207 |
-
|
208 |
-
context = fetch_pubmed_articles(" AND ".join(keywords), max_search=5)
|
209 |
-
print("Context: ", context)
|
210 |
|
|
|
|
|
211 |
if len(context) == 0:
|
212 |
return {
|
213 |
"question": question,
|
@@ -215,20 +221,14 @@ class HerbalExpert:
|
|
215 |
"info": "No context found"
|
216 |
}
|
217 |
|
218 |
-
contextual_response = format_prompt_and_query(self.prompts["qa_prompt"], question=question,
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
# "info": "Irrelevant context found"
|
224 |
-
# }
|
225 |
-
|
226 |
-
print()
|
227 |
-
print("Contextual Response: ", contextual_response)
|
228 |
-
improved_response = format_prompt_and_query(self.prompts["ri_prompt"], question=question,
|
229 |
answer=original_response, answer2=contextual_response)
|
230 |
-
|
231 |
-
|
232 |
return {
|
233 |
"question": question,
|
234 |
"response": improved_response,
|
@@ -241,5 +241,5 @@ herbal_expert = HerbalExpert()
|
|
241 |
if __name__ == '__main__':
|
242 |
herbal_expert = HerbalExpert()
|
243 |
answer = herbal_expert.query_expert()
|
244 |
-
|
245 |
# # return to api? who knows
|
|
|
1 |
import requests
|
2 |
import json
|
3 |
import random
|
4 |
+
import logging
|
5 |
|
6 |
from langchain.agents import AgentExecutor, LLMSingleActionAgent, AgentOutputParser
|
7 |
from langchain.prompts import StringPromptTemplate
|
|
|
17 |
from langchain.callbacks.manager import CallbackManagerForLLMRun
|
18 |
from typing import List, Union, Optional, Any
|
19 |
|
20 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
21 |
+
ngrok_url = "https://6d75-2605-7b80-3d-320-cc20-aa68-fd8-3c5e.ngrok-free.app/"
|
22 |
Entrez.email = "[email protected]"
|
23 |
nltk.download('wordnet')
|
24 |
|
|
|
33 |
:return: The number of citations for a given pmid
|
34 |
"""
|
35 |
citations_xml = Entrez.read(
|
36 |
+
Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pubmed_citedin", from_uid=pmid)
|
37 |
+
)
|
38 |
|
39 |
+
for i in range(len(citations_xml)):
|
40 |
if len(citations_xml[i]["LinkSetDb"]) > 0:
|
41 |
pmids_list = [link["Id"] for link in citations_xml[i]["LinkSetDb"][0]["Link"]]
|
42 |
return len(pmids_list)
|
|
|
44 |
return 0
|
45 |
|
46 |
|
47 |
+
def fetch_pubmed_context(keywords, max_search=10, max_context=3):
|
48 |
"""
|
49 |
The fetch_pubmed_articles function takes in a list of keywords and returns the top 3 articles from PubMed that
|
50 |
are most relevant to those keywords. First the search is done on max_search articles, the list is then sorted by
|
|
|
58 |
"""
|
59 |
|
60 |
try:
|
61 |
+
return query_pubmed(
|
62 |
+
keywords, max_search, max_context
|
63 |
+
)
|
64 |
+
except HTTPError as e:
|
65 |
+
logging.error(f"HTTPError: {e}")
|
66 |
+
return []
|
67 |
+
except RuntimeError as e:
|
68 |
+
logging.error(f"RuntimeError: {e}")
|
69 |
+
return []
|
70 |
|
|
|
|
|
|
|
71 |
|
72 |
+
def query_pubmed(keywords, max_search, max_context):
|
73 |
+
search_result = Entrez.esearch(db="pubmed", term=keywords, retmax=max_search)
|
74 |
+
id_list = Entrez.read(search_result)["IdList"]
|
75 |
|
76 |
+
if len(id_list) == 0:
|
77 |
+
search_result = Entrez.esearch(db="pubmed", term=keywords[:4], retmax=max_search)
|
78 |
+
id_list = Entrez.read(search_result)["IdList"]
|
79 |
|
80 |
+
num_citations = [(id, get_num_citations(id)) for id in id_list]
|
81 |
+
top_n_papers = sorted(num_citations, key=lambda x: x[1], reverse=True)[:max_context]
|
82 |
+
logging.info(f"top_{max_context}_papers: {top_n_papers}")
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
top_n_papers = [paper[0] for paper in top_n_papers]
|
85 |
+
fetch_handle = Entrez.efetch(db="pubmed", id=top_n_papers, rettype="medline", retmode="xml")
|
86 |
+
fetched_articles = Entrez.read(fetch_handle)
|
87 |
+
|
88 |
+
articles = []
|
89 |
+
# somehow only pull natural therapeutic articles
|
90 |
+
for fetched in fetched_articles['PubmedArticle']:
|
91 |
+
title = fetched['MedlineCitation']['Article']['ArticleTitle']
|
92 |
+
abstract = fetched['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', ["No Abstract"])[0]
|
93 |
+
articles.append(title + "\n" + abstract)
|
94 |
+
|
95 |
+
return articles
|
96 |
|
97 |
|
98 |
def call_model_with_history(messages: list):
|
|
|
104 |
"""
|
105 |
data = {
|
106 |
"messages": messages,
|
107 |
+
"stop": ["### Instruction:"], "temperature": 0, "max_tokens": 512, "stream": False, "repeat_penalty": 1.2
|
108 |
}
|
109 |
|
110 |
+
response = requests.post(
|
111 |
+
f"{ngrok_url}v1/chat/completions",
|
112 |
+
headers={"Content-Type": "application/json"},
|
113 |
+
json=data,
|
114 |
+
)
|
115 |
return json.loads(response.text)['choices'][0]['message']['content']
|
116 |
|
117 |
|
118 |
+
def format_prompt_and_query(prompt: str, system_role: bool, **kwargs):
|
|
|
119 |
"""
|
120 |
The format_prompt_and_query function takes a prompt and keyword arguments, formats the prompt with the keyword
|
121 |
arguments, and then calls call_model_with_history with a list of messages containing the formatted prompt.
|
122 |
|
123 |
+
:param system_role:
|
124 |
:param prompt: Format the prompt with the values in kwargs
|
125 |
:param **kwargs: Pass a dictionary of key-value pairs to the prompt formatting function
|
126 |
:return: A list of dictionaries
|
127 |
"""
|
128 |
|
129 |
formatted_prompt = prompt.format(**kwargs)
|
130 |
+
messages = []
|
131 |
+
if system_role:
|
132 |
+
messages.append({"role": "system", "content": "Perform the instructions to the best of your ability."})
|
133 |
+
else:
|
134 |
+
messages.append({"role": "system",
|
135 |
+
"content": "Develop an AI-based system to recommend optimal herbal products for specific health needs. Analyze the chemical composition, structural parameters, and pharmacology of natural medicinal substances found in plants, fungi, and roots. Cross-reference all information with toxicology data and pharmaceutical drugs to mitigate any potential risks, ensuring that the recommendations are safe, effective, and free from toxic chemicals."})
|
136 |
+
|
137 |
+
messages.append({"role": "user", "content": formatted_prompt})
|
138 |
return call_model_with_history(messages)
|
139 |
|
140 |
|
|
|
156 |
# qd = Question Decompose, og = Original, qa = Question Asking, ri = Response Improvement
|
157 |
self.prompts = {
|
158 |
"qd_prompt": """### Instruction: Identify and list the keywords that capture the essence of the question. List them as a string separated by commas. Focus on the question. Order the keyword by importance. The first keyword should be the most important keyword in the question and the last keyword should be the least important keyword.
|
159 |
+
Question: {question}
|
160 |
|
161 |
YOUR RESPONSE SHOULD BE A STRING OF COMMA SEPARATED KEYWORDS:
|
162 |
### Response: Keywords: """,
|
|
|
182 |
### Response: Improved Answer: """
|
183 |
}
|
184 |
|
185 |
+
def process_query_words(self, question_words: str):
|
186 |
# don't need to be searching for these in pubmed. Should we include: 'supplements', 'supplement'
|
187 |
+
vague_words = ['recommendation', 'recommendations', 'products', 'product', 'scholarly articles', 'academic database']
|
188 |
+
words = question_words.lower().split(",")[:4]
|
|
|
189 |
|
190 |
final_list = []
|
191 |
for word in words:
|
|
|
196 |
return list(set(final_list))
|
197 |
|
198 |
def convert_question_into_words(self, question: str):
|
199 |
+
original_answer = format_prompt_and_query(self.prompts["og_answer_prompt"], system_role=False, question=question)
|
200 |
+
logging.info(f"Original Answer: {original_answer}")
|
201 |
|
202 |
+
question_decompose = format_prompt_and_query(self.prompts["qd_prompt"], system_role=True, question=question)
|
203 |
+
logging.info(f"Question Decompose: {question_decompose}")
|
204 |
|
205 |
+
words = self.process_query_words(question_decompose)
|
|
|
|
|
|
|
|
|
206 |
return words, original_answer
|
207 |
|
208 |
def query_expert(self, question: str = None):
|
209 |
question = random.choice(self.default_questions) if question is None else question
|
210 |
+
logging.info(f"Question: {question}")
|
211 |
|
212 |
keywords, original_response = self.convert_question_into_words(question)
|
213 |
+
logging.info(f"Keywords: {keywords}")
|
|
|
|
|
|
|
214 |
|
215 |
+
context = fetch_pubmed_context(" AND ".join(keywords), max_search=5)
|
216 |
+
logging.info(f"Context: {context}")
|
217 |
if len(context) == 0:
|
218 |
return {
|
219 |
"question": question,
|
|
|
221 |
"info": "No context found"
|
222 |
}
|
223 |
|
224 |
+
contextual_response = format_prompt_and_query(self.prompts["qa_prompt"], system_role=False, question=question,
|
225 |
+
context=context)
|
226 |
+
logging.info(f"Contextual Response: {contextual_response}")
|
227 |
+
|
228 |
+
improved_response = format_prompt_and_query(self.prompts["ri_prompt"], system_role=False, question=question,
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
answer=original_response, answer2=contextual_response)
|
230 |
+
logging.info(f"Improved Response: {improved_response}")
|
231 |
+
|
232 |
return {
|
233 |
"question": question,
|
234 |
"response": improved_response,
|
|
|
241 |
if __name__ == '__main__':
|
242 |
herbal_expert = HerbalExpert()
|
243 |
answer = herbal_expert.query_expert()
|
244 |
+
# logging.info(answer['response'])
|
245 |
# # return to api? who knows
|