Spaces:
Sleeping
Sleeping
import requests | |
import json | |
import random | |
import logging | |
from langchain.agents import AgentExecutor, LLMSingleActionAgent, AgentOutputParser | |
from langchain.prompts import StringPromptTemplate | |
from langchain.schema import AgentAction, AgentFinish | |
from langchain.memory import ConversationBufferWindowMemory | |
from langchain import LLMChain | |
from langchain.llms.base import LLM | |
from Bio import Entrez | |
from requests import HTTPError | |
from nltk.stem import WordNetLemmatizer | |
import nltk | |
from langchain.callbacks.manager import CallbackManagerForLLMRun | |
from typing import List, Union, Optional, Any | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
ngrok_url = "https://6d75-2605-7b80-3d-320-cc20-aa68-fd8-3c5e.ngrok-free.app/" | |
Entrez.email = "[email protected]" | |
nltk.download('wordnet') | |
def get_num_citations(pmid: str): | |
""" | |
The get_num_citations function takes a PubMed ID (pmid) as input and returns the number of citations for that | |
pmid. The function uses the Entrez module to query PubMed Central's API. The function first queries PMC using | |
elink to get all articles citing the given pmid, then it counts how many articles are in that list. | |
:param pmid: str: Specify the pmid of the article you want to get citations for | |
:return: The number of citations for a given pmid | |
""" | |
citations_xml = Entrez.read( | |
Entrez.elink(dbfrom="pubmed", db="pmc", LinkName="pubmed_pubmed_citedin", from_uid=pmid) | |
) | |
for i in range(len(citations_xml)): | |
if len(citations_xml[i]["LinkSetDb"]) > 0: | |
pmids_list = [link["Id"] for link in citations_xml[i]["LinkSetDb"][0]["Link"]] | |
return len(pmids_list) | |
else: | |
return 0 | |
def fetch_pubmed_context(keywords, max_search=10, max_context=3): | |
""" | |
The fetch_pubmed_articles function takes in a list of keywords and returns the top 3 articles from PubMed that | |
are most relevant to those keywords. First the search is done on max_search articles, the list is then sorted by | |
number of citations, then the top max_content articles are chosen from that list. If no articles are found with | |
the initial list of keywords, the search is rerun with the top 4 keywords of the list | |
:param keywords: Search for articles in pubmed | |
:param max_search: Limit the number of initial search results | |
:param max_context: Specify the number of articles to return | |
:return: A list of articles | |
""" | |
try: | |
return query_pubmed( | |
keywords, max_search, max_context | |
) | |
except HTTPError as e: | |
logging.error(f"HTTPError: {e}") | |
return [] | |
except RuntimeError as e: | |
logging.error(f"RuntimeError: {e}") | |
return [] | |
def query_pubmed(keywords, max_search, max_context): | |
search_result = Entrez.esearch(db="pubmed", term=keywords, retmax=max_search) | |
id_list = Entrez.read(search_result)["IdList"] | |
if len(id_list) == 0: | |
search_result = Entrez.esearch(db="pubmed", term=keywords[:4], retmax=max_search) | |
id_list = Entrez.read(search_result)["IdList"] | |
num_citations = [(id, get_num_citations(id)) for id in id_list] | |
top_n_papers = sorted(num_citations, key=lambda x: x[1], reverse=True)[:max_context] | |
logging.info(f"top_{max_context}_papers: {top_n_papers}") | |
top_n_papers = [paper[0] for paper in top_n_papers] | |
fetch_handle = Entrez.efetch(db="pubmed", id=top_n_papers, rettype="medline", retmode="xml") | |
fetched_articles = Entrez.read(fetch_handle) | |
articles = [] | |
# somehow only pull natural therapeutic articles | |
for fetched in fetched_articles['PubmedArticle']: | |
title = fetched['MedlineCitation']['Article']['ArticleTitle'] | |
abstract = fetched['MedlineCitation']['Article'].get('Abstract', {}).get('AbstractText', ["No Abstract"])[0] | |
articles.append(title + "\n" + abstract) | |
return articles | |
def call_model_with_history(messages: list): | |
""" | |
The call_model_with_history function takes a list of messages and returns the next message in the conversation. | |
:param messages: list: Pass the history of messages to the model | |
:return: the text of the model's reply | |
""" | |
data = { | |
"messages": messages, | |
"stop": ["### Instruction:"], "temperature": 0, "max_tokens": 512, "stream": False, "repeat_penalty": 1.2 | |
} | |
response = requests.post( | |
f"{ngrok_url}v1/chat/completions", | |
headers={"Content-Type": "application/json"}, | |
json=data, | |
) | |
return json.loads(response.text)['choices'][0]['message']['content'] | |
def format_prompt_and_query(prompt: str, system_role: bool, **kwargs): | |
""" | |
The format_prompt_and_query function takes a prompt and keyword arguments, formats the prompt with the keyword | |
arguments, and then calls call_model_with_history with a list of messages containing the formatted prompt. | |
:param system_role: | |
:param prompt: Format the prompt with the values in kwargs | |
:param **kwargs: Pass a dictionary of key-value pairs to the prompt formatting function | |
:return: A list of dictionaries | |
""" | |
formatted_prompt = prompt.format(**kwargs) | |
messages = [] | |
if system_role: | |
messages.append({"role": "system", "content": "Perform the instructions to the best of your ability."}) | |
else: | |
messages.append({"role": "system", | |
"content": "Develop an AI-based system to recommend optimal herbal products for specific health needs. Analyze the chemical composition, structural parameters, and pharmacology of natural medicinal substances found in plants, fungi, and roots. Cross-reference all information with toxicology data and pharmaceutical drugs to mitigate any potential risks, ensuring that the recommendations are safe, effective, and free from toxic chemicals."}) | |
messages.append({"role": "user", "content": formatted_prompt}) | |
return call_model_with_history(messages) | |
class HerbalExpert: | |
def __init__(self): | |
self.wnl = WordNetLemmatizer() | |
self.default_questions = [ | |
"How is chamomile traditionally used in herbal medicine?", | |
"What are the potential side effects or interactions of consuming echinacea alongside finasteride?", | |
"Can you explain the different methods of consuming lavender for health benefits?", | |
"Which herbs are commonly known for their anti-inflammatory properties?", | |
"I'm experiencing consistent stress and anxiety. What herbs or supplements could help alleviate these symptoms?", | |
"Are there any natural herbs that could support better sleep?", | |
"What cannabis or hemp products would you recommend for chronic pain relief?", | |
"I'm looking to boost my immune system. Are there any specific herbs or supplements that could help?", | |
"Which herbs or supplements are recommended for enhancing cognitive functions and memory?", | |
"What natural (herbal) medicinal molecule is the best alternative for pharmaceutical drugs, e.g., opiates?" | |
] | |
# qd = Question Decompose, og = Original, qa = Question Asking, ri = Response Improvement | |
self.prompts = { | |
"qd_prompt": """### Instruction: Identify and list the keywords that capture the essence of the question. List them as a string separated by commas. Focus on the question. Order the keyword by importance. The first keyword should be the most important keyword in the question and the last keyword should be the least important keyword. | |
Question: {question} | |
YOUR RESPONSE SHOULD BE A STRING OF COMMA SEPARATED KEYWORDS: | |
### Response: Keywords: """, | |
"og_answer_prompt": """### Instruction: Answer the following question to the best of your ability. Question: {question} | |
### Response: Answer: """, | |
"ans_decompose_prompt": """### Instruction: Given the following text, identify the 2 most important keywords that capture the essence of the text. If there's a list of products, choose the top 2 products. Your response should be a list of only 2 keywords separated by commas. | |
Text: {original_answer} | |
### Response: Keywords: """, | |
"qa_prompt": """### Instruction: Answer the following question using the given context ONLY if the context is relevant to the question. If the context doesn't help answer the question, ONLY respond with "I don't know". | |
Question: {question} | |
Context: {context} | |
### Response: Answer: """, | |
"ri_prompt": """### Instruction: You are an caring, intelligent question answering agent. Craft a response that is more safe, informative and intelligent than the original answer and imparts knowledge from both the old answer and from the context ONLY if it helps answer the question. | |
Question: {question} | |
Old Answer: {answer} | |
Context: {answer2} | |
### Response: Improved Answer: """ | |
} | |
def process_query_words(self, question_words: str): | |
# don't need to be searching for these in pubmed. Should we include: 'supplements', 'supplement' | |
vague_words = ['recommendation', 'recommendations', 'products', 'product', 'scholarly articles', 'academic database'] | |
words = question_words.lower().split(",")[:4] | |
final_list = [] | |
for word in words: | |
cleaned = word.strip().strip('"') | |
if cleaned not in vague_words: | |
final_list.append(self.wnl.lemmatize(cleaned)) | |
return list(set(final_list)) | |
def convert_question_into_words(self, question: str): | |
original_answer = format_prompt_and_query(self.prompts["og_answer_prompt"], system_role=False, question=question) | |
logging.info(f"Original Answer: {original_answer}") | |
question_decompose = format_prompt_and_query(self.prompts["qd_prompt"], system_role=True, question=question) | |
logging.info(f"Question Decompose: {question_decompose}") | |
words = self.process_query_words(question_decompose) | |
return words, original_answer | |
def query_expert(self, question: str = None): | |
question = random.choice(self.default_questions) if question is None else question | |
logging.info(f"Question: {question}") | |
keywords, original_response = self.convert_question_into_words(question) | |
logging.info(f"Keywords: {keywords}") | |
context = fetch_pubmed_context(" AND ".join(keywords), max_search=5) | |
logging.info(f"Context: {context}") | |
if len(context) == 0: | |
return { | |
"question": question, | |
"response": original_response, | |
"info": "No context found" | |
} | |
contextual_response = format_prompt_and_query(self.prompts["qa_prompt"], system_role=False, question=question, | |
context=context) | |
logging.info(f"Contextual Response: {contextual_response}") | |
improved_response = format_prompt_and_query(self.prompts["ri_prompt"], system_role=False, question=question, | |
answer=original_response, answer2=contextual_response) | |
logging.info(f"Improved Response: {improved_response}") | |
return { | |
"question": question, | |
"response": improved_response, | |
"info": "Success" | |
} | |
herbal_expert = HerbalExpert() | |
if __name__ == '__main__': | |
herbal_expert = HerbalExpert() | |
answer = herbal_expert.query_expert() | |
# logging.info(answer['response']) | |
# # return to api? who knows | |