Spaces:
Paused
Paused
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool | |
import datetime | |
import requests | |
import yaml | |
from tools.final_answer import FinalAnswerTool | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import time | |
import datetime | |
import random | |
from requests.adapters import HTTPAdapter | |
from urllib3.util.retry import Retry | |
from Gradio_UI import GradioUI | |
# ✅ Tool wrapper function for SmolAgent | |
def scrape_drug_reviews_tool(drug_name: str, max_pages: int = 3) -> dict : | |
""" | |
Scrapes reviews from the website Drugs.com using Playwright for a given drug name. | |
Args: | |
drug_name: the name of the target drug for which I want to retrieve reviews, | |
max_pages: the number of pages of reviews from Drugs.com that I want to collect | |
Output: a dictionary url:review mapping the url of a review to the text of the review | |
""" | |
try: | |
df = scrape_drugs_com_reviews_requests(drug_name, max_pages) | |
return df.to_dict(orient="records") | |
except Exception as e: | |
return {"error": str(e)} | |
# List of User-Agents for rotation | |
USER_AGENTS = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36", | |
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0" | |
] | |
# Retry logic wrapper | |
def requests_retry_session(retries=3, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), session=None): | |
session = session or requests.Session() | |
retry = Retry( | |
total=retries, | |
read=retries, | |
connect=retries, | |
backoff_factor=backoff_factor, | |
status_forcelist=status_forcelist, | |
) | |
adapter = HTTPAdapter(max_retries=retry) | |
session.mount("http://", adapter) | |
session.mount("https://", adapter) | |
return session | |
# Scraper function using requests | |
def scrape_drugs_com_reviews_requests(drug_name, max_pages=3, delay=2): | |
base_url = f"https://www.drugs.com/comments/{drug_name}/" | |
all_reviews = [] | |
session = requests_retry_session() | |
for page_num in range(1, max_pages + 1): | |
url = base_url if page_num == 1 else f"{base_url}?page={page_num}" | |
headers = {"User-Agent": random.choice(USER_AGENTS)} | |
try: | |
response = session.get(url, headers=headers, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
review_blocks = soup.find_all("div", class_="ddc-comment ddc-box ddc-mgb-2") | |
if not review_blocks: | |
print(f"No reviews found on page {page_num}.") | |
break | |
for block in review_blocks: | |
review_paragraph = block.find("p") | |
review_text = None | |
if review_paragraph: | |
if review_paragraph.b: | |
review_paragraph.b.extract() # remove category (e.g., "For Back Pain") | |
review_text = review_paragraph.get_text(strip=True) | |
all_reviews.append({ | |
"review": review_text, | |
"source": url | |
}) | |
time.sleep(delay) # Polite delay | |
except Exception as e: | |
print(f"Error scraping {url}: {e}") | |
continue | |
return pd.DataFrame(all_reviews) | |
final_answer = FinalAnswerTool() | |
# If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder: | |
# model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud' | |
model = HfApiModel( | |
max_tokens=2096, | |
temperature=0.5, | |
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded | |
custom_role_conversions=None, | |
) | |
with open("prompts.yaml", 'r') as stream: | |
prompt_templates = yaml.safe_load(stream) | |
agent = CodeAgent( | |
model=model, | |
tools=[scrape_drug_reviews_tool,final_answer], ## add your tools here (don't remove final answer) | |
max_steps=6, | |
verbosity_level=1, | |
grammar=None, | |
planning_interval=None, | |
name="DrugReviewScraperAgent", | |
description="Agent that can scrape drug reviews and analyze causal relations", | |
prompt_templates=prompt_templates | |
) | |
GradioUI(agent).launch() |