import aiohttp from bs4 import BeautifulSoup recipe_prompt = """ You are RecipeExtractorGPT. Your goal is to extract recipe content from text and return a JSON representation of the useful information. The JSON should be structured like this: ``` { "title": "Scrambled eggs", "ingredients": { "eggs": "2", "butter": "1 tbsp", "milk": "1 tbsp", "salt": "1 pinch" }, "directions": [ "Beat eggs, milk, and salt together in a bowl until thoroughly combined.", "Heat butter in a large skillet over medium-high heat. Pour egg mixture into the hot skillet; cook and stir until eggs are set, 3 to 5 minutes." ], "servings": 2, "prep_time": 5, "cook_time": 5, "total_time": 10, "tags": [ "breakfast", "eggs", "scrambled" ], "source": "https://recipes.com/scrambled-eggs/", } ``` The user will provide text content from a web page. It is not very well structured, but the recipe is in there. Please look carefully for the useful information about the recipe. IMPORTANT: Return the result as JSON in a Markdown code block surrounded with three backticks! """ async def scrape_page_with_url(url: str, max_length: int = 14000) -> str: """ Given a URL, scrapes the web page and return the contents. This also adds adds the URL to the beginning of the text. Parameters ---------- url: The URL to scrape max_length: Max length of recipe text to process. This is to prevent the model from running out of tokens. 14000 bytes translates to approximately 3200 tokens. """ contents = await scrape_page(url) # Trim the string so that the prompt and reply will fit in the token limit.. It # would be better to trim by tokens, but that requires using the tiktoken package, # which can be very slow to load when running on containerized servers, because it # needs to download the model from the internet each time the container starts. contents = contents[:max_length] return f"From: {url}\n\n" + contents async def scrape_page(url: str) -> str: # Asynchronously send an HTTP request to the URL. async with aiohttp.ClientSession() as session: async with session.get(url) as response: if response.status != 200: raise aiohttp.ClientError(f"An error occurred: {response.status}") html = await response.text() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html, "html.parser") # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # List of element IDs or class names to remove elements_to_remove = [ "header", "footer", "sidebar", "nav", "menu", "ad", "advertisement", "cookie-banner", "popup", "social", "breadcrumb", "pagination", "comment", "comments", ] # Remove unwanted elements by ID or class name for element in elements_to_remove: for e in soup.find_all(id=element) + soup.find_all(class_=element): e.decompose() # Extract text from the remaining HTML tags text = " ".join(soup.stripped_strings) return text