Spaces:
Sleeping
Sleeping
import aiohttp | |
from bs4 import BeautifulSoup | |
recipe_prompt = """ | |
You are RecipeExtractorGPT. | |
Your goal is to extract recipe content from text and return a JSON representation of the useful information. | |
The JSON should be structured like this: | |
``` | |
{ | |
"title": "Scrambled eggs", | |
"ingredients": { | |
"eggs": "2", | |
"butter": "1 tbsp", | |
"milk": "1 tbsp", | |
"salt": "1 pinch" | |
}, | |
"directions": [ | |
"Beat eggs, milk, and salt together in a bowl until thoroughly combined.", | |
"Heat butter in a large skillet over medium-high heat. Pour egg mixture into the hot skillet; cook and stir until eggs are set, 3 to 5 minutes." | |
], | |
"servings": 2, | |
"prep_time": 5, | |
"cook_time": 5, | |
"total_time": 10, | |
"tags": [ | |
"breakfast", | |
"eggs", | |
"scrambled" | |
], | |
"source": "https://recipes.com/scrambled-eggs/", | |
} | |
``` | |
The user will provide text content from a web page. | |
It is not very well structured, but the recipe is in there. | |
Please look carefully for the useful information about the recipe. | |
IMPORTANT: Return the result as JSON in a Markdown code block surrounded with three backticks! | |
""" | |
async def scrape_page_with_url(url: str, max_length: int = 14000) -> str: | |
""" | |
Given a URL, scrapes the web page and return the contents. This also adds adds the | |
URL to the beginning of the text. | |
Parameters | |
---------- | |
url: | |
The URL to scrape | |
max_length: | |
Max length of recipe text to process. This is to prevent the model from running | |
out of tokens. 14000 bytes translates to approximately 3200 tokens. | |
""" | |
contents = await scrape_page(url) | |
# Trim the string so that the prompt and reply will fit in the token limit.. It | |
# would be better to trim by tokens, but that requires using the tiktoken package, | |
# which can be very slow to load when running on containerized servers, because it | |
# needs to download the model from the internet each time the container starts. | |
contents = contents[:max_length] | |
return f"From: {url}\n\n" + contents | |
async def scrape_page(url: str) -> str: | |
# Asynchronously send an HTTP request to the URL. | |
async with aiohttp.ClientSession() as session: | |
async with session.get(url) as response: | |
if response.status != 200: | |
raise aiohttp.ClientError(f"An error occurred: {response.status}") | |
html = await response.text() | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(html, "html.parser") | |
# Remove script and style elements | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# List of element IDs or class names to remove | |
elements_to_remove = [ | |
"header", | |
"footer", | |
"sidebar", | |
"nav", | |
"menu", | |
"ad", | |
"advertisement", | |
"cookie-banner", | |
"popup", | |
"social", | |
"breadcrumb", | |
"pagination", | |
"comment", | |
"comments", | |
] | |
# Remove unwanted elements by ID or class name | |
for element in elements_to_remove: | |
for e in soup.find_all(id=element) + soup.find_all(class_=element): | |
e.decompose() | |
# Extract text from the remaining HTML tags | |
text = " ".join(soup.stripped_strings) | |
return text |