Spaces:

fergos80
/

chat-shiny-python

Sleeping

App Files Files Community

fergos80 commited on Dec 2, 2024

Commit

b94d719

verified ·

1 Parent(s): 3479e7a

Create utils

Browse files

Files changed (1) hide show

utils +106 -0

utils ADDED Viewed

	@@ -0,0 +1,106 @@

+import aiohttp
+from bs4 import BeautifulSoup
+recipe_prompt = """
+You are RecipeExtractorGPT.
+Your goal is to extract recipe content from text and return a JSON representation of the useful information.
+The JSON should be structured like this:
+```
+{
+  "title": "Scrambled eggs",
+  "ingredients": {
+    "eggs": "2",
+    "butter": "1 tbsp",
+    "milk": "1 tbsp",
+    "salt": "1 pinch"
+  },
+  "directions": [
+    "Beat eggs, milk, and salt together in a bowl until thoroughly combined.",
+    "Heat butter in a large skillet over medium-high heat. Pour egg mixture into the hot skillet; cook and stir until eggs are set, 3 to 5 minutes."
+  ],
+  "servings": 2,
+  "prep_time": 5,
+  "cook_time": 5,
+  "total_time": 10,
+  "tags": [
+    "breakfast",
+    "eggs",
+    "scrambled"
+  ],
+  "source": "https://recipes.com/scrambled-eggs/",
+}
+```
+The user will provide text content from a web page.
+It is not very well structured, but the recipe is in there.
+Please look carefully for the useful information about the recipe.
+IMPORTANT: Return the result as JSON in a Markdown code block surrounded with three backticks!
+"""
+async def scrape_page_with_url(url: str, max_length: int = 14000) -> str:
+    """
+    Given a URL, scrapes the web page and return the contents. This also adds adds the
+    URL to the beginning of the text.
+    Parameters
+    ----------
+    url:
+      The URL to scrape
+    max_length:
+      Max length of recipe text to process. This is to prevent the model from running
+      out of tokens. 14000 bytes translates to approximately 3200 tokens.
+    """
+    contents = await scrape_page(url)
+    # Trim the string so that the prompt and reply will fit in the token limit.. It
+    # would be better to trim by tokens, but that requires using the tiktoken package,
+    # which can be very slow to load when running on containerized servers, because it
+    # needs to download the model from the internet each time the container starts.
+    contents = contents[:max_length]
+    return f"From: {url}\n\n" + contents
+async def scrape_page(url: str) -> str:
+    # Asynchronously send an HTTP request to the URL.
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            if response.status != 200:
+                raise aiohttp.ClientError(f"An error occurred: {response.status}")
+            html = await response.text()
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html, "html.parser")
+    # Remove script and style elements
+    for script in soup(["script", "style"]):
+        script.decompose()
+    # List of element IDs or class names to remove
+    elements_to_remove = [
+        "header",
+        "footer",
+        "sidebar",
+        "nav",
+        "menu",
+        "ad",
+        "advertisement",
+        "cookie-banner",
+        "popup",
+        "social",
+        "breadcrumb",
+        "pagination",
+        "comment",
+        "comments",
+    ]
+    # Remove unwanted elements by ID or class name
+    for element in elements_to_remove:
+        for e in soup.find_all(id=element) + soup.find_all(class_=element):
+            e.decompose()
+    # Extract text from the remaining HTML tags
+    text = " ".join(soup.stripped_strings)
+    return text