fergos80 commited on
Commit
b94d719
·
verified ·
1 Parent(s): 3479e7a

Create utils

Browse files
Files changed (1) hide show
  1. utils +106 -0
utils ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+ from bs4 import BeautifulSoup
3
+
4
+ recipe_prompt = """
5
+ You are RecipeExtractorGPT.
6
+ Your goal is to extract recipe content from text and return a JSON representation of the useful information.
7
+
8
+ The JSON should be structured like this:
9
+
10
+ ```
11
+ {
12
+ "title": "Scrambled eggs",
13
+ "ingredients": {
14
+ "eggs": "2",
15
+ "butter": "1 tbsp",
16
+ "milk": "1 tbsp",
17
+ "salt": "1 pinch"
18
+ },
19
+ "directions": [
20
+ "Beat eggs, milk, and salt together in a bowl until thoroughly combined.",
21
+ "Heat butter in a large skillet over medium-high heat. Pour egg mixture into the hot skillet; cook and stir until eggs are set, 3 to 5 minutes."
22
+ ],
23
+ "servings": 2,
24
+ "prep_time": 5,
25
+ "cook_time": 5,
26
+ "total_time": 10,
27
+ "tags": [
28
+ "breakfast",
29
+ "eggs",
30
+ "scrambled"
31
+ ],
32
+ "source": "https://recipes.com/scrambled-eggs/",
33
+ }
34
+ ```
35
+
36
+ The user will provide text content from a web page.
37
+ It is not very well structured, but the recipe is in there.
38
+ Please look carefully for the useful information about the recipe.
39
+ IMPORTANT: Return the result as JSON in a Markdown code block surrounded with three backticks!
40
+ """
41
+
42
+
43
+ async def scrape_page_with_url(url: str, max_length: int = 14000) -> str:
44
+ """
45
+ Given a URL, scrapes the web page and return the contents. This also adds adds the
46
+ URL to the beginning of the text.
47
+
48
+ Parameters
49
+ ----------
50
+ url:
51
+ The URL to scrape
52
+ max_length:
53
+ Max length of recipe text to process. This is to prevent the model from running
54
+ out of tokens. 14000 bytes translates to approximately 3200 tokens.
55
+ """
56
+ contents = await scrape_page(url)
57
+ # Trim the string so that the prompt and reply will fit in the token limit.. It
58
+ # would be better to trim by tokens, but that requires using the tiktoken package,
59
+ # which can be very slow to load when running on containerized servers, because it
60
+ # needs to download the model from the internet each time the container starts.
61
+ contents = contents[:max_length]
62
+ return f"From: {url}\n\n" + contents
63
+
64
+
65
+ async def scrape_page(url: str) -> str:
66
+ # Asynchronously send an HTTP request to the URL.
67
+ async with aiohttp.ClientSession() as session:
68
+ async with session.get(url) as response:
69
+ if response.status != 200:
70
+ raise aiohttp.ClientError(f"An error occurred: {response.status}")
71
+ html = await response.text()
72
+
73
+ # Parse the HTML content using BeautifulSoup
74
+ soup = BeautifulSoup(html, "html.parser")
75
+
76
+ # Remove script and style elements
77
+ for script in soup(["script", "style"]):
78
+ script.decompose()
79
+
80
+ # List of element IDs or class names to remove
81
+ elements_to_remove = [
82
+ "header",
83
+ "footer",
84
+ "sidebar",
85
+ "nav",
86
+ "menu",
87
+ "ad",
88
+ "advertisement",
89
+ "cookie-banner",
90
+ "popup",
91
+ "social",
92
+ "breadcrumb",
93
+ "pagination",
94
+ "comment",
95
+ "comments",
96
+ ]
97
+
98
+ # Remove unwanted elements by ID or class name
99
+ for element in elements_to_remove:
100
+ for e in soup.find_all(id=element) + soup.find_all(class_=element):
101
+ e.decompose()
102
+
103
+ # Extract text from the remaining HTML tags
104
+ text = " ".join(soup.stripped_strings)
105
+
106
+ return text