Spaces:
Sleeping
Sleeping
Update browser_tools.py
Browse files- browser_tools.py +52 -52
browser_tools.py
CHANGED
@@ -1,53 +1,53 @@
|
|
1 |
-
import requests
|
2 |
-
from bs4 import BeautifulSoup
|
3 |
-
from crewai import Agent, Task
|
4 |
-
from langchain.tools import tool
|
5 |
-
import os
|
6 |
-
from langchain_google_genai import ChatGoogleGenerativeAI
|
7 |
-
|
8 |
-
class BrowserTools:
|
9 |
-
@tool("Scrape website content")
|
10 |
-
def scrape_and_summarize_website(website):
|
11 |
-
"""Useful to scrape and summarize a website content"""
|
12 |
-
# Fetch the webpage content
|
13 |
-
headers = {
|
14 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
15 |
-
}
|
16 |
-
|
17 |
-
response = requests.get(website, headers=headers)
|
18 |
-
|
19 |
-
# Parse the HTML content
|
20 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
21 |
-
|
22 |
-
# Extract text content
|
23 |
-
for script in soup(["script", "style"]):
|
24 |
-
script.decompose()
|
25 |
-
content = soup.get_text(separator="\n")
|
26 |
-
|
27 |
-
# Clean up the text
|
28 |
-
lines = (line.strip() for line in content.splitlines())
|
29 |
-
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
30 |
-
content = '\n'.join(chunk for chunk in chunks if chunk)
|
31 |
-
print(content)
|
32 |
-
# Split content into chunks
|
33 |
-
content_chunks = [content[i:i + 8000] for i in range(0, len(content), 8000)]
|
34 |
-
|
35 |
-
summaries = []
|
36 |
-
for chunk in content_chunks:
|
37 |
-
agent = Agent(
|
38 |
-
role='Principal Researcher',
|
39 |
-
goal='Do amazing research and summaries based on the content you are working with',
|
40 |
-
backstory="You're a Principal Researcher at a big company and you need to do research about a given topic.",
|
41 |
-
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key = os.getenv("GOOGLE_API_KEY")),
|
42 |
-
allow_delegation=False
|
43 |
-
)
|
44 |
-
task = Task(
|
45 |
-
agent=agent,
|
46 |
-
description=f'Analyze and summarize the content below, make sure to include the most relevant information in the summary, return only the summary nothing else.\n\nCONTENT\n----------\n{chunk}',
|
47 |
-
|
48 |
-
|
49 |
-
)
|
50 |
-
summary = task.execute()
|
51 |
-
summaries.append(summary)
|
52 |
-
|
53 |
return "\n\n".join(summaries)
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from crewai import Agent, Task
|
4 |
+
from langchain.tools import tool
|
5 |
+
import os
|
6 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
7 |
+
|
8 |
+
class BrowserTools:
|
9 |
+
@tool("Scrape website content")
|
10 |
+
def scrape_and_summarize_website(website):
|
11 |
+
"""Useful to scrape and summarize a website content"""
|
12 |
+
# Fetch the webpage content
|
13 |
+
headers = {
|
14 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
15 |
+
}
|
16 |
+
|
17 |
+
response = requests.get(website, headers=headers)
|
18 |
+
|
19 |
+
# Parse the HTML content
|
20 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
21 |
+
|
22 |
+
# Extract text content
|
23 |
+
for script in soup(["script", "style"]):
|
24 |
+
script.decompose()
|
25 |
+
content = soup.get_text(separator="\n")
|
26 |
+
|
27 |
+
# Clean up the text
|
28 |
+
lines = (line.strip() for line in content.splitlines())
|
29 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
30 |
+
content = '\n'.join(chunk for chunk in chunks if chunk)
|
31 |
+
print(content)
|
32 |
+
# Split content into chunks
|
33 |
+
content_chunks = [content[i:i + 8000] for i in range(0, len(content), 8000)]
|
34 |
+
|
35 |
+
summaries = []
|
36 |
+
for chunk in content_chunks:
|
37 |
+
agent = Agent(
|
38 |
+
role='Principal Researcher',
|
39 |
+
goal='Do amazing research and summaries based on the content you are working with',
|
40 |
+
backstory="You're a Principal Researcher at a big company and you need to do research about a given topic.",
|
41 |
+
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key = os.getenv("GOOGLE_API_KEY")),
|
42 |
+
allow_delegation=False
|
43 |
+
)
|
44 |
+
task = Task(
|
45 |
+
agent=agent,
|
46 |
+
description=f'Analyze and summarize the content below, make sure to include the most relevant information in the summary, return only the summary nothing else.\n\nCONTENT\n----------\n{chunk}',
|
47 |
+
|
48 |
+
|
49 |
+
)
|
50 |
+
summary = task.execute()
|
51 |
+
summaries.append(summary)
|
52 |
+
|
53 |
return "\n\n".join(summaries)
|