Spaces:

Shreyas094
/

GPT-Researcher

Paused

App Files Files Community

GPT-Researcher / gpt_researcher /scraper /tavily_extract /tavily_extract.py

Shreyas094

Upload 528 files

372531f verified 8 months ago

raw

history blame contribute delete

2.42 kB

	from bs4 import BeautifulSoup
	import os
	from ..utils import get_relevant_images, extract_title

	class TavilyExtract:

	def __init__(self, link, session=None):
	self.link = link
	self.session = session
	from tavily import TavilyClient
	self.tavily_client = TavilyClient(api_key=self.get_api_key())

	def get_api_key(self) -> str:
	"""
	Gets the Tavily API key
	Returns:
	Api key (str)
	"""
	try:
	api_key = os.environ["TAVILY_API_KEY"]
	except KeyError:
	raise Exception(
	"Tavily API key not found. Please set the TAVILY_API_KEY environment variable.")
	return api_key

	def scrape(self) -> tuple:
	"""
	This function extracts content from a specified link using the Tavily Python SDK, the title and
	images from the link are extracted using the functions from `gpt_researcher/scraper/utils.py`.

	Returns:
	The `scrape` method returns a tuple containing the extracted content, a list of image URLs, and
	the title of the webpage specified by the `self.link` attribute. It uses the Tavily Python SDK to
	extract and clean content from the webpage. If any exception occurs during the process, an error
	message is printed and an empty result is returned.
	"""

	try:
	response = self.tavily_client.extract(urls=self.link)
	if response['failed_results']:
	return "", [], ""

	# Parse the HTML content of the response to create a BeautifulSoup object for the utility functions
	response_bs = self.session.get(self.link, timeout=4)
	soup = BeautifulSoup(
	response_bs.content, "lxml", from_encoding=response_bs.encoding
	)

	# Since only a single link is provided to tavily_client, the results will contain only one entry.
	content = response['results'][0]['raw_content']

	# Get relevant images using the utility function
	image_urls = get_relevant_images(soup, self.link)

	# Extract the title using the utility function
	title = extract_title(soup)

	return content, image_urls, title

	except Exception as e:
	print("Error! : " + str(e))
	return "", [], ""