pvanand commited on
Commit
ec971eb
1 Parent(s): 129b060

Update helper_functions_api.py

Browse files
Files changed (1) hide show
  1. helper_functions_api.py +36 -19
helper_functions_api.py CHANGED
@@ -66,6 +66,7 @@ from fuzzy_json import loads
66
  from half_json.core import JSONFixer
67
  from openai import OpenAI
68
  from together import Together
 
69
 
70
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
71
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
@@ -195,27 +196,43 @@ class Scraper:
195
  print(f"Error fetching page content for {url}: {e}")
196
  return None
197
 
198
- def extract_main_content(html):
199
- if html:
200
- plain_text = ""
201
- soup = BeautifulSoup(html, 'lxml')
202
- for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
203
- plain_text += element.get_text(separator=" ", strip=True) + "\n"
204
- return plain_text
205
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  def process_content(data_format, url, query):
208
- scraper = Scraper()
209
- html_content = scraper.fetch_content(url)
210
- if html_content:
211
- content = extract_main_content(html_content)
212
- if content:
213
- rephrased_content = rephrase_content(
214
- data_format=data_format,
215
- content=limit_tokens(remove_stopwords(content), token_limit=1000),
216
- query=query,
217
- )
218
- return rephrased_content, url
219
  return "", url
220
 
221
  def fetch_and_extract_content(data_format, urls, query):
 
66
  from half_json.core import JSONFixer
67
  from openai import OpenAI
68
  from together import Together
69
+ from urllib.parse import urlparse
70
 
71
  llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
72
  llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
 
196
  print(f"Error fetching page content for {url}: {e}")
197
  return None
198
 
199
+ def extract_main_content(url):
200
+ if url:
201
+ try:
202
+ result = urlparse(url)
203
+ if all([result.scheme, result.netloc]):
204
+ # Prepare query parameters
205
+ params = {
206
+ "url": url,
207
+ "favor_precision": False,
208
+ "favor_recall": False,
209
+ "output_format": "markdown",
210
+ "target_language": "en",
211
+ "include_tables": True,
212
+ "include_images": False,
213
+ "include_links": False,
214
+ "deduplicate": True,
215
+ }
216
+
217
+ # Make request to FastAPI endpoint
218
+ response = requests.get("https://pvanand-web-scraping.hf.space/extract-article", params=params)
219
+
220
+ if response.status_code == 200:
221
+ return response.json()["article"]
222
+ else:
223
+ return ""
224
+ except:
225
+ return ""
226
 
227
  def process_content(data_format, url, query):
228
+ content = extract_main_content(url)
229
+ if content:
230
+ rephrased_content = rephrase_content(
231
+ data_format=data_format,
232
+ content=limit_tokens(remove_stopwords(content), token_limit=4000),
233
+ query=query,
234
+ )
235
+ return rephrased_content, url
 
 
 
236
  return "", url
237
 
238
  def fetch_and_extract_content(data_format, urls, query):