Spaces:
Running
Running
Update helper_functions_api.py
Browse files- helper_functions_api.py +11 -16
helper_functions_api.py
CHANGED
@@ -4,7 +4,7 @@ from mistune.plugins.table import table
|
|
4 |
from jinja2 import Template
|
5 |
import re
|
6 |
import os
|
7 |
-
import
|
8 |
|
9 |
def md_to_html(md_text):
|
10 |
renderer = mistune.HTMLRenderer()
|
@@ -183,20 +183,16 @@ def rephrase_content(data_format, content, query):
|
|
183 |
max_tokens=500,
|
184 |
)
|
185 |
|
186 |
-
class Scraper:
|
187 |
-
def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
|
188 |
-
self.session = requests.Session()
|
189 |
-
self.session.headers.update({"User-Agent": user_agent})
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
|
201 |
def extract_main_content(html):
|
202 |
extracted = trafilatura.extract(
|
@@ -215,8 +211,7 @@ def extract_main_content(html):
|
|
215 |
return ""
|
216 |
|
217 |
def process_content(data_format, url, query):
|
218 |
-
|
219 |
-
html_content = scraper.fetch_content(url)
|
220 |
if html_content:
|
221 |
content = extract_main_content(html_content)
|
222 |
if content:
|
|
|
4 |
from jinja2 import Template
|
5 |
import re
|
6 |
import os
|
7 |
+
import hrequests
|
8 |
|
9 |
def md_to_html(md_text):
|
10 |
renderer = mistune.HTMLRenderer()
|
|
|
183 |
max_tokens=500,
|
184 |
)
|
185 |
|
|
|
|
|
|
|
|
|
186 |
|
187 |
+
@retry(tries=3, delay=1)
|
188 |
+
def fetch_content(url):
|
189 |
+
try:
|
190 |
+
response = hrequests.get(url)
|
191 |
+
if response.status_code == 200:
|
192 |
+
return response.text
|
193 |
+
except Exception as e:
|
194 |
+
print(f"Error fetching page content for {url}: {e}")
|
195 |
+
return None
|
196 |
|
197 |
def extract_main_content(html):
|
198 |
extracted = trafilatura.extract(
|
|
|
211 |
return ""
|
212 |
|
213 |
def process_content(data_format, url, query):
|
214 |
+
html_content = fetch_content(url)
|
|
|
215 |
if html_content:
|
216 |
content = extract_main_content(html_content)
|
217 |
if content:
|