Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,8 @@ import filetype
|
|
10 |
import requests
|
11 |
import os
|
12 |
import mimetypes
|
|
|
|
|
13 |
|
14 |
# Constants
|
15 |
CHUNK_SIZE = 32000
|
@@ -32,7 +34,6 @@ def clean_text(content):
|
|
32 |
content = re.sub(r'\s+', ' ', content)
|
33 |
return content
|
34 |
|
35 |
-
|
36 |
def split_content(content, chunk_size=CHUNK_SIZE):
|
37 |
"""Splits content into chunks of a specified size."""
|
38 |
chunks = []
|
@@ -92,7 +93,6 @@ def extract_text_from_pptx(pptx_data, clean=True):
|
|
92 |
return text, len(text)
|
93 |
|
94 |
def read_document(file_path, clean=True):
|
95 |
-
|
96 |
with open(file_path, "rb") as f:
|
97 |
file_content = f.read()
|
98 |
|
@@ -154,7 +154,17 @@ def read_document(file_path, clean=True):
|
|
154 |
return extract_text_from_pptx(file_content, clean)
|
155 |
except Exception as e:
|
156 |
return f"Error reading PPTX: {e}", 0
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
else:
|
159 |
try:
|
160 |
content = file_content.decode('utf-8')
|
@@ -165,43 +175,111 @@ def read_document(file_path, clean=True):
|
|
165 |
return f"Error reading file: {e}", 0
|
166 |
|
167 |
def download_and_process_file(url, clean=True):
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
else:
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
# --- Gradio Interface ---
|
207 |
|
@@ -212,13 +290,11 @@ iface = gr.Interface(
|
|
212 |
gr.Checkbox(label="Clean Text", value=True),
|
213 |
],
|
214 |
outputs=[
|
215 |
-
gr.Textbox(label="Document Content/Image Markdown"),
|
216 |
gr.Number(label="Document Length (characters)"),
|
217 |
],
|
218 |
-
title="Enhanced File Processor for Hugging Face Chat Tools",
|
219 |
-
description="Enter the URL of
|
220 |
-
"This tool is designed for use with Hugging Face Chat Tools: "
|
221 |
-
"[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
|
222 |
concurrency_limit=None
|
223 |
)
|
224 |
|
|
|
10 |
import requests
|
11 |
import os
|
12 |
import mimetypes
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
from urllib.parse import urljoin
|
15 |
|
16 |
# Constants
|
17 |
CHUNK_SIZE = 32000
|
|
|
34 |
content = re.sub(r'\s+', ' ', content)
|
35 |
return content
|
36 |
|
|
|
37 |
def split_content(content, chunk_size=CHUNK_SIZE):
|
38 |
"""Splits content into chunks of a specified size."""
|
39 |
chunks = []
|
|
|
93 |
return text, len(text)
|
94 |
|
95 |
def read_document(file_path, clean=True):
|
|
|
96 |
with open(file_path, "rb") as f:
|
97 |
file_content = f.read()
|
98 |
|
|
|
154 |
return extract_text_from_pptx(file_content, clean)
|
155 |
except Exception as e:
|
156 |
return f"Error reading PPTX: {e}", 0
|
157 |
+
elif mime == "text/html": # Handle HTML content
|
158 |
+
try:
|
159 |
+
soup = BeautifulSoup(file_content, 'html.parser')
|
160 |
+
structured_data = {
|
161 |
+
"Texts": extract_texts(soup),
|
162 |
+
"Links": extract_links(soup, ""),
|
163 |
+
"Images": extract_images(soup, "")
|
164 |
+
}
|
165 |
+
return format_detailed_output(structured_data), 0
|
166 |
+
except Exception as e:
|
167 |
+
return f"Error parsing HTML content: {e}", 0
|
168 |
else:
|
169 |
try:
|
170 |
content = file_content.decode('utf-8')
|
|
|
175 |
return f"Error reading file: {e}", 0
|
176 |
|
177 |
def download_and_process_file(url, clean=True):
|
178 |
+
"""Downloads a file from a URL and returns the local file path."""
|
179 |
+
if not url.startswith("http://") and not url.startswith("https://"):
|
180 |
+
url = "http://" + url # Prepend "http://" if not present
|
181 |
+
|
182 |
+
try:
|
183 |
+
response = requests.get(url, stream=True, timeout=10)
|
184 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
185 |
+
|
186 |
+
# Generate a safe and unique temporary filename
|
187 |
+
original_filename = os.path.basename(url)
|
188 |
+
# Remove invalid characters from filename
|
189 |
+
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
|
190 |
+
temp_filename = f"{safe_filename}"
|
191 |
+
|
192 |
+
# Infer file extension from content type
|
193 |
+
content_type = response.headers['content-type']
|
194 |
+
ext = mimetypes.guess_extension(content_type)
|
195 |
+
if ext and not temp_filename.endswith(ext): # Append extension if not already present
|
196 |
+
temp_filename += ext
|
197 |
+
|
198 |
+
with open(temp_filename, 'wb') as f:
|
199 |
+
for chunk in response.iter_content(chunk_size=8192000):
|
200 |
+
f.write(chunk)
|
201 |
+
|
202 |
+
# Check if it's an image type
|
203 |
+
kind = filetype.guess(temp_filename)
|
204 |
+
if kind and kind.mime.startswith('image/'):
|
205 |
+
return f"![]({url})", 0 # Return markdown image syntax if it's an image
|
206 |
+
else:
|
207 |
+
return read_document(temp_filename, clean) # Otherwise, process as a document
|
208 |
+
|
209 |
+
except requests.exceptions.MissingSchema:
|
210 |
+
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
|
211 |
+
except requests.exceptions.ConnectionError:
|
212 |
+
return "Error: Could not connect to the server. Please check your internet connection.", 0
|
213 |
+
except requests.exceptions.Timeout:
|
214 |
+
return "Error: Connection timed out while trying to fetch the URL.", 0
|
215 |
+
except requests.exceptions.RequestException as e:
|
216 |
+
return f"Error downloading file: {e}", 0
|
217 |
+
|
218 |
+
# --- Web Page Content Extraction Functions (from previous code) ---
|
219 |
+
|
220 |
+
def extract_texts(soup):
|
221 |
+
"""Extracts all text content from the soup."""
|
222 |
+
return [text for text in soup.stripped_strings]
|
223 |
+
|
224 |
+
def extract_links(soup, base_url):
|
225 |
+
"""Extracts all valid links from the soup."""
|
226 |
+
links = []
|
227 |
+
for link in soup.find_all('a', href=True):
|
228 |
+
href = link['href']
|
229 |
+
# Use urljoin to create an absolute URL
|
230 |
+
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
|
231 |
+
link_text = link.get_text(strip=True) or "No Text"
|
232 |
+
links.append({"Text": link_text, "URL": full_url})
|
233 |
+
return links
|
234 |
+
|
235 |
+
def extract_images(soup, base_url):
|
236 |
+
"""Extracts all valid image URLs and their alt text from the soup."""
|
237 |
+
images = []
|
238 |
+
for img in soup.find_all('img', src=True):
|
239 |
+
img_url = img['src']
|
240 |
+
# Use urljoin to create an absolute URL
|
241 |
+
full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
|
242 |
+
alt_text = img.get('alt', 'No Alt Text')
|
243 |
+
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
244 |
+
return images
|
245 |
+
|
246 |
+
def fetch_page_content(url):
|
247 |
+
"""Fetches the content of the page at the given URL."""
|
248 |
+
try:
|
249 |
+
response = requests.get(url, timeout=10)
|
250 |
+
response.raise_for_status()
|
251 |
+
return response.text
|
252 |
+
except requests.exceptions.RequestException as e:
|
253 |
+
return f"Error fetching the URL: {e}"
|
254 |
+
|
255 |
+
def format_detailed_output(structured_data):
|
256 |
+
"""Formats the structured data into a Markdown string."""
|
257 |
+
result = "### Structured Page Content\n\n"
|
258 |
+
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
259 |
+
result += "**Links:**\n"
|
260 |
+
if structured_data["Links"]:
|
261 |
+
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
|
262 |
else:
|
263 |
+
result += "No links found.\n"
|
264 |
+
result += "**Images:**\n"
|
265 |
+
if structured_data["Images"]:
|
266 |
+
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
|
267 |
+
else:
|
268 |
+
result += "No images found.\n"
|
269 |
+
return result
|
270 |
+
|
271 |
+
def extract_page_content(url):
|
272 |
+
"""Extracts and formats the content of the page at the given URL."""
|
273 |
+
page_content = fetch_page_content(url)
|
274 |
+
if "Error" in page_content:
|
275 |
+
return page_content
|
276 |
+
soup = BeautifulSoup(page_content, 'html.parser')
|
277 |
+
structured_data = {
|
278 |
+
"Texts": extract_texts(soup),
|
279 |
+
"Links": extract_links(soup, url), # Pass the base URL
|
280 |
+
"Images": extract_images(soup, url) # Pass the base URL
|
281 |
+
}
|
282 |
+
return format_detailed_output(structured_data)
|
283 |
|
284 |
# --- Gradio Interface ---
|
285 |
|
|
|
290 |
gr.Checkbox(label="Clean Text", value=True),
|
291 |
],
|
292 |
outputs=[
|
293 |
+
gr.Textbox(label="Document Content/Image Markdown/Web Page Content"),
|
294 |
gr.Number(label="Document Length (characters)"),
|
295 |
],
|
296 |
+
title="Enhanced File and Web Page Processor for Hugging Face Chat Tools",
|
297 |
+
description="Enter the URL of an image, video, document, or web page. The tool will handle it accordingly: images will be displayed as Markdown, documents will have their text extracted, and web pages will have their content structured and displayed. This tool is designed for use with Hugging Face Chat Tools. \n [https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
|
|
|
|
|
298 |
concurrency_limit=None
|
299 |
)
|
300 |
|