Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -34,12 +34,45 @@ def clean_text(content):
|
|
34 |
content = re.sub(r'\s+', ' ', content)
|
35 |
return content
|
36 |
|
37 |
-
def
|
38 |
-
"""
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
# --- Document Reading Functions ---
|
45 |
|
@@ -156,7 +189,7 @@ def read_document(file_path, clean=True, url=""):
|
|
156 |
return f"Error reading PPTX: {e}", 0
|
157 |
elif mime == "text/html": # Handle HTML content
|
158 |
try:
|
159 |
-
soup = BeautifulSoup(file_content, '
|
160 |
structured_data = {
|
161 |
"Texts": extract_texts(soup),
|
162 |
"Links": extract_links(soup, url),
|
@@ -181,15 +214,10 @@ def download_and_process_file(url, clean=True):
|
|
181 |
|
182 |
try:
|
183 |
response = requests.get(url, stream=True, timeout=10)
|
184 |
-
response.raise_for_status() # Raise an exception for bad status codes
|
185 |
-
|
186 |
-
# Generate a safe and unique temporary filename
|
187 |
original_filename = os.path.basename(url)
|
188 |
-
# Remove invalid characters from filename
|
189 |
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
|
190 |
temp_filename = f"{safe_filename}"
|
191 |
|
192 |
-
# Infer file extension from content type
|
193 |
content_type = response.headers['content-type']
|
194 |
ext = mimetypes.guess_extension(content_type)
|
195 |
if ext and not temp_filename.endswith(ext): # Append extension if not already present
|
@@ -199,7 +227,6 @@ def download_and_process_file(url, clean=True):
|
|
199 |
for chunk in response.iter_content(chunk_size=8192000):
|
200 |
f.write(chunk)
|
201 |
|
202 |
-
# Check if it's an image type
|
203 |
kind = filetype.guess(temp_filename)
|
204 |
if kind and kind.mime.startswith('image/'):
|
205 |
return f"![]({url})", 0 # Return markdown image syntax if it's an image
|
@@ -215,72 +242,6 @@ def download_and_process_file(url, clean=True):
|
|
215 |
except requests.exceptions.RequestException as e:
|
216 |
return f"Error downloading file: {e}", 0
|
217 |
|
218 |
-
# --- Web Page Content Extraction Functions (from previous code) ---
|
219 |
-
|
220 |
-
def extract_texts(soup):
|
221 |
-
"""Extracts all text content from the soup."""
|
222 |
-
return [text for text in soup.stripped_strings]
|
223 |
-
|
224 |
-
def extract_links(soup, base_url):
|
225 |
-
"""Extracts all valid links from the soup."""
|
226 |
-
links = []
|
227 |
-
for link in soup.find_all('a', href=True):
|
228 |
-
href = link['href']
|
229 |
-
# Use urljoin to create an absolute URL
|
230 |
-
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
|
231 |
-
link_text = link.get_text(strip=True) or "No Text"
|
232 |
-
links.append({"Text": link_text, "URL": full_url})
|
233 |
-
return links
|
234 |
-
|
235 |
-
def extract_images(soup, base_url):
|
236 |
-
"""Extracts all valid image URLs and their alt text from the soup."""
|
237 |
-
images = []
|
238 |
-
for img in soup.find_all('img', src=True):
|
239 |
-
img_url = img['src']
|
240 |
-
# Use urljoin to create an absolute URL
|
241 |
-
full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
|
242 |
-
alt_text = img.get('alt', 'No Alt Text')
|
243 |
-
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
244 |
-
return images
|
245 |
-
|
246 |
-
def fetch_page_content(url):
|
247 |
-
"""Fetches the content of the page at the given URL."""
|
248 |
-
try:
|
249 |
-
response = requests.get(url, timeout=10)
|
250 |
-
response.raise_for_status()
|
251 |
-
return response.text
|
252 |
-
except requests.exceptions.RequestException as e:
|
253 |
-
return f"Error fetching the URL: {e}"
|
254 |
-
|
255 |
-
def format_detailed_output(structured_data):
|
256 |
-
"""Formats the structured data into a Markdown string."""
|
257 |
-
result = "### Structured Page Content\n\n"
|
258 |
-
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
259 |
-
result += "**Links:**\n"
|
260 |
-
if structured_data["Links"]:
|
261 |
-
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
|
262 |
-
else:
|
263 |
-
result += "No links found.\n"
|
264 |
-
result += "**Images:**\n"
|
265 |
-
if structured_data["Images"]:
|
266 |
-
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
|
267 |
-
else:
|
268 |
-
result += "No images found.\n"
|
269 |
-
return result
|
270 |
-
|
271 |
-
def extract_page_content(url):
|
272 |
-
"""Extracts and formats the content of the page at the given URL."""
|
273 |
-
page_content = fetch_page_content(url)
|
274 |
-
if "Error" in page_content:
|
275 |
-
return page_content
|
276 |
-
soup = BeautifulSoup(page_content, 'html.parser')
|
277 |
-
structured_data = {
|
278 |
-
"Texts": extract_texts(soup),
|
279 |
-
"Links": extract_links(soup, url), # Pass the base URL
|
280 |
-
"Images": extract_images(soup, url) # Pass the base URL
|
281 |
-
}
|
282 |
-
return format_detailed_output(structured_data)
|
283 |
-
|
284 |
# --- Gradio Interface ---
|
285 |
|
286 |
iface = gr.Interface(
|
|
|
34 |
content = re.sub(r'\s+', ' ', content)
|
35 |
return content
|
36 |
|
37 |
+
def extract_texts(soup):
|
38 |
+
"""Extracts all text content from the soup."""
|
39 |
+
return [text for text in soup.stripped_strings]
|
40 |
+
|
41 |
+
def extract_links(soup, base_url):
|
42 |
+
"""Extracts all valid links from the soup."""
|
43 |
+
links = []
|
44 |
+
for link in soup.find_all('a', href=True):
|
45 |
+
href = link['href']
|
46 |
+
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
|
47 |
+
link_text = link.get_text(strip=True) or "No Text"
|
48 |
+
links.append({"Text": link_text, "URL": full_url})
|
49 |
+
return links
|
50 |
+
|
51 |
+
def extract_images(soup, base_url):
|
52 |
+
"""Extracts all valid image URLs and their alt text from the soup."""
|
53 |
+
images = []
|
54 |
+
for img in soup.find_all('img', src=True):
|
55 |
+
img_url = img['src']
|
56 |
+
full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
|
57 |
+
alt_text = img.get('alt', 'No Alt Text')
|
58 |
+
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
59 |
+
return images
|
60 |
+
|
61 |
+
def format_detailed_output(structured_data):
|
62 |
+
"""Formats the structured data into a Markdown string."""
|
63 |
+
result = "### Structured Page Content\n\n"
|
64 |
+
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
65 |
+
result += "**Links:**\n"
|
66 |
+
if structured_data["Links"]:
|
67 |
+
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
|
68 |
+
else:
|
69 |
+
result += "No links found.\n"
|
70 |
+
result += "**Images:**\n"
|
71 |
+
if structured_data["Images"]:
|
72 |
+
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
|
73 |
+
else:
|
74 |
+
result += "No images found.\n"
|
75 |
+
return result
|
76 |
|
77 |
# --- Document Reading Functions ---
|
78 |
|
|
|
189 |
return f"Error reading PPTX: {e}", 0
|
190 |
elif mime == "text/html": # Handle HTML content
|
191 |
try:
|
192 |
+
soup = BeautifulSoup(file_content, 'lxml')
|
193 |
structured_data = {
|
194 |
"Texts": extract_texts(soup),
|
195 |
"Links": extract_links(soup, url),
|
|
|
214 |
|
215 |
try:
|
216 |
response = requests.get(url, stream=True, timeout=10)
|
|
|
|
|
|
|
217 |
original_filename = os.path.basename(url)
|
|
|
218 |
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
|
219 |
temp_filename = f"{safe_filename}"
|
220 |
|
|
|
221 |
content_type = response.headers['content-type']
|
222 |
ext = mimetypes.guess_extension(content_type)
|
223 |
if ext and not temp_filename.endswith(ext): # Append extension if not already present
|
|
|
227 |
for chunk in response.iter_content(chunk_size=8192000):
|
228 |
f.write(chunk)
|
229 |
|
|
|
230 |
kind = filetype.guess(temp_filename)
|
231 |
if kind and kind.mime.startswith('image/'):
|
232 |
return f"![]({url})", 0 # Return markdown image syntax if it's an image
|
|
|
242 |
except requests.exceptions.RequestException as e:
|
243 |
return f"Error downloading file: {e}", 0
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
# --- Gradio Interface ---
|
246 |
|
247 |
iface = gr.Interface(
|