Spaces:
Running
Running
File size: 3,952 Bytes
c385238 638b9bc c385238 638b9bc c385238 638b9bc c385238 638b9bc c385238 638b9bc c385238 638b9bc c385238 638b9bc c385238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
from typing import List, Dict, Optional, Any
import re
from urllib.parse import urlparse
from mimetypes import guess_type
class ContentValidator:
"""
A validator class for content and URLs in the web crawler.
This class provides methods to validate URLs, content types, and content size
to ensure they meet the requirements for processing.
Attributes:
allowed_content_types (List[str]): List of acceptable MIME types
blocked_domains (set): Set of domains that are blocked
max_content_size (int): Maximum allowed content size in bytes (10MB)
Example:
>>> validator = ContentValidator()
>>> validator.is_valid_url("https://example.com")
True
"""
def __init__(self):
"""Initialize the ContentValidator with default settings."""
self.allowed_content_types = [
"text/html",
"text/plain",
"application/pdf",
"application/json"
]
self.blocked_domains = set()
self.max_content_size = 10 * 1024 * 1024 # 10MB MAX SIZE LIMIT OF GROQ API IS 25MB We are using 10MB
def is_valid_url(self, url: str) -> bool:
"""
Validate if the given URL is properly formatted.
Args:
url (str): The URL to validate
Returns:
bool: True if URL is valid, False otherwise
Example:
>>> validator.is_valid_url("https://example.com")
True
>>> validator.is_valid_url("not-a-url")
False
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except:
return False
def is_allowed_content_type(self, content_type: str) -> bool:
"""
Check if the content type is in the allowed list.
Args:
content_type (str): MIME type to check
Returns:
bool: True if content type is allowed, False otherwise
Example:
>>> validator.is_allowed_content_type("text/html")
True
>>> validator.is_allowed_content_type("image/jpeg")
False
"""
return content_type in self.allowed_content_types
def is_allowed_size(self, content_size: int) -> bool:
"""
Check if the content size is within the allowed limit.
Args:
content_size (int): Size of the content in bytes
Returns:
bool: True if content size is within limit, False otherwise
Example:
>>> validator.is_allowed_size(1024)
True
>>> validator.is_allowed_size(1024 * 1024 * 20) # 20MB
False
"""
return content_size <= self.max_content_size
def validate_content(self, content: str) -> Dict[str, Any]:
"""
Validate the content for potential issues.
This method checks the content for malicious content, length, etc.
and returns a dictionary with validation results.
Args:
content (str): The content to validate
Returns:
Dict[str, Any]: A dictionary with validation results
Example:
>>> validator.validate_content("<p>Hello, World!</p>")
{'valid': True, 'reason': 'Content passed validation'}
>>> validator.validate_content("<script>alert('XSS')</script>")
{'valid': False, 'reason': 'Contains script tags'}
"""
# Check for potentially malicious content
if re.search(r"<script.*?>", content, re.I):
return {"valid": False, "reason": "Contains script tags"}
# Check for minimum content length
if len(content.strip()) < 10:
return {"valid": False, "reason": "Content too short"}
return {"valid": True, "reason": "Content passed validation"} |