Spaces:
Sleeping
Sleeping
Jatin Mehra
Add pytest configuration and restructure test files; move tests to core directory and update imports
5b02b7b
from typing import List, Dict, Optional, Any | |
import re | |
from urllib.parse import urlparse | |
from mimetypes import guess_type | |
class ContentValidator: | |
""" | |
A validator class for content and URLs in the web crawler. | |
This class provides methods to validate URLs, content types, and content size | |
to ensure they meet the requirements for processing. | |
Attributes: | |
allowed_content_types (List[str]): List of acceptable MIME types | |
blocked_domains (set): Set of domains that are blocked | |
max_content_size (int): Maximum allowed content size in bytes (10MB) | |
Example: | |
>>> validator = ContentValidator() | |
>>> validator.is_valid_url("https://example.com") | |
True | |
""" | |
def __init__(self): | |
"""Initialize the ContentValidator with default settings.""" | |
self.allowed_content_types = [ | |
"text/html", | |
"text/plain", | |
"application/pdf", | |
"application/json" | |
] | |
self.blocked_domains = set() | |
self.max_content_size = 10 * 1024 * 1024 # 10MB MAX SIZE LIMIT OF GROQ API IS 25MB We are using 10MB | |
def is_valid_url(self, url: str) -> bool: | |
""" | |
Validate if the given URL is properly formatted. | |
Args: | |
url (str): The URL to validate | |
Returns: | |
bool: True if URL is valid, False otherwise | |
Example: | |
>>> validator.is_valid_url("https://example.com") | |
True | |
>>> validator.is_valid_url("not-a-url") | |
False | |
""" | |
try: | |
result = urlparse(url) | |
return all([result.scheme, result.netloc]) | |
except: | |
return False | |
def is_allowed_content_type(self, content_type: str) -> bool: | |
""" | |
Check if the content type is in the allowed list. | |
Args: | |
content_type (str): MIME type to check | |
Returns: | |
bool: True if content type is allowed, False otherwise | |
Example: | |
>>> validator.is_allowed_content_type("text/html") | |
True | |
>>> validator.is_allowed_content_type("image/jpeg") | |
False | |
""" | |
return content_type in self.allowed_content_types | |
def is_allowed_size(self, content_size: int) -> bool: | |
""" | |
Check if the content size is within the allowed limit. | |
Args: | |
content_size (int): Size of the content in bytes | |
Returns: | |
bool: True if content size is within limit, False otherwise | |
Example: | |
>>> validator.is_allowed_size(1024) | |
True | |
>>> validator.is_allowed_size(1024 * 1024 * 20) # 20MB | |
False | |
""" | |
return content_size <= self.max_content_size | |
def validate_content(self, content: str) -> Dict[str, Any]: | |
""" | |
Validate the content for potential issues. | |
This method checks the content for malicious content, length, etc. | |
and returns a dictionary with validation results. | |
Args: | |
content (str): The content to validate | |
Returns: | |
Dict[str, Any]: A dictionary with validation results | |
Example: | |
>>> validator.validate_content("<p>Hello, World!</p>") | |
{'valid': True, 'reason': 'Content passed validation'} | |
>>> validator.validate_content("<script>alert('XSS')</script>") | |
{'valid': False, 'reason': 'Contains script tags'} | |
""" | |
# Check for potentially malicious content | |
if re.search(r"<script.*?>", content, re.I): | |
return {"valid": False, "reason": "Contains script tags"} | |
# Check for minimum content length | |
if len(content.strip()) < 10: | |
return {"valid": False, "reason": "Content too short"} | |
return {"valid": True, "reason": "Content passed validation"} |