Spaces:
Running
Running
"""Shim module exporting the same ElementTree API for lxml and | |
xml.etree backends. | |
When lxml is installed, it is automatically preferred over the built-in | |
xml.etree module. | |
On Python 2.7, the cElementTree module is preferred over the pure-python | |
ElementTree module. | |
Besides exporting a unified interface, this also defines extra functions | |
or subclasses built-in ElementTree classes to add features that are | |
only availble in lxml, like OrderedDict for attributes, pretty_print and | |
iterwalk. | |
""" | |
from fontTools.misc.textTools import tostr | |
XML_DECLARATION = """<?xml version='1.0' encoding='%s'?>""" | |
__all__ = [ | |
# public symbols | |
"Comment", | |
"dump", | |
"Element", | |
"ElementTree", | |
"fromstring", | |
"fromstringlist", | |
"iselement", | |
"iterparse", | |
"parse", | |
"ParseError", | |
"PI", | |
"ProcessingInstruction", | |
"QName", | |
"SubElement", | |
"tostring", | |
"tostringlist", | |
"TreeBuilder", | |
"XML", | |
"XMLParser", | |
"register_namespace", | |
] | |
try: | |
from lxml.etree import * | |
_have_lxml = True | |
except ImportError: | |
try: | |
from xml.etree.cElementTree import * | |
# the cElementTree version of XML function doesn't support | |
# the optional 'parser' keyword argument | |
from xml.etree.ElementTree import XML | |
except ImportError: # pragma: no cover | |
from xml.etree.ElementTree import * | |
_have_lxml = False | |
import sys | |
# dict is always ordered in python >= 3.6 and on pypy | |
PY36 = sys.version_info >= (3, 6) | |
try: | |
import __pypy__ | |
except ImportError: | |
__pypy__ = None | |
_dict_is_ordered = bool(PY36 or __pypy__) | |
del PY36, __pypy__ | |
if _dict_is_ordered: | |
_Attrib = dict | |
else: | |
from collections import OrderedDict as _Attrib | |
if isinstance(Element, type): | |
_Element = Element | |
else: | |
# in py27, cElementTree.Element cannot be subclassed, so | |
# we need to import the pure-python class | |
from xml.etree.ElementTree import Element as _Element | |
class Element(_Element): | |
"""Element subclass that keeps the order of attributes.""" | |
def __init__(self, tag, attrib=_Attrib(), **extra): | |
super(Element, self).__init__(tag) | |
self.attrib = _Attrib() | |
if attrib: | |
self.attrib.update(attrib) | |
if extra: | |
self.attrib.update(extra) | |
def SubElement(parent, tag, attrib=_Attrib(), **extra): | |
"""Must override SubElement as well otherwise _elementtree.SubElement | |
fails if 'parent' is a subclass of Element object. | |
""" | |
element = parent.__class__(tag, attrib, **extra) | |
parent.append(element) | |
return element | |
def _iterwalk(element, events, tag): | |
include = tag is None or element.tag == tag | |
if include and "start" in events: | |
yield ("start", element) | |
for e in element: | |
for item in _iterwalk(e, events, tag): | |
yield item | |
if include: | |
yield ("end", element) | |
def iterwalk(element_or_tree, events=("end",), tag=None): | |
"""A tree walker that generates events from an existing tree as | |
if it was parsing XML data with iterparse(). | |
Drop-in replacement for lxml.etree.iterwalk. | |
""" | |
if iselement(element_or_tree): | |
element = element_or_tree | |
else: | |
element = element_or_tree.getroot() | |
if tag == "*": | |
tag = None | |
for item in _iterwalk(element, events, tag): | |
yield item | |
_ElementTree = ElementTree | |
class ElementTree(_ElementTree): | |
"""ElementTree subclass that adds 'pretty_print' and 'doctype' | |
arguments to the 'write' method. | |
Currently these are only supported for the default XML serialization | |
'method', and not also for "html" or "text", for these are delegated | |
to the base class. | |
""" | |
def write( | |
self, | |
file_or_filename, | |
encoding=None, | |
xml_declaration=False, | |
method=None, | |
doctype=None, | |
pretty_print=False, | |
): | |
if method and method != "xml": | |
# delegate to super-class | |
super(ElementTree, self).write( | |
file_or_filename, | |
encoding=encoding, | |
xml_declaration=xml_declaration, | |
method=method, | |
) | |
return | |
if encoding is not None and encoding.lower() == "unicode": | |
if xml_declaration: | |
raise ValueError( | |
"Serialisation to unicode must not request an XML declaration" | |
) | |
write_declaration = False | |
encoding = "unicode" | |
elif xml_declaration is None: | |
# by default, write an XML declaration only for non-standard encodings | |
write_declaration = encoding is not None and encoding.upper() not in ( | |
"ASCII", | |
"UTF-8", | |
"UTF8", | |
"US-ASCII", | |
) | |
else: | |
write_declaration = xml_declaration | |
if encoding is None: | |
encoding = "ASCII" | |
if pretty_print: | |
# NOTE this will modify the tree in-place | |
_indent(self._root) | |
with _get_writer(file_or_filename, encoding) as write: | |
if write_declaration: | |
write(XML_DECLARATION % encoding.upper()) | |
if pretty_print: | |
write("\n") | |
if doctype: | |
write(_tounicode(doctype)) | |
if pretty_print: | |
write("\n") | |
qnames, namespaces = _namespaces(self._root) | |
_serialize_xml(write, self._root, qnames, namespaces) | |
import io | |
def tostring( | |
element, | |
encoding=None, | |
xml_declaration=None, | |
method=None, | |
doctype=None, | |
pretty_print=False, | |
): | |
"""Custom 'tostring' function that uses our ElementTree subclass, with | |
pretty_print support. | |
""" | |
stream = io.StringIO() if encoding == "unicode" else io.BytesIO() | |
ElementTree(element).write( | |
stream, | |
encoding=encoding, | |
xml_declaration=xml_declaration, | |
method=method, | |
doctype=doctype, | |
pretty_print=pretty_print, | |
) | |
return stream.getvalue() | |
# serialization support | |
import re | |
# Valid XML strings can include any Unicode character, excluding control | |
# characters, the surrogate blocks, FFFE, and FFFF: | |
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] | |
# Here we reversed the pattern to match only the invalid characters. | |
# For the 'narrow' python builds supporting only UCS-2, which represent | |
# characters beyond BMP as UTF-16 surrogate pairs, we need to pass through | |
# the surrogate block. I haven't found a more elegant solution... | |
UCS2 = sys.maxunicode < 0x10FFFF | |
if UCS2: | |
_invalid_xml_string = re.compile( | |
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]" | |
) | |
else: | |
_invalid_xml_string = re.compile( | |
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]" | |
) | |
def _tounicode(s): | |
"""Test if a string is valid user input and decode it to unicode string | |
using ASCII encoding if it's a bytes string. | |
Reject all bytes/unicode input that contains non-XML characters. | |
Reject all bytes input that contains non-ASCII characters. | |
""" | |
try: | |
s = tostr(s, encoding="ascii", errors="strict") | |
except UnicodeDecodeError: | |
raise ValueError( | |
"Bytes strings can only contain ASCII characters. " | |
"Use unicode strings for non-ASCII characters." | |
) | |
except AttributeError: | |
_raise_serialization_error(s) | |
if s and _invalid_xml_string.search(s): | |
raise ValueError( | |
"All strings must be XML compatible: Unicode or ASCII, " | |
"no NULL bytes or control characters" | |
) | |
return s | |
import contextlib | |
def _get_writer(file_or_filename, encoding): | |
# returns text write method and release all resources after using | |
try: | |
write = file_or_filename.write | |
except AttributeError: | |
# file_or_filename is a file name | |
f = open( | |
file_or_filename, | |
"w", | |
encoding="utf-8" if encoding == "unicode" else encoding, | |
errors="xmlcharrefreplace", | |
) | |
with f: | |
yield f.write | |
else: | |
# file_or_filename is a file-like object | |
# encoding determines if it is a text or binary writer | |
if encoding == "unicode": | |
# use a text writer as is | |
yield write | |
else: | |
# wrap a binary writer with TextIOWrapper | |
detach_buffer = False | |
if isinstance(file_or_filename, io.BufferedIOBase): | |
buf = file_or_filename | |
elif isinstance(file_or_filename, io.RawIOBase): | |
buf = io.BufferedWriter(file_or_filename) | |
detach_buffer = True | |
else: | |
# This is to handle passed objects that aren't in the | |
# IOBase hierarchy, but just have a write method | |
buf = io.BufferedIOBase() | |
buf.writable = lambda: True | |
buf.write = write | |
try: | |
# TextIOWrapper uses this methods to determine | |
# if BOM (for UTF-16, etc) should be added | |
buf.seekable = file_or_filename.seekable | |
buf.tell = file_or_filename.tell | |
except AttributeError: | |
pass | |
wrapper = io.TextIOWrapper( | |
buf, | |
encoding=encoding, | |
errors="xmlcharrefreplace", | |
newline="\n", | |
) | |
try: | |
yield wrapper.write | |
finally: | |
# Keep the original file open when the TextIOWrapper and | |
# the BufferedWriter are destroyed | |
wrapper.detach() | |
if detach_buffer: | |
buf.detach() | |
from xml.etree.ElementTree import _namespace_map | |
def _namespaces(elem): | |
# identify namespaces used in this tree | |
# maps qnames to *encoded* prefix:local names | |
qnames = {None: None} | |
# maps uri:s to prefixes | |
namespaces = {} | |
def add_qname(qname): | |
# calculate serialized qname representation | |
try: | |
qname = _tounicode(qname) | |
if qname[:1] == "{": | |
uri, tag = qname[1:].rsplit("}", 1) | |
prefix = namespaces.get(uri) | |
if prefix is None: | |
prefix = _namespace_map.get(uri) | |
if prefix is None: | |
prefix = "ns%d" % len(namespaces) | |
else: | |
prefix = _tounicode(prefix) | |
if prefix != "xml": | |
namespaces[uri] = prefix | |
if prefix: | |
qnames[qname] = "%s:%s" % (prefix, tag) | |
else: | |
qnames[qname] = tag # default element | |
else: | |
qnames[qname] = qname | |
except TypeError: | |
_raise_serialization_error(qname) | |
# populate qname and namespaces table | |
for elem in elem.iter(): | |
tag = elem.tag | |
if isinstance(tag, QName): | |
if tag.text not in qnames: | |
add_qname(tag.text) | |
elif isinstance(tag, str): | |
if tag not in qnames: | |
add_qname(tag) | |
elif tag is not None and tag is not Comment and tag is not PI: | |
_raise_serialization_error(tag) | |
for key, value in elem.items(): | |
if isinstance(key, QName): | |
key = key.text | |
if key not in qnames: | |
add_qname(key) | |
if isinstance(value, QName) and value.text not in qnames: | |
add_qname(value.text) | |
text = elem.text | |
if isinstance(text, QName) and text.text not in qnames: | |
add_qname(text.text) | |
return qnames, namespaces | |
def _serialize_xml(write, elem, qnames, namespaces, **kwargs): | |
tag = elem.tag | |
text = elem.text | |
if tag is Comment: | |
write("<!--%s-->" % _tounicode(text)) | |
elif tag is ProcessingInstruction: | |
write("<?%s?>" % _tounicode(text)) | |
else: | |
tag = qnames[_tounicode(tag) if tag is not None else None] | |
if tag is None: | |
if text: | |
write(_escape_cdata(text)) | |
for e in elem: | |
_serialize_xml(write, e, qnames, None) | |
else: | |
write("<" + tag) | |
if namespaces: | |
for uri, prefix in sorted( | |
namespaces.items(), key=lambda x: x[1] | |
): # sort on prefix | |
if prefix: | |
prefix = ":" + prefix | |
write(' xmlns%s="%s"' % (prefix, _escape_attrib(uri))) | |
attrs = elem.attrib | |
if attrs: | |
# try to keep existing attrib order | |
if len(attrs) <= 1 or type(attrs) is _Attrib: | |
items = attrs.items() | |
else: | |
# if plain dict, use lexical order | |
items = sorted(attrs.items()) | |
for k, v in items: | |
if isinstance(k, QName): | |
k = _tounicode(k.text) | |
else: | |
k = _tounicode(k) | |
if isinstance(v, QName): | |
v = qnames[_tounicode(v.text)] | |
else: | |
v = _escape_attrib(v) | |
write(' %s="%s"' % (qnames[k], v)) | |
if text is not None or len(elem): | |
write(">") | |
if text: | |
write(_escape_cdata(text)) | |
for e in elem: | |
_serialize_xml(write, e, qnames, None) | |
write("</" + tag + ">") | |
else: | |
write("/>") | |
if elem.tail: | |
write(_escape_cdata(elem.tail)) | |
def _raise_serialization_error(text): | |
raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__)) | |
def _escape_cdata(text): | |
# escape character data | |
try: | |
text = _tounicode(text) | |
# it's worth avoiding do-nothing calls for short strings | |
if "&" in text: | |
text = text.replace("&", "&") | |
if "<" in text: | |
text = text.replace("<", "<") | |
if ">" in text: | |
text = text.replace(">", ">") | |
return text | |
except (TypeError, AttributeError): | |
_raise_serialization_error(text) | |
def _escape_attrib(text): | |
# escape attribute value | |
try: | |
text = _tounicode(text) | |
if "&" in text: | |
text = text.replace("&", "&") | |
if "<" in text: | |
text = text.replace("<", "<") | |
if ">" in text: | |
text = text.replace(">", ">") | |
if '"' in text: | |
text = text.replace('"', """) | |
if "\n" in text: | |
text = text.replace("\n", " ") | |
return text | |
except (TypeError, AttributeError): | |
_raise_serialization_error(text) | |
def _indent(elem, level=0): | |
# From http://effbot.org/zone/element-lib.htm#prettyprint | |
i = "\n" + level * " " | |
if len(elem): | |
if not elem.text or not elem.text.strip(): | |
elem.text = i + " " | |
if not elem.tail or not elem.tail.strip(): | |
elem.tail = i | |
for elem in elem: | |
_indent(elem, level + 1) | |
if not elem.tail or not elem.tail.strip(): | |
elem.tail = i | |
else: | |
if level and (not elem.tail or not elem.tail.strip()): | |
elem.tail = i | |