File size: 11,421 Bytes
0ad74ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 |
"""A tree representation of a linear markdown-it token stream.
This module is not part of upstream JavaScript markdown-it.
"""
from __future__ import annotations
from collections.abc import Generator, Sequence
import textwrap
from typing import Any, NamedTuple, TypeVar, overload
from .token import Token
class _NesterTokens(NamedTuple):
opening: Token
closing: Token
_NodeType = TypeVar("_NodeType", bound="SyntaxTreeNode")
class SyntaxTreeNode:
"""A Markdown syntax tree node.
A class that can be used to construct a tree representation of a linear
`markdown-it-py` token stream.
Each node in the tree represents either:
- root of the Markdown document
- a single unnested `Token`
- a `Token` "_open" and "_close" token pair, and the tokens nested in
between
"""
def __init__(
self, tokens: Sequence[Token] = (), *, create_root: bool = True
) -> None:
"""Initialize a `SyntaxTreeNode` from a token stream.
If `create_root` is True, create a root node for the document.
"""
# Only nodes representing an unnested token have self.token
self.token: Token | None = None
# Only containers have nester tokens
self.nester_tokens: _NesterTokens | None = None
# Root node does not have self.parent
self._parent: Any = None
# Empty list unless a non-empty container, or unnested token that has
# children (i.e. inline or img)
self._children: list[Any] = []
if create_root:
self._set_children_from_tokens(tokens)
return
if not tokens:
raise ValueError(
"Can only create root from empty token sequence."
" Set `create_root=True`."
)
elif len(tokens) == 1:
inline_token = tokens[0]
if inline_token.nesting:
raise ValueError(
"Unequal nesting level at the start and end of token stream."
)
self.token = inline_token
if inline_token.children:
self._set_children_from_tokens(inline_token.children)
else:
self.nester_tokens = _NesterTokens(tokens[0], tokens[-1])
self._set_children_from_tokens(tokens[1:-1])
def __repr__(self) -> str:
return f"{type(self).__name__}({self.type})"
@overload
def __getitem__(self: _NodeType, item: int) -> _NodeType:
...
@overload
def __getitem__(self: _NodeType, item: slice) -> list[_NodeType]:
...
def __getitem__(self: _NodeType, item: int | slice) -> _NodeType | list[_NodeType]:
return self.children[item]
def to_tokens(self: _NodeType) -> list[Token]:
"""Recover the linear token stream."""
def recursive_collect_tokens(node: _NodeType, token_list: list[Token]) -> None:
if node.type == "root":
for child in node.children:
recursive_collect_tokens(child, token_list)
elif node.token:
token_list.append(node.token)
else:
assert node.nester_tokens
token_list.append(node.nester_tokens.opening)
for child in node.children:
recursive_collect_tokens(child, token_list)
token_list.append(node.nester_tokens.closing)
tokens: list[Token] = []
recursive_collect_tokens(self, tokens)
return tokens
@property
def children(self: _NodeType) -> list[_NodeType]:
return self._children
@children.setter
def children(self: _NodeType, value: list[_NodeType]) -> None:
self._children = value
@property
def parent(self: _NodeType) -> _NodeType | None:
return self._parent # type: ignore
@parent.setter
def parent(self: _NodeType, value: _NodeType | None) -> None:
self._parent = value
@property
def is_root(self) -> bool:
"""Is the node a special root node?"""
return not (self.token or self.nester_tokens)
@property
def is_nested(self) -> bool:
"""Is this node nested?.
Returns `True` if the node represents a `Token` pair and tokens in the
sequence between them, where `Token.nesting` of the first `Token` in
the pair is 1 and nesting of the other `Token` is -1.
"""
return bool(self.nester_tokens)
@property
def siblings(self: _NodeType) -> Sequence[_NodeType]:
"""Get siblings of the node.
Gets the whole group of siblings, including self.
"""
if not self.parent:
return [self]
return self.parent.children
@property
def type(self) -> str:
"""Get a string type of the represented syntax.
- "root" for root nodes
- `Token.type` if the node represents an unnested token
- `Token.type` of the opening token, with "_open" suffix stripped, if
the node represents a nester token pair
"""
if self.is_root:
return "root"
if self.token:
return self.token.type
assert self.nester_tokens
return _removesuffix(self.nester_tokens.opening.type, "_open")
@property
def next_sibling(self: _NodeType) -> _NodeType | None:
"""Get the next node in the sequence of siblings.
Returns `None` if this is the last sibling.
"""
self_index = self.siblings.index(self)
if self_index + 1 < len(self.siblings):
return self.siblings[self_index + 1]
return None
@property
def previous_sibling(self: _NodeType) -> _NodeType | None:
"""Get the previous node in the sequence of siblings.
Returns `None` if this is the first sibling.
"""
self_index = self.siblings.index(self)
if self_index - 1 >= 0:
return self.siblings[self_index - 1]
return None
def _add_child(
self,
tokens: Sequence[Token],
) -> None:
"""Make a child node for `self`."""
child = type(self)(tokens, create_root=False)
child.parent = self
self.children.append(child)
def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None:
"""Convert the token stream to a tree structure and set the resulting
nodes as children of `self`."""
reversed_tokens = list(reversed(tokens))
while reversed_tokens:
token = reversed_tokens.pop()
if not token.nesting:
self._add_child([token])
continue
if token.nesting != 1:
raise ValueError("Invalid token nesting")
nested_tokens = [token]
nesting = 1
while reversed_tokens and nesting:
token = reversed_tokens.pop()
nested_tokens.append(token)
nesting += token.nesting
if nesting:
raise ValueError(f"unclosed tokens starting {nested_tokens[0]}")
self._add_child(nested_tokens)
def pretty(
self, *, indent: int = 2, show_text: bool = False, _current: int = 0
) -> str:
"""Create an XML style string of the tree."""
prefix = " " * _current
text = prefix + f"<{self.type}"
if not self.is_root and self.attrs:
text += " " + " ".join(f"{k}={v!r}" for k, v in self.attrs.items())
text += ">"
if (
show_text
and not self.is_root
and self.type in ("text", "text_special")
and self.content
):
text += "\n" + textwrap.indent(self.content, prefix + " " * indent)
for child in self.children:
text += "\n" + child.pretty(
indent=indent, show_text=show_text, _current=_current + indent
)
return text
def walk(
self: _NodeType, *, include_self: bool = True
) -> Generator[_NodeType, None, None]:
"""Recursively yield all descendant nodes in the tree starting at self.
The order mimics the order of the underlying linear token
stream (i.e. depth first).
"""
if include_self:
yield self
for child in self.children:
yield from child.walk(include_self=True)
# NOTE:
# The values of the properties defined below directly map to properties
# of the underlying `Token`s. A root node does not translate to a `Token`
# object, so calling these property getters on a root node will raise an
# `AttributeError`.
#
# There is no mapping for `Token.nesting` because the `is_nested` property
# provides that data, and can be called on any node type, including root.
def _attribute_token(self) -> Token:
"""Return the `Token` that is used as the data source for the
properties defined below."""
if self.token:
return self.token
if self.nester_tokens:
return self.nester_tokens.opening
raise AttributeError("Root node does not have the accessed attribute")
@property
def tag(self) -> str:
"""html tag name, e.g. \"p\" """
return self._attribute_token().tag
@property
def attrs(self) -> dict[str, str | int | float]:
"""Html attributes."""
return self._attribute_token().attrs
def attrGet(self, name: str) -> None | str | int | float:
"""Get the value of attribute `name`, or null if it does not exist."""
return self._attribute_token().attrGet(name)
@property
def map(self) -> tuple[int, int] | None:
"""Source map info. Format: `tuple[ line_begin, line_end ]`"""
map_ = self._attribute_token().map
if map_:
# Type ignore because `Token`s attribute types are not perfect
return tuple(map_) # type: ignore
return None
@property
def level(self) -> int:
"""nesting level, the same as `state.level`"""
return self._attribute_token().level
@property
def content(self) -> str:
"""In a case of self-closing tag (code, html, fence, etc.), it
has contents of this tag."""
return self._attribute_token().content
@property
def markup(self) -> str:
"""'*' or '_' for emphasis, fence string for fence, etc."""
return self._attribute_token().markup
@property
def info(self) -> str:
"""fence infostring"""
return self._attribute_token().info
@property
def meta(self) -> dict[Any, Any]:
"""A place for plugins to store an arbitrary data."""
return self._attribute_token().meta
@property
def block(self) -> bool:
"""True for block-level tokens, false for inline tokens."""
return self._attribute_token().block
@property
def hidden(self) -> bool:
"""If it's true, ignore this element when rendering.
Used for tight lists to hide paragraphs."""
return self._attribute_token().hidden
def _removesuffix(string: str, suffix: str) -> str:
"""Remove a suffix from a string.
Replace this with str.removesuffix() from stdlib when minimum Python
version is 3.9.
"""
if suffix and string.endswith(suffix):
return string[: -len(suffix)]
return string
|