tree.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. """A tree representation of a linear markdown-it token stream.
  2. This module is not part of upstream JavaScript markdown-it.
  3. """
  4. from __future__ import annotations
  5. from collections.abc import Generator, Sequence
  6. import textwrap
  7. from typing import Any, NamedTuple, TypeVar, overload
  8. from .token import Token
  9. class _NesterTokens(NamedTuple):
  10. opening: Token
  11. closing: Token
  12. _NodeType = TypeVar("_NodeType", bound="SyntaxTreeNode")
  13. class SyntaxTreeNode:
  14. """A Markdown syntax tree node.
  15. A class that can be used to construct a tree representation of a linear
  16. `markdown-it-py` token stream.
  17. Each node in the tree represents either:
  18. - root of the Markdown document
  19. - a single unnested `Token`
  20. - a `Token` "_open" and "_close" token pair, and the tokens nested in
  21. between
  22. """
  23. def __init__(
  24. self, tokens: Sequence[Token] = (), *, create_root: bool = True
  25. ) -> None:
  26. """Initialize a `SyntaxTreeNode` from a token stream.
  27. If `create_root` is True, create a root node for the document.
  28. """
  29. # Only nodes representing an unnested token have self.token
  30. self.token: Token | None = None
  31. # Only containers have nester tokens
  32. self.nester_tokens: _NesterTokens | None = None
  33. # Root node does not have self.parent
  34. self._parent: Any = None
  35. # Empty list unless a non-empty container, or unnested token that has
  36. # children (i.e. inline or img)
  37. self._children: list[Any] = []
  38. if create_root:
  39. self._set_children_from_tokens(tokens)
  40. return
  41. if not tokens:
  42. raise ValueError(
  43. "Can only create root from empty token sequence."
  44. " Set `create_root=True`."
  45. )
  46. elif len(tokens) == 1:
  47. inline_token = tokens[0]
  48. if inline_token.nesting:
  49. raise ValueError(
  50. "Unequal nesting level at the start and end of token stream."
  51. )
  52. self.token = inline_token
  53. if inline_token.children:
  54. self._set_children_from_tokens(inline_token.children)
  55. else:
  56. self.nester_tokens = _NesterTokens(tokens[0], tokens[-1])
  57. self._set_children_from_tokens(tokens[1:-1])
  58. def __repr__(self) -> str:
  59. return f"{type(self).__name__}({self.type})"
  60. @overload
  61. def __getitem__(self: _NodeType, item: int) -> _NodeType: ...
  62. @overload
  63. def __getitem__(self: _NodeType, item: slice) -> list[_NodeType]: ...
  64. def __getitem__(self: _NodeType, item: int | slice) -> _NodeType | list[_NodeType]:
  65. return self.children[item]
  66. def to_tokens(self: _NodeType) -> list[Token]:
  67. """Recover the linear token stream."""
  68. def recursive_collect_tokens(node: _NodeType, token_list: list[Token]) -> None:
  69. if node.type == "root":
  70. for child in node.children:
  71. recursive_collect_tokens(child, token_list)
  72. elif node.token:
  73. token_list.append(node.token)
  74. else:
  75. assert node.nester_tokens
  76. token_list.append(node.nester_tokens.opening)
  77. for child in node.children:
  78. recursive_collect_tokens(child, token_list)
  79. token_list.append(node.nester_tokens.closing)
  80. tokens: list[Token] = []
  81. recursive_collect_tokens(self, tokens)
  82. return tokens
  83. @property
  84. def children(self: _NodeType) -> list[_NodeType]:
  85. return self._children
  86. @children.setter
  87. def children(self: _NodeType, value: list[_NodeType]) -> None:
  88. self._children = value
  89. @property
  90. def parent(self: _NodeType) -> _NodeType | None:
  91. return self._parent # type: ignore
  92. @parent.setter
  93. def parent(self: _NodeType, value: _NodeType | None) -> None:
  94. self._parent = value
  95. @property
  96. def is_root(self) -> bool:
  97. """Is the node a special root node?"""
  98. return not (self.token or self.nester_tokens)
  99. @property
  100. def is_nested(self) -> bool:
  101. """Is this node nested?.
  102. Returns `True` if the node represents a `Token` pair and tokens in the
  103. sequence between them, where `Token.nesting` of the first `Token` in
  104. the pair is 1 and nesting of the other `Token` is -1.
  105. """
  106. return bool(self.nester_tokens)
  107. @property
  108. def siblings(self: _NodeType) -> Sequence[_NodeType]:
  109. """Get siblings of the node.
  110. Gets the whole group of siblings, including self.
  111. """
  112. if not self.parent:
  113. return [self]
  114. return self.parent.children
  115. @property
  116. def type(self) -> str:
  117. """Get a string type of the represented syntax.
  118. - "root" for root nodes
  119. - `Token.type` if the node represents an unnested token
  120. - `Token.type` of the opening token, with "_open" suffix stripped, if
  121. the node represents a nester token pair
  122. """
  123. if self.is_root:
  124. return "root"
  125. if self.token:
  126. return self.token.type
  127. assert self.nester_tokens
  128. return self.nester_tokens.opening.type.removesuffix("_open")
  129. @property
  130. def next_sibling(self: _NodeType) -> _NodeType | None:
  131. """Get the next node in the sequence of siblings.
  132. Returns `None` if this is the last sibling.
  133. """
  134. self_index = self.siblings.index(self)
  135. if self_index + 1 < len(self.siblings):
  136. return self.siblings[self_index + 1]
  137. return None
  138. @property
  139. def previous_sibling(self: _NodeType) -> _NodeType | None:
  140. """Get the previous node in the sequence of siblings.
  141. Returns `None` if this is the first sibling.
  142. """
  143. self_index = self.siblings.index(self)
  144. if self_index - 1 >= 0:
  145. return self.siblings[self_index - 1]
  146. return None
  147. def _add_child(
  148. self,
  149. tokens: Sequence[Token],
  150. ) -> None:
  151. """Make a child node for `self`."""
  152. child = type(self)(tokens, create_root=False)
  153. child.parent = self
  154. self.children.append(child)
  155. def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None:
  156. """Convert the token stream to a tree structure and set the resulting
  157. nodes as children of `self`."""
  158. reversed_tokens = list(reversed(tokens))
  159. while reversed_tokens:
  160. token = reversed_tokens.pop()
  161. if not token.nesting:
  162. self._add_child([token])
  163. continue
  164. if token.nesting != 1:
  165. raise ValueError("Invalid token nesting")
  166. nested_tokens = [token]
  167. nesting = 1
  168. while reversed_tokens and nesting:
  169. token = reversed_tokens.pop()
  170. nested_tokens.append(token)
  171. nesting += token.nesting
  172. if nesting:
  173. raise ValueError(f"unclosed tokens starting {nested_tokens[0]}")
  174. self._add_child(nested_tokens)
  175. def pretty(
  176. self, *, indent: int = 2, show_text: bool = False, _current: int = 0
  177. ) -> str:
  178. """Create an XML style string of the tree."""
  179. prefix = " " * _current
  180. text = prefix + f"<{self.type}"
  181. if not self.is_root and self.attrs:
  182. text += " " + " ".join(f"{k}={v!r}" for k, v in self.attrs.items())
  183. text += ">"
  184. if (
  185. show_text
  186. and not self.is_root
  187. and self.type in ("text", "text_special")
  188. and self.content
  189. ):
  190. text += "\n" + textwrap.indent(self.content, prefix + " " * indent)
  191. for child in self.children:
  192. text += "\n" + child.pretty(
  193. indent=indent, show_text=show_text, _current=_current + indent
  194. )
  195. return text
  196. def walk(
  197. self: _NodeType, *, include_self: bool = True
  198. ) -> Generator[_NodeType, None, None]:
  199. """Recursively yield all descendant nodes in the tree starting at self.
  200. The order mimics the order of the underlying linear token
  201. stream (i.e. depth first).
  202. """
  203. if include_self:
  204. yield self
  205. for child in self.children:
  206. yield from child.walk(include_self=True)
  207. # NOTE:
  208. # The values of the properties defined below directly map to properties
  209. # of the underlying `Token`s. A root node does not translate to a `Token`
  210. # object, so calling these property getters on a root node will raise an
  211. # `AttributeError`.
  212. #
  213. # There is no mapping for `Token.nesting` because the `is_nested` property
  214. # provides that data, and can be called on any node type, including root.
  215. def _attribute_token(self) -> Token:
  216. """Return the `Token` that is used as the data source for the
  217. properties defined below."""
  218. if self.token:
  219. return self.token
  220. if self.nester_tokens:
  221. return self.nester_tokens.opening
  222. raise AttributeError("Root node does not have the accessed attribute")
  223. @property
  224. def tag(self) -> str:
  225. """html tag name, e.g. \"p\" """
  226. return self._attribute_token().tag
  227. @property
  228. def attrs(self) -> dict[str, str | int | float]:
  229. """Html attributes."""
  230. return self._attribute_token().attrs
  231. def attrGet(self, name: str) -> None | str | int | float:
  232. """Get the value of attribute `name`, or null if it does not exist."""
  233. return self._attribute_token().attrGet(name)
  234. @property
  235. def map(self) -> tuple[int, int] | None:
  236. """Source map info. Format: `tuple[ line_begin, line_end ]`"""
  237. map_ = self._attribute_token().map
  238. if map_:
  239. # Type ignore because `Token`s attribute types are not perfect
  240. return tuple(map_) # type: ignore
  241. return None
  242. @property
  243. def level(self) -> int:
  244. """nesting level, the same as `state.level`"""
  245. return self._attribute_token().level
  246. @property
  247. def content(self) -> str:
  248. """In a case of self-closing tag (code, html, fence, etc.), it
  249. has contents of this tag."""
  250. return self._attribute_token().content
  251. @property
  252. def markup(self) -> str:
  253. """'*' or '_' for emphasis, fence string for fence, etc."""
  254. return self._attribute_token().markup
  255. @property
  256. def info(self) -> str:
  257. """fence infostring"""
  258. return self._attribute_token().info
  259. @property
  260. def meta(self) -> dict[Any, Any]:
  261. """A place for plugins to store an arbitrary data."""
  262. return self._attribute_token().meta
  263. @property
  264. def block(self) -> bool:
  265. """True for block-level tokens, false for inline tokens."""
  266. return self._attribute_token().block
  267. @property
  268. def hidden(self) -> bool:
  269. """If it's true, ignore this element when rendering.
  270. Used for tight lists to hide paragraphs."""
  271. return self._attribute_token().hidden