utils.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. """Utilities for parsing source text"""
  2. from __future__ import annotations
  3. import re
  4. from re import Match
  5. from typing import TypeVar
  6. import unicodedata
  7. from .entities import entities
  8. def charCodeAt(src: str, pos: int) -> int | None:
  9. """
  10. Returns the Unicode value of the character at the specified location.
  11. @param - index The zero-based index of the desired character.
  12. If there is no character at the specified index, NaN is returned.
  13. This was added for compatibility with python
  14. """
  15. try:
  16. return ord(src[pos])
  17. except IndexError:
  18. return None
  19. def charStrAt(src: str, pos: int) -> str | None:
  20. """
  21. Returns the Unicode value of the character at the specified location.
  22. @param - index The zero-based index of the desired character.
  23. If there is no character at the specified index, NaN is returned.
  24. This was added for compatibility with python
  25. """
  26. try:
  27. return src[pos]
  28. except IndexError:
  29. return None
  30. _ItemTV = TypeVar("_ItemTV")
  31. def arrayReplaceAt(
  32. src: list[_ItemTV], pos: int, newElements: list[_ItemTV]
  33. ) -> list[_ItemTV]:
  34. """
  35. Remove element from array and put another array at those position.
  36. Useful for some operations with tokens
  37. """
  38. return src[:pos] + newElements + src[pos + 1 :]
  39. def isValidEntityCode(c: int) -> bool:
  40. # broken sequence
  41. if c >= 0xD800 and c <= 0xDFFF:
  42. return False
  43. # never used
  44. if c >= 0xFDD0 and c <= 0xFDEF:
  45. return False
  46. if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
  47. return False
  48. # control codes
  49. if c >= 0x00 and c <= 0x08:
  50. return False
  51. if c == 0x0B:
  52. return False
  53. if c >= 0x0E and c <= 0x1F:
  54. return False
  55. if c >= 0x7F and c <= 0x9F:
  56. return False
  57. # out of range
  58. return not (c > 0x10FFFF)
  59. def fromCodePoint(c: int) -> str:
  60. """Convert ordinal to unicode.
  61. Note, in the original Javascript two string characters were required,
  62. for codepoints larger than `0xFFFF`.
  63. But Python 3 can represent any unicode codepoint in one character.
  64. """
  65. return chr(c)
  66. # UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
  67. # ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
  68. UNESCAPE_ALL_RE = re.compile(
  69. r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
  70. re.IGNORECASE,
  71. )
  72. DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
  73. DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
  74. def replaceEntityPattern(match: str, name: str) -> str:
  75. """Convert HTML entity patterns,
  76. see https://spec.commonmark.org/0.30/#entity-references
  77. """
  78. if name in entities:
  79. return entities[name]
  80. code: None | int = None
  81. if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
  82. code = int(pat.group(1), 10)
  83. elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
  84. code = int(pat.group(1), 16)
  85. if code is not None and isValidEntityCode(code):
  86. return fromCodePoint(code)
  87. return match
  88. def unescapeAll(string: str) -> str:
  89. def replacer_func(match: Match[str]) -> str:
  90. escaped = match.group(1)
  91. if escaped:
  92. return escaped
  93. entity = match.group(2)
  94. return replaceEntityPattern(match.group(), entity)
  95. if "\\" not in string and "&" not in string:
  96. return string
  97. return UNESCAPE_ALL_RE.sub(replacer_func, string)
  98. ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
  99. ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
  100. def stripEscape(string: str) -> str:
  101. """Strip escape \\ characters"""
  102. return ESCAPE_CHAR.sub(r"\1", string)
  103. def escapeHtml(raw: str) -> str:
  104. """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
  105. # like html.escape, but without escaping single quotes
  106. raw = raw.replace("&", "&amp;") # Must be done first!
  107. raw = raw.replace("<", "&lt;")
  108. raw = raw.replace(">", "&gt;")
  109. raw = raw.replace('"', "&quot;")
  110. return raw
  111. # //////////////////////////////////////////////////////////////////////////////
  112. REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
  113. def escapeRE(string: str) -> str:
  114. string = REGEXP_ESCAPE_RE.sub("\\$&", string)
  115. return string
  116. # //////////////////////////////////////////////////////////////////////////////
  117. def isSpace(code: int | None) -> bool:
  118. """Check if character code is a whitespace."""
  119. return code in (0x09, 0x20)
  120. def isStrSpace(ch: str | None) -> bool:
  121. """Check if character is a whitespace."""
  122. return ch in ("\t", " ")
  123. MD_WHITESPACE = {
  124. 0x09, # \t
  125. 0x0A, # \n
  126. 0x0B, # \v
  127. 0x0C, # \f
  128. 0x0D, # \r
  129. 0x20, # space
  130. 0xA0,
  131. 0x1680,
  132. 0x202F,
  133. 0x205F,
  134. 0x3000,
  135. }
  136. def isWhiteSpace(code: int) -> bool:
  137. r"""Zs (unicode class) || [\t\f\v\r\n]"""
  138. if code >= 0x2000 and code <= 0x200A:
  139. return True
  140. return code in MD_WHITESPACE
  141. # //////////////////////////////////////////////////////////////////////////////
  142. def isPunctChar(ch: str) -> bool:
  143. """Check if character is a punctuation character."""
  144. return unicodedata.category(ch).startswith(("P", "S"))
  145. MD_ASCII_PUNCT = {
  146. 0x21, # /* ! */
  147. 0x22, # /* " */
  148. 0x23, # /* # */
  149. 0x24, # /* $ */
  150. 0x25, # /* % */
  151. 0x26, # /* & */
  152. 0x27, # /* ' */
  153. 0x28, # /* ( */
  154. 0x29, # /* ) */
  155. 0x2A, # /* * */
  156. 0x2B, # /* + */
  157. 0x2C, # /* , */
  158. 0x2D, # /* - */
  159. 0x2E, # /* . */
  160. 0x2F, # /* / */
  161. 0x3A, # /* : */
  162. 0x3B, # /* ; */
  163. 0x3C, # /* < */
  164. 0x3D, # /* = */
  165. 0x3E, # /* > */
  166. 0x3F, # /* ? */
  167. 0x40, # /* @ */
  168. 0x5B, # /* [ */
  169. 0x5C, # /* \ */
  170. 0x5D, # /* ] */
  171. 0x5E, # /* ^ */
  172. 0x5F, # /* _ */
  173. 0x60, # /* ` */
  174. 0x7B, # /* { */
  175. 0x7C, # /* | */
  176. 0x7D, # /* } */
  177. 0x7E, # /* ~ */
  178. }
  179. def isMdAsciiPunct(ch: int) -> bool:
  180. """Markdown ASCII punctuation characters.
  181. ::
  182. !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
  183. See http://spec.commonmark.org/0.15/#ascii-punctuation-character
  184. Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
  185. """
  186. return ch in MD_ASCII_PUNCT
  187. def normalizeReference(string: str) -> str:
  188. """Helper to unify [reference labels]."""
  189. # Trim and collapse whitespace
  190. #
  191. string = re.sub(r"\s+", " ", string.strip())
  192. # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
  193. # fixed in v12 (couldn't find any details).
  194. #
  195. # So treat this one as a special case
  196. # (remove this when node v10 is no longer supported).
  197. #
  198. # if ('ẞ'.toLowerCase() === 'Ṿ') {
  199. # str = str.replace(/ẞ/g, 'ß')
  200. # }
  201. # .toLowerCase().toUpperCase() should get rid of all differences
  202. # between letter variants.
  203. #
  204. # Simple .toLowerCase() doesn't normalize 125 code points correctly,
  205. # and .toUpperCase doesn't normalize 6 of them (list of exceptions:
  206. # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
  207. # uppercased versions).
  208. #
  209. # Here's an example showing how it happens. Lets take greek letter omega:
  210. # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
  211. #
  212. # Unicode entries:
  213. # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
  214. # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
  215. # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
  216. # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
  217. #
  218. # Case-insensitive comparison should treat all of them as equivalent.
  219. #
  220. # But .toLowerCase() doesn't change ϑ (it's already lowercase),
  221. # and .toUpperCase() doesn't change ϴ (already uppercase).
  222. #
  223. # Applying first lower then upper case normalizes any character:
  224. # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
  225. #
  226. # Note: this is equivalent to unicode case folding; unicode normalization
  227. # is a different step that is not required here.
  228. #
  229. # Final result should be uppercased, because it's later stored in an object
  230. # (this avoid a conflict with Object.prototype members,
  231. # most notably, `__proto__`)
  232. #
  233. return string.lower().upper()
  234. LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
  235. LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
  236. def isLinkOpen(string: str) -> bool:
  237. return bool(LINK_OPEN_RE.search(string))
  238. def isLinkClose(string: str) -> bool:
  239. return bool(LINK_CLOSE_RE.search(string))