syntax.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822
  1. from .exceptions import EmailSyntaxError
  2. from .types import ValidatedEmail
  3. from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
  4. DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \
  5. DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS
  6. import re
  7. import unicodedata
  8. import idna # implements IDNA 2008; Python's codec is only IDNA 2003
  9. import ipaddress
  10. from typing import Optional, Tuple, TypedDict, Union
  11. def split_email(email: str) -> Tuple[Optional[str], str, str, bool]:
  12. # Return the display name, unescaped local part, and domain part
  13. # of the address, and whether the local part was quoted. If no
  14. # display name was present and angle brackets do not surround
  15. # the address, display name will be None; otherwise, it will be
  16. # set to the display name or the empty string if there were
  17. # angle brackets but no display name.
  18. # Typical email addresses have a single @-sign and no quote
  19. # characters, but the awkward "quoted string" local part form
  20. # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear
  21. # in the local part if the local part is quoted.
  22. # A `display name <addr>` format is also present in MIME messages
  23. # (RFC 5322 3.4) and this format is also often recognized in
  24. # mail UIs. It's not allowed in SMTP commands or in typical web
  25. # login forms, but parsing it has been requested, so it's done
  26. # here as a convenience. It's implemented in the spirit but not
  27. # the letter of RFC 5322 3.4 because MIME messages allow newlines
  28. # and comments as a part of the CFWS rule, but this is typically
  29. # not allowed in mail UIs (although comment syntax was requested
  30. # once too).
  31. #
  32. # Display names are either basic characters (the same basic characters
  33. # permitted in email addresses, but periods are not allowed and spaces
  34. # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with
  35. # the same rules as a quoted local part. (Multiple quoted strings might
  36. # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the
  37. # email address follows in angle brackets.
  38. #
  39. # An initial quote is ambiguous between starting a display name or
  40. # a quoted local part --- fun.
  41. #
  42. # We assume the input string is already stripped of leading and
  43. # trailing CFWS.
  44. def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]:
  45. # Split the string at the first character in specials (an @-sign
  46. # or left angle bracket) that does not occur within quotes and
  47. # is not followed by a Unicode combining character.
  48. # If no special character is found, raise an error.
  49. inside_quote = False
  50. escaped = False
  51. left_part = ""
  52. for i, c in enumerate(text):
  53. # < plus U+0338 (Combining Long Solidus Overlay) normalizes to
  54. # ≮ U+226E (Not Less-Than), and it would be confusing to treat
  55. # the < as the start of "<email>" syntax in that case. Likewise,
  56. # if anything combines with an @ or ", we should probably not
  57. # treat it as a special character.
  58. if unicodedata.normalize("NFC", text[i:])[0] != c:
  59. left_part += c
  60. elif inside_quote:
  61. left_part += c
  62. if c == '\\' and not escaped:
  63. escaped = True
  64. elif c == '"' and not escaped:
  65. # The only way to exit the quote is an unescaped quote.
  66. inside_quote = False
  67. escaped = False
  68. else:
  69. escaped = False
  70. elif c == '"':
  71. left_part += c
  72. inside_quote = True
  73. elif c in specials:
  74. # When unquoted, stop before a special character.
  75. break
  76. else:
  77. left_part += c
  78. # No special symbol found. The special symbols always
  79. # include an at-sign, so this always indicates a missing
  80. # at-sign. The other symbol is optional.
  81. if len(left_part) == len(text):
  82. # The full-width at-sign might occur in CJK contexts.
  83. # We can't accept it because we only accept addresess
  84. # that are actually valid. But if this is common we
  85. # may want to consider accepting and normalizing full-
  86. # width characters for the other special symbols (and
  87. # full-width dot is already accepted in internationalized
  88. # domains) with a new option.
  89. # See https://news.ycombinator.com/item?id=42235268.
  90. if "@" in text:
  91. raise EmailSyntaxError("The email address has the \"full-width\" at-sign (@) character instead of a regular at-sign.")
  92. # Check another near-homoglyph for good measure because
  93. # homoglyphs in place of required characters could be
  94. # very confusing. We may want to consider checking for
  95. # homoglyphs anywhere we look for a special symbol.
  96. if "﹫" in text:
  97. raise EmailSyntaxError('The email address has the "small commercial at" character instead of a regular at-sign.')
  98. raise EmailSyntaxError("An email address must have an @-sign.")
  99. # The right part is whatever is left.
  100. right_part = text[len(left_part):]
  101. return left_part, right_part
  102. def unquote_quoted_string(text: str) -> Tuple[str, bool]:
  103. # Remove surrounding quotes and unescape escaped backslashes
  104. # and quotes. Escapes are parsed liberally. I think only
  105. # backslashes and quotes can be escaped but we'll allow anything
  106. # to be.
  107. quoted = False
  108. escaped = False
  109. value = ""
  110. for i, c in enumerate(text):
  111. if quoted:
  112. if escaped:
  113. value += c
  114. escaped = False
  115. elif c == '\\':
  116. escaped = True
  117. elif c == '"':
  118. if i != len(text) - 1:
  119. raise EmailSyntaxError("Extra character(s) found after close quote: "
  120. + ", ".join(safe_character_display(c) for c in text[i + 1:]))
  121. break
  122. else:
  123. value += c
  124. elif i == 0 and c == '"':
  125. quoted = True
  126. else:
  127. value += c
  128. return value, quoted
  129. # Split the string at the first unquoted @-sign or left angle bracket.
  130. left_part, right_part = split_string_at_unquoted_special(email, ("@", "<"))
  131. # If the right part starts with an angle bracket,
  132. # then the left part is a display name and the rest
  133. # of the right part up to the final right angle bracket
  134. # is the email address, .
  135. if right_part.startswith("<"):
  136. # Remove space between the display name and angle bracket.
  137. left_part = left_part.rstrip()
  138. # Unquote and unescape the display name.
  139. display_name, display_name_quoted = unquote_quoted_string(left_part)
  140. # Check that only basic characters are present in a
  141. # non-quoted display name.
  142. if not display_name_quoted:
  143. bad_chars = {
  144. safe_character_display(c)
  145. for c in display_name
  146. if (not ATEXT_RE.match(c) and c != ' ') or c == '.'
  147. }
  148. if bad_chars:
  149. raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".")
  150. # Check for other unsafe characters.
  151. check_unsafe_chars(display_name, allow_space=True)
  152. # Check that the right part ends with an angle bracket
  153. # but allow spaces after it, I guess.
  154. if ">" not in right_part:
  155. raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.")
  156. right_part = right_part.rstrip(" ")
  157. if right_part[-1] != ">":
  158. raise EmailSyntaxError("There can't be anything after the email address.")
  159. # Remove the initial and trailing angle brackets.
  160. addr_spec = right_part[1:].rstrip(">")
  161. # Split the email address at the first unquoted @-sign.
  162. local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",))
  163. # Otherwise there is no display name. The left part is the local
  164. # part and the right part is the domain.
  165. else:
  166. display_name = None
  167. local_part, domain_part = left_part, right_part
  168. if domain_part.startswith("@"):
  169. domain_part = domain_part[1:]
  170. # Unquote the local part if it is quoted.
  171. local_part, is_quoted_local_part = unquote_quoted_string(local_part)
  172. return display_name, local_part, domain_part, is_quoted_local_part
  173. def get_length_reason(addr: str, limit: int) -> str:
  174. """Helper function to return an error message related to invalid length."""
  175. diff = len(addr) - limit
  176. suffix = "s" if diff > 1 else ""
  177. return f"({diff} character{suffix} too many)"
  178. def safe_character_display(c: str) -> str:
  179. # Return safely displayable characters in quotes.
  180. if c == '\\':
  181. return f"\"{c}\"" # can't use repr because it escapes it
  182. if unicodedata.category(c)[0] in ("L", "N", "P", "S"):
  183. return repr(c)
  184. # Construct a hex string in case the unicode name doesn't exist.
  185. if ord(c) < 0xFFFF:
  186. h = f"U+{ord(c):04x}".upper()
  187. else:
  188. h = f"U+{ord(c):08x}".upper()
  189. # Return the character name or, if it has no name, the hex string.
  190. return unicodedata.name(c, h)
  191. class LocalPartValidationResult(TypedDict):
  192. local_part: str
  193. ascii_local_part: Optional[str]
  194. smtputf8: bool
  195. def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False,
  196. quoted_local_part: bool = False, strict: bool = False) -> LocalPartValidationResult:
  197. """Validates the syntax of the local part of an email address."""
  198. if len(local) == 0:
  199. if not allow_empty_local:
  200. raise EmailSyntaxError("There must be something before the @-sign.")
  201. # The caller allows an empty local part. Useful for validating certain
  202. # Postfix aliases.
  203. return {
  204. "local_part": local,
  205. "ascii_local_part": local,
  206. "smtputf8": False,
  207. }
  208. # Check the length of the local part by counting characters.
  209. # (RFC 5321 4.5.3.1.1)
  210. # We're checking the number of characters here. If the local part
  211. # is ASCII-only, then that's the same as bytes (octets). If it's
  212. # internationalized, then the UTF-8 encoding may be longer, but
  213. # that may not be relevant. We will check the total address length
  214. # instead.
  215. if strict and len(local) > LOCAL_PART_MAX_LENGTH:
  216. reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH)
  217. raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.")
  218. # Check the local part against the non-internationalized regular expression.
  219. # Most email addresses match this regex so it's probably fastest to check this first.
  220. # (RFC 5322 3.2.3)
  221. # All local parts matching the dot-atom rule are also valid as a quoted string
  222. # so if it was originally quoted (quoted_local_part is True) and this regex matches,
  223. # it's ok.
  224. # (RFC 5321 4.1.2 / RFC 5322 3.2.4).
  225. if DOT_ATOM_TEXT.match(local):
  226. # It's valid. And since it's just the permitted ASCII characters,
  227. # it's normalized and safe. If the local part was originally quoted,
  228. # the quoting was unnecessary and it'll be returned as normalized to
  229. # non-quoted form.
  230. # Return the local part and flag that SMTPUTF8 is not needed.
  231. return {
  232. "local_part": local,
  233. "ascii_local_part": local,
  234. "smtputf8": False,
  235. }
  236. # The local part failed the basic dot-atom check. Try the extended character set
  237. # for internationalized addresses. It's the same pattern but with additional
  238. # characters permitted.
  239. # RFC 6531 section 3.3.
  240. valid: Optional[str] = None
  241. requires_smtputf8 = False
  242. if DOT_ATOM_TEXT_INTL.match(local):
  243. # But international characters in the local part may not be permitted.
  244. if not allow_smtputf8:
  245. # Check for invalid characters against the non-internationalized
  246. # permitted character set.
  247. # (RFC 5322 3.2.3)
  248. bad_chars = {
  249. safe_character_display(c)
  250. for c in local
  251. if not ATEXT_RE.match(c)
  252. }
  253. if bad_chars:
  254. raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")
  255. # Although the check above should always find something, fall back to this just in case.
  256. raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.")
  257. # It's valid.
  258. valid = "dot-atom"
  259. requires_smtputf8 = True
  260. # There are no dot-atom syntax restrictions on quoted local parts, so
  261. # if it was originally quoted, it is probably valid. More characters
  262. # are allowed, like @-signs, spaces, and quotes, and there are no
  263. # restrictions on the placement of dots, as in dot-atom local parts.
  264. elif quoted_local_part:
  265. # Check for invalid characters in a quoted string local part.
  266. # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete*
  267. # characters which are *not* allowed here. RFC 6531 section 3.3
  268. # extends the range to UTF8 strings.)
  269. bad_chars = {
  270. safe_character_display(c)
  271. for c in local
  272. if not QTEXT_INTL.match(c)
  273. }
  274. if bad_chars:
  275. raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
  276. # See if any characters are outside of the ASCII range.
  277. bad_chars = {
  278. safe_character_display(c)
  279. for c in local
  280. if not (32 <= ord(c) <= 126)
  281. }
  282. if bad_chars:
  283. requires_smtputf8 = True
  284. # International characters in the local part may not be permitted.
  285. if not allow_smtputf8:
  286. raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")
  287. # It's valid.
  288. valid = "quoted"
  289. # If the local part matches the internationalized dot-atom form or was quoted,
  290. # perform additional checks for Unicode strings.
  291. if valid:
  292. # Check that the local part is a valid, safe, and sensible Unicode string.
  293. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
  294. # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the
  295. # email specs, but they may not be valid, safe, or sensible Unicode strings.
  296. # See the function for rationale.
  297. check_unsafe_chars(local, allow_space=(valid == "quoted"))
  298. # Try encoding to UTF-8. Failure is possible with some characters like
  299. # surrogate code points, but those are checked above. Still, we don't
  300. # want to have an unhandled exception later.
  301. try:
  302. local.encode("utf8")
  303. except ValueError as e:
  304. raise EmailSyntaxError("The email address contains an invalid character.") from e
  305. # If this address passes only by the quoted string form, re-quote it
  306. # and backslash-escape quotes and backslashes (removing any unnecessary
  307. # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent,
  308. # and the sending system SHOULD transmit the form that uses the minimum quoting possible."
  309. if valid == "quoted":
  310. local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"'
  311. return {
  312. "local_part": local,
  313. "ascii_local_part": local if not requires_smtputf8 else None,
  314. "smtputf8": requires_smtputf8,
  315. }
  316. # It's not a valid local part. Let's find out why.
  317. # (Since quoted local parts are all valid or handled above, these checks
  318. # don't apply in those cases.)
  319. # Check for invalid characters.
  320. # (RFC 5322 3.2.3, plus RFC 6531 3.3)
  321. bad_chars = {
  322. safe_character_display(c)
  323. for c in local
  324. if not ATEXT_INTL_DOT_RE.match(c)
  325. }
  326. if bad_chars:
  327. raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
  328. # Check for dot errors imposted by the dot-atom rule.
  329. # (RFC 5322 3.2.3)
  330. check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)
  331. # All of the reasons should already have been checked, but just in case
  332. # we have a fallback message.
  333. raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
  334. def check_unsafe_chars(s: str, allow_space: bool = False) -> None:
  335. # Check for unsafe characters or characters that would make the string
  336. # invalid or non-sensible Unicode.
  337. bad_chars = set()
  338. for i, c in enumerate(s):
  339. category = unicodedata.category(c)
  340. if category[0] in ("L", "N", "P", "S"):
  341. # Letters, numbers, punctuation, and symbols are permitted.
  342. pass
  343. elif category[0] == "M":
  344. # Combining character in first position would combine with something
  345. # outside of the email address if concatenated, so they are not safe.
  346. # We also check if this occurs after the @-sign, which would not be
  347. # sensible because it would modify the @-sign.
  348. if i == 0:
  349. bad_chars.add(c)
  350. elif category == "Zs":
  351. # Spaces outside of the ASCII range are not specifically disallowed in
  352. # internationalized addresses as far as I can tell, but they violate
  353. # the spirit of the non-internationalized specification that email
  354. # addresses do not contain ASCII spaces when not quoted. Excluding
  355. # ASCII spaces when not quoted is handled directly by the atom regex.
  356. #
  357. # In quoted-string local parts, spaces are explicitly permitted, and
  358. # the ASCII space has category Zs, so we must allow it here, and we'll
  359. # allow all Unicode spaces to be consistent.
  360. if not allow_space:
  361. bad_chars.add(c)
  362. elif category[0] == "Z":
  363. # The two line and paragraph separator characters (in categories Zl and Zp)
  364. # are not specifically disallowed in internationalized addresses
  365. # as far as I can tell, but they violate the spirit of the non-internationalized
  366. # specification that email addresses do not contain line breaks when not quoted.
  367. bad_chars.add(c)
  368. elif category[0] == "C":
  369. # Control, format, surrogate, private use, and unassigned code points (C)
  370. # are all unsafe in various ways. Control and format characters can affect
  371. # text rendering if the email address is concatenated with other text.
  372. # Bidirectional format characters are unsafe, even if used properly, because
  373. # they cause an email address to render as a different email address.
  374. # Private use characters do not make sense for publicly deliverable
  375. # email addresses.
  376. bad_chars.add(c)
  377. else:
  378. # All categories should be handled above, but in case there is something new
  379. # to the Unicode specification in the future, reject all other categories.
  380. bad_chars.add(c)
  381. if bad_chars:
  382. raise EmailSyntaxError("The email address contains unsafe characters: "
  383. + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")
  384. def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None:
  385. # RFC 5322 3.2.3
  386. if label.endswith("."):
  387. raise EmailSyntaxError(end_descr.format("period"))
  388. if label.startswith("."):
  389. raise EmailSyntaxError(start_descr.format("period"))
  390. if ".." in label:
  391. raise EmailSyntaxError("An email address cannot have two periods in a row.")
  392. if is_hostname:
  393. # RFC 952
  394. if label.endswith("-"):
  395. raise EmailSyntaxError(end_descr.format("hyphen"))
  396. if label.startswith("-"):
  397. raise EmailSyntaxError(start_descr.format("hyphen"))
  398. if ".-" in label or "-." in label:
  399. raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")
  400. def uts46_valid_char(char: str) -> bool:
  401. # By exhaustively searching for characters rejected by
  402. # for c in (chr(i) for i in range(0x110000)):
  403. # idna.uts46_remap(c, std3_rules=False, transitional=False)
  404. # I found the following rules are pretty close.
  405. c = ord(char)
  406. if 0x80 <= c <= 0x9f:
  407. # 8-bit ASCII range.
  408. return False
  409. elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E))
  410. or c in (0x00AD, 0x2064, 0xFF0E)
  411. or 0x200B <= c <= 0x200D
  412. or 0x1BCA0 <= c <= 0x1BCA3):
  413. # Characters that are permitted but fall into one of the
  414. # tests below.
  415. return True
  416. elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"):
  417. # There are a bunch of Zs characters including regular space
  418. # that are allowed by UTS46 but are not allowed in domain
  419. # names anyway.
  420. #
  421. # There are some Cn (unassigned) characters that the idna
  422. # package doesn't reject but we can, I think.
  423. return False
  424. elif "002E" in unicodedata.decomposition(chr(c)).split(" "):
  425. # Characters that decompose into a sequence with a dot.
  426. return False
  427. return True
  428. class DomainNameValidationResult(TypedDict):
  429. ascii_domain: str
  430. domain: str
  431. def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult:
  432. """Validates the syntax of the domain part of an email address."""
  433. # Check for invalid characters.
  434. # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
  435. bad_chars = {
  436. safe_character_display(c)
  437. for c in domain
  438. if not ATEXT_HOSTNAME_INTL.match(c)
  439. }
  440. if bad_chars:
  441. raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
  442. # Check for unsafe characters.
  443. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
  444. # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
  445. # they may not be valid, safe, or sensible Unicode strings.
  446. check_unsafe_chars(domain)
  447. # Reject characters that would be rejected by UTS-46 normalization next but
  448. # with an error message under our control.
  449. bad_chars = {
  450. safe_character_display(c) for c in domain
  451. if not uts46_valid_char(c)
  452. }
  453. if bad_chars:
  454. raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
  455. # Perform UTS-46 normalization, which includes casefolding, NFC normalization,
  456. # and converting all label separators (the period/full stop, fullwidth full stop,
  457. # ideographic full stop, and halfwidth ideographic full stop) to regular dots.
  458. # It will also raise an exception if there is an invalid character in the input,
  459. # such as "⒈" which is invalid because it would expand to include a dot and
  460. # U+1FEF which normalizes to a backtick, which is not an allowed hostname character.
  461. # Since several characters *are* normalized to a dot, this has to come before
  462. # checks related to dots, like check_dot_atom which comes next.
  463. original_domain = domain
  464. try:
  465. domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)
  466. except idna.IDNAError as e:
  467. raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e
  468. # Check for invalid characters after Unicode normalization which are not caught
  469. # by uts46_remap (see tests for examples).
  470. bad_chars = {
  471. safe_character_display(c)
  472. for c in domain
  473. if not ATEXT_HOSTNAME_INTL.match(c)
  474. }
  475. if bad_chars:
  476. raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".")
  477. # The domain part is made up dot-separated "labels." Each label must
  478. # have at least one character and cannot start or end with dashes, which
  479. # means there are some surprising restrictions on periods and dashes.
  480. # Check that before we do IDNA encoding because the IDNA library gives
  481. # unfriendly errors for these cases, but after UTS-46 normalization because
  482. # it can insert periods and hyphens (from fullwidth characters).
  483. # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3)
  484. check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)
  485. # Check for RFC 5890's invalid R-LDH labels, which are labels that start
  486. # with two characters other than "xn" and two dashes.
  487. for label in domain.split("."):
  488. if re.match(r"(?!xn)..--", label, re.I):
  489. raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")
  490. if DOT_ATOM_TEXT_HOSTNAME.match(domain):
  491. # This is a valid non-internationalized domain.
  492. ascii_domain = domain
  493. else:
  494. # If international characters are present in the domain name, convert
  495. # the domain to IDNA ASCII. If internationalized characters are present,
  496. # the MTA must either support SMTPUTF8 or the mail client must convert the
  497. # domain name to IDNA before submission.
  498. #
  499. # For ASCII-only domains, the transformation does nothing and is safe to
  500. # apply. However, to ensure we don't rely on the idna library for basic
  501. # syntax checks, we don't use it if it's not needed.
  502. #
  503. # idna.encode also checks the domain name length after encoding but it
  504. # doesn't give a nice error, so we call the underlying idna.alabel method
  505. # directly. idna.alabel checks label length and doesn't give great messages,
  506. # but we can't easily go to lower level methods.
  507. try:
  508. ascii_domain = ".".join(
  509. idna.alabel(label).decode("ascii")
  510. for label in domain.split(".")
  511. )
  512. except idna.IDNAError as e:
  513. # Some errors would have already been raised by idna.uts46_remap.
  514. raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e
  515. # Check the syntax of the string returned by idna.encode.
  516. # It should never fail.
  517. if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain):
  518. raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")
  519. # Check the length of the domain name in bytes.
  520. # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)
  521. # We're checking the number of bytes ("octets") here, which can be much
  522. # higher than the number of characters in internationalized domains,
  523. # on the assumption that the domain may be transmitted without SMTPUTF8
  524. # as IDNA ASCII. (This is also checked by idna.encode, so this exception
  525. # is never reached for internationalized domains.)
  526. if len(ascii_domain) > DOMAIN_MAX_LENGTH:
  527. if ascii_domain == original_domain:
  528. reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)
  529. raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.")
  530. else:
  531. diff = len(ascii_domain) - DOMAIN_MAX_LENGTH
  532. s = "" if diff == 1 else "s"
  533. raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).")
  534. # Also check the label length limit.
  535. # (RFC 1035 2.3.1)
  536. for label in ascii_domain.split("."):
  537. if len(label) > DNS_LABEL_LENGTH_LIMIT:
  538. reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)
  539. raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")
  540. if globally_deliverable:
  541. # All publicly deliverable addresses have domain names with at least
  542. # one period, at least for gTLDs created since 2013 (per the ICANN Board
  543. # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
  544. # We'll consider the lack of a period a syntax error
  545. # since that will match people's sense of what an email address looks
  546. # like. We'll skip this in test environments to allow '@test' email
  547. # addresses.
  548. if "." not in ascii_domain and not (ascii_domain == "test" and test_environment):
  549. raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.")
  550. # We also know that all TLDs currently end with a letter.
  551. if not DOMAIN_NAME_REGEX.search(ascii_domain):
  552. raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.")
  553. # Check special-use and reserved domain names.
  554. # Some might fail DNS-based deliverability checks, but that
  555. # can be turned off, so we should fail them all sooner.
  556. # See the references in __init__.py.
  557. from . import SPECIAL_USE_DOMAIN_NAMES
  558. for d in SPECIAL_USE_DOMAIN_NAMES:
  559. # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.
  560. if d == "test" and test_environment:
  561. continue
  562. if ascii_domain == d or ascii_domain.endswith("." + d):
  563. raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.")
  564. # We may have been given an IDNA ASCII domain to begin with. Check
  565. # that the domain actually conforms to IDNA. It could look like IDNA
  566. # but not be actual IDNA. For ASCII-only domains, the conversion out
  567. # of IDNA just gives the same thing back.
  568. #
  569. # This gives us the canonical internationalized form of the domain,
  570. # which we return to the caller as a part of the normalized email
  571. # address.
  572. try:
  573. domain_i18n = idna.decode(ascii_domain.encode('ascii'))
  574. except idna.IDNAError as e:
  575. raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e
  576. # Check that this normalized domain name has not somehow become
  577. # an invalid domain name. All of the checks before this point
  578. # using the idna package probably guarantee that we now have
  579. # a valid international domain name in most respects. But it
  580. # doesn't hurt to re-apply some tests to be sure. See the similar
  581. # tests above.
  582. # Check for invalid and unsafe characters. We have no test
  583. # case for this.
  584. bad_chars = {
  585. safe_character_display(c)
  586. for c in domain_i18n
  587. if not ATEXT_HOSTNAME_INTL.match(c)
  588. }
  589. if bad_chars:
  590. raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
  591. check_unsafe_chars(domain_i18n)
  592. # Check that it can be encoded back to IDNA ASCII. We have no test
  593. # case for this.
  594. try:
  595. idna.encode(domain_i18n)
  596. except idna.IDNAError as e:
  597. raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e
  598. # Return the IDNA ASCII-encoded form of the domain, which is how it
  599. # would be transmitted on the wire (except when used with SMTPUTF8
  600. # possibly), as well as the canonical Unicode form of the domain,
  601. # which is better for display purposes. This should also take care
  602. # of RFC 6532 section 3.1's suggestion to apply Unicode NFC
  603. # normalization to addresses.
  604. return {
  605. "ascii_domain": ascii_domain,
  606. "domain": domain_i18n,
  607. }
  608. def validate_email_length(addrinfo: ValidatedEmail) -> None:
  609. # There are three forms of the email address whose length must be checked:
  610. #
  611. # 1) The original email address string. Since callers may continue to use
  612. # this string, even though we recommend using the normalized form, we
  613. # should not pass validation when the original input is not valid. This
  614. # form is checked first because it is the original input.
  615. # 2) The normalized email address. We perform Unicode NFC normalization of
  616. # the local part, we normalize the domain to internationalized characters
  617. # (if originally IDNA ASCII) which also includes Unicode normalization,
  618. # and we may remove quotes in quoted local parts. We recommend that
  619. # callers use this string, so it must be valid.
  620. # 3) The email address with the IDNA ASCII representation of the domain
  621. # name, since this string may be used with email stacks that don't
  622. # support UTF-8. Since this is the least likely to be used by callers,
  623. # it is checked last. Note that ascii_email will only be set if the
  624. # local part is ASCII, but conceivably the caller may combine a
  625. # internationalized local part with an ASCII domain, so we check this
  626. # on that combination also. Since we only return the normalized local
  627. # part, we use that (and not the unnormalized local part).
  628. #
  629. # In all cases, the length is checked in UTF-8 because the SMTPUTF8
  630. # extension to SMTP validates the length in bytes.
  631. addresses_to_check = [
  632. (addrinfo.original, None),
  633. (addrinfo.normalized, "after normalization"),
  634. ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"),
  635. ]
  636. for addr, reason in addresses_to_check:
  637. addr_len = len(addr)
  638. addr_utf8_len = len(addr.encode("utf8"))
  639. diff = addr_utf8_len - EMAIL_MAX_LENGTH
  640. if diff > 0:
  641. if reason is None and addr_len == addr_utf8_len:
  642. # If there is no normalization or transcoding,
  643. # we can give a simple count of the number of
  644. # characters over the limit.
  645. reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH)
  646. elif reason is None:
  647. # If there is no normalization but there is
  648. # some transcoding to UTF-8, we can compute
  649. # the minimum number of characters over the
  650. # limit by dividing the number of bytes over
  651. # the limit by the maximum number of bytes
  652. # per character.
  653. mbpc = max(len(c.encode("utf8")) for c in addr)
  654. mchars = max(1, diff // mbpc)
  655. suffix = "s" if diff > 1 else ""
  656. if mchars == diff:
  657. reason = f"({diff} character{suffix} too many)"
  658. else:
  659. reason = f"({mchars}-{diff} character{suffix} too many)"
  660. else:
  661. # Since there is normalization, the number of
  662. # characters in the input that need to change is
  663. # impossible to know.
  664. suffix = "s" if diff > 1 else ""
  665. reason += f" ({diff} byte{suffix} too many)"
  666. raise EmailSyntaxError(f"The email address is too long {reason}.")
  667. class DomainLiteralValidationResult(TypedDict):
  668. domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address]
  669. domain: str
  670. def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult:
  671. # This is obscure domain-literal syntax. Parse it and return
  672. # a compressed/normalized address.
  673. # RFC 5321 4.1.3 and RFC 5322 3.4.1.
  674. addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address]
  675. # Try to parse the domain literal as an IPv4 address.
  676. # There is no tag for IPv4 addresses, so we can never
  677. # be sure if the user intends an IPv4 address.
  678. if re.match(r"^[0-9\.]+$", domain_literal):
  679. try:
  680. addr = ipaddress.IPv4Address(domain_literal)
  681. except ValueError as e:
  682. raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e
  683. # Return the IPv4Address object and the domain back unchanged.
  684. return {
  685. "domain_address": addr,
  686. "domain": f"[{addr}]",
  687. }
  688. # If it begins with "IPv6:" it's an IPv6 address.
  689. if domain_literal.startswith("IPv6:"):
  690. try:
  691. addr = ipaddress.IPv6Address(domain_literal[5:])
  692. except ValueError as e:
  693. raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e
  694. # Return the IPv6Address object and construct a normalized
  695. # domain literal.
  696. return {
  697. "domain_address": addr,
  698. "domain": f"[IPv6:{addr.compressed}]",
  699. }
  700. # Nothing else is valid.
  701. if ":" not in domain_literal:
  702. raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")
  703. # The tag (the part before the colon) has character restrictions,
  704. # but since it must come from a registry of tags (in which only "IPv6" is defined),
  705. # there's no need to check the syntax of the tag. See RFC 5321 4.1.2.
  706. # Check for permitted ASCII characters. This actually doesn't matter
  707. # since there will be an exception after anyway.
  708. bad_chars = {
  709. safe_character_display(c)
  710. for c in domain_literal
  711. if not DOMAIN_LITERAL_CHARS.match(c)
  712. }
  713. if bad_chars:
  714. raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".")
  715. # There are no other domain literal tags.
  716. # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml
  717. raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")