html_re.py 926 B

123456789101112131415161718192021222324252627282930313233343536373839
  1. """Regexps to match html elements"""
  2. import re
  3. attr_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
  4. unquoted = "[^\"'=<>`\\x00-\\x20]+"
  5. single_quoted = "'[^']*'"
  6. double_quoted = '"[^"]*"'
  7. attr_value = "(?:" + unquoted + "|" + single_quoted + "|" + double_quoted + ")"
  8. attribute = "(?:\\s+" + attr_name + "(?:\\s*=\\s*" + attr_value + ")?)"
  9. open_tag = "<[A-Za-z][A-Za-z0-9\\-]*" + attribute + "*\\s*\\/?>"
  10. close_tag = "<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>"
  11. comment = "<!---?>|<!--(?:[^-]|-[^-]|--[^>])*-->"
  12. processing = "<[?][\\s\\S]*?[?]>"
  13. declaration = "<![A-Za-z][^>]*>"
  14. cdata = "<!\\[CDATA\\[[\\s\\S]*?\\]\\]>"
  15. HTML_TAG_RE = re.compile(
  16. "^(?:"
  17. + open_tag
  18. + "|"
  19. + close_tag
  20. + "|"
  21. + comment
  22. + "|"
  23. + processing
  24. + "|"
  25. + declaration
  26. + "|"
  27. + cdata
  28. + ")"
  29. )
  30. HTML_OPEN_CLOSE_TAG_STR = "^(?:" + open_tag + "|" + close_tag + ")"
  31. HTML_OPEN_CLOSE_TAG_RE = re.compile(HTML_OPEN_CLOSE_TAG_STR)