html.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. """Compare two HTML documents."""
  2. import re
  3. from html.parser import HTMLParser
  4. # ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020
  5. # SPACE.
  6. # https://infra.spec.whatwg.org/#ascii-whitespace
  7. ASCII_WHITESPACE = re.compile(r'[\t\n\f\r ]+')
  8. def normalize_whitespace(string):
  9. return ASCII_WHITESPACE.sub(' ', string)
  10. class Element:
  11. def __init__(self, name, attributes):
  12. self.name = name
  13. self.attributes = sorted(attributes)
  14. self.children = []
  15. def append(self, element):
  16. if isinstance(element, str):
  17. element = normalize_whitespace(element)
  18. if self.children:
  19. if isinstance(self.children[-1], str):
  20. self.children[-1] += element
  21. self.children[-1] = normalize_whitespace(self.children[-1])
  22. return
  23. elif self.children:
  24. # removing last children if it is only whitespace
  25. # this can result in incorrect dom representations since
  26. # whitespace between inline tags like <span> is significant
  27. if isinstance(self.children[-1], str):
  28. if self.children[-1].isspace():
  29. self.children.pop()
  30. if element:
  31. self.children.append(element)
  32. def finalize(self):
  33. def rstrip_last_element(children):
  34. if children:
  35. if isinstance(children[-1], str):
  36. children[-1] = children[-1].rstrip()
  37. if not children[-1]:
  38. children.pop()
  39. children = rstrip_last_element(children)
  40. return children
  41. rstrip_last_element(self.children)
  42. for i, child in enumerate(self.children):
  43. if isinstance(child, str):
  44. self.children[i] = child.strip()
  45. elif hasattr(child, 'finalize'):
  46. child.finalize()
  47. def __eq__(self, element):
  48. if not hasattr(element, 'name') or self.name != element.name:
  49. return False
  50. if len(self.attributes) != len(element.attributes):
  51. return False
  52. if self.attributes != element.attributes:
  53. # attributes without a value is same as attribute with value that
  54. # equals the attributes name:
  55. # <input checked> == <input checked="checked">
  56. for i in range(len(self.attributes)):
  57. attr, value = self.attributes[i]
  58. other_attr, other_value = element.attributes[i]
  59. if value is None:
  60. value = attr
  61. if other_value is None:
  62. other_value = other_attr
  63. if attr != other_attr or value != other_value:
  64. return False
  65. return self.children == element.children
  66. def __hash__(self):
  67. return hash((self.name, *self.attributes))
  68. def _count(self, element, count=True):
  69. if not isinstance(element, str):
  70. if self == element:
  71. return 1
  72. if isinstance(element, RootElement):
  73. if self.children == element.children:
  74. return 1
  75. i = 0
  76. for child in self.children:
  77. # child is text content and element is also text content, then
  78. # make a simple "text" in "text"
  79. if isinstance(child, str):
  80. if isinstance(element, str):
  81. if count:
  82. i += child.count(element)
  83. elif element in child:
  84. return 1
  85. else:
  86. i += child._count(element, count=count)
  87. if not count and i:
  88. return i
  89. return i
  90. def __contains__(self, element):
  91. return self._count(element, count=False) > 0
  92. def count(self, element):
  93. return self._count(element, count=True)
  94. def __getitem__(self, key):
  95. return self.children[key]
  96. def __str__(self):
  97. output = '<%s' % self.name
  98. for key, value in self.attributes:
  99. if value:
  100. output += ' %s="%s"' % (key, value)
  101. else:
  102. output += ' %s' % key
  103. if self.children:
  104. output += '>\n'
  105. output += ''.join(str(c) for c in self.children)
  106. output += '\n</%s>' % self.name
  107. else:
  108. output += '>'
  109. return output
  110. def __repr__(self):
  111. return str(self)
  112. class RootElement(Element):
  113. def __init__(self):
  114. super().__init__(None, ())
  115. def __str__(self):
  116. return ''.join(str(c) for c in self.children)
  117. class HTMLParseError(Exception):
  118. pass
  119. class Parser(HTMLParser):
  120. # https://html.spec.whatwg.org/#void-elements
  121. SELF_CLOSING_TAGS = {
  122. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta',
  123. 'param', 'source', 'track', 'wbr',
  124. # Deprecated tags
  125. 'frame', 'spacer',
  126. }
  127. def __init__(self):
  128. super().__init__()
  129. self.root = RootElement()
  130. self.open_tags = []
  131. self.element_positions = {}
  132. def error(self, msg):
  133. raise HTMLParseError(msg, self.getpos())
  134. def format_position(self, position=None, element=None):
  135. if not position and element:
  136. position = self.element_positions[element]
  137. if position is None:
  138. position = self.getpos()
  139. if hasattr(position, 'lineno'):
  140. position = position.lineno, position.offset
  141. return 'Line %d, Column %d' % position
  142. @property
  143. def current(self):
  144. if self.open_tags:
  145. return self.open_tags[-1]
  146. else:
  147. return self.root
  148. def handle_startendtag(self, tag, attrs):
  149. self.handle_starttag(tag, attrs)
  150. if tag not in self.SELF_CLOSING_TAGS:
  151. self.handle_endtag(tag)
  152. def handle_starttag(self, tag, attrs):
  153. # Special case handling of 'class' attribute, so that comparisons of DOM
  154. # instances are not sensitive to ordering of classes.
  155. attrs = [
  156. (name, ' '.join(sorted(value for value in ASCII_WHITESPACE.split(value) if value)))
  157. if name == "class"
  158. else (name, value)
  159. for name, value in attrs
  160. ]
  161. element = Element(tag, attrs)
  162. self.current.append(element)
  163. if tag not in self.SELF_CLOSING_TAGS:
  164. self.open_tags.append(element)
  165. self.element_positions[element] = self.getpos()
  166. def handle_endtag(self, tag):
  167. if not self.open_tags:
  168. self.error("Unexpected end tag `%s` (%s)" % (
  169. tag, self.format_position()))
  170. element = self.open_tags.pop()
  171. while element.name != tag:
  172. if not self.open_tags:
  173. self.error("Unexpected end tag `%s` (%s)" % (
  174. tag, self.format_position()))
  175. element = self.open_tags.pop()
  176. def handle_data(self, data):
  177. self.current.append(data)
  178. def parse_html(html):
  179. """
  180. Take a string that contains *valid* HTML and turn it into a Python object
  181. structure that can be easily compared against other HTML on semantic
  182. equivalence. Syntactical differences like which quotation is used on
  183. arguments will be ignored.
  184. """
  185. parser = Parser()
  186. parser.feed(html)
  187. parser.close()
  188. document = parser.root
  189. document.finalize()
  190. # Removing ROOT element if it's not necessary
  191. if len(document.children) == 1:
  192. if not isinstance(document.children[0], str):
  193. document = document.children[0]
  194. return document