summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xruntests.py17
1 files changed, 10 insertions, 7 deletions
diff --git a/runtests.py b/runtests.py
index 83c331d..8a37f6d 100755
--- a/runtests.py
+++ b/runtests.py
@@ -115,7 +115,7 @@ class MyHTMLParser(HTMLParser):
def handle_decl(self, data):
self.output += '<!' + data + '>'
self.last = "decl"
- def handle_unknown_decl(self, data):
+ def unknown_decl(self, data):
self.output += '<!' + data + '>'
self.last = "decl"
def handle_pi(self,data):
@@ -174,15 +174,18 @@ def normalize_html(html):
* Attributes are sorted and lowercased.
* References are converted to unicode, except that '<', '>', '&', and
'&' are rendered using entities.
-
- Known limitations:
-
- * HTMLParser just swallows CDATA.
- * HTMLParser seems to treat unknown declarations as comments.
"""
+ html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
try:
parser = MyHTMLParser()
- parser.feed(html.decode(encoding='UTF-8'))
+ # We work around HTMLParser's limitations parsing CDATA
+ # by breaking the input into chunks and passing CDATA chunks
+ # through verbatim.
+ for chunk in re.finditer(html_chunk_re, html):
+ if chunk.group(0)[:8] == "<![CDATA":
+ parser.output += chunk.group(0)
+ else:
+ parser.feed(chunk.group(0).decode(encoding='UTF-8'))
parser.close()
return parser.output
except HTMLParseError as e: