1 files changed, 10 insertions, 7 deletions
diff --git a/runtests.py b/runtests.py
index 83c331d..8a37f6d 100755
--- a/runtests.py
+++ b/runtests.py
@@ -115,7 +115,7 @@ class MyHTMLParser(HTMLParser):
     def handle_decl(self, data):
         self.output += '<!' + data + '>'
         self.last = "decl"
-    def handle_unknown_decl(self, data):
+    def unknown_decl(self, data):
         self.output += '<!' + data + '>'
         self.last = "decl"
     def handle_pi(self,data):
@@ -174,15 +174,18 @@ def normalize_html(html):
     * Attributes are sorted and lowercased.
     * References are converted to unicode, except that '<', '>', '&', and
       '&' are rendered using entities.
-
-    Known limitations:
-
-    * HTMLParser just swallows CDATA.
-    * HTMLParser seems to treat unknown declarations as comments.
     """
+    html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
     try:
         parser = MyHTMLParser()
-        parser.feed(html.decode(encoding='UTF-8'))
+        # We work around HTMLParser's limitations parsing CDATA
+        # by breaking the input into chunks and passing CDATA chunks
+        # through verbatim.
+        for chunk in re.finditer(html_chunk_re, html):
+            if chunk.group(0)[:8] == "<![CDATA":
+                parser.output += chunk.group(0)
+            else:
+                parser.feed(chunk.group(0).decode(encoding='UTF-8'))
         parser.close()
         return parser.output
     except HTMLParseError as e: