From a8d97a098742413d0ffdc3602d1798df6e4f00a1 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 22 Dec 2014 23:38:56 -0700 Subject: Fixed normalization bug, added more doctests for normalization. * The tests test for removal of whitespace around block-level tags. * Previously whitespace wasn't removed before an initial block-level tag; this commit fixes that. * Also revised wording so it's clear that whitespace is removed on both sides of block-level tags. Closes #246 in a slightly different way. --- test/normalize.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/test/normalize.py b/test/normalize.py index 5b4803b..894a837 100644 --- a/test/normalize.py +++ b/test/normalize.py @@ -32,7 +32,7 @@ class MyHTMLParser(HTMLParser): def handle_endtag(self, tag): if tag == "pre": self.in_pre = False - if self.is_block_tag(tag): + elif self.is_block_tag(tag): self.output = self.output.rstrip() self.output += "" self.last_tag = tag @@ -40,6 +40,8 @@ class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): if tag == "pre": self.in_pre = True + if self.is_block_tag(tag): + self.output = self.output.rstrip() self.output += "<" + tag # For now we don't strip out 'extra' attributes, because of # raw HTML test cases. @@ -125,11 +127,23 @@ def normalize_html(html): >>> normalize_html("

a \t\nb

") u'

a b

' - * Outer whitespace (outside block-level tags) is removed. + * Whitespace surrounding block-level tags is removed. + + >>> normalize_html("

a b

") + u'

a b

' + + >>> normalize_html("

a b

") + u'

a b

' - >>> normalize_html("

a b

") + >>> normalize_html("

a b

") u'

a b

' + >>> normalize_html("\n\t

\n\t\ta b\t\t

\n\t") + u'

a b

' + + >>> normalize_html("a b ") + u'a b ' + * Self-closing tags are converted to open tags. >>> normalize_html("
") -- cgit v1.2.3