summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVicent Marti <tanoku@gmail.com>2014-09-09 04:00:36 +0200
committerVicent Marti <tanoku@gmail.com>2014-09-09 04:00:36 +0200
commit9d86d2f32303ae0048f6a5daa552bacceb9b12ea (patch)
tree96a8cb5e53aa166044cd27091d1d5d4468c077bc
parentd21ef7b5db11075e038e60732682dfd8a5cf6a13 (diff)
Update the spec with better entity handling
-rw-r--r--Makefile4
-rw-r--r--spec.txt22
-rw-r--r--src/html/houdini_html_u.c2
3 files changed, 15 insertions, 13 deletions
diff --git a/Makefile b/Makefile
index b5e487d..5d13272 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-CFLAGS=-g -pg -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS)
-LDFLAGS=-g -pg -O3 -Wall -Werror
+CFLAGS=-g -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS)
+LDFLAGS=-g -O3 -Wall -Werror
SRCDIR=src
DATADIR=data
diff --git a/spec.txt b/spec.txt
index ebd6d98..112dccc 100644
--- a/spec.txt
+++ b/spec.txt
@@ -3762,20 +3762,20 @@ as the "unknown codepoint" character (`0xFFFD`)
[Hexadecimal entities](#hexadecimal-entities) <a id="hexadecimal-entities"></a>
consist of `&#` + either `X` or `x` + a string of 1-8 hexadecimal digits
-+ `;`.
++ `;`. They will also be parsed and turned into their corresponding UTF8 values in the AST.
.
-&#x1; &#X22; &#XD06; &#xcab;
+&#X22; &#XD06; &#xcab;
.
-<p>&#x1; &#X22; &#XD06; &#xcab;</p>
+<p>&quot; ആ ಫ</p>
.
Here are some nonentities:
.
-&nbsp &x; &#; &#x; &#123456789; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?;
+&nbsp &x; &#; &#x; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?;
.
-<p>&amp;nbsp &amp;x; &amp;#; &amp;#x; &amp;#123456789; &amp;ThisIsWayTooLongToBeAnEntityIsntIt; &amp;hi?;</p>
+<p>&amp;nbsp &amp;x; &amp;#; &amp;#x; &amp;ThisIsWayTooLongToBeAnEntityIsntIt; &amp;hi?;</p>
.
Although HTML5 does accept some entities without a trailing semicolon
@@ -3808,7 +3808,7 @@ code blocks, including raw HTML, URLs, [link titles](#link-title), and
.
[foo](/f&ouml;&ouml; "f&ouml;&ouml;")
.
-<p><a href="/f&ouml;&ouml;" title="f&ouml;&ouml;">foo</a></p>
+<p><a href="/f%C3%B6%C3%B6" title="föö">foo</a></p>
.
.
@@ -3816,7 +3816,7 @@ code blocks, including raw HTML, URLs, [link titles](#link-title), and
[foo]: /f&ouml;&ouml; "f&ouml;&ouml;"
.
-<p><a href="/f&ouml;&ouml;" title="f&ouml;&ouml;">foo</a></p>
+<p><a href="/f%C3%B6%C3%B6" title="föö">foo</a></p>
.
.
@@ -3824,7 +3824,7 @@ code blocks, including raw HTML, URLs, [link titles](#link-title), and
foo
```
.
-<pre><code class="language-f&ouml;&ouml;">foo
+<pre><code class="language-föö">foo
</code></pre>
.
@@ -4817,12 +4817,14 @@ in Markdown:
<p><a href="foo):">link</a></p>
.
-URL-escaping and entities should be left alone inside the destination:
+URL-escaping and should be left alone inside the destination, as all URL-escaped characters
+are also valid URL characters. HTML entities in the destination will be parsed into their UTF8
+codepoints, as usual, and optionally URL-escaped when written as HTML.
.
[link](foo%20b&auml;)
.
-<p><a href="foo%20b&auml;">link</a></p>
+<p><a href="foo%20b%C3%A4">link</a></p>
.
Note that, because titles can often be parsed as destinations,
diff --git a/src/html/houdini_html_u.c b/src/html/houdini_html_u.c
index 762f980..b8e2d8d 100644
--- a/src/html/houdini_html_u.c
+++ b/src/html/houdini_html_u.c
@@ -24,7 +24,7 @@ houdini_unescape_ent(strbuf *ob, const uint8_t *src, size_t size)
codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9);
}
- if (i < size && src[i] == ';') {
+ if (i < size && src[i] == ';' && codepoint) {
utf8proc_encode_char(codepoint, ob);
return i + 1;
}