From ff9c0dcecd1314b820bf7d2584990c26c0e28909 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 20:10:49 +0100 Subject: Validate UTF-8 input Invalid UTF-8 byte sequences are replaced with the Unicode replacement character U+FFFD. Fixes #213. --- api_test/main.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 117 insertions(+), 7 deletions(-) (limited to 'api_test') diff --git a/api_test/main.c b/api_test/main.c index 06d9be2..2d65a46 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -8,6 +8,8 @@ #include "harness.h" +#define UTF8_REPL "\xEF\xBF\xBD" + static const cmark_node_type node_types[] = { CMARK_NODE_DOCUMENT, CMARK_NODE_BLOCK_QUOTE, @@ -31,10 +33,25 @@ static const cmark_node_type node_types[] = { }; static const int num_node_types = sizeof(node_types) / sizeof(*node_types); +static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg); + static void test_content(test_batch_runner *runner, cmark_node_type type, int allowed_content); +static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg); + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg); + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8); + static void constructor(test_batch_runner *runner) { @@ -436,13 +453,8 @@ test_content(test_batch_runner *runner, cmark_node_type type, static void parser(test_batch_runner *runner) { - static const char markdown[] = "No newline"; - cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1); - char *html = cmark_render_html(doc); - STR_EQ(runner, html, "

No newline

\n", - "document without trailing newline"); - free(html); - cmark_node_destroy(doc); + test_md_to_html(runner, "No newline", "

No newline

\n", + "document without trailing newline"); } static void @@ -475,6 +487,103 @@ render_html(test_batch_runner *runner) cmark_node_destroy(doc); } +static void +utf8(test_batch_runner *runner) +{ + // Ranges + test_char(runner, 1, "\x01", "valid utf8 01"); + test_char(runner, 1, "\x7F", "valid utf8 7F"); + test_char(runner, 0, "\x80", "invalid utf8 80"); + test_char(runner, 0, "\xBF", "invalid utf8 BF"); + test_char(runner, 0, "\xC0\x80", "invalid utf8 C080"); + test_char(runner, 0, "\xC1\xBF", "invalid utf8 C1BF"); + test_char(runner, 1, "\xC2\x80", "valid utf8 C280"); + test_char(runner, 1, "\xDF\xBF", "valid utf8 DFBF"); + test_char(runner, 0, "\xE0\x80\x80", "invalid utf8 E08080"); + test_char(runner, 0, "\xE0\x9F\xBF", "invalid utf8 E09FBF"); + test_char(runner, 1, "\xE0\xA0\x80", "valid utf8 E0A080"); + test_char(runner, 1, "\xED\x9F\xBF", "valid utf8 ED9FBF"); + test_char(runner, 0, "\xED\xA0\x80", "invalid utf8 EDA080"); + test_char(runner, 0, "\xED\xBF\xBF", "invalid utf8 EDBFBF"); + test_char(runner, 0, "\xF0\x80\x80\x80", "invalid utf8 F0808080"); + test_char(runner, 0, "\xF0\x8F\xBF\xBF", "invalid utf8 F08FBFBF"); + test_char(runner, 1, "\xF0\x90\x80\x80", "valid utf8 F0908080"); + test_char(runner, 1, "\xF4\x8F\xBF\xBF", "valid utf8 F48FBFBF"); + test_char(runner, 0, "\xF4\x90\x80\x80", "invalid utf8 F4908080"); + test_char(runner, 0, "\xF7\xBF\xBF\xBF", "invalid utf8 F7BFBFBF"); + test_char(runner, 0, "\xF8", "invalid utf8 F8"); + test_char(runner, 0, "\xFF", "invalid utf8 FF"); + + // Incomplete byte sequences at end of input + test_incomplete_char(runner, "\xE0\xA0", "invalid utf8 E0A0"); + test_incomplete_char(runner, "\xF0\x90\x80", "invalid utf8 F09080"); + + // Invalid continuation bytes + test_continuation_byte(runner, "\xC2\x80"); + test_continuation_byte(runner, "\xE0\xA0\x80"); + test_continuation_byte(runner, "\xF0\x90\x80\x80"); +} + +static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + + if (valid) { + char expected[30]; + sprintf(expected, "

((((%s))))

\n", utf8); + test_md_to_html(runner, buf, expected, msg); + } + else { + test_md_to_html(runner, buf, "

((((" UTF8_REPL "))))

\n", + msg); + } +} + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "----%s", utf8); + test_md_to_html(runner, buf, "

----" UTF8_REPL "

\n", msg); +} + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8) +{ + int len = strlen(utf8); + + for (int pos = 1; pos < len; ++pos) { + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + buf[4+pos] = '\x20'; + + char expected[50]; + strcpy(expected, "

((((" UTF8_REPL "\x20"); + for (int i = pos + 1; i < len; ++i) { + strcat(expected, UTF8_REPL); + } + strcat(expected, "))))

\n"); + + char *html = cmark_markdown_to_html(buf, strlen(buf)); + STR_EQ(runner, html, expected, + "invalid utf8 continuation byte %d/%d", pos, len); + free(html); + } +} + +static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg) +{ + char *html = cmark_markdown_to_html(markdown, strlen(markdown)); + STR_EQ(runner, html, expected_html, msg); + free(html); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -486,6 +595,7 @@ int main() { hierarchy(runner); parser(runner); render_html(runner); + utf8(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; -- cgit v1.2.3 From a5ba5add1d72874fd40168eac54ed39e7b82bf49 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 22:46:20 +0100 Subject: Add test for input containing null character --- api_test/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'api_test') diff --git a/api_test/main.c b/api_test/main.c index 2d65a46..9931581 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -522,6 +522,14 @@ utf8(test_batch_runner *runner) test_continuation_byte(runner, "\xC2\x80"); test_continuation_byte(runner, "\xE0\xA0\x80"); test_continuation_byte(runner, "\xF0\x90\x80\x80"); + + // Test string containing null character + static const char string_with_null[] = "((((\0))))"; + char *html = cmark_markdown_to_html(string_with_null, + sizeof(string_with_null) - 1); + STR_EQ(runner, html, "

((((" UTF8_REPL "))))

\n", + "utf8 with U+0000"); + free(html); } static void -- cgit v1.2.3