diff options
author | Nick Wellnhofer <wellnhofer@aevum.de> | 2014-11-24 20:10:49 +0100 |
---|---|---|
committer | Nick Wellnhofer <wellnhofer@aevum.de> | 2014-11-24 20:14:00 +0100 |
commit | ff9c0dcecd1314b820bf7d2584990c26c0e28909 (patch) | |
tree | 15c10ee0e2ccac5fbb66e71dfc4792750342ecc8 /api_test | |
parent | 82b360db50dfcb889e4488dfb3e5bcfc52d91857 (diff) |
Validate UTF-8 input
Invalid UTF-8 byte sequences are replaced with the Unicode replacement
character U+FFFD. Fixes #213.
Diffstat (limited to 'api_test')
-rw-r--r-- | api_test/main.c | 124 |
1 files changed, 117 insertions, 7 deletions
diff --git a/api_test/main.c b/api_test/main.c index 06d9be2..2d65a46 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -8,6 +8,8 @@ #include "harness.h" +#define UTF8_REPL "\xEF\xBF\xBD" + static const cmark_node_type node_types[] = { CMARK_NODE_DOCUMENT, CMARK_NODE_BLOCK_QUOTE, @@ -32,10 +34,25 @@ static const cmark_node_type node_types[] = { static const int num_node_types = sizeof(node_types) / sizeof(*node_types); static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg); + +static void test_content(test_batch_runner *runner, cmark_node_type type, int allowed_content); static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg); + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg); + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8); + +static void constructor(test_batch_runner *runner) { for (int i = 0; i < num_node_types; ++i) { @@ -436,13 +453,8 @@ test_content(test_batch_runner *runner, cmark_node_type type, static void parser(test_batch_runner *runner) { - static const char markdown[] = "No newline"; - cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1); - char *html = cmark_render_html(doc); - STR_EQ(runner, html, "<p>No newline</p>\n", - "document without trailing newline"); - free(html); - cmark_node_destroy(doc); + test_md_to_html(runner, "No newline", "<p>No newline</p>\n", + "document without trailing newline"); } static void @@ -475,6 +487,103 @@ render_html(test_batch_runner *runner) cmark_node_destroy(doc); } +static void +utf8(test_batch_runner *runner) +{ + // Ranges + test_char(runner, 1, "\x01", "valid utf8 01"); + test_char(runner, 1, "\x7F", "valid utf8 7F"); + test_char(runner, 0, "\x80", "invalid utf8 80"); + test_char(runner, 0, "\xBF", "invalid utf8 BF"); + test_char(runner, 0, "\xC0\x80", "invalid utf8 C080"); + test_char(runner, 0, "\xC1\xBF", "invalid utf8 C1BF"); + test_char(runner, 1, "\xC2\x80", "valid utf8 C280"); + test_char(runner, 1, "\xDF\xBF", "valid utf8 DFBF"); + test_char(runner, 0, "\xE0\x80\x80", "invalid utf8 E08080"); + test_char(runner, 0, "\xE0\x9F\xBF", "invalid utf8 E09FBF"); + test_char(runner, 1, "\xE0\xA0\x80", "valid utf8 E0A080"); + test_char(runner, 1, "\xED\x9F\xBF", "valid utf8 ED9FBF"); + test_char(runner, 0, "\xED\xA0\x80", "invalid utf8 EDA080"); + test_char(runner, 0, "\xED\xBF\xBF", "invalid utf8 EDBFBF"); + test_char(runner, 0, "\xF0\x80\x80\x80", "invalid utf8 F0808080"); + test_char(runner, 0, "\xF0\x8F\xBF\xBF", "invalid utf8 F08FBFBF"); + test_char(runner, 1, "\xF0\x90\x80\x80", "valid utf8 F0908080"); + test_char(runner, 1, "\xF4\x8F\xBF\xBF", "valid utf8 F48FBFBF"); + test_char(runner, 0, "\xF4\x90\x80\x80", "invalid utf8 F4908080"); + test_char(runner, 0, "\xF7\xBF\xBF\xBF", "invalid utf8 F7BFBFBF"); + test_char(runner, 0, "\xF8", "invalid utf8 F8"); + test_char(runner, 0, "\xFF", "invalid utf8 FF"); + + // Incomplete byte sequences at end of input + test_incomplete_char(runner, "\xE0\xA0", "invalid utf8 E0A0"); + test_incomplete_char(runner, "\xF0\x90\x80", "invalid utf8 F09080"); + + // Invalid continuation bytes + test_continuation_byte(runner, "\xC2\x80"); + test_continuation_byte(runner, "\xE0\xA0\x80"); + test_continuation_byte(runner, "\xF0\x90\x80\x80"); +} + +static void +test_char(test_batch_runner *runner, int valid, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + + if (valid) { + char expected[30]; + sprintf(expected, "<p>((((%s))))</p>\n", utf8); + test_md_to_html(runner, buf, expected, msg); + } + else { + test_md_to_html(runner, buf, "<p>((((" UTF8_REPL "))))</p>\n", + msg); + } +} + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, + const char *msg) +{ + char buf[20]; + sprintf(buf, "----%s", utf8); + test_md_to_html(runner, buf, "<p>----" UTF8_REPL "</p>\n", msg); +} + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8) +{ + int len = strlen(utf8); + + for (int pos = 1; pos < len; ++pos) { + char buf[20]; + sprintf(buf, "((((%s))))", utf8); + buf[4+pos] = '\x20'; + + char expected[50]; + strcpy(expected, "<p>((((" UTF8_REPL "\x20"); + for (int i = pos + 1; i < len; ++i) { + strcat(expected, UTF8_REPL); + } + strcat(expected, "))))</p>\n"); + + char *html = cmark_markdown_to_html(buf, strlen(buf)); + STR_EQ(runner, html, expected, + "invalid utf8 continuation byte %d/%d", pos, len); + free(html); + } +} + +static void +test_md_to_html(test_batch_runner *runner, const char *markdown, + const char *expected_html, const char *msg) +{ + char *html = cmark_markdown_to_html(markdown, strlen(markdown)); + STR_EQ(runner, html, expected_html, msg); + free(html); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -486,6 +595,7 @@ int main() { hierarchy(runner); parser(runner); render_html(runner); + utf8(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; |