summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--api_test/main.c124
-rw-r--r--src/utf8.c65
-rw-r--r--src/utf8.h1
3 files changed, 179 insertions, 11 deletions
diff --git a/api_test/main.c b/api_test/main.c
index 06d9be2..2d65a46 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -8,6 +8,8 @@
#include "harness.h"
+#define UTF8_REPL "\xEF\xBF\xBD"
+
static const cmark_node_type node_types[] = {
CMARK_NODE_DOCUMENT,
CMARK_NODE_BLOCK_QUOTE,
@@ -32,10 +34,25 @@ static const cmark_node_type node_types[] = {
static const int num_node_types = sizeof(node_types) / sizeof(*node_types);
static void
+test_md_to_html(test_batch_runner *runner, const char *markdown,
+ const char *expected_html, const char *msg);
+
+static void
test_content(test_batch_runner *runner, cmark_node_type type,
int allowed_content);
static void
+test_char(test_batch_runner *runner, int valid, const char *utf8,
+ const char *msg);
+
+static void
+test_incomplete_char(test_batch_runner *runner, const char *utf8,
+ const char *msg);
+
+static void
+test_continuation_byte(test_batch_runner *runner, const char *utf8);
+
+static void
constructor(test_batch_runner *runner)
{
for (int i = 0; i < num_node_types; ++i) {
@@ -436,13 +453,8 @@ test_content(test_batch_runner *runner, cmark_node_type type,
static void
parser(test_batch_runner *runner)
{
- static const char markdown[] = "No newline";
- cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1);
- char *html = cmark_render_html(doc);
- STR_EQ(runner, html, "<p>No newline</p>\n",
- "document without trailing newline");
- free(html);
- cmark_node_destroy(doc);
+ test_md_to_html(runner, "No newline", "<p>No newline</p>\n",
+ "document without trailing newline");
}
static void
@@ -475,6 +487,103 @@ render_html(test_batch_runner *runner)
cmark_node_destroy(doc);
}
+static void
+utf8(test_batch_runner *runner)
+{
+ // Ranges
+ test_char(runner, 1, "\x01", "valid utf8 01");
+ test_char(runner, 1, "\x7F", "valid utf8 7F");
+ test_char(runner, 0, "\x80", "invalid utf8 80");
+ test_char(runner, 0, "\xBF", "invalid utf8 BF");
+ test_char(runner, 0, "\xC0\x80", "invalid utf8 C080");
+ test_char(runner, 0, "\xC1\xBF", "invalid utf8 C1BF");
+ test_char(runner, 1, "\xC2\x80", "valid utf8 C280");
+ test_char(runner, 1, "\xDF\xBF", "valid utf8 DFBF");
+ test_char(runner, 0, "\xE0\x80\x80", "invalid utf8 E08080");
+ test_char(runner, 0, "\xE0\x9F\xBF", "invalid utf8 E09FBF");
+ test_char(runner, 1, "\xE0\xA0\x80", "valid utf8 E0A080");
+ test_char(runner, 1, "\xED\x9F\xBF", "valid utf8 ED9FBF");
+ test_char(runner, 0, "\xED\xA0\x80", "invalid utf8 EDA080");
+ test_char(runner, 0, "\xED\xBF\xBF", "invalid utf8 EDBFBF");
+ test_char(runner, 0, "\xF0\x80\x80\x80", "invalid utf8 F0808080");
+ test_char(runner, 0, "\xF0\x8F\xBF\xBF", "invalid utf8 F08FBFBF");
+ test_char(runner, 1, "\xF0\x90\x80\x80", "valid utf8 F0908080");
+ test_char(runner, 1, "\xF4\x8F\xBF\xBF", "valid utf8 F48FBFBF");
+ test_char(runner, 0, "\xF4\x90\x80\x80", "invalid utf8 F4908080");
+ test_char(runner, 0, "\xF7\xBF\xBF\xBF", "invalid utf8 F7BFBFBF");
+ test_char(runner, 0, "\xF8", "invalid utf8 F8");
+ test_char(runner, 0, "\xFF", "invalid utf8 FF");
+
+ // Incomplete byte sequences at end of input
+ test_incomplete_char(runner, "\xE0\xA0", "invalid utf8 E0A0");
+ test_incomplete_char(runner, "\xF0\x90\x80", "invalid utf8 F09080");
+
+ // Invalid continuation bytes
+ test_continuation_byte(runner, "\xC2\x80");
+ test_continuation_byte(runner, "\xE0\xA0\x80");
+ test_continuation_byte(runner, "\xF0\x90\x80\x80");
+}
+
+static void
+test_char(test_batch_runner *runner, int valid, const char *utf8,
+ const char *msg)
+{
+ char buf[20];
+ sprintf(buf, "((((%s))))", utf8);
+
+ if (valid) {
+ char expected[30];
+ sprintf(expected, "<p>((((%s))))</p>\n", utf8);
+ test_md_to_html(runner, buf, expected, msg);
+ }
+ else {
+ test_md_to_html(runner, buf, "<p>((((" UTF8_REPL "))))</p>\n",
+ msg);
+ }
+}
+
+static void
+test_incomplete_char(test_batch_runner *runner, const char *utf8,
+ const char *msg)
+{
+ char buf[20];
+ sprintf(buf, "----%s", utf8);
+ test_md_to_html(runner, buf, "<p>----" UTF8_REPL "</p>\n", msg);
+}
+
+static void
+test_continuation_byte(test_batch_runner *runner, const char *utf8)
+{
+ int len = strlen(utf8);
+
+ for (int pos = 1; pos < len; ++pos) {
+ char buf[20];
+ sprintf(buf, "((((%s))))", utf8);
+ buf[4+pos] = '\x20';
+
+ char expected[50];
+ strcpy(expected, "<p>((((" UTF8_REPL "\x20");
+ for (int i = pos + 1; i < len; ++i) {
+ strcat(expected, UTF8_REPL);
+ }
+ strcat(expected, "))))</p>\n");
+
+ char *html = cmark_markdown_to_html(buf, strlen(buf));
+ STR_EQ(runner, html, expected,
+ "invalid utf8 continuation byte %d/%d", pos, len);
+ free(html);
+ }
+}
+
+static void
+test_md_to_html(test_batch_runner *runner, const char *markdown,
+ const char *expected_html, const char *msg)
+{
+ char *html = cmark_markdown_to_html(markdown, strlen(markdown));
+ STR_EQ(runner, html, expected_html, msg);
+ free(html);
+}
+
int main() {
int retval;
test_batch_runner *runner = test_batch_runner_new();
@@ -486,6 +595,7 @@ int main() {
hierarchy(runner);
parser(runner);
render_html(runner);
+ utf8(runner);
test_print_summary(runner);
retval = test_ok(runner) ? 0 : 1;
diff --git a/src/utf8.c b/src/utf8.c
index b343175..e4ea8e2 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -28,7 +28,7 @@ static void encode_unknown(strbuf *buf)
strbuf_put(buf, repl, 3);
}
-int utf8proc_charlen(const uint8_t *str, int str_len)
+static int utf8proc_charlen(const uint8_t *str, int str_len)
{
int length, i;
@@ -51,6 +51,64 @@ int utf8proc_charlen(const uint8_t *str, int str_len)
return length;
}
+// Validate a single UTF-8 character according to RFC 3629.
+static int utf8proc_valid(const uint8_t *str, int str_len)
+{
+ int length = utf8proc_charlen(str, str_len);
+
+ if (length <= 0)
+ return length;
+
+ switch (length) {
+ case 1:
+ if (str[0] == 0x00) {
+ // ASCII NUL is technically valid but rejected
+ // for security reasons.
+ return -length;
+ }
+ break;
+
+ case 2:
+ if (str[0] < 0xC2) {
+ // Overlong
+ return -length;
+ }
+ break;
+
+ case 3:
+ if (str[0] == 0xE0) {
+ if (str[1] < 0xA0) {
+ // Overlong
+ return -length;
+ }
+ }
+ else if (str[0] == 0xED) {
+ if (str[1] >= 0xA0) {
+ // Surrogate
+ return -length;
+ }
+ }
+ break;
+
+ case 4:
+ if (str[0] == 0xF0) {
+ if (str[1] < 0x90) {
+ // Overlong
+ return -length;
+ }
+ }
+ else if (str[0] >= 0xF4) {
+ if (str[0] > 0xF4 || str[1] >= 0x90) {
+ // Above 0x10FFFF
+ return -length;
+ }
+ }
+ break;
+ }
+
+ return length;
+}
+
void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
{
static const uint8_t whitespace[] = " ";
@@ -60,7 +118,8 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
while (i < size) {
size_t org = i;
- while (i < size && line[i] != '\t' && line[i] < 0x80) {
+ while (i < size && line[i] != '\t' && line[i] != '\0'
+ && line[i] < 0x80) {
i++; tab++;
}
@@ -76,7 +135,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)
i += 1;
tab += numspaces;
} else {
- int charlen = utf8proc_charlen(line + i, size - i);
+ int charlen = utf8proc_valid(line + i, size - i);
if (charlen >= 0) {
strbuf_put(ob, line + i, charlen);
diff --git a/src/utf8.h b/src/utf8.h
index 319e39a..7df1573 100644
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -11,7 +11,6 @@ extern "C" {
void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len);
void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf);
int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst);
-int utf8proc_charlen(const uint8_t *str, int str_len);
void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size);
int utf8proc_is_space(int32_t uc);
int utf8proc_is_punctuation(int32_t uc);