From 82b360db50dfcb889e4488dfb3e5bcfc52d91857 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 18:23:13 +0100 Subject: Off-by-one error in utf8proc_detab --- src/utf8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/utf8.c b/src/utf8.c index e144c72..b343175 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -60,7 +60,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) while (i < size) { size_t org = i; - while (i < size && line[i] != '\t' && line[i] <= 0x80) { + while (i < size && line[i] != '\t' && line[i] < 0x80) { i++; tab++; } -- cgit v1.2.3 From ff9c0dcecd1314b820bf7d2584990c26c0e28909 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 24 Nov 2014 20:10:49 +0100 Subject: Validate UTF-8 input Invalid UTF-8 byte sequences are replaced with the Unicode replacement character U+FFFD. Fixes #213. --- src/utf8.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- src/utf8.h | 1 - 2 files changed, 62 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/utf8.c b/src/utf8.c index b343175..e4ea8e2 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -28,7 +28,7 @@ static void encode_unknown(strbuf *buf) strbuf_put(buf, repl, 3); } -int utf8proc_charlen(const uint8_t *str, int str_len) +static int utf8proc_charlen(const uint8_t *str, int str_len) { int length, i; @@ -51,6 +51,64 @@ int utf8proc_charlen(const uint8_t *str, int str_len) return length; } +// Validate a single UTF-8 character according to RFC 3629. +static int utf8proc_valid(const uint8_t *str, int str_len) +{ + int length = utf8proc_charlen(str, str_len); + + if (length <= 0) + return length; + + switch (length) { + case 1: + if (str[0] == 0x00) { + // ASCII NUL is technically valid but rejected + // for security reasons. + return -length; + } + break; + + case 2: + if (str[0] < 0xC2) { + // Overlong + return -length; + } + break; + + case 3: + if (str[0] == 0xE0) { + if (str[1] < 0xA0) { + // Overlong + return -length; + } + } + else if (str[0] == 0xED) { + if (str[1] >= 0xA0) { + // Surrogate + return -length; + } + } + break; + + case 4: + if (str[0] == 0xF0) { + if (str[1] < 0x90) { + // Overlong + return -length; + } + } + else if (str[0] >= 0xF4) { + if (str[0] > 0xF4 || str[1] >= 0x90) { + // Above 0x10FFFF + return -length; + } + } + break; + } + + return length; +} + void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) { static const uint8_t whitespace[] = " "; @@ -60,7 +118,8 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) while (i < size) { size_t org = i; - while (i < size && line[i] != '\t' && line[i] < 0x80) { + while (i < size && line[i] != '\t' && line[i] != '\0' + && line[i] < 0x80) { i++; tab++; } @@ -76,7 +135,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) i += 1; tab += numspaces; } else { - int charlen = utf8proc_charlen(line + i, size - i); + int charlen = utf8proc_valid(line + i, size - i); if (charlen >= 0) { strbuf_put(ob, line + i, charlen); diff --git a/src/utf8.h b/src/utf8.h index 319e39a..7df1573 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -11,7 +11,6 @@ extern "C" { void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len); void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf); int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst); -int utf8proc_charlen(const uint8_t *str, int str_len); void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size); int utf8proc_is_space(int32_t uc); int utf8proc_is_punctuation(int32_t uc); -- cgit v1.2.3