From e216094e2192c05ddbd0988458eb8c0012e7baf8 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 2 Sep 2014 01:10:54 +0200 Subject: lol --- src/utf8.c | 221 ++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 129 insertions(+), 92 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 4bb3b35..1a5df9e 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -2,105 +2,142 @@ #include "bstrlib.h" #include "debug.h" -#define advance(s) \ - s++; \ - check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s); - -// Reads a unicode code point from a UTF8-encoded string, and -// puts it in the pointer n. If something illegal -// is encountered, 0xFFFD is emitted. -// Returns a pointer to next position in string, or NULL if no -// more characters remain. -extern unsigned char * from_utf8(unsigned char * s, unsigned int *n) +static const int8_t utf8proc_utf8class[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; + +ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) { - int x = 0; - - if (*s == 0) { - return NULL; - } else if (*s < 0x80) { - x = *s; - } else if (*s >> 5 == 0x06) { - x = *s & 0x1F; - advance(s); - x = (x << 6) + (*s & 0x3F); - } else if (*s >> 4 == 0x0E) { - x = *s & 0x0F; - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - } else if (*s >> 3 == 0x1E) { - x = *s & 0x07; - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - } else if (*s >> 2 == 0x3E) { - x = *s & 0x03; - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - advance(s); - x = (x << 6) + (*s & 0x3F); - } else { - log_err("UTF-8 decode error on byte %x", *s); - goto error; - } - *n = x; - s++; - return s; - error: - *n = 0xFFFD; - return s; + ssize_t length, i; + + if (!str_len) + return 0; + + length = utf8proc_utf8class[str[0]]; + + if (!length) + return -1; + + if (str_len >= 0 && length > str_len) + return -1; + + for (i = 1; i < length; i++) { + if ((str[i] & 0xC0) != 0x80) + return -1; + } + + return length; +} + +ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) +{ + ssize_t length; + int32_t uc = -1; + + *dst = -1; + length = utf8proc_charlen(str, str_len); + if (length < 0) + return -1; + + switch (length) { + case 1: + uc = str[0]; + break; + case 2: + uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); + if (uc < 0x80) uc = -1; + break; + case 3: + uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + + (str[2] & 0x3F); + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || + (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; + break; + case 4: + uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) + + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); + if (uc < 0x10000 || uc >= 0x110000) uc = -1; + break; + } + + if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) + return -1; + + *dst = uc; + return length; } -// Converts the unicode code point c to UTF-8, -// putting the result in dest. Returns 0 on success, -1 on error. -extern int to_utf8(unsigned int c, bstring dest) +void utf8_encode_char(int32_t uc, gh_buf *buf) { - if (c < 0x80) { - bconchar(dest, c); - } else if (c < 0x800) { - bconchar(dest, 192 + c/64); - bconchar(dest, 128 + c%64); - } else if (c - 0xd800u < 0x800) { - goto error; - } else if (c < 0x10000) { - bconchar(dest, 224 + c / 4096); - bconchar(dest, 128 + c /64%64); - bconchar(dest, 128 + c%64); - } else if (c < 0x110000) { - bconchar(dest, 240 + c/262144); - bconchar(dest, 128 + c/4096%64); - bconchar(dest, 128 + c/64%64); - bconchar(dest, 128 + c%64); - } else { - goto error; - } - return 0; -error: - return -1; + char dst[4]; + int len = 0; + + if (uc < 0x00) { + assert(false); + } else if (uc < 0x80) { + dst[0] = uc; + len = 1; + } else if (uc < 0x800) { + dst[0] = 0xC0 + (uc >> 6); + dst[1] = 0x80 + (uc & 0x3F); + len = 2; + } else if (uc == 0xFFFF) { + dst[0] = 0xFF; + return 1; + } else if (uc == 0xFFFE) { + dst[0] = 0xFE; + len = 1; + } else if (uc < 0x10000) { + dst[0] = 0xE0 + (uc >> 12); + dst[1] = 0x80 + ((uc >> 6) & 0x3F); + dst[2] = 0x80 + (uc & 0x3F); + len = 3; + } else if (uc < 0x110000) { + dst[0] = 0xF0 + (uc >> 18); + dst[1] = 0x80 + ((uc >> 12) & 0x3F); + dst[2] = 0x80 + ((uc >> 6) & 0x3F); + dst[3] = 0x80 + (uc & 0x3F); + len = 4; + } else { + assert(false); + } + + gh_buf_put(buf, dst, len); } +void utf8proc_case_fold(gh_buf *dest, const unsigned char *str, int len) +{ + int32_t c; + #define bufpush(x) \ - check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x) + utf8proc_encode_char(x, dest) -// Returns the case-folded version of the source string, or NULL on error. -extern bstring case_fold(bstring source) -{ - unsigned char * s = source->data; - unsigned int c = 0; - bstring buf = bfromcstr(""); - while ((s = from_utf8(s, &c))) { -#include "case_fold_switch.c" - } - return buf; -error: - return NULL; + while (len > 0) { + ssize_t char_len = utf8proc_iterate(str, len, &c); + + if (char_len < 0) { + bufpush(0xFFFD); + continue; + } + +#include "case_fold_switch.inc" + + str += char_len; + len -= char_len; + } } -- cgit v1.2.3 From 582674e662d1f8757350c51486a5e0a837195e15 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 2 Sep 2014 13:18:04 +0200 Subject: ffffix --- src/utf8.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 1a5df9e..e3f8dd3 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -1,6 +1,8 @@ #include -#include "bstrlib.h" -#include "debug.h" +#include +#include + +#include "stmd.h" static const int8_t utf8proc_utf8class[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -- cgit v1.2.3 From c28af79329264a7cf331a1b1c414919e4ed9e9f9 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Tue, 2 Sep 2014 13:37:34 +0200 Subject: It buiiiilds --- src/utf8.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index e3f8dd3..32c78a4 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "stmd.h" @@ -83,9 +84,9 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) return length; } -void utf8_encode_char(int32_t uc, gh_buf *buf) +void utf8proc_encode_char(int32_t uc, gh_buf *buf) { - char dst[4]; + unsigned char dst[4]; int len = 0; if (uc < 0x00) { @@ -99,7 +100,7 @@ void utf8_encode_char(int32_t uc, gh_buf *buf) len = 2; } else if (uc == 0xFFFF) { dst[0] = 0xFF; - return 1; + len = 1; } else if (uc == 0xFFFE) { dst[0] = 0xFE; len = 1; -- cgit v1.2.3 From 543c2c94d71adee42c7bd2f8027d75c87ed8120d Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Thu, 4 Sep 2014 18:38:14 +0200 Subject: Rename to strbuf --- src/utf8.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 32c78a4..cebd872 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -84,7 +84,7 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) return length; } -void utf8proc_encode_char(int32_t uc, gh_buf *buf) +void utf8proc_encode_char(int32_t uc, strbuf *buf) { unsigned char dst[4]; int len = 0; @@ -119,10 +119,10 @@ void utf8proc_encode_char(int32_t uc, gh_buf *buf) assert(false); } - gh_buf_put(buf, dst, len); + strbuf_put(buf, dst, len); } -void utf8proc_case_fold(gh_buf *dest, const unsigned char *str, int len) +void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) { int32_t c; -- cgit v1.2.3 From 61e3e606e64221eaa5cf3d83dc598d5a42818d10 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Sat, 6 Sep 2014 20:48:05 +0200 Subject: UTF8-aware detabbing and entity handling --- src/utf8.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 6 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index cebd872..12d7ba5 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -3,7 +3,7 @@ #include #include -#include "stmd.h" +#include "utf8.h" static const int8_t utf8proc_utf8class[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -23,6 +23,12 @@ static const int8_t utf8proc_utf8class[256] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; +static void encode_unknown(strbuf *buf) +{ + static const unsigned char repl[] = {239, 191, 189}; + strbuf_put(buf, repl, 3); +} + ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) { ssize_t length, i; @@ -46,6 +52,46 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return length; } +void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size) +{ + static const unsigned char whitespace[] = " "; + + size_t i = 0, tab = 0; + + while (i < size) { + size_t org = i; + + while (i < size && line[i] != '\t' && line[i] <= 0x80) { + i++; tab++; + } + + if (i > org) + strbuf_put(ob, line + org, i - org); + + if (i >= size) + break; + + if (line[i] == '\t') { + int numspaces = 4 - (tab % 4); + strbuf_put(ob, whitespace, numspaces); + i += 1; + tab += numspaces; + } else { + ssize_t charlen = utf8proc_charlen(line + i, size - i); + + if (charlen < 0) { + encode_unknown(ob); + i++; + } else { + strbuf_put(ob, line + i, charlen); + i += charlen; + } + + tab += 1; + } + } +} + ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) { ssize_t length; @@ -89,9 +135,9 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) unsigned char dst[4]; int len = 0; - if (uc < 0x00) { - assert(false); - } else if (uc < 0x80) { + assert(uc >= 0); + + if (uc < 0x80) { dst[0] = uc; len = 1; } else if (uc < 0x800) { @@ -116,7 +162,8 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) dst[3] = 0x80 + (uc & 0x3F); len = 4; } else { - assert(false); + encode_unknown(buf); + return; } strbuf_put(buf, dst, len); @@ -133,7 +180,7 @@ void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) ssize_t char_len = utf8proc_iterate(str, len, &c); if (char_len < 0) { - bufpush(0xFFFD); + encode_unknown(dest); continue; } -- cgit v1.2.3 From 94a79a605f3e76a43f1f87a5044f6761b99e5ca5 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 10 Sep 2014 18:33:27 +0200 Subject: Cleanup reference implementation --- src/utf8.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 12d7ba5..c65aec6 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -25,7 +25,7 @@ static const int8_t utf8proc_utf8class[256] = { static void encode_unknown(strbuf *buf) { - static const unsigned char repl[] = {239, 191, 189}; + static const uint8_t repl[] = {239, 191, 189}; strbuf_put(buf, repl, 3); } @@ -52,9 +52,9 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return length; } -void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size) +void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) { - static const unsigned char whitespace[] = " "; + static const uint8_t whitespace[] = " "; size_t i = 0, tab = 0; @@ -132,7 +132,7 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) void utf8proc_encode_char(int32_t uc, strbuf *buf) { - unsigned char dst[4]; + uint8_t dst[4]; int len = 0; assert(uc >= 0); @@ -169,7 +169,7 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) strbuf_put(buf, dst, len); } -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) +void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len) { int32_t c; -- cgit v1.2.3 From c47e3a34adac00a262f72c6d17a1c87deefa33c4 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 10 Sep 2014 19:39:03 +0200 Subject: Fix infinite loop when case folding invalid UTF8 chars --- src/utf8.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index c65aec6..1b0224b 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -29,9 +29,9 @@ static void encode_unknown(strbuf *buf) strbuf_put(buf, repl, 3); } -ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) +int utf8proc_charlen(const uint8_t *str, int str_len) { - ssize_t length, i; + int length, i; if (!str_len) return 0; @@ -42,11 +42,11 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return -1; if (str_len >= 0 && length > str_len) - return -1; + return -str_len; for (i = 1; i < length; i++) { if ((str[i] & 0xC0) != 0x80) - return -1; + return -i; } return length; @@ -77,7 +77,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) i += 1; tab += numspaces; } else { - ssize_t charlen = utf8proc_charlen(line + i, size - i); + int charlen = utf8proc_charlen(line + i, size - i); if (charlen < 0) { encode_unknown(ob); @@ -92,9 +92,9 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) } } -ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) +int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) { - ssize_t length; + int length; int32_t uc = -1; *dst = -1; @@ -177,15 +177,15 @@ void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len) utf8proc_encode_char(x, dest) while (len > 0) { - ssize_t char_len = utf8proc_iterate(str, len, &c); + int char_len = utf8proc_iterate(str, len, &c); - if (char_len < 0) { + if (char_len >= 0) { +#include "case_fold_switch.inc" + } else { encode_unknown(dest); - continue; + char_len = -char_len; } -#include "case_fold_switch.inc" - str += char_len; len -= char_len; } -- cgit v1.2.3 From 79e7a4bbf7055e33b346564db769f03e85f98988 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 10 Sep 2014 19:40:40 +0200 Subject: Improve invalid UTF8 codepoint skipping --- src/utf8.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index 1b0224b..6b34831 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -79,14 +79,14 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) } else { int charlen = utf8proc_charlen(line + i, size - i); - if (charlen < 0) { - encode_unknown(ob); - i++; - } else { + if (charlen >= 0) { strbuf_put(ob, line + i, charlen); - i += charlen; + } else { + encode_unknown(ob); + charlen = -charlen; } + i += charlen; tab += 1; } } -- cgit v1.2.3