From 26537124a4070f7869db67317b90e08916050c8f Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 11 Jun 2015 16:43:34 -0700 Subject: Renamed utf8proc_detab as utf8proc_check, removed detabbing function. Now it just replaces bad UTF-8 sequences and NULLs. This restores benchmarks to near their previous levels. --- src/utf8.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'src/utf8.c') diff --git a/src/utf8.c b/src/utf8.c index f572042..ffe6652 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -116,53 +116,42 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) return length; } -void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size) +void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size) { - static const uint8_t whitespace[] = " "; - - bufsize_t i = 0, tab = 0; + bufsize_t i = 0; while (i < size) { bufsize_t org = i; int charlen = 0; - while (i < size && line[i] != '\t') { - if (line[i] >= 0x80) { + while (i < size) { + if (line[i] < 0x80 && line[i] != 0) { + i++; + } else if (line[i] >= 0x80) { charlen = utf8proc_valid(line + i, size - i); if (charlen < 0) { charlen = -charlen; break; } i += charlen; - } else if (line[i] == '\0') { + } else if (line[i] == 0) { // ASCII NUL is technically valid but rejected // for security reasons. charlen = 1; break; - } else { - i++; } - - tab++; } - if (i > org) + if (i > org) { cmark_strbuf_put(ob, line + org, i - org); + } - if (i >= size) + if (i >= size) { break; - - if (line[i] == '\t') { - int numspaces = 4 - (tab % 4); - cmark_strbuf_put(ob, whitespace, numspaces); - i += 1; - tab += numspaces; } else { // Invalid UTF-8 encode_unknown(ob); - i += charlen; - tab += 1; } } } -- cgit v1.2.3