From 26537124a4070f7869db67317b90e08916050c8f Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 11 Jun 2015 16:43:34 -0700 Subject: Renamed utf8proc_detab as utf8proc_check, removed detabbing function. Now it just replaces bad UTF-8 sequences and NULLs. This restores benchmarks to near their previous levels. --- src/blocks.c | 2 +- src/utf8.c | 31 ++++++++++--------------------- src/utf8.h | 2 +- 3 files changed, 12 insertions(+), 23 deletions(-) (limited to 'src') diff --git a/src/blocks.c b/src/blocks.c index 06f6dcb..08f2e63 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -619,7 +619,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte cmark_chunk input; bool maybe_lazy; - cmark_strbuf_put(parser->curline, buffer, bytes); + utf8proc_check(parser->curline, buffer, bytes); parser->offset = 0; parser->column = 0; parser->blank = false; diff --git a/src/utf8.c b/src/utf8.c index f572042..ffe6652 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -116,53 +116,42 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len) return length; } -void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size) +void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size) { - static const uint8_t whitespace[] = " "; - - bufsize_t i = 0, tab = 0; + bufsize_t i = 0; while (i < size) { bufsize_t org = i; int charlen = 0; - while (i < size && line[i] != '\t') { - if (line[i] >= 0x80) { + while (i < size) { + if (line[i] < 0x80 && line[i] != 0) { + i++; + } else if (line[i] >= 0x80) { charlen = utf8proc_valid(line + i, size - i); if (charlen < 0) { charlen = -charlen; break; } i += charlen; - } else if (line[i] == '\0') { + } else if (line[i] == 0) { // ASCII NUL is technically valid but rejected // for security reasons. charlen = 1; break; - } else { - i++; } - - tab++; } - if (i > org) + if (i > org) { cmark_strbuf_put(ob, line + org, i - org); + } - if (i >= size) + if (i >= size) { break; - - if (line[i] == '\t') { - int numspaces = 4 - (tab % 4); - cmark_strbuf_put(ob, whitespace, numspaces); - i += 1; - tab += numspaces; } else { // Invalid UTF-8 encode_unknown(ob); - i += charlen; - tab += 1; } } } diff --git a/src/utf8.h b/src/utf8.h index ed1d7ee..9f1a4ec 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -11,7 +11,7 @@ extern "C" { void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, bufsize_t len); void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf); int utf8proc_iterate(const uint8_t *str, bufsize_t str_len, int32_t *dst); -void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, bufsize_t size); +void utf8proc_check(cmark_strbuf *dest, const uint8_t *line, bufsize_t size); int utf8proc_is_space(int32_t uc); int utf8proc_is_punctuation(int32_t uc); -- cgit v1.2.3