summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-11 16:43:34 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 17:48:05 -0700
commit26537124a4070f7869db67317b90e08916050c8f (patch)
tree376f5f2f78e01fe2c7b09c9f7cfe471fbeeac60e
parent7f491b0bdf8e206458d284938efa8a0890c9d352 (diff)
Renamed utf8proc_detab as utf8proc_check, removed detabbing function.
Now it just replaces bad UTF-8 sequences and NULLs. This restores benchmarks to near their previous levels.
-rw-r--r--src/blocks.c2
-rw-r--r--src/utf8.c31
-rw-r--r--src/utf8.h2
3 files changed, 12 insertions, 23 deletions
diff --git a/src/blocks.c b/src/blocks.c
index 06f6dcb..08f2e63 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -619,7 +619,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
cmark_chunk input;
bool maybe_lazy;
- cmark_strbuf_put(parser->curline, buffer, bytes);
+ utf8proc_check(parser->curline, buffer, bytes);
parser->offset = 0;
parser->column = 0;
parser->blank = false;
diff --git a/src/utf8.c b/src/utf8.c
index f572042..ffe6652 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -116,53 +116,42 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
return length;
}
-void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
+void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
{
- static const uint8_t whitespace[] = " ";
-
- bufsize_t i = 0, tab = 0;
+ bufsize_t i = 0;
while (i < size) {
bufsize_t org = i;
int charlen = 0;
- while (i < size && line[i] != '\t') {
- if (line[i] >= 0x80) {
+ while (i < size) {
+ if (line[i] < 0x80 && line[i] != 0) {
+ i++;
+ } else if (line[i] >= 0x80) {
charlen = utf8proc_valid(line + i, size - i);
if (charlen < 0) {
charlen = -charlen;
break;
}
i += charlen;
- } else if (line[i] == '\0') {
+ } else if (line[i] == 0) {
// ASCII NUL is technically valid but rejected
// for security reasons.
charlen = 1;
break;
- } else {
- i++;
}
-
- tab++;
}
- if (i > org)
+ if (i > org) {
cmark_strbuf_put(ob, line + org, i - org);
+ }
- if (i >= size)
+ if (i >= size) {
break;
-
- if (line[i] == '\t') {
- int numspaces = 4 - (tab % 4);
- cmark_strbuf_put(ob, whitespace, numspaces);
- i += 1;
- tab += numspaces;
} else {
// Invalid UTF-8
encode_unknown(ob);
-
i += charlen;
- tab += 1;
}
}
}
diff --git a/src/utf8.h b/src/utf8.h
index ed1d7ee..9f1a4ec 100644
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -11,7 +11,7 @@ extern "C" {
void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, bufsize_t len);
void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf);
int utf8proc_iterate(const uint8_t *str, bufsize_t str_len, int32_t *dst);
-void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, bufsize_t size);
+void utf8proc_check(cmark_strbuf *dest, const uint8_t *line, bufsize_t size);
int utf8proc_is_space(int32_t uc);
int utf8proc_is_punctuation(int32_t uc);