summaryrefslogtreecommitdiff
path: root/src/utf8.c
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-11 16:43:34 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 17:48:05 -0700
commit26537124a4070f7869db67317b90e08916050c8f (patch)
tree376f5f2f78e01fe2c7b09c9f7cfe471fbeeac60e /src/utf8.c
parent7f491b0bdf8e206458d284938efa8a0890c9d352 (diff)
Renamed utf8proc_detab as utf8proc_check, removed detabbing function.
Now it just replaces bad UTF-8 sequences and NULLs. This restores benchmarks to near their previous levels.
Diffstat (limited to 'src/utf8.c')
-rw-r--r--src/utf8.c31
1 files changed, 10 insertions, 21 deletions
diff --git a/src/utf8.c b/src/utf8.c
index f572042..ffe6652 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -116,53 +116,42 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
return length;
}
-void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
+void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
{
- static const uint8_t whitespace[] = " ";
-
- bufsize_t i = 0, tab = 0;
+ bufsize_t i = 0;
while (i < size) {
bufsize_t org = i;
int charlen = 0;
- while (i < size && line[i] != '\t') {
- if (line[i] >= 0x80) {
+ while (i < size) {
+ if (line[i] < 0x80 && line[i] != 0) {
+ i++;
+ } else if (line[i] >= 0x80) {
charlen = utf8proc_valid(line + i, size - i);
if (charlen < 0) {
charlen = -charlen;
break;
}
i += charlen;
- } else if (line[i] == '\0') {
+ } else if (line[i] == 0) {
// ASCII NUL is technically valid but rejected
// for security reasons.
charlen = 1;
break;
- } else {
- i++;
}
-
- tab++;
}
- if (i > org)
+ if (i > org) {
cmark_strbuf_put(ob, line + org, i - org);
+ }
- if (i >= size)
+ if (i >= size) {
break;
-
- if (line[i] == '\t') {
- int numspaces = 4 - (tab % 4);
- cmark_strbuf_put(ob, whitespace, numspaces);
- i += 1;
- tab += numspaces;
} else {
// Invalid UTF-8
encode_unknown(ob);
-
i += charlen;
- tab += 1;
}
}
}