summaryrefslogtreecommitdiff
path: root/src/utf8.c
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-09 10:54:58 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-09 10:54:58 -0700
commit54d1249c2caebf45a24d691dc765fb93c9a5e594 (patch)
tree75af766e702d5899959b91ae7bd99e186e846283 /src/utf8.c
parentbc14d869323650e936c7143dcf941b28ccd5b57d (diff)
parent38f6ac470d3b597446d4663a00efbe6ebce8ee5e (diff)
Merge pull request #58 from nwellnhof/optimize_utf8proc_detab
Further optimize utf8proc_valid
Diffstat (limited to 'src/utf8.c')
-rw-r--r--src/utf8.c71
1 files changed, 34 insertions, 37 deletions
diff --git a/src/utf8.c b/src/utf8.c
index a4449dd..4b85714 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -54,9 +54,11 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)
}
// Validate a single UTF-8 character according to RFC 3629.
+// Assumes a multi-byte UTF-8 sequence.
static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
{
int length = utf8proc_utf8class[str[0]];
+ assert(length != 1);
if (!length)
return -1;
@@ -64,53 +66,48 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
if ((bufsize_t)length > str_len)
return -str_len;
- switch (length) {
- case 2:
- if ((str[1] & 0xC0) != 0x80)
- return -1;
+ if ((str[1] & 0xC0) != 0x80)
+ return -1;
+
+ if (length == 2) {
if (str[0] < 0xC2) {
// Overlong
return -length;
}
- break;
-
- case 3:
- if ((str[1] & 0xC0) != 0x80)
- return -1;
+ }
+ else {
if ((str[2] & 0xC0) != 0x80)
return -2;
- if (str[0] == 0xE0) {
- if (str[1] < 0xA0) {
- // Overlong
- return -length;
- }
- } else if (str[0] == 0xED) {
- if (str[1] >= 0xA0) {
- // Surrogate
- return -length;
- }
- }
- break;
- case 4:
- if ((str[1] & 0xC0) != 0x80)
- return -1;
- if ((str[2] & 0xC0) != 0x80)
- return -2;
- if ((str[3] & 0xC0) != 0x80)
- return -3;
- if (str[0] == 0xF0) {
- if (str[1] < 0x90) {
- // Overlong
- return -length;
+ if (length == 3) {
+ if (str[0] == 0xE0) {
+ if (str[1] < 0xA0) {
+ // Overlong
+ return -length;
+ }
+ } else if (str[0] == 0xED) {
+ if (str[1] >= 0xA0) {
+ // Surrogate
+ return -length;
+ }
}
- } else if (str[0] >= 0xF4) {
- if (str[0] > 0xF4 || str[1] >= 0x90) {
- // Above 0x10FFFF
- return -length;
+ }
+ else {
+ if ((str[3] & 0xC0) != 0x80)
+ return -3;
+
+ if (str[0] == 0xF0) {
+ if (str[1] < 0x90) {
+ // Overlong
+ return -length;
+ }
+ } else if (str[0] >= 0xF4) {
+ if (str[0] > 0xF4 || str[1] >= 0x90) {
+ // Above 0x10FFFF
+ return -length;
+ }
}
}
- break;
}
return length;