#include #include "bstrlib.h" #include "debug.h" #define advance(s) \ s++; \ check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s); // Reads a unicode code point from a UTF8-encoded string, and // puts it in the pointer n. If something illegal // is encountered, 0xFFFD is emitted. // Returns a pointer to next position in string, or NULL if no // more characters remain. extern unsigned char * from_utf8(unsigned char * s, unsigned int *n) { int x = 0; if (*s == 0) { return NULL; } else if (*s < 0x80) { x = *s; } else if (*s >> 5 == 0x06) { x = *s & 0x1F; advance(s); x = (x << 6) + (*s & 0x3F); } else if (*s >> 4 == 0x0E) { x = *s & 0x0F; advance(s); x = (x << 6) + (*s & 0x3F); advance(s); x = (x << 6) + (*s & 0x3F); } else if (*s >> 3 == 0x1E) { x = *s & 0x07; advance(s); x = (x << 6) + (*s & 0x3F); advance(s); x = (x << 6) + (*s & 0x3F); advance(s); x = (x << 6) + (*s & 0x3F); } else if (*s >> 2 == 0x3E) { x = *s & 0x03; advance(s); x = (x << 6) + (*s & 0x3F); advance(s); x = (x << 6) + (*s & 0x3F); advance(s); x = (x << 6) + (*s & 0x3F); advance(s); x = (x << 6) + (*s & 0x3F); } else { log_err("UTF-8 decode error on byte %x", *s); goto error; } *n = x; s++; return s; error: *n = 0xFFFD; return s; } // Converts the unicode code point c to UTF-8, // putting the result in dest. Returns 0 on success, -1 on error. extern int to_utf8(unsigned int c, bstring dest) { if (c < 0x80) { bconchar(dest, c); } else if (c < 0x800) { bconchar(dest, 192 + c/64); bconchar(dest, 128 + c%64); } else if (c - 0xd800u < 0x800) { goto error; } else if (c < 0x10000) { bconchar(dest, 224 + c / 4096); bconchar(dest, 128 + c /64%64); bconchar(dest, 128 + c%64); } else if (c < 0x110000) { bconchar(dest, 240 + c/262144); bconchar(dest, 128 + c/4096%64); bconchar(dest, 128 + c/64%64); bconchar(dest, 128 + c%64); } else { goto error; } return 0; error: return -1; } #define bufpush(x) \ check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x) // Returns the case-folded version of the source string, or NULL on error. extern bstring case_fold(bstring source) { unsigned char * s = source->data; unsigned int c = 0; bstring buf = bfromcstr(""); while ((s = from_utf8(s, &c))) { #include "case_fold_switch.c" } return buf; error: return NULL; }