summaryrefslogtreecommitdiff
path: root/src/utf8.c
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2014-07-21 22:29:16 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2014-08-13 22:56:32 -0700
commit870e63be7360b5a0097a27656048e853bc720464 (patch)
treee8f19ee2d62e529115cb71dcda5f3298cca7d389 /src/utf8.c
parent650ad87f35f4405a2ca8270d2b2835daa442e5f1 (diff)
Initial commit
Diffstat (limited to 'src/utf8.c')
-rw-r--r--src/utf8.c106
1 files changed, 106 insertions, 0 deletions
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..4bb3b35
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,106 @@
+#include <stdlib.h>
+#include "bstrlib.h"
+#include "debug.h"
+
+#define advance(s) \
+ s++; \
+ check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s);
+
+// Reads a unicode code point from a UTF8-encoded string, and
+// puts it in the pointer n. If something illegal
+// is encountered, 0xFFFD is emitted.
+// Returns a pointer to next position in string, or NULL if no
+// more characters remain.
+extern unsigned char * from_utf8(unsigned char * s, unsigned int *n)
+{
+ int x = 0;
+
+ if (*s == 0) {
+ return NULL;
+ } else if (*s < 0x80) {
+ x = *s;
+ } else if (*s >> 5 == 0x06) {
+ x = *s & 0x1F;
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ } else if (*s >> 4 == 0x0E) {
+ x = *s & 0x0F;
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ } else if (*s >> 3 == 0x1E) {
+ x = *s & 0x07;
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ } else if (*s >> 2 == 0x3E) {
+ x = *s & 0x03;
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ advance(s);
+ x = (x << 6) + (*s & 0x3F);
+ } else {
+ log_err("UTF-8 decode error on byte %x", *s);
+ goto error;
+ }
+ *n = x;
+ s++;
+ return s;
+ error:
+ *n = 0xFFFD;
+ return s;
+}
+
+// Converts the unicode code point c to UTF-8,
+// putting the result in dest. Returns 0 on success, -1 on error.
+extern int to_utf8(unsigned int c, bstring dest)
+{
+ if (c < 0x80) {
+ bconchar(dest, c);
+ } else if (c < 0x800) {
+ bconchar(dest, 192 + c/64);
+ bconchar(dest, 128 + c%64);
+ } else if (c - 0xd800u < 0x800) {
+ goto error;
+ } else if (c < 0x10000) {
+ bconchar(dest, 224 + c / 4096);
+ bconchar(dest, 128 + c /64%64);
+ bconchar(dest, 128 + c%64);
+ } else if (c < 0x110000) {
+ bconchar(dest, 240 + c/262144);
+ bconchar(dest, 128 + c/4096%64);
+ bconchar(dest, 128 + c/64%64);
+ bconchar(dest, 128 + c%64);
+ } else {
+ goto error;
+ }
+ return 0;
+error:
+ return -1;
+}
+
+#define bufpush(x) \
+ check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x)
+
+// Returns the case-folded version of the source string, or NULL on error.
+extern bstring case_fold(bstring source)
+{
+ unsigned char * s = source->data;
+ unsigned int c = 0;
+ bstring buf = bfromcstr("");
+ while ((s = from_utf8(s, &c))) {
+#include "case_fold_switch.c"
+ }
+ return buf;
+error:
+ return NULL;
+}
+