summaryrefslogtreecommitdiff
path: root/src/utf8.c
blob: 4bb3b35ae38870fb44ce310a484becc283bc7369 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <stdlib.h>
#include "bstrlib.h"
#include "debug.h"

#define advance(s) \
  s++; \
  check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s);

// Reads a unicode code point from a UTF8-encoded string, and
// puts it in the pointer n. If something illegal
// is encountered, 0xFFFD is emitted.
// Returns a pointer to next position in string, or NULL if no
// more characters remain.
extern unsigned char * from_utf8(unsigned char * s, unsigned int *n)
{
  int x = 0;

  if (*s == 0) {
    return NULL;
  } else if (*s < 0x80) {
    x = *s;
  } else if (*s >> 5 == 0x06) {
    x = *s & 0x1F;
    advance(s);
    x = (x << 6) + (*s & 0x3F);
  } else if (*s >> 4 == 0x0E) {
    x = *s & 0x0F;
    advance(s);
    x = (x << 6) + (*s & 0x3F);
    advance(s);
    x = (x << 6) + (*s & 0x3F);
  } else if (*s >> 3 == 0x1E) {
    x = *s & 0x07;
    advance(s);
    x = (x << 6) + (*s & 0x3F);
    advance(s);
    x = (x << 6) + (*s & 0x3F);
    advance(s);
    x = (x << 6) + (*s & 0x3F);
  } else if (*s >> 2 == 0x3E) {
    x = *s & 0x03;
    advance(s);
    x = (x << 6) + (*s & 0x3F);
    advance(s);
    x = (x << 6) + (*s & 0x3F);
    advance(s);
    x = (x << 6) + (*s & 0x3F);
    advance(s);
    x = (x << 6) + (*s & 0x3F);
   } else {
    log_err("UTF-8 decode error on byte %x", *s);
    goto error;
  }
  *n = x;
  s++;
  return s;
 error:
  *n = 0xFFFD;
  return s;
}

// Converts the unicode code point c to UTF-8,
// putting the result in dest.  Returns 0 on success, -1 on error.
extern int to_utf8(unsigned int c, bstring dest)
{
  if (c < 0x80) {
    bconchar(dest, c);
  } else if (c < 0x800) {
    bconchar(dest, 192 + c/64);
    bconchar(dest, 128 + c%64);
  } else if (c - 0xd800u < 0x800) {
    goto error;
  } else if (c < 0x10000) {
    bconchar(dest, 224 + c / 4096);
    bconchar(dest, 128 + c /64%64);
    bconchar(dest, 128 + c%64);
  } else if (c < 0x110000) {
    bconchar(dest, 240 + c/262144);
    bconchar(dest, 128 + c/4096%64);
    bconchar(dest, 128 + c/64%64);
    bconchar(dest, 128 + c%64);
  } else {
    goto error;
  }
  return 0;
error:
  return -1;
}

#define bufpush(x) \
  check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point  %04x", x)

// Returns the case-folded version of the source string, or NULL on error.
extern bstring case_fold(bstring source)
{
  unsigned char * s = source->data;
  unsigned int c = 0;
  bstring buf = bfromcstr("");
  while ((s = from_utf8(s, &c))) {
#include "case_fold_switch.c"
  }
  return buf;
error:
  return NULL;
}