1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#include <stdlib.h>
#include "bstrlib.h"
#include "debug.h"
#define advance(s) \
s++; \
check(*s >> 6 == 0x02, "UTF-8 decode error on byte %x", *s);
// Reads a unicode code point from a UTF8-encoded string, and
// puts it in the pointer n. If something illegal
// is encountered, 0xFFFD is emitted.
// Returns a pointer to next position in string, or NULL if no
// more characters remain.
extern unsigned char * from_utf8(unsigned char * s, unsigned int *n)
{
int x = 0;
if (*s == 0) {
return NULL;
} else if (*s < 0x80) {
x = *s;
} else if (*s >> 5 == 0x06) {
x = *s & 0x1F;
advance(s);
x = (x << 6) + (*s & 0x3F);
} else if (*s >> 4 == 0x0E) {
x = *s & 0x0F;
advance(s);
x = (x << 6) + (*s & 0x3F);
advance(s);
x = (x << 6) + (*s & 0x3F);
} else if (*s >> 3 == 0x1E) {
x = *s & 0x07;
advance(s);
x = (x << 6) + (*s & 0x3F);
advance(s);
x = (x << 6) + (*s & 0x3F);
advance(s);
x = (x << 6) + (*s & 0x3F);
} else if (*s >> 2 == 0x3E) {
x = *s & 0x03;
advance(s);
x = (x << 6) + (*s & 0x3F);
advance(s);
x = (x << 6) + (*s & 0x3F);
advance(s);
x = (x << 6) + (*s & 0x3F);
advance(s);
x = (x << 6) + (*s & 0x3F);
} else {
log_err("UTF-8 decode error on byte %x", *s);
goto error;
}
*n = x;
s++;
return s;
error:
*n = 0xFFFD;
return s;
}
// Converts the unicode code point c to UTF-8,
// putting the result in dest. Returns 0 on success, -1 on error.
extern int to_utf8(unsigned int c, bstring dest)
{
if (c < 0x80) {
bconchar(dest, c);
} else if (c < 0x800) {
bconchar(dest, 192 + c/64);
bconchar(dest, 128 + c%64);
} else if (c - 0xd800u < 0x800) {
goto error;
} else if (c < 0x10000) {
bconchar(dest, 224 + c / 4096);
bconchar(dest, 128 + c /64%64);
bconchar(dest, 128 + c%64);
} else if (c < 0x110000) {
bconchar(dest, 240 + c/262144);
bconchar(dest, 128 + c/4096%64);
bconchar(dest, 128 + c/64%64);
bconchar(dest, 128 + c%64);
} else {
goto error;
}
return 0;
error:
return -1;
}
#define bufpush(x) \
check(to_utf8(x, buf) == 0, "UTF-8 encode error on code point %04x", x)
// Returns the case-folded version of the source string, or NULL on error.
extern bstring case_fold(bstring source)
{
unsigned char * s = source->data;
unsigned int c = 0;
bstring buf = bfromcstr("");
while ((s = from_utf8(s, &c))) {
#include "case_fold_switch.c"
}
return buf;
error:
return NULL;
}
|