diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2015-06-09 09:54:55 -0700 | 
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2015-06-09 09:54:55 -0700 | 
| commit | bc14d869323650e936c7143dcf941b28ccd5b57d (patch) | |
| tree | e46c0ec079ff6e62c35bd7de544f4a794caeaa27 /src/utf8.c | |
| parent | a173d0bb746b1afc6a4942a2536c9008da35b572 (diff) | |
| parent | 8d997c85ee1452480ed3d821ce0642f7e6e5b9e6 (diff) | |
Merge pull request #57 from nwellnhof/optimize_utf8proc_detab
Optimize utf8proc_detab
Diffstat (limited to 'src/utf8.c')
| -rw-r--r-- | src/utf8.c | 62 | 
1 files changed, 40 insertions, 22 deletions
| @@ -56,21 +56,18 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)  // Validate a single UTF-8 character according to RFC 3629.  static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)  { -	int length = utf8proc_charlen(str, str_len); +	int length = utf8proc_utf8class[str[0]]; -	if (length <= 0) -		return length; +	if (!length) +		return -1; -	switch (length) { -	case 1: -		if (str[0] == 0x00) { -			// ASCII NUL is technically valid but rejected -			// for security reasons. -			return -length; -		} -		break; +	if ((bufsize_t)length > str_len) +		return -str_len; +	switch (length) {  	case 2: +		if ((str[1] & 0xC0) != 0x80) +			return -1;  		if (str[0] < 0xC2) {  			// Overlong  			return -length; @@ -78,6 +75,10 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)  		break;  	case 3: +		if ((str[1] & 0xC0) != 0x80) +			return -1; +		if ((str[2] & 0xC0) != 0x80) +			return -2;  		if (str[0] == 0xE0) {  			if (str[1] < 0xA0) {  				// Overlong @@ -92,6 +93,12 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)  		break;  	case 4: +		if ((str[1] & 0xC0) != 0x80) +			return -1; +		if ((str[2] & 0xC0) != 0x80) +			return -2; +		if ((str[3] & 0xC0) != 0x80) +			return -3;  		if (str[0] == 0xF0) {  			if (str[1] < 0x90) {  				// Overlong @@ -117,10 +124,27 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)  	while (i < size) {  		bufsize_t org = i; +		int charlen = 0; + +		while (i < size && line[i] != '\t') { +			if (line[i] >= 0x80) { +				charlen = utf8proc_valid(line + i, size - i); +				if (charlen < 0) { +					charlen = -charlen; +					break; +				} +				i += charlen; +			} +			else if (line[i] == '\0') { +				// ASCII NUL is technically valid but rejected +				// for security reasons. +				charlen = 1; +				break; +			} +			else { +				i++; +			} -		while (i < size && line[i] != '\t' && line[i] != '\0' -		       && line[i] < 0x80) { -			i++;  			tab++;  		} @@ -136,14 +160,8 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)  			i += 1;  			tab += numspaces;  		} else { -			int charlen = utf8proc_valid(line + i, size - i); - -			if (charlen >= 0) { -				cmark_strbuf_put(ob, line + i, charlen); -			} else { -				encode_unknown(ob); -				charlen = -charlen; -			} +			// Invalid UTF-8 +			encode_unknown(ob);  			i += charlen;  			tab += 1; | 
