From 8ac509f8bf0fe9f9f0b277cb612f9deb5bd072a5 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Tue, 9 Jun 2015 17:59:37 +0200
Subject: Optimize utf8proc_detab

Handle valid UTF-8 chars inside the main loop and avoid a call to
strbuf_put for every UTF-8 char.

Results in a 8% speedup in the UTF-8-heavy "make bench" on my system.
---
 src/utf8.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'src/utf8.c')

diff --git a/src/utf8.c b/src/utf8.c
index ba1d873..248a199 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -62,14 +62,6 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 		return length;
 
 	switch (length) {
-	case 1:
-		if (str[0] == 0x00) {
-			// ASCII NUL is technically valid but rejected
-			// for security reasons.
-			return -length;
-		}
-		break;
-
 	case 2:
 		if (str[0] < 0xC2) {
 			// Overlong
@@ -117,10 +109,27 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 
 	while (i < size) {
 		bufsize_t org = i;
+		int charlen = 0;
+
+		while (i < size && line[i] != '\t') {
+			if (line[i] >= 0x80) {
+				charlen = utf8proc_valid(line + i, size - i);
+				if (charlen < 0) {
+					charlen = -charlen;
+					break;
+				}
+				i += charlen;
+			}
+			else if (line[i] == '\0') {
+				// ASCII NUL is technically valid but rejected
+				// for security reasons.
+				charlen = 1;
+				break;
+			}
+			else {
+				i++;
+			}
 
-		while (i < size && line[i] != '\t' && line[i] != '\0'
-		       && line[i] < 0x80) {
-			i++;
 			tab++;
 		}
 
@@ -136,14 +145,8 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 			i += 1;
 			tab += numspaces;
 		} else {
-			int charlen = utf8proc_valid(line + i, size - i);
-
-			if (charlen >= 0) {
-				cmark_strbuf_put(ob, line + i, charlen);
-			} else {
-				encode_unknown(ob);
-				charlen = -charlen;
-			}
+			// Invalid UTF-8
+			encode_unknown(ob);
 
 			i += charlen;
 			tab += 1;
-- 
cgit v1.2.3


From 8d997c85ee1452480ed3d821ce0642f7e6e5b9e6 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Tue, 9 Jun 2015 18:26:04 +0200
Subject: Roll utf8proc_charlen into utf8proc_valid

Speeds up "make bench" by another percent.
---
 src/utf8.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'src/utf8.c')

diff --git a/src/utf8.c b/src/utf8.c
index 248a199..a4449dd 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -56,13 +56,18 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)
 // Validate a single UTF-8 character according to RFC 3629.
 static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 {
-	int length = utf8proc_charlen(str, str_len);
+	int length = utf8proc_utf8class[str[0]];
 
-	if (length <= 0)
-		return length;
+	if (!length)
+		return -1;
+
+	if ((bufsize_t)length > str_len)
+		return -str_len;
 
 	switch (length) {
 	case 2:
+		if ((str[1] & 0xC0) != 0x80)
+			return -1;
 		if (str[0] < 0xC2) {
 			// Overlong
 			return -length;
@@ -70,6 +75,10 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 		break;
 
 	case 3:
+		if ((str[1] & 0xC0) != 0x80)
+			return -1;
+		if ((str[2] & 0xC0) != 0x80)
+			return -2;
 		if (str[0] == 0xE0) {
 			if (str[1] < 0xA0) {
 				// Overlong
@@ -84,6 +93,12 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 		break;
 
 	case 4:
+		if ((str[1] & 0xC0) != 0x80)
+			return -1;
+		if ((str[2] & 0xC0) != 0x80)
+			return -2;
+		if ((str[3] & 0xC0) != 0x80)
+			return -3;
 		if (str[0] == 0xF0) {
 			if (str[1] < 0x90) {
 				// Overlong
-- 
cgit v1.2.3