From 8ac509f8bf0fe9f9f0b277cb612f9deb5bd072a5 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Tue, 9 Jun 2015 17:59:37 +0200
Subject: Optimize utf8proc_detab

Handle valid UTF-8 chars inside the main loop and avoid a call to
strbuf_put for every UTF-8 char.

Results in a 8% speedup in the UTF-8-heavy "make bench" on my system.
---
 src/utf8.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'src/utf8.c')

diff --git a/src/utf8.c b/src/utf8.c
index ba1d873..248a199 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -62,14 +62,6 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 		return length;
 
 	switch (length) {
-	case 1:
-		if (str[0] == 0x00) {
-			// ASCII NUL is technically valid but rejected
-			// for security reasons.
-			return -length;
-		}
-		break;
-
 	case 2:
 		if (str[0] < 0xC2) {
 			// Overlong
@@ -117,10 +109,27 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 
 	while (i < size) {
 		bufsize_t org = i;
+		int charlen = 0;
+
+		while (i < size && line[i] != '\t') {
+			if (line[i] >= 0x80) {
+				charlen = utf8proc_valid(line + i, size - i);
+				if (charlen < 0) {
+					charlen = -charlen;
+					break;
+				}
+				i += charlen;
+			}
+			else if (line[i] == '\0') {
+				// ASCII NUL is technically valid but rejected
+				// for security reasons.
+				charlen = 1;
+				break;
+			}
+			else {
+				i++;
+			}
 
-		while (i < size && line[i] != '\t' && line[i] != '\0'
-		       && line[i] < 0x80) {
-			i++;
 			tab++;
 		}
 
@@ -136,14 +145,8 @@ void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 			i += 1;
 			tab += numspaces;
 		} else {
-			int charlen = utf8proc_valid(line + i, size - i);
-
-			if (charlen >= 0) {
-				cmark_strbuf_put(ob, line + i, charlen);
-			} else {
-				encode_unknown(ob);
-				charlen = -charlen;
-			}
+			// Invalid UTF-8
+			encode_unknown(ob);
 
 			i += charlen;
 			tab += 1;
-- 
cgit v1.2.3