From 26537124a4070f7869db67317b90e08916050c8f Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Thu, 11 Jun 2015 16:43:34 -0700
Subject: Renamed utf8proc_detab as utf8proc_check, removed detabbing function.

Now it just replaces bad UTF-8 sequences and NULLs.

This restores benchmarks to near their previous levels.
---
 src/utf8.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'src/utf8.c')

diff --git a/src/utf8.c b/src/utf8.c
index f572042..ffe6652 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -116,53 +116,42 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)
 	return length;
 }
 
-void utf8proc_detab(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
+void utf8proc_check(cmark_strbuf *ob, const uint8_t *line, bufsize_t size)
 {
-	static const uint8_t whitespace[] = "    ";
-
-	bufsize_t i = 0, tab = 0;
+	bufsize_t i = 0;
 
 	while (i < size) {
 		bufsize_t org = i;
 		int charlen = 0;
 
-		while (i < size && line[i] != '\t') {
-			if (line[i] >= 0x80) {
+		while (i < size) {
+			if (line[i] < 0x80 && line[i] != 0) {
+				i++;
+			} else if (line[i] >= 0x80) {
 				charlen = utf8proc_valid(line + i, size - i);
 				if (charlen < 0) {
 					charlen = -charlen;
 					break;
 				}
 				i += charlen;
-			} else if (line[i] == '\0') {
+			} else if (line[i] == 0) {
 				// ASCII NUL is technically valid but rejected
 				// for security reasons.
 				charlen = 1;
 				break;
-			} else {
-				i++;
 			}
-
-			tab++;
 		}
 
-		if (i > org)
+		if (i > org) {
 			cmark_strbuf_put(ob, line + org, i - org);
+		}
 
-		if (i >= size)
+		if (i >= size) {
 			break;
-
-		if (line[i] == '\t') {
-			int numspaces = 4 - (tab % 4);
-			cmark_strbuf_put(ob, whitespace, numspaces);
-			i += 1;
-			tab += numspaces;
 		} else {
 			// Invalid UTF-8
 			encode_unknown(ob);
-
 			i += charlen;
-			tab += 1;
 		}
 	}
 }
-- 
cgit v1.2.3