UTF8-aware detabbing and entity handling

author: Vicent Marti <tanoku@gmail.com> 2014-09-06 20:48:05 +0200
committer: Vicent Marti <tanoku@gmail.com> 2014-09-09 03:39:16 +0200
commit: 61e3e606e64221eaa5cf3d83dc598d5a42818d10 (patch)
tree: 1dfb6309c0e0fd7de8094c9da6497992b156350c
parent: 278b89d092cae8fe9cdd6346c69512886d36abbd (diff)
8 files changed, 95 insertions, 86 deletions
diff --git a/Makefile b/Makefile
index 0d2eb8b..b5e487d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-CFLAGS=-g -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS)
-LDFLAGS=-g -O3 -Wall -Werror
+CFLAGS=-g -pg -O3 -Wall -Wextra -std=c99 -Isrc $(OPTFLAGS)
+LDFLAGS=-g -pg -O3 -Wall -Werror
 SRCDIR=src
 DATADIR=data
 
@@ -41,11 +41,11 @@ testjs: spec.txt
 benchjs:
 	node js/bench.js ${BENCHINP}
 
-HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o
+HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o $(SRCDIR)/html/houdini_html_u.o
 STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o
 
-$(PROG): $(SRCDIR)/main.c $(HTML_OBJ) $(STMD_OBJ)
-	$(CC) $(LDFLAGS) -o $@ $^
+$(PROG): $(SRCDIR)/html/html_unescape.h $(SRCDIR)/case_fold_switch.inc $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c
+	$(CC) $(LDFLAGS) -o $@ $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c
 
 $(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re
 	re2c --case-insensitive -bis $< > $@ || (rm $@ && false)
@@ -53,6 +53,9 @@ $(SRCDIR)/scanners.c: $(SRCDIR)/scanners.re
 $(SRCDIR)/case_fold_switch.inc: $(DATADIR)/CaseFolding-3.2.0.txt
 	perl mkcasefold.pl < $< > $@
 
+$(SRCDIR)/html/html_unescape.h: $(SRCDIR)/html/html_unescape.gperf
+	gperf -I -t -N find_entity -H hash_entity -K entity -C -l --null-strings -m5 $< > $@
+
 .PHONY: leakcheck clean fuzztest dingus upload
 
 dingus:
diff --git a/src/blocks.c b/src/blocks.c
index f671b5e..8c7d49c 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -5,6 +5,8 @@
 #include <ctype.h>
 
 #include "stmd.h"
+#include "utf8.h"
+#include "html/houdini.h"
 #include "scanners.h"
 #include "uthash.h"
 
@@ -184,7 +186,7 @@ static void finalize(node_block* b, int line_number)
 			firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
 
 			strbuf_init(&b->attributes.fenced_code_data.info, 0);
-			strbuf_set(
+			houdini_unescape_html_f(
 				&b->attributes.fenced_code_data.info,
 				b->string_content.ptr,
 				firstlinelen
@@ -369,31 +371,6 @@ static int lists_match(struct ListData list_data,
 			list_data.bullet_char == item_data.bullet_char);
 }
 
-static void expand_tabs(strbuf *ob, const unsigned char *line, size_t size)
-{
-	size_t  i = 0, tab = 0;
-
-	while (i < size) {
-		size_t org = i;
-
-		while (i < size && line[i] != '\t') {
-			i++; tab++;
-		}
-
-		if (i > org)
-			strbuf_put(ob, line + org, i - org);
-
-		if (i >= size)
-			break;
-
-		do {
-			strbuf_putc(ob, ' '); tab++;
-		} while (tab % 4);
-
-		i++;
-	}
-}
-
 static node_block *finalize_document(node_block *document, int linenum)
 {
 	while (document != document->top) {
@@ -415,7 +392,7 @@ extern node_block *stmd_parse_file(FILE *f)
 	node_block *document = make_document();
 
 	while (fgets((char *)buffer, sizeof(buffer), f)) {
-		expand_tabs(&line, buffer, strlen((char *)buffer));
+		utf8proc_detab(&line, buffer, strlen((char *)buffer));
 		incorporate_line(&line, linenum, &document);
 		strbuf_clear(&line);
 		linenum++;
@@ -436,10 +413,10 @@ extern node_block *stmd_parse_document(const unsigned char *buffer, size_t len)
 		const unsigned char *eol = memchr(buffer, '\n', end - buffer);
 
 		if (!eol) {
-			expand_tabs(&line, buffer, end - buffer);
+			utf8proc_detab(&line, buffer, end - buffer);
 			buffer = end;
 		} else {
-			expand_tabs(&line, buffer, (eol - buffer) + 1);
+			utf8proc_detab(&line, buffer, (eol - buffer) + 1);
 			buffer += (eol - buffer) + 1;
 		}
 
diff --git a/src/html/houdini.h b/src/html/houdini.h
index 1e54d20..5fd690d 100644
--- a/src/html/houdini.h
+++ b/src/html/houdini.h
@@ -25,9 +25,11 @@ extern "C" {
 #define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10)
 #define HOUDINI_UNESCAPED_SIZE(x) (x)
 
+extern size_t houdini_unescape_ent(strbuf *ob, const uint8_t *src, size_t size);
 extern int houdini_escape_html(strbuf *ob, const uint8_t *src, size_t size);
 extern int houdini_escape_html0(strbuf *ob, const uint8_t *src, size_t size, int secure);
 extern int houdini_unescape_html(strbuf *ob, const uint8_t *src, size_t size);
+extern void houdini_unescape_html_f(strbuf *ob, const uint8_t *src, size_t size);
 extern int houdini_escape_xml(strbuf *ob, const uint8_t *src, size_t size);
 extern int houdini_escape_uri(strbuf *ob, const uint8_t *src, size_t size);
 extern int houdini_escape_url(strbuf *ob, const uint8_t *src, size_t size);
diff --git a/src/html/html.c b/src/html/html.c
index 758ec80..595dfcd 100644
--- a/src/html/html.c
+++ b/src/html/html.c
@@ -166,7 +166,6 @@ void inlines_to_html(strbuf *html, node_inl* ils)
 				break;
 
 			case INL_RAW_HTML:
-			case INL_ENTITY:
 				strbuf_put(html,
 						ils->content.literal.data,
 						ils->content.literal.len);
diff --git a/src/inlines.c b/src/inlines.c
index 6b17027..7b27150 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -5,6 +5,8 @@
 #include <ctype.h>
 
 #include "stmd.h"
+#include "html/houdini.h"
+#include "utf8.h"
 #include "uthash.h"
 #include "scanners.h"
 
@@ -176,7 +178,6 @@ inline static node_inl* make_simple(int t)
 #define make_str(s) make_literal(INL_STRING, s)
 #define make_code(s) make_literal(INL_CODE, s)
 #define make_raw_html(s) make_literal(INL_RAW_HTML, s)
-#define make_entity(s) make_literal(INL_ENTITY, s)
 #define make_linebreak() make_simple(INL_LINEBREAK)
 #define make_softbreak() make_simple(INL_SOFTBREAK)
 #define make_emph(contents) make_inlines(INL_EMPH, contents)
@@ -191,7 +192,6 @@ extern void free_inlines(node_inl* e)
 			case INL_STRING:
 			case INL_RAW_HTML:
 			case INL_CODE:
-			case INL_ENTITY:
 				chunk_free(&e->content.literal);
 				break;
 			case INL_LINEBREAK:
@@ -540,45 +540,34 @@ static node_inl* handle_backslash(subject *subj)
 // Assumes the subject has an '&' character at the current position.
 static node_inl* handle_entity(subject* subj)
 {
-	int match;
-	node_inl *result;
-	match = scan_entity(&subj->input, subj->pos);
-	if (match) {
-		result = make_entity(chunk_dup(&subj->input, subj->pos, match));
-		subj->pos += match;
-	} else {
-		advance(subj);
-		result = make_str(chunk_literal("&"));
-	}
-	return result;
+	strbuf ent = GH_BUF_INIT;
+	size_t len;
+
+	advance(subj);
+
+	len = houdini_unescape_ent(&ent,
+		subj->input.data + subj->pos,
+		subj->input.len - subj->pos
+	);
+
+	if (len == 0)
+		return make_str(chunk_literal("&"));
+
+	subj->pos += len;
+	return make_str(chunk_buf_detach(&ent));
 }
 
 // Like make_str, but parses entities.
 // Returns an inline sequence consisting of str and entity elements.
 static node_inl *make_str_with_entities(chunk *content)
 {
-	node_inl *result = NULL;
-	node_inl *new;
-	int searchpos;
-	char c;
-	subject subj;
-
-	subject_from_chunk(&subj, content, NULL);
+	strbuf unescaped = GH_BUF_INIT;
 
-	while ((c = peek_char(&subj))) {
-		switch (c) {
-			case '&':
-				new = handle_entity(&subj);
-				break;
-			default:
-				searchpos = chunk_strchr(&subj.input, '&', subj.pos);
-				new = make_str(chunk_dup(&subj.input, subj.pos, searchpos - subj.pos));
-				subj.pos = searchpos;
-		}
-		result = append_inlines(result, new);
+	if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) {
+		return make_str(chunk_buf_detach(&unescaped));
+	} else {
+		return make_str(*content);
 	}
-
-	return result;
 }
 
 // Destructively unescape a string: remove backslashes before punctuation chars.
@@ -611,9 +600,9 @@ static unsigned char *clean_url(chunk *url, int is_email)
 		strbuf_puts(&buf, "mailto:");
 
 	if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
-		strbuf_put(&buf, url->data + 1, url->len - 2);
+		houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
 	} else {
-		strbuf_put(&buf, url->data, url->len);
+		houdini_unescape_html_f(&buf, url->data, url->len);
 	}
 
 	unescape_buffer(&buf);
@@ -636,9 +625,9 @@ static unsigned char *clean_title(chunk *title)
 	if ((first == '\'' && last == '\'') ||
 		(first == '(' && last == ')') ||
 		(first == '"' && last == '"')) {
-		strbuf_set(&buf, title->data + 1, title->len - 2);
+		houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
 	} else {
-		strbuf_set(&buf, title->data, title->len);
+		houdini_unescape_html_f(&buf, title->data, title->len);
 	}
 
 	unescape_buffer(&buf);
diff --git a/src/print.c b/src/print.c
index 0ff86fa..9240dac 100644
--- a/src/print.c
+++ b/src/print.c
@@ -145,11 +145,6 @@ extern void print_inlines(node_inl* ils, int indent)
 			print_str(ils->content.literal.data, ils->content.literal.len);
 			putchar('\n');
 			break;
-		case INL_ENTITY:
-			printf("entity ");
-			print_str(ils->content.literal.data, ils->content.literal.len);
-			putchar('\n');
-			break;
 		case INL_LINK:
 		case INL_IMAGE:
 			printf("%s url=", ils->tag == INL_LINK ? "link" : "image");
diff --git a/src/stmd.h b/src/stmd.h
index be65371..c80eeda 100644
--- a/src/stmd.h
+++ b/src/stmd.h
@@ -17,7 +17,6 @@ struct node_inl {
 		INL_LINEBREAK,
 		INL_CODE,
 		INL_RAW_HTML,
-		INL_ENTITY,
 		INL_EMPH,
 		INL_STRONG,
 		INL_LINK,
@@ -133,6 +132,4 @@ void print_blocks(node_block* blk, int indent);
 void blocks_to_html(strbuf *html, node_block *b, bool tight);
 void inlines_to_html(strbuf *html, node_inl *b);
 
-void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len);
-
 #endif
diff --git a/src/utf8.c b/src/utf8.c
index cebd872..12d7ba5 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -3,7 +3,7 @@
 #include <unistd.h>
 #include <assert.h>
 
-#include "stmd.h"
+#include "utf8.h"
 
 static const int8_t utf8proc_utf8class[256] = {
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -23,6 +23,12 @@ static const int8_t utf8proc_utf8class[256] = {
 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 	4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
 
+static void encode_unknown(strbuf *buf)
+{
+	static const unsigned char repl[] = {239, 191, 189};
+	strbuf_put(buf, repl, 3);
+}
+
 ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len)
 {
 	ssize_t length, i;
@@ -46,6 +52,46 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len)
 	return length;
 }
 
+void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size)
+{
+	static const unsigned char whitespace[] = "    ";
+
+	size_t i = 0, tab = 0;
+
+	while (i < size) {
+		size_t org = i;
+
+		while (i < size && line[i] != '\t' && line[i] <= 0x80) {
+			i++; tab++;
+		}
+
+		if (i > org)
+			strbuf_put(ob, line + org, i - org);
+
+		if (i >= size)
+			break;
+
+		if (line[i] == '\t') {
+			int numspaces = 4 - (tab % 4);
+			strbuf_put(ob, whitespace, numspaces);
+			i += 1;
+			tab += numspaces;
+		} else {
+			ssize_t charlen = utf8proc_charlen(line + i, size - i);
+
+			if (charlen < 0) {
+				encode_unknown(ob);
+				i++;
+			} else {
+				strbuf_put(ob, line + i, charlen);
+				i += charlen;
+			}
+
+			tab += 1;
+		}
+	}
+}
+
 ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst)
 {
 	ssize_t length;
@@ -89,9 +135,9 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf)
 	unsigned char dst[4];
 	int len = 0;
 
-	if (uc < 0x00) {
-		assert(false);
-	} else if (uc < 0x80) {
+	assert(uc >= 0);
+
+	if (uc < 0x80) {
 		dst[0] = uc;
 		len = 1;
 	} else if (uc < 0x800) {
@@ -116,7 +162,8 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf)
 		dst[3] = 0x80 + (uc & 0x3F);
 		len = 4;
 	} else {
-		assert(false);
+		encode_unknown(buf);
+		return;
 	}
 
 	strbuf_put(buf, dst, len);
@@ -133,7 +180,7 @@ void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len)
 		ssize_t char_len = utf8proc_iterate(str, len, &c);
 
 		if (char_len < 0) {
-			bufpush(0xFFFD);
+			encode_unknown(dest);
 			continue;
 		}
author	Vicent Marti <tanoku@gmail.com>	2014-09-06 20:48:05 +0200
committer	Vicent Marti <tanoku@gmail.com>	2014-09-09 03:39:16 +0200
commit	61e3e606e64221eaa5cf3d83dc598d5a42818d10 (patch)
tree	1dfb6309c0e0fd7de8094c9da6497992b156350c
parent	278b89d092cae8fe9cdd6346c69512886d36abbd (diff)