Added `CMARK_OPT_VALIDATE_UTF8` option.

Also command line option `--validate-utf8`. This option causes cmark to check for valid UTF-8, replacing invalid sequences with the replacement character, U+FFFD. Reinstated api tests for utf8.
author: John MacFarlane <jgm@berkeley.edu> 2015-06-16 22:21:55 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2015-06-16 22:22:56 -0700
commit: 04726a7089e44e7ff4e6c552524841579a1053da (patch)
tree: bc1657e17edb1d264a0cfe8adfe3c378b7baa8ae
parent: fb7af2f0d6ca845b33364c6ce9a704a458e31ff9 (diff)
6 files changed, 36 insertions, 9 deletions
diff --git a/api_test/main.c b/api_test/main.c
index 871b3a5..666af4d 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -658,7 +658,7 @@ test_continuation_byte(test_batch_runner *runner, const char *utf8)
 		strcat(expected, "))))</p>\n");
 
 		char *html = cmark_markdown_to_html(buf, strlen(buf),
-						    CMARK_OPT_DEFAULT);
+						    CMARK_OPT_VALIDATE_UTF8);
 		STR_EQ(runner, html, expected,
 		       "invalid utf8 continuation byte %d/%d", pos, len);
 		free(html);
@@ -718,7 +718,7 @@ test_md_to_html(test_batch_runner *runner, const char *markdown,
 		const char *expected_html, const char *msg)
 {
 	char *html = cmark_markdown_to_html(markdown, strlen(markdown),
-					    CMARK_OPT_DEFAULT);
+					    CMARK_OPT_VALIDATE_UTF8);
 	STR_EQ(runner, html, expected_html, msg);
 	free(html);
 }
@@ -737,7 +737,7 @@ int main() {
 	hierarchy(runner);
 	parser(runner);
 	render_html(runner);
-	// utf8(runner);
+	utf8(runner);
 	line_endings(runner);
 	numeric_entities(runner);
 	test_cplusplus(runner);
diff --git a/man/man1/cmark.1 b/man/man1/cmark.1
index 2e08b38..8c1c2c7 100644
--- a/man/man1/cmark.1
+++ b/man/man1/cmark.1
@@ -35,6 +35,9 @@ hard wrapping is disabled, regardless of the value given with \-\-width.
 .B \-\-normalize
 Consolidate adjacent text nodes.
 .TP 12n
+.B \-\-validate-utf8
+Validate UTF-8, replacing illegal sequences with U+FFFD.
+.TP 12n
 .B \-\-smart
 Use smart punctuation.  Straight double and single quotes will
 be rendered as curly quotes, depending on their position.
diff --git a/man/man3/cmark.3 b/man/man3/cmark.3
index 82c34cd..aa254b0 100644
--- a/man/man3/cmark.3
+++ b/man/man3/cmark.3
@@ -1,4 +1,4 @@
-.TH cmark 3 "June 07, 2015" "LOCAL" "Library Functions Manual"
+.TH cmark 3 "June 16, 2015" "LOCAL" "Library Functions Manual"
 .SH
 NAME
 .PP
@@ -403,10 +403,10 @@ Streaming interface:
 cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT);
 FILE *fp = fopen("myfile.md", "r");
 while ((bytes = fread(buffer, 1, sizeof(buffer), fp)) > 0) {
-       cmark_parser_feed(parser, buffer, bytes);
-       if (bytes < sizeof(buffer)) {
-           break;
-       }
+	   cmark_parser_feed(parser, buffer, bytes);
+	   if (bytes < sizeof(buffer)) {
+	       break;
+	   }
 }
 document = cmark_parser_finish(parser);
 cmark_parser_free(parser);
@@ -539,6 +539,19 @@ Normalize tree by consolidating adjacent text nodes.
 .PP
 Convert straight quotes to curly, \-\-\- to em dashes, \-\- to en dashes.
 
+.PP
+.nf
+\fC
+.RS 0n
+#define CMARK_OPT_VALIDATE_UTF8 16
+.RE
+\f[]
+.fi
+
+.PP
+Validate UTF\-8 in the input before parsing, replacing illegal
+sequences with the replacement character U+FFFD.
+
 .SS
 Version information
 
diff --git a/src/blocks.c b/src/blocks.c
index 95a87c3..17288df 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -631,7 +631,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
 	cmark_chunk input;
 	bool maybe_lazy;
 
-	cmark_strbuf_put(parser->curline, buffer, bytes);
+	if (parser->options & CMARK_OPT_VALIDATE_UTF8) {
+		utf8proc_check(parser->curline, buffer, bytes);
+	} else {
+		cmark_strbuf_put(parser->curline, buffer, bytes);
+	}
 	parser->offset = 0;
 	parser->column = 0;
 	parser->blank = false;
diff --git a/src/cmark.h b/src/cmark.h
index d86e13e..4ff1ca0 100644
--- a/src/cmark.h
+++ b/src/cmark.h
@@ -506,6 +506,11 @@ char *cmark_render_commonmark(cmark_node *root, int options, int width);
  */
 #define CMARK_OPT_SMART 8
 
+/** Validate UTF-8 in the input before parsing, replacing illegal
+ * sequences with the replacement character U+FFFD.
+ */
+#define CMARK_OPT_VALIDATE_UTF8 16
+
 /**
  * ## Version information
  */
diff --git a/src/main.c b/src/main.c
index c23071f..5dc97b2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -92,6 +92,8 @@ int main(int argc, char *argv[])
 			options |= CMARK_OPT_SMART;
 		} else if (strcmp(argv[i], "--normalize") == 0) {
 			options |= CMARK_OPT_NORMALIZE;
+		} else if (strcmp(argv[i], "--validate-utf8") == 0) {
+			options |= CMARK_OPT_VALIDATE_UTF8;
 		} else if ((strcmp(argv[i], "--help") == 0) ||
 		           (strcmp(argv[i], "-h") == 0)) {
 			print_usage();
author	John MacFarlane <jgm@berkeley.edu>	2015-06-16 22:21:55 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2015-06-16 22:22:56 -0700
commit	04726a7089e44e7ff4e6c552524841579a1053da (patch)
tree	bc1657e17edb1d264a0cfe8adfe3c378b7baa8ae
parent	fb7af2f0d6ca845b33364c6ce9a704a458e31ff9 (diff)