summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-16 22:21:55 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 22:22:56 -0700
commit04726a7089e44e7ff4e6c552524841579a1053da (patch)
treebc1657e17edb1d264a0cfe8adfe3c378b7baa8ae
parentfb7af2f0d6ca845b33364c6ce9a704a458e31ff9 (diff)
Added `CMARK_OPT_VALIDATE_UTF8` option.
Also command line option `--validate-utf8`. This option causes cmark to check for valid UTF-8, replacing invalid sequences with the replacement character, U+FFFD. Reinstated api tests for utf8.
-rw-r--r--api_test/main.c6
-rw-r--r--man/man1/cmark.13
-rw-r--r--man/man3/cmark.323
-rw-r--r--src/blocks.c6
-rw-r--r--src/cmark.h5
-rw-r--r--src/main.c2
6 files changed, 36 insertions, 9 deletions
diff --git a/api_test/main.c b/api_test/main.c
index 871b3a5..666af4d 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -658,7 +658,7 @@ test_continuation_byte(test_batch_runner *runner, const char *utf8)
strcat(expected, "))))</p>\n");
char *html = cmark_markdown_to_html(buf, strlen(buf),
- CMARK_OPT_DEFAULT);
+ CMARK_OPT_VALIDATE_UTF8);
STR_EQ(runner, html, expected,
"invalid utf8 continuation byte %d/%d", pos, len);
free(html);
@@ -718,7 +718,7 @@ test_md_to_html(test_batch_runner *runner, const char *markdown,
const char *expected_html, const char *msg)
{
char *html = cmark_markdown_to_html(markdown, strlen(markdown),
- CMARK_OPT_DEFAULT);
+ CMARK_OPT_VALIDATE_UTF8);
STR_EQ(runner, html, expected_html, msg);
free(html);
}
@@ -737,7 +737,7 @@ int main() {
hierarchy(runner);
parser(runner);
render_html(runner);
- // utf8(runner);
+ utf8(runner);
line_endings(runner);
numeric_entities(runner);
test_cplusplus(runner);
diff --git a/man/man1/cmark.1 b/man/man1/cmark.1
index 2e08b38..8c1c2c7 100644
--- a/man/man1/cmark.1
+++ b/man/man1/cmark.1
@@ -35,6 +35,9 @@ hard wrapping is disabled, regardless of the value given with \-\-width.
.B \-\-normalize
Consolidate adjacent text nodes.
.TP 12n
+.B \-\-validate-utf8
+Validate UTF-8, replacing illegal sequences with U+FFFD.
+.TP 12n
.B \-\-smart
Use smart punctuation. Straight double and single quotes will
be rendered as curly quotes, depending on their position.
diff --git a/man/man3/cmark.3 b/man/man3/cmark.3
index 82c34cd..aa254b0 100644
--- a/man/man3/cmark.3
+++ b/man/man3/cmark.3
@@ -1,4 +1,4 @@
-.TH cmark 3 "June 07, 2015" "LOCAL" "Library Functions Manual"
+.TH cmark 3 "June 16, 2015" "LOCAL" "Library Functions Manual"
.SH
NAME
.PP
@@ -403,10 +403,10 @@ Streaming interface:
cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT);
FILE *fp = fopen("myfile.md", "r");
while ((bytes = fread(buffer, 1, sizeof(buffer), fp)) > 0) {
- cmark_parser_feed(parser, buffer, bytes);
- if (bytes < sizeof(buffer)) {
- break;
- }
+ cmark_parser_feed(parser, buffer, bytes);
+ if (bytes < sizeof(buffer)) {
+ break;
+ }
}
document = cmark_parser_finish(parser);
cmark_parser_free(parser);
@@ -539,6 +539,19 @@ Normalize tree by consolidating adjacent text nodes.
.PP
Convert straight quotes to curly, \-\-\- to em dashes, \-\- to en dashes.
+.PP
+.nf
+\fC
+.RS 0n
+#define CMARK_OPT_VALIDATE_UTF8 16
+.RE
+\f[]
+.fi
+
+.PP
+Validate UTF\-8 in the input before parsing, replacing illegal
+sequences with the replacement character U+FFFD.
+
.SS
Version information
diff --git a/src/blocks.c b/src/blocks.c
index 95a87c3..17288df 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -631,7 +631,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
cmark_chunk input;
bool maybe_lazy;
- cmark_strbuf_put(parser->curline, buffer, bytes);
+ if (parser->options & CMARK_OPT_VALIDATE_UTF8) {
+ utf8proc_check(parser->curline, buffer, bytes);
+ } else {
+ cmark_strbuf_put(parser->curline, buffer, bytes);
+ }
parser->offset = 0;
parser->column = 0;
parser->blank = false;
diff --git a/src/cmark.h b/src/cmark.h
index d86e13e..4ff1ca0 100644
--- a/src/cmark.h
+++ b/src/cmark.h
@@ -506,6 +506,11 @@ char *cmark_render_commonmark(cmark_node *root, int options, int width);
*/
#define CMARK_OPT_SMART 8
+/** Validate UTF-8 in the input before parsing, replacing illegal
+ * sequences with the replacement character U+FFFD.
+ */
+#define CMARK_OPT_VALIDATE_UTF8 16
+
/**
* ## Version information
*/
diff --git a/src/main.c b/src/main.c
index c23071f..5dc97b2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -92,6 +92,8 @@ int main(int argc, char *argv[])
options |= CMARK_OPT_SMART;
} else if (strcmp(argv[i], "--normalize") == 0) {
options |= CMARK_OPT_NORMALIZE;
+ } else if (strcmp(argv[i], "--validate-utf8") == 0) {
+ options |= CMARK_OPT_VALIDATE_UTF8;
} else if ((strcmp(argv[i], "--help") == 0) ||
(strcmp(argv[i], "-h") == 0)) {
print_usage();