From 04726a7089e44e7ff4e6c552524841579a1053da Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 16 Jun 2015 22:21:55 -0700 Subject: Added `CMARK_OPT_VALIDATE_UTF8` option. Also command line option `--validate-utf8`. This option causes cmark to check for valid UTF-8, replacing invalid sequences with the replacement character, U+FFFD. Reinstated api tests for utf8. --- api_test/main.c | 6 +++--- man/man1/cmark.1 | 3 +++ man/man3/cmark.3 | 23 ++++++++++++++++++----- src/blocks.c | 6 +++++- src/cmark.h | 5 +++++ src/main.c | 2 ++ 6 files changed, 36 insertions(+), 9 deletions(-) diff --git a/api_test/main.c b/api_test/main.c index 871b3a5..666af4d 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -658,7 +658,7 @@ test_continuation_byte(test_batch_runner *runner, const char *utf8) strcat(expected, "))))

\n"); char *html = cmark_markdown_to_html(buf, strlen(buf), - CMARK_OPT_DEFAULT); + CMARK_OPT_VALIDATE_UTF8); STR_EQ(runner, html, expected, "invalid utf8 continuation byte %d/%d", pos, len); free(html); @@ -718,7 +718,7 @@ test_md_to_html(test_batch_runner *runner, const char *markdown, const char *expected_html, const char *msg) { char *html = cmark_markdown_to_html(markdown, strlen(markdown), - CMARK_OPT_DEFAULT); + CMARK_OPT_VALIDATE_UTF8); STR_EQ(runner, html, expected_html, msg); free(html); } @@ -737,7 +737,7 @@ int main() { hierarchy(runner); parser(runner); render_html(runner); - // utf8(runner); + utf8(runner); line_endings(runner); numeric_entities(runner); test_cplusplus(runner); diff --git a/man/man1/cmark.1 b/man/man1/cmark.1 index 2e08b38..8c1c2c7 100644 --- a/man/man1/cmark.1 +++ b/man/man1/cmark.1 @@ -35,6 +35,9 @@ hard wrapping is disabled, regardless of the value given with \-\-width. .B \-\-normalize Consolidate adjacent text nodes. .TP 12n +.B \-\-validate-utf8 +Validate UTF-8, replacing illegal sequences with U+FFFD. +.TP 12n .B \-\-smart Use smart punctuation. Straight double and single quotes will be rendered as curly quotes, depending on their position. diff --git a/man/man3/cmark.3 b/man/man3/cmark.3 index 82c34cd..aa254b0 100644 --- a/man/man3/cmark.3 +++ b/man/man3/cmark.3 @@ -1,4 +1,4 @@ -.TH cmark 3 "June 07, 2015" "LOCAL" "Library Functions Manual" +.TH cmark 3 "June 16, 2015" "LOCAL" "Library Functions Manual" .SH NAME .PP @@ -403,10 +403,10 @@ Streaming interface: cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT); FILE *fp = fopen("myfile.md", "r"); while ((bytes = fread(buffer, 1, sizeof(buffer), fp)) > 0) { - cmark_parser_feed(parser, buffer, bytes); - if (bytes < sizeof(buffer)) { - break; - } + cmark_parser_feed(parser, buffer, bytes); + if (bytes < sizeof(buffer)) { + break; + } } document = cmark_parser_finish(parser); cmark_parser_free(parser); @@ -539,6 +539,19 @@ Normalize tree by consolidating adjacent text nodes. .PP Convert straight quotes to curly, \-\-\- to em dashes, \-\- to en dashes. +.PP +.nf +\fC +.RS 0n +#define CMARK_OPT_VALIDATE_UTF8 16 +.RE +\f[] +.fi + +.PP +Validate UTF\-8 in the input before parsing, replacing illegal +sequences with the replacement character U+FFFD. + .SS Version information diff --git a/src/blocks.c b/src/blocks.c index 95a87c3..17288df 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -631,7 +631,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte cmark_chunk input; bool maybe_lazy; - cmark_strbuf_put(parser->curline, buffer, bytes); + if (parser->options & CMARK_OPT_VALIDATE_UTF8) { + utf8proc_check(parser->curline, buffer, bytes); + } else { + cmark_strbuf_put(parser->curline, buffer, bytes); + } parser->offset = 0; parser->column = 0; parser->blank = false; diff --git a/src/cmark.h b/src/cmark.h index d86e13e..4ff1ca0 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -506,6 +506,11 @@ char *cmark_render_commonmark(cmark_node *root, int options, int width); */ #define CMARK_OPT_SMART 8 +/** Validate UTF-8 in the input before parsing, replacing illegal + * sequences with the replacement character U+FFFD. + */ +#define CMARK_OPT_VALIDATE_UTF8 16 + /** * ## Version information */ diff --git a/src/main.c b/src/main.c index c23071f..5dc97b2 100644 --- a/src/main.c +++ b/src/main.c @@ -92,6 +92,8 @@ int main(int argc, char *argv[]) options |= CMARK_OPT_SMART; } else if (strcmp(argv[i], "--normalize") == 0) { options |= CMARK_OPT_NORMALIZE; + } else if (strcmp(argv[i], "--validate-utf8") == 0) { + options |= CMARK_OPT_VALIDATE_UTF8; } else if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { print_usage(); -- cgit v1.2.3