summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-16 22:21:55 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 22:22:56 -0700
commit04726a7089e44e7ff4e6c552524841579a1053da (patch)
treebc1657e17edb1d264a0cfe8adfe3c378b7baa8ae /src
parentfb7af2f0d6ca845b33364c6ce9a704a458e31ff9 (diff)
Added `CMARK_OPT_VALIDATE_UTF8` option.
Also command line option `--validate-utf8`. This option causes cmark to check for valid UTF-8, replacing invalid sequences with the replacement character, U+FFFD. Reinstated api tests for utf8.
Diffstat (limited to 'src')
-rw-r--r--src/blocks.c6
-rw-r--r--src/cmark.h5
-rw-r--r--src/main.c2
3 files changed, 12 insertions, 1 deletions
diff --git a/src/blocks.c b/src/blocks.c
index 95a87c3..17288df 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -631,7 +631,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
cmark_chunk input;
bool maybe_lazy;
- cmark_strbuf_put(parser->curline, buffer, bytes);
+ if (parser->options & CMARK_OPT_VALIDATE_UTF8) {
+ utf8proc_check(parser->curline, buffer, bytes);
+ } else {
+ cmark_strbuf_put(parser->curline, buffer, bytes);
+ }
parser->offset = 0;
parser->column = 0;
parser->blank = false;
diff --git a/src/cmark.h b/src/cmark.h
index d86e13e..4ff1ca0 100644
--- a/src/cmark.h
+++ b/src/cmark.h
@@ -506,6 +506,11 @@ char *cmark_render_commonmark(cmark_node *root, int options, int width);
*/
#define CMARK_OPT_SMART 8
+/** Validate UTF-8 in the input before parsing, replacing illegal
+ * sequences with the replacement character U+FFFD.
+ */
+#define CMARK_OPT_VALIDATE_UTF8 16
+
/**
* ## Version information
*/
diff --git a/src/main.c b/src/main.c
index c23071f..5dc97b2 100644
--- a/src/main.c
+++ b/src/main.c
@@ -92,6 +92,8 @@ int main(int argc, char *argv[])
options |= CMARK_OPT_SMART;
} else if (strcmp(argv[i], "--normalize") == 0) {
options |= CMARK_OPT_NORMALIZE;
+ } else if (strcmp(argv[i], "--validate-utf8") == 0) {
+ options |= CMARK_OPT_VALIDATE_UTF8;
} else if ((strcmp(argv[i], "--help") == 0) ||
(strcmp(argv[i], "-h") == 0)) {
print_usage();