From ac39623d667999cfae1444b46508a9a423b0df1b Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 13 Jul 2015 09:21:35 -0700 Subject: Added `CMARK_OPT_SAFE` option and `--safe` command-line flag. * Added `CMARK_OPT_SAFE`. This option disables rendering of raw HTML and potentially dangerous links. * Added `--safe` option in command-line program. * Updated `cmark.3` man page. * Added `scan_dangerous_url` to scanners. * In HTML, suppress rendering of raw HTML and potentially dangerous links if `CMARK_OPT_SAFE`. Dangerous URLs are those that begin with `javascript:`, `vbscript:`, `file:`, or `data:` (except for `image/png`, `image/gif`, `image/jpeg`, or `image/webp` mime types). * Added `api_test` for `OPT_CMARK_SAFE`. * Rewrote `README.md` on security. --- README.md | 15 ++- api_test/main.c | 16 +++ man/man1/cmark.1 | 8 ++ man/man3/cmark.3 | 18 +++- src/cmark.h | 8 ++ src/html.c | 38 +++++-- src/main.c | 3 + src/scanners.c | 315 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/scanners.h | 2 + src/scanners.re | 14 +++ 10 files changed, 422 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index c0ca22d..5cfbb10 100644 --- a/README.md +++ b/README.md @@ -139,11 +139,16 @@ Usage Instructions for the use of the command line program and library can be found in the man pages in the `man` subdirectory. -**A note on security:** -This library does not attempt to sanitize link attributes or -raw HTML. If you use it in applications that accept -untrusted user input, you must run the output through an HTML -sanitizer to protect against +Security +-------- + +By default, the library will pass through raw HTML and potentially +dangerous links (`javascript:`, `vbscript:`, `data:`, `file:`). + +It is recommended that users either disable this potentially unsafe +feature by using the option `CMARK_OPT_SAFE` (or `--safe` with the +command-line program), or run the output through an HTML sanitizer +to protect against [XSS attacks](http://en.wikipedia.org/wiki/Cross-site_scripting). Contributing diff --git a/api_test/main.c b/api_test/main.c index 104371c..dfb5483 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -713,6 +713,21 @@ numeric_entities(test_batch_runner *runner) "Max hexadecimal entity length"); } +static void +test_safe(test_batch_runner *runner) +{ + // Test safe mode + static const char raw_html[] = + "
\nhi\n
\n\nhi\n[link](JAVAscript:alert('hi'))\n![image](file:my.js)\n"; + char *html = cmark_markdown_to_html(raw_html, + sizeof(raw_html) - 1, + CMARK_OPT_DEFAULT | + CMARK_OPT_SAFE); + STR_EQ(runner, html, "\n

hi\nlink\n\"image\"

\n", + "input with raw HTML and dangerous links"); + free(html); +} + static void test_md_to_html(test_batch_runner *runner, const char *markdown, const char *expected_html, const char *msg) @@ -741,6 +756,7 @@ int main() { line_endings(runner); numeric_entities(runner); test_cplusplus(runner); + test_safe(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; diff --git a/man/man1/cmark.1 b/man/man1/cmark.1 index 64fa697..8dd9165 100644 --- a/man/man1/cmark.1 +++ b/man/man1/cmark.1 @@ -45,6 +45,14 @@ be rendered as curly quotes, depending on their position. \f[C]\-\-\-\f[] will be rendered as an em-dash. \f[C]...\f[] will be rendered as ellipses. .TP 12n +.B \-\-safe +Do not render raw HTML or potentially dangerous URLs. +(Raw HTML is replaced by a placeholder comment; potentially +dangerous URLs are replaced by empty strings.) Dangerous +URLs are those that begin with `javascript:`, `vbscript:`, +`file:`, or `data:` (except for `image/png`, `image/gif`, +`image/jpeg`, or `image/webp` mime types). +.TP 12n .B \-\-help Print usage information. .TP 12n diff --git a/man/man3/cmark.3 b/man/man3/cmark.3 index 288fadc..1359fcc 100644 --- a/man/man3/cmark.3 +++ b/man/man3/cmark.3 @@ -1,4 +1,4 @@ -.TH cmark 3 "July 12, 2015" "LOCAL" "Library Functions Manual" +.TH cmark 3 "July 13, 2015" "LOCAL" "Library Functions Manual" .SH NAME .PP @@ -569,6 +569,22 @@ dashes. Validate UTF\-8 in the input before parsing, replacing illegal sequences with the replacement character U+FFFD. +.PP +.nf +\fC +.RS 0n +#define CMARK_OPT_SAFE 32 +.RE +\f[] +.fi + +.PP +Suppress raw HTML and unsafe links (\f[C]javascript:\f[], +\f[C]vbscript:\f[], \f[C]file:\f[], and \f[C]data:\f[], except for +\f[C]image/png\f[], \f[C]image/gif\f[], \f[C]image/jpeg\f[], or +\f[C]image/webp\f[] mime types). Raw HTML is replaced by a placeholder +HTML comment. Unsafe links are replaced by empty strings. + .SS Version information diff --git a/src/cmark.h b/src/cmark.h index 7ae6d36..4a85f26 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -516,6 +516,14 @@ char *cmark_render_latex(cmark_node *root, int options, int width); */ #define CMARK_OPT_VALIDATE_UTF8 16 +/** Suppress raw HTML and unsafe links (`javascript:`, `vbscript:`, + * `file:`, and `data:`, except for `image/png`, `image/gif`, + * `image/jpeg`, or `image/webp` mime types). Raw HTML is replaced + * by a placeholder HTML comment. Unsafe links are replaced by + * empty strings. + */ +#define CMARK_OPT_SAFE 32 + /** * ## Version information */ diff --git a/src/html.c b/src/html.c index 8cf8835..48a80d6 100644 --- a/src/html.c +++ b/src/html.c @@ -8,6 +8,7 @@ #include "node.h" #include "buffer.h" #include "houdini.h" +#include "scanners.h" // Functions to convert cmark_nodes to HTML strings. @@ -174,7 +175,13 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_HTML: cr(html); - cmark_strbuf_put(html, node->as.literal.data, node->as.literal.len); + if (options & CMARK_OPT_SAFE) { + cmark_strbuf_puts(html, ""); + } else { + cmark_strbuf_put(html, node->as.literal.data, + node->as.literal.len); + } + cr(html); break; case CMARK_NODE_HRULE: @@ -228,7 +235,12 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, break; case CMARK_NODE_INLINE_HTML: - cmark_strbuf_put(html, node->as.literal.data, node->as.literal.len); + if (options & CMARK_OPT_SAFE) { + cmark_strbuf_puts(html, ""); + } else { + cmark_strbuf_put(html, node->as.literal.data, + node->as.literal.len); + } break; case CMARK_NODE_STRONG: @@ -250,15 +262,19 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_LINK: if (entering) { cmark_strbuf_puts(html, "as.link.url.data, - node->as.link.url.len); + if (!((options & CMARK_OPT_SAFE) && + scan_dangerous_url(&node->as.link.url, 0))) { + houdini_escape_href(html, + node->as.link.url.data, + node->as.link.url.len); + } if (node->as.link.title.len) { cmark_strbuf_puts(html, "\" title=\""); - escape_html(html, node->as.link.title.data, - node->as.link.title.len); + escape_html(html, + node->as.link.title.data, + node->as.link.title.len); } - cmark_strbuf_puts(html, "\">"); } else { cmark_strbuf_puts(html, ""); @@ -268,9 +284,13 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_IMAGE: if (entering) { cmark_strbuf_puts(html, "as.link.url.data, - node->as.link.url.len); + if (!((options & CMARK_OPT_SAFE) && + scan_dangerous_url(&node->as.link.url, 0))) { + houdini_escape_href(html, + node->as.link.url.data, + node->as.link.url.len); + } cmark_strbuf_puts(html, "\" alt=\""); state->plain = node; } else { diff --git a/src/main.c b/src/main.c index 26e42ca..7fae7e4 100644 --- a/src/main.c +++ b/src/main.c @@ -28,6 +28,7 @@ void print_usage() printf(" --width WIDTH Specify wrap width (default 0 = nowrap)\n"); printf(" --sourcepos Include source position attribute\n"); printf(" --hardbreaks Treat newlines as hard line breaks\n"); + printf(" --safe Suppress raw HTML and dangerous URLs\n"); printf(" --smart Use smart punctuation\n"); printf(" --normalize Consolidate adjacent text nodes\n"); printf(" --help, -h Print usage information\n"); @@ -93,6 +94,8 @@ int main(int argc, char *argv[]) options |= CMARK_OPT_HARDBREAKS; } else if (strcmp(argv[i], "--smart") == 0) { options |= CMARK_OPT_SMART; + } else if (strcmp(argv[i], "--safe") == 0) { + options |= CMARK_OPT_SAFE; } else if (strcmp(argv[i], "--normalize") == 0) { options |= CMARK_OPT_NORMALIZE; } else if (strcmp(argv[i], "--validate-utf8") == 0) { diff --git a/src/scanners.c b/src/scanners.c index b3963a3..75fdb46 100644 --- a/src/scanners.c +++ b/src/scanners.c @@ -20730,3 +20730,318 @@ yy2270: } } + +// Returns positive value if a URL begins in a way that is potentially +// dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0. +bufsize_t _scan_dangerous_url(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; + +{ + unsigned char yych; + unsigned int yyaccept = 0; + yych = *(marker = p); + if (yych <= 'f') { + if (yych <= 'I') { + if (yych <= 'C') { + if (yych != '\n') goto yy2278; + } else { + if (yych <= 'D') goto yy2274; + if (yych == 'F') goto yy2277; + goto yy2278; + } + } else { + if (yych <= 'V') { + if (yych <= 'J') goto yy2275; + if (yych <= 'U') goto yy2278; + goto yy2276; + } else { + if (yych == 'd') goto yy2274; + if (yych <= 'e') goto yy2278; + goto yy2277; + } + } + } else { + if (yych <= 0xDF) { + if (yych <= 'u') { + if (yych == 'j') goto yy2275; + goto yy2278; + } else { + if (yych <= 'v') goto yy2276; + if (yych <= 0x7F) goto yy2278; + if (yych >= 0xC2) goto yy2279; + } + } else { + if (yych <= 0xEF) { + if (yych <= 0xE0) goto yy2281; + if (yych == 0xED) goto yy2286; + goto yy2282; + } else { + if (yych <= 0xF0) goto yy2283; + if (yych <= 0xF3) goto yy2284; + if (yych <= 0xF4) goto yy2285; + } + } + } +yy2273: + { return 0; } +yy2274: + yyaccept = 0; + yych = *(marker = ++p); + if (yych == 'A') goto yy2308; + if (yych == 'a') goto yy2308; + goto yy2273; +yy2275: + yyaccept = 0; + yych = *(marker = ++p); + if (yych == 'A') goto yy2299; + if (yych == 'a') goto yy2299; + goto yy2273; +yy2276: + yyaccept = 0; + yych = *(marker = ++p); + if (yych == 'B') goto yy2292; + if (yych == 'b') goto yy2292; + goto yy2273; +yy2277: + yyaccept = 0; + yych = *(marker = ++p); + if (yych == 'I') goto yy2287; + if (yych == 'i') goto yy2287; + goto yy2273; +yy2278: + yych = *++p; + goto yy2273; +yy2279: + yych = *++p; + if (yych <= 0x7F) goto yy2280; + if (yych <= 0xBF) goto yy2278; +yy2280: + p = marker; + if (yyaccept == 0) { + goto yy2273; + } else { + goto yy2291; + } +yy2281: + yych = *++p; + if (yych <= 0x9F) goto yy2280; + if (yych <= 0xBF) goto yy2279; + goto yy2280; +yy2282: + yych = *++p; + if (yych <= 0x7F) goto yy2280; + if (yych <= 0xBF) goto yy2279; + goto yy2280; +yy2283: + yych = *++p; + if (yych <= 0x8F) goto yy2280; + if (yych <= 0xBF) goto yy2282; + goto yy2280; +yy2284: + yych = *++p; + if (yych <= 0x7F) goto yy2280; + if (yych <= 0xBF) goto yy2282; + goto yy2280; +yy2285: + yych = *++p; + if (yych <= 0x7F) goto yy2280; + if (yych <= 0x8F) goto yy2282; + goto yy2280; +yy2286: + yych = *++p; + if (yych <= 0x7F) goto yy2280; + if (yych <= 0x9F) goto yy2279; + goto yy2280; +yy2287: + yych = *++p; + if (yych == 'L') goto yy2288; + if (yych != 'l') goto yy2280; +yy2288: + yych = *++p; + if (yych == 'E') goto yy2289; + if (yych != 'e') goto yy2280; +yy2289: + yych = *++p; + if (yych != ':') goto yy2280; +yy2290: + ++p; +yy2291: + { return (bufsize_t)(p - start); } +yy2292: + yych = *++p; + if (yych == 'S') goto yy2293; + if (yych != 's') goto yy2280; +yy2293: + yych = *++p; + if (yych == 'C') goto yy2294; + if (yych != 'c') goto yy2280; +yy2294: + yych = *++p; + if (yych == 'R') goto yy2295; + if (yych != 'r') goto yy2280; +yy2295: + yych = *++p; + if (yych == 'I') goto yy2296; + if (yych != 'i') goto yy2280; +yy2296: + yych = *++p; + if (yych == 'P') goto yy2297; + if (yych != 'p') goto yy2280; +yy2297: + yych = *++p; + if (yych == 'T') goto yy2298; + if (yych != 't') goto yy2280; +yy2298: + yych = *++p; + if (yych == ':') goto yy2290; + goto yy2280; +yy2299: + yych = *++p; + if (yych == 'V') goto yy2300; + if (yych != 'v') goto yy2280; +yy2300: + yych = *++p; + if (yych == 'A') goto yy2301; + if (yych != 'a') goto yy2280; +yy2301: + yych = *++p; + if (yych == 'S') goto yy2302; + if (yych != 's') goto yy2280; +yy2302: + yych = *++p; + if (yych == 'C') goto yy2303; + if (yych != 'c') goto yy2280; +yy2303: + yych = *++p; + if (yych == 'R') goto yy2304; + if (yych != 'r') goto yy2280; +yy2304: + yych = *++p; + if (yych == 'I') goto yy2305; + if (yych != 'i') goto yy2280; +yy2305: + yych = *++p; + if (yych == 'P') goto yy2306; + if (yych != 'p') goto yy2280; +yy2306: + yych = *++p; + if (yych == 'T') goto yy2307; + if (yych != 't') goto yy2280; +yy2307: + yych = *++p; + if (yych == ':') goto yy2290; + goto yy2280; +yy2308: + yych = *++p; + if (yych == 'T') goto yy2309; + if (yych != 't') goto yy2280; +yy2309: + yych = *++p; + if (yych == 'A') goto yy2310; + if (yych != 'a') goto yy2280; +yy2310: + yych = *++p; + if (yych != ':') goto yy2280; + yyaccept = 1; + yych = *(marker = ++p); + if (yych == 'I') goto yy2312; + if (yych != 'i') goto yy2291; +yy2312: + yych = *++p; + if (yych == 'M') goto yy2313; + if (yych != 'm') goto yy2280; +yy2313: + yych = *++p; + if (yych == 'A') goto yy2314; + if (yych != 'a') goto yy2280; +yy2314: + yych = *++p; + if (yych == 'G') goto yy2315; + if (yych != 'g') goto yy2280; +yy2315: + yych = *++p; + if (yych == 'E') goto yy2316; + if (yych != 'e') goto yy2280; +yy2316: + yych = *++p; + if (yych != '/') goto yy2280; + yych = *++p; + if (yych <= 'W') { + if (yych <= 'J') { + if (yych == 'G') goto yy2319; + if (yych <= 'I') goto yy2280; + goto yy2320; + } else { + if (yych == 'P') goto yy2318; + if (yych <= 'V') goto yy2280; + goto yy2321; + } + } else { + if (yych <= 'j') { + if (yych == 'g') goto yy2319; + if (yych <= 'i') goto yy2280; + goto yy2320; + } else { + if (yych <= 'p') { + if (yych <= 'o') goto yy2280; + } else { + if (yych == 'w') goto yy2321; + goto yy2280; + } + } + } +yy2318: + yych = *++p; + if (yych == 'N') goto yy2329; + if (yych == 'n') goto yy2329; + goto yy2280; +yy2319: + yych = *++p; + if (yych == 'I') goto yy2328; + if (yych == 'i') goto yy2328; + goto yy2280; +yy2320: + yych = *++p; + if (yych == 'P') goto yy2326; + if (yych == 'p') goto yy2326; + goto yy2280; +yy2321: + yych = *++p; + if (yych == 'E') goto yy2322; + if (yych != 'e') goto yy2280; +yy2322: + yych = *++p; + if (yych == 'B') goto yy2323; + if (yych != 'b') goto yy2280; +yy2323: + yych = *++p; + if (yych == 'P') goto yy2324; + if (yych != 'p') goto yy2280; +yy2324: + ++p; + { return 0; } +yy2326: + yych = *++p; + if (yych == 'E') goto yy2327; + if (yych != 'e') goto yy2280; +yy2327: + yych = *++p; + if (yych == 'G') goto yy2324; + if (yych == 'g') goto yy2324; + goto yy2280; +yy2328: + yych = *++p; + if (yych == 'F') goto yy2324; + if (yych == 'f') goto yy2324; + goto yy2280; +yy2329: + ++p; + if ((yych = *p) == 'G') goto yy2324; + if (yych == 'g') goto yy2324; + goto yy2280; +} + +} + diff --git a/src/scanners.h b/src/scanners.h index db8eeb8..a6a71bf 100644 --- a/src/scanners.h +++ b/src/scanners.h @@ -26,6 +26,7 @@ bufsize_t _scan_hrule(const unsigned char *p); bufsize_t _scan_open_code_fence(const unsigned char *p); bufsize_t _scan_close_code_fence(const unsigned char *p); bufsize_t _scan_entity(const unsigned char *p); +bufsize_t _scan_dangerous_url(const unsigned char *p); #define scan_scheme(c, n) _scan_at(&_scan_scheme, c, n) #define scan_autolink_uri(c, n) _scan_at(&_scan_autolink_uri, c, n) @@ -47,6 +48,7 @@ bufsize_t _scan_entity(const unsigned char *p); #define scan_open_code_fence(c, n) _scan_at(&_scan_open_code_fence, c, n) #define scan_close_code_fence(c, n) _scan_at(&_scan_close_code_fence, c, n) #define scan_entity(c, n) _scan_at(&_scan_entity, c, n) +#define scan_dangerous_url(c, n) _scan_at(&_scan_dangerous_url, c, n) #ifdef __cplusplus } diff --git a/src/scanners.re b/src/scanners.re index efa6731..fbe3283 100644 --- a/src/scanners.re +++ b/src/scanners.re @@ -315,3 +315,17 @@ bufsize_t _scan_entity(const unsigned char *p) .? { return 0; } */ } + +// Returns positive value if a URL begins in a way that is potentially +// dangerous, with javascript:, vbscript:, file:, or data:, otherwise 0. +bufsize_t _scan_dangerous_url(const unsigned char *p) +{ + const unsigned char *marker = NULL; + const unsigned char *start = p; +/*!re2c + 'data:image/' ('png'|'gif'|'jpeg'|'webp') { return 0; } + 'javascript:' | 'vbscript:' | 'file:' | 'data:' { return (bufsize_t)(p - start); } + .? { return 0; } +*/ +} + -- cgit v1.2.3