From 01cb5c9563cc257e14a0093843d87621563d961f Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 17 Jan 2016 14:28:53 -0800 Subject: Improved escaping in commonmark renderer. We try not to escape punctuation unless we absolutely have to. So, `)` and `.` are no longer escaped whenever they occur after digits; now they are only escaped if they are geuninely in a position where they'd cause a list item. This required a couple changes to render.c. - `renderer->begin_content` is only set to false AFTER a string of digits at the beginning of the line. (This is slightly unprincipled.) - We never break before a numeral (also slightly unprincipled). --- src/commonmark.c | 11 ++++++++--- src/render.c | 18 +++++++++++++++--- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/commonmark.c b/src/commonmark.c index 3eac076..4fb9cec 100644 --- a/src/commonmark.c +++ b/src/commonmark.c @@ -24,6 +24,8 @@ static inline void outc(cmark_renderer *renderer, cmark_escaping escape, int32_t c, unsigned char nextc) { bool needs_escaping = false; char encoded[20]; + bool follows_digit = renderer->buffer->size > 0 && + cmark_isdigit(renderer->buffer->ptr[renderer->buffer->size - 1]); needs_escaping = escape != LITERAL && @@ -31,9 +33,12 @@ static inline void outc(cmark_renderer *renderer, cmark_escaping escape, (c == '*' || c == '_' || c == '[' || c == ']' || c == '#' || c == '<' || c == '>' || c == '\\' || c == '`' || c == '!' || (c == '&' && isalpha(nextc)) || (c == '!' && nextc == '[') || - (renderer->begin_content && (c == '-' || c == '+' || c == '=')) || - ((c == '.' || c == ')') && - isdigit(renderer->buffer->ptr[renderer->buffer->size - 1])))) || + (renderer->begin_content && (c == '-' || c == '+' || c == '=') && + // begin_content doesn't get set to false til we've passed digits + // at the beginning of line, so... + !follows_digit) || + (renderer->begin_content && (c == '.' || c == ')') && follows_digit && + (nextc == 0 || cmark_isspace(nextc))))) || (escape == URL && (c == '`' || c == '<' || c == '>' || isspace(c) || c == '\\' || c == ')' || c == '(')) || (escape == TITLE && diff --git a/src/render.c b/src/render.c index 898a9e2..2c941bf 100755 --- a/src/render.c +++ b/src/render.c @@ -23,6 +23,7 @@ static void S_out(cmark_renderer *renderer, const char *source, bool wrap, unsigned char nextc; int32_t c; int i = 0; + int last_nonspace; int len; cmark_chunk remainder = cmark_chunk_literal(""); int k = renderer->buffer->size - 1; @@ -63,15 +64,20 @@ static void S_out(cmark_renderer *renderer, const char *source, bool wrap, nextc = source[i + len]; if (c == 32 && wrap) { if (!renderer->begin_line) { + last_nonspace = renderer->buffer->size; cmark_strbuf_putc(renderer->buffer, ' '); renderer->column += 1; renderer->begin_line = false; renderer->begin_content = false; - renderer->last_breakable = renderer->buffer->size - 1; // skip following spaces while (source[i + 1] == ' ') { i++; } + // We don't allow breaks that make a digit the first character + // because this causes problems with commonmark output. + if (!cmark_isdigit(source[i + 1])) { + renderer->last_breakable = last_nonspace; + } } } else if (c == 10) { @@ -83,11 +89,17 @@ static void S_out(cmark_renderer *renderer, const char *source, bool wrap, } else if (escape == LITERAL) { cmark_render_code_point(renderer, c); renderer->begin_line = false; - renderer->begin_content = false; + // we don't set 'begin_content' to false til we've + // finished parsing a digit. Reason: in commonmark + // we need to escape a potential list marker after + // a digit: + renderer->begin_content = renderer->begin_content && + cmark_isdigit(c) == 1; } else { (renderer->outc)(renderer, escape, c, nextc); renderer->begin_line = false; - renderer->begin_content = false; + renderer->begin_content = renderer->begin_content && + cmark_isdigit(c) == 1; } // If adding the character went beyond width, look for an -- cgit v1.2.3