From 2da7c3f21e2b70cfd08d0f193eeaa6f00e9eb1b8 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 14 Dec 2014 18:21:04 -0800 Subject: Improved rules for emphasis and strong emphasis. This improves parsing of emphasis around punctuation. Background: http://talk.commonmark.org/t/emphasis-inside-strong-broken-in-js-implementation-when-parenthesis-involved/903/6 The basic idea of the change is that if the delimiter is part of a delimiter clump that has punctuation to the left and a normal character (non-space, non-punctuation) to the right, it can only be an opener. If it has punctuation to the right and a normal character (non-space, non-punctuation) to the left, it can only be a closer. This handles cases like **Gomphocarpus (*Gomphocarpus physocarpus*, syn. *Asclepias physocarpa*)** and **foo "*bar*" foo** better than before. The spec section on Emphasis and Strong Emphasis has been extensively revised. The C and JS implementations have been brought up to date, and all tests pass. --- js/lib/inlines.js | 12 ++- spec.txt | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++---- src/inlines.c | 14 +++- 3 files changed, 244 insertions(+), 24 deletions(-) diff --git a/js/lib/inlines.js b/js/lib/inlines.js index c799d0d..297d31f 100644 --- a/js/lib/inlines.js +++ b/js/lib/inlines.js @@ -41,6 +41,8 @@ var HTMLTAG = "(?:" + OPENTAG + "|" + CLOSETAG + "|" + HTMLCOMMENT + "|" + PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"; var ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; +var rePunctuation = new RegExp(/^[\u2000-\u206F\u2E00-\u2E7F\\'!"#\$%&\(\)\*\+,\-\.\/:;<=>\?@\[\]\^_`\{\|\}~]/); + var reHtmlTag = new RegExp('^' + HTMLTAG, 'i'); var reLinkTitle = new RegExp( @@ -227,8 +229,14 @@ var scanDelims = function(cc) { char_after = fromCodePoint(cc_after); } - var can_open = numdelims > 0 && !(/\s/.test(char_after)); - var can_close = numdelims > 0 && !(/\s/.test(char_before)); + var can_open = numdelims > 0 && !(/\s/.test(char_after)) && + !(rePunctuation.test(char_after) && + !(/\s/.test(char_before)) && + !(rePunctuation.test(char_before))); + var can_close = numdelims > 0 && !(/\s/.test(char_before)) && + !(rePunctuation.test(char_before) && + !(/\s/.test(char_after)) && + !(rePunctuation.test(char_after))); if (cc === C_UNDERSCORE) { can_open = can_open && !((/[a-z0-9]/i).test(char_before)); can_close = can_close && !((/[a-z0-9]/i).test(char_after)); diff --git a/spec.txt b/spec.txt index bb7e620..cdf79ae 100644 --- a/spec.txt +++ b/spec.txt @@ -4390,36 +4390,107 @@ internal emphasis: foo*bar*baz no emphasis: foo_bar_baz ``` -The following rules capture all of these patterns, while allowing -for efficient parsing strategies that do not backtrack: +The rules given below capture all of these patterns, while allowing +for efficient parsing strategies that do not backtrack. + +First, some definitions. A [delimiter run](@delimiter-run) is either +a sequence of one or more `*` characters that is not preceded or +followed by a `*` character, or a sequence of one or more `_` +characters that is not preceded or followed by a `_` character. + +A [left-flanking delimiter run](@right-facing-delimiter-run) is +a [delimiter run](#delimiter-run) that is (a) not followed by [unicode +whitespace](#unicode-whitespace), and (b) either not followed by a +[punctuation character](#punctuation-character), or +preceded by [unicode whitespace](#unicode-whitespace) or +a [punctuation character](#punctuation-character). + +A [right-flanking delimiter run](@left-facing-delimiter-run) is +a [delimiter run](#delimiter-run) that is (a) not preceded by [unicode +whitespace](#unicode-whitespace), and (b) either not preceded by a +[punctuation character](#punctuation-character), or +followed by [unicode whitespace](#unicode-whitespace) or +a [punctuation character](#punctuation-character). + +Here are some examples of delimiter runs. + + - left-flanking but not right-flanking: + + ``` + ***abc + _abc + **"abc" + _"abc" + ``` + + - right-flanking but not left-flanking: + + ``` + abc*** + abc_ + "abc"** + _"abc" + ``` + + - Both right and right-flanking: + + ``` + abc***def + "abc"_"def" + ``` + + - Neither right nor right-flanking: + + ``` + abc *** def + a _ b + ``` + +(The idea of distinguishing left-flanking and right-flanking +delimiter runs based on the character before and the character +after comes from Roopesh Chander's +[vfmd](http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags). +vfmd uses the terminology "emphasis indicator string" instead of "delimiter +run," and its rules for distinguishing left- and right-flanking runs +are a bit more complex than the ones given here.) + +The following rules define emphasis and strong emphasis: 1. A single `*` character [can open emphasis](@can-open-emphasis) - iff it is not followed by [unicode whitespace](#unicode-whitespace). + iff it is part of a + [left-flanking delimiter run](#right-facing-delimiter-run). 2. A single `_` character [can open emphasis](#can-open-emphasis) iff - it is not followed by [unicode whitespace](#unicode-whitespace) - and it is not preceded by an ASCII alphanumeric character. + it is part of a + [left-flanking delimiter run](#right-facing-delimiter-run) + and is not preceded by an ASCII alphanumeric character. 3. A single `*` character [can close emphasis](@can-close-emphasis) - iff it is not preceded by [unicode whitespace](#unicode-whitespace). + iff it is part of a + [left-flanking delimiter run](#right-facing-delimiter-run). -4. A single `_` character [can close emphasis](#can-close-emphasis) iff - it is not preceded by [unicode whitespace](#unicode-whitespace) +4. A single `_` character [can close emphasis](#can-close-emphasis) + iff it is part of a + [left-flanking delimiter run](#right-facing-delimiter-run). and it is not followed by an ASCII alphanumeric character. 5. A double `**` [can open strong emphasis](@can-open-strong-emphasis) - iff it is not followed by [unicode whitespace](#unicode-whitespace). + iff it is part of a + [left-flanking delimiter run](#right-facing-delimiter-run). 6. A double `__` [can open strong emphasis](#can-open-strong-emphasis) - iff it is not followed by [unicode whitespace](#unicode-whitespace) - and it is not preceded by an ASCII alphanumeric character. + iff it is part of a + [left-flanking delimiter run](#right-facing-delimiter-run) + and is not preceded by an ASCII alphanumeric character. 7. A double `**` [can close strong emphasis](@can-close-strong-emphasis) - iff it is not preceded by [unicode whitespace](#unicode-whitespace). + iff it is part of a + [right-flanking delimiter run](#right-facing-delimiter-run). 8. A double `__` [can close strong emphasis](#can-close-strong-emphasis) - iff it is not preceded by [unicode whitespace](#unicode-whitespace) - and it is not followed by an ASCII alphanumeric character. + iff it is part of a + [right-flanking delimiter run](#right-facing-delimiter-run). + and is not followed by an ASCII alphanumeric character. 9. Emphasis begins with a delimiter that [can open emphasis](#can-open-emphasis) and ends with a delimiter that [can close @@ -4487,7 +4558,8 @@ Rule 1: . This is not emphasis, because the opening `*` is followed by -whitespace: +whitespace, and hence not part of a [left-flanking delimiter +run](#right-facing-delimiter-run): . a * foo bar* @@ -4495,6 +4567,16 @@ a * foo bar*

a * foo bar*

. +This is not emphasis, because the opening `*` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run](#right-facing-delimiter-run): + +. +a*"foo"* +. +

a*"foo"*

+. + Unicode nonbreaking spaces count as whitespace, too: . @@ -4534,6 +4616,15 @@ _ foo bar_

_ foo bar_

. +This is not emphasis, because the opening `_` is preceded +by an alphanumeric and followed by punctuation: + +. +a_"foo"_ +. +

a_"foo"_

+. + Emphasis with `_` is not allowed inside ASCII words: . @@ -4558,6 +4649,15 @@ But it is permitted inside non-ASCII words: Rule 3: +This is not emphasis, because the closing delimiter does +not match the opening delimiter: + +. +_foo* +. +

_foo*

+. + This is not emphasis, because the closing `*` is preceded by whitespace: @@ -4567,6 +4667,26 @@ whitespace:

*foo bar *

. +This is not emphasis, because the second `*` is +preceded by punctuation and followed by an alphanumeric +(hence it is not part of a [right-flanking delimiter +run](#left-facing-delimiter-run): + +. +*(*foo) +. +

*(*foo)

+. + +The point of this restriction is more easily appreciated +with this example: + +. +*(*foo*)* +. +

(foo)

+. + Intraword emphasis with `*` is allowed: . @@ -4587,7 +4707,24 @@ _foo bar _

_foo bar _

. -Intraword emphasis: +This is not emphasis, because the second `_` is +preceded by punctuation and followed by an alphanumeric: + +. +_(_foo) +. +

_(_foo)

+. + +This is emphasis within emphasis: + +. +_(_foo_)_ +. +

(foo)

+. + +Intraword emphasis is disallowed for `_`: . _foo_bar @@ -4624,6 +4761,16 @@ followed by whitespace:

** foo bar**

. +This is not strong emphasis, because the opening `**` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run](#right-facing-delimiter-run): + +. +a**"foo"** +. +

a**"foo"**

+. + Intraword strong emphasis with `**` is permitted: . @@ -4649,7 +4796,16 @@ __ foo bar__

__ foo bar__

. -Intraword emphasis examples: +This is not strong emphasis, because the opening `__` is preceded +by an alphanumeric and followed by punctuation: + +. +a__"foo"__ +. +

a__"foo"__

+. + +Intraword strong emphasis is forbidden with `__`: . foo__bar__ @@ -4689,6 +4845,38 @@ by whitespace: (Nor can it be interpreted as an emphasized `*foo bar *`, because of Rule 11.) +This is not strong emphasis, because the second `**` is +preceded by punctuation and followed by an alphanumeric: + +. +**(**foo) +. +

**(**foo)

+. + +The point of this restriction is more easily appreciated +with these examples: + +. +*(**foo**)* +. +

(foo)

+. + +. +**Gomphocarpus (*Gomphocarpus physocarpus*, syn. +*Asclepias physocarpa*)** +. +

Gomphocarpus (Gomphocarpus physocarpus, syn. +Asclepias physocarpa)

+. + +. +**foo "*bar*" foo** +. +

foo "bar" foo

+. + Intraword emphasis: . @@ -4708,7 +4896,25 @@ __foo bar __

__foo bar __

. -Intraword strong emphasis examples: +This is not strong emphasis, because the second `__` is +preceded by punctuation and followed by an alphanumeric: + +. +__(__foo) +. +

__(__foo)

+. + +The point of this restriction is more easily appreciated +with this example: + +. +_(__foo__)_ +. +

(foo)

+. + +Intraword strong emphasis is forbidden with `__`: . __foo__bar diff --git a/src/inlines.c b/src/inlines.c index f63fabe..3f69837 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -261,7 +261,7 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) } len = utf8proc_iterate(subj->input.data + before_char_pos, subj->pos - before_char_pos, &before_char); - if (len == 0) { + if (len == -1) { before_char = 10; } } @@ -273,11 +273,17 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) len = utf8proc_iterate(subj->input.data + subj->pos, subj->input.len - subj->pos, &after_char); - if (len == 0) { + if (len == -1) { after_char = 10; } - *can_open = numdelims > 0 && !utf8proc_is_space(after_char); - *can_close = numdelims > 0 && !utf8proc_is_space(before_char); + *can_open = numdelims > 0 && !utf8proc_is_space(after_char) && + !(utf8proc_is_punctuation(after_char) && + !utf8proc_is_space(before_char) && + !utf8proc_is_punctuation(before_char)); + *can_close = numdelims > 0 && !utf8proc_is_space(before_char) && + !(utf8proc_is_punctuation(before_char) && + !utf8proc_is_space(after_char) && + !utf8proc_is_punctuation(after_char)); if (c == '_') { *can_open = *can_open && !(before_char < 128 && isalnum((char)before_char)); -- cgit v1.2.3