From 158bbebe1a0eede2122feecd6f6b5aee9a53468d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 3 Nov 2014 17:36:01 -0800 Subject: Removed artificial rule for emph/strong markers. Previously there was a rule that nothing in a string of more than 3 `*` or `_` characters could close or start emphasis. This was artifical and led to strange asymmetries, e.g. you could have `*a *b**` emph within emph but not `**a **b****` strong within strong. The new parsing strategy makes it easy to remove this limitation. Spec, js, and c implementations have been updated. Spec might need some further grooming. --- js/lib/inlines.js | 40 ++++++++++----------- spec.txt | 104 +++++++++++++++++++++++++++++------------------------- src/inlines.c | 14 +++++--- 3 files changed, 82 insertions(+), 76 deletions(-) diff --git a/js/lib/inlines.js b/js/lib/inlines.js index 5fde099..4f1f16a 100644 --- a/js/lib/inlines.js +++ b/js/lib/inlines.js @@ -235,8 +235,8 @@ var scanDelims = function(cc) { char_after = fromCodePoint(cc_after); } - var can_open = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_after)); - var can_close = numdelims > 0 && numdelims <= 3 && !(/\s/.test(char_before)); + var can_open = numdelims > 0 && !(/\s/.test(char_after)); + var can_close = numdelims > 0 && !(/\s/.test(char_before)); if (cc === C_UNDERSCORE) { can_open = can_open && !((/[a-z0-9]/i).test(char_before)); can_close = can_close && !((/[a-z0-9]/i).test(char_after)); @@ -265,6 +265,7 @@ var parseEmphasis = function(cc,inlines) { var res = this.scanDelims(cc); var numdelims = res.numdelims; + var usedelims; if (numdelims === 0) { this.pos = startpos; @@ -279,41 +280,36 @@ var parseEmphasis = function(cc,inlines) { if (opener.cc === cc) { // we have a match! - if (opener.numdelims <= numdelims) { // all openers used - - this.pos += opener.numdelims; - var X; - switch (opener.numdelims) { - case 3: - X = function(x) { return Strong([Emph(x)]); }; - break; - case 2: - X = Strong; - break; - case 1: - default: - X = Emph; - break; - } + if (numdelims < 3 || opener.numdelims < 3) { + usedelims = numdelims <= opener.numdelims ? numdelims : opener.numdelims; + } else { // numdelims >= 3 && opener.numdelims >= 3 + usedelims = numdelims % 2 === 0 ? 2 : 1; + } + var X = usedelims === 1 ? Emph : Strong; + + if (opener.numdelims == usedelims) { // all openers used + + this.pos += usedelims; inlines[opener.pos] = X(inlines.slice(opener.pos + 1)); inlines.splice(opener.pos + 1, inlines.length - (opener.pos + 1)); // Remove entries after this, to prevent overlapping nesting: this.emphasis_openers = opener.previous; return true; - } else if (opener.numdelims > numdelims) { // only some openers used + } else if (opener.numdelims > usedelims) { // only some openers used - this.pos += numdelims; - opener.numdelims -= numdelims; + this.pos += usedelims; + opener.numdelims -= usedelims; inlines[opener.pos].c = inlines[opener.pos].c.slice(0, opener.numdelims); - var X = numdelims === 2 ? Strong : Emph; inlines[opener.pos + 1] = X(inlines.slice(opener.pos + 1)); inlines.splice(opener.pos + 2, inlines.length - (opener.pos + 2)); // Remove entries after this, to prevent overlapping nesting: this.emphasis_openers = opener; return true; + } else { // usedelims > opener.numdelims, should never happen + throw new Error("Logic error: usedelims > opener.numdelims"); } } diff --git a/spec.txt b/spec.txt index 1bbd287..3eabb31 100644 --- a/spec.txt +++ b/spec.txt @@ -4250,60 +4250,52 @@ for efficient parsing strategies that do not backtrack: 1. A single `*` character [can open emphasis](#can-open-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `*`s, - (b) it is not followed by whitespace, and - (c) either it is not followed by a `*` character or it is + (a) it is not followed by whitespace, and + (b) either it is not followed by a `*` character or it is followed immediately by emphasis or strong emphasis. 2. A single `_` character [can open emphasis](#can-open-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not followed by whitespace, - (c) it is not preceded by an ASCII alphanumeric character, and - (d) either it is not followed by a `_` character or it is + (a) it is not followed by whitespace, + (b) it is not preceded by an ASCII alphanumeric character, and + (c) either it is not followed by a `_` character or it is followed immediately by emphasis or strong emphasis. 3. A single `*` character [can close emphasis](#can-close-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `*`s, and (b) it is not preceded by whitespace. 4. A single `_` character [can close emphasis](#can-close-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not preceded by whitespace, and - (c) it is not followed by an ASCII alphanumeric character. + (a) it is not preceded by whitespace, and + (b) it is not followed by an ASCII alphanumeric character. 5. A double `**` [can open strong emphasis](#can-open-strong-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `*`s, - (b) it is not followed by whitespace, and - (c) either it is not followed by a `*` character or it is + (a) it is not followed by whitespace, and + (b) either it is not followed by a `*` character or it is followed immediately by emphasis. 6. A double `__` [can open strong emphasis](#can-open-strong-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not followed by whitespace, and - (c) it is not preceded by an ASCII alphanumeric character, and - (d) either it is not followed by a `_` character or it is + (a) it is not followed by whitespace, and + (b) it is not preceded by an ASCII alphanumeric character, and + (c) either it is not followed by a `_` character or it is followed immediately by emphasis. 7. A double `**` [can close strong emphasis](#can-close-strong-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `*`s, and - (b) it is not preceded by whitespace. + (a) it is not preceded by whitespace. 8. A double `__` [can close strong emphasis](#can-close-strong-emphasis) iff - (a) it is not part of a sequence of four or more unescaped `_`s, - (b) it is not preceded by whitespace, and - (c) it is not followed by an ASCII alphanumeric character. + (a) it is not preceded by whitespace, and + (b) it is not followed by an ASCII alphanumeric character. 9. Emphasis begins with a delimiter that [can open emphasis](#can-open-emphasis) and ends with a delimiter that [can close @@ -4544,19 +4536,13 @@ and __foo bar __

and __foo bar __

. -The rules imply that a sequence of four or more unescaped `*` or -`_` characters will always be parsed as a literal string: - -. -****hi**** -. -

****hi****

-. +The rules imply that a sequence of `*` or `_` characters +surrounded by whitespace will be parsed as a literal string: . -_____hi_____ +foo ******** . -

_____hi_____

+

foo ********

. . @@ -4827,8 +4813,7 @@ the internal delimiters [can close emphasis](#can-close-emphasis), while in the cases with spaces, they cannot. Note that you cannot nest emphasis directly inside emphasis -using the same delimeter, or strong emphasis directly inside -strong emphasis: +using the same delimeter: . **foo** @@ -4836,22 +4821,25 @@ strong emphasis:

foo

. +For this, you need to switch delimiters: + . -****foo**** +*_foo_* . -

****foo****

+

foo

. -For these nestings, you need to switch delimiters: +Strong within strong is possible without switching +delimiters: . -*_foo_* +****foo**** . -

foo

+

foo

. . -**__foo__** +____foo____ .

foo

. @@ -4890,21 +4878,19 @@ similarly for `_` and `__`):

foo bar**

. -The following contains no strong emphasis, because the opening -delimiter is closed by the first `*` before `bar`: - . -*foo**bar*** +*foo**** . -

foobar**

+

foo***

. -However, a string of four or more `****` can never close emphasis: +The following contains no strong emphasis, because the opening +delimiter is closed by the first `*` before `bar`: . -*foo**** +*foo**bar*** . -

*foo****

+

foobar**

. We retain symmetry in these cases: @@ -4927,6 +4913,26 @@ We retain symmetry in these cases:

foo bar

. +. +**foo*** + +***foo** +. +

foo*

+

*foo

+. + +. +**foo **bar**** + +****foo** bar** +. +

foo bar

+

foo bar

+. + + + More cases with mismatched delimiters: . diff --git a/src/inlines.c b/src/inlines.c index 9216979..e747dfd 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -297,8 +297,8 @@ static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * c advance(subj); } char_after = peek_char(subj); - *can_open = numdelims > 0 && numdelims <= 3 && !isspace(char_after); - *can_close = numdelims > 0 && numdelims <= 3 && !isspace(char_before); + *can_open = numdelims > 0 && !isspace(char_after); + *can_close = numdelims > 0 && !isspace(char_before); if (c == '_') { *can_open = *can_open && !isalnum(char_before); *can_close = *can_close && !isalnum(char_after); @@ -324,6 +324,7 @@ static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **l bool can_open, can_close; int numdelims; int useDelims; + int openerDelims; inline_stack * istack; node_inl * inl; node_inl * emph; @@ -347,9 +348,12 @@ static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **l } // calculate the actual number of delimeters used from this closer - useDelims = istack->delim_count; - if (useDelims == 3) useDelims = numdelims == 3 ? 1 : numdelims; - else if (useDelims > numdelims) useDelims = 1; + openerDelims = istack->delim_count; + if (numdelims < 3 || openerDelims < 3) { + useDelims = numdelims <= openerDelims ? numdelims : openerDelims; + } else { // (numdelims >= 3 && openerDelims >= 3) + useDelims = numdelims % 2 == 0 ? 2 : 1; + } if (istack->delim_count == useDelims) { -- cgit v1.2.3