From 2da7c3f21e2b70cfd08d0f193eeaa6f00e9eb1b8 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 14 Dec 2014 18:21:04 -0800
Subject: Improved rules for emphasis and strong emphasis.

This improves parsing of emphasis around punctuation.

Background:
http://talk.commonmark.org/t/emphasis-inside-strong-broken-in-js-implementation-when-parenthesis-involved/903/6

The basic idea of the change is that if the delimiter is part of
a delimiter clump that has punctuation to the left and a normal
character (non-space, non-punctuation) to the right, it can only
be an opener.  If it has punctuation to the right and a normal
character (non-space, non-punctuation) to the left, it can only be a closer.

This handles cases like

    **Gomphocarpus (*Gomphocarpus physocarpus*, syn. *Asclepias
    physocarpa*)**

and

    **foo "*bar*" foo**

better than before.

The spec section on Emphasis and Strong Emphasis has been extensively
revised.  The C and JS implementations have been brought up to date,
and all tests pass.
---
 js/lib/inlines.js |  12 ++-
 spec.txt          | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 src/inlines.c     |  14 +++-
 3 files changed, 244 insertions(+), 24 deletions(-)

diff --git a/js/lib/inlines.js b/js/lib/inlines.js
index c799d0d..297d31f 100644
--- a/js/lib/inlines.js
+++ b/js/lib/inlines.js
@@ -41,6 +41,8 @@ var HTMLTAG = "(?:" + OPENTAG + "|" + CLOSETAG + "|" + HTMLCOMMENT + "|" +
         PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")";
 var ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});";
 
+var rePunctuation = new RegExp(/^[\u2000-\u206F\u2E00-\u2E7F\\'!"#\$%&\(\)\*\+,\-\.\/:;<=>\?@\[\]\^_`\{\|\}~]/);
+
 var reHtmlTag = new RegExp('^' + HTMLTAG, 'i');
 
 var reLinkTitle = new RegExp(
@@ -227,8 +229,14 @@ var scanDelims = function(cc) {
         char_after = fromCodePoint(cc_after);
     }
 
-    var can_open = numdelims > 0 && !(/\s/.test(char_after));
-    var can_close = numdelims > 0 && !(/\s/.test(char_before));
+    var can_open = numdelims > 0 && !(/\s/.test(char_after)) &&
+            !(rePunctuation.test(char_after) &&
+             !(/\s/.test(char_before)) &&
+             !(rePunctuation.test(char_before)));
+    var can_close = numdelims > 0 && !(/\s/.test(char_before)) &&
+            !(rePunctuation.test(char_before) &&
+              !(/\s/.test(char_after)) &&
+              !(rePunctuation.test(char_after)));
     if (cc === C_UNDERSCORE) {
         can_open = can_open && !((/[a-z0-9]/i).test(char_before));
         can_close = can_close && !((/[a-z0-9]/i).test(char_after));
diff --git a/spec.txt b/spec.txt
index bb7e620..cdf79ae 100644
--- a/spec.txt
+++ b/spec.txt
@@ -4390,36 +4390,107 @@ internal emphasis: foo*bar*baz
 no emphasis: foo_bar_baz
 ```
 
-The following rules capture all of these patterns, while allowing
-for efficient parsing strategies that do not backtrack:
+The rules given below capture all of these patterns, while allowing
+for efficient parsing strategies that do not backtrack.
+
+First, some definitions.  A [delimiter run](@delimiter-run) is either
+a sequence of one or more `*` characters that is not preceded or
+followed by a `*` character, or a sequence of one or more `_`
+characters that is not preceded or followed by a `_` character.
+
+A [left-flanking delimiter run](@right-facing-delimiter-run) is
+a [delimiter run](#delimiter-run) that is (a) not followed by [unicode
+whitespace](#unicode-whitespace), and (b) either not followed by a
+[punctuation character](#punctuation-character), or
+preceded by [unicode whitespace](#unicode-whitespace) or
+a [punctuation character](#punctuation-character).
+
+A [right-flanking delimiter run](@left-facing-delimiter-run) is
+a [delimiter run](#delimiter-run) that is (a) not preceded by [unicode
+whitespace](#unicode-whitespace), and (b) either not preceded by a
+[punctuation character](#punctuation-character), or
+followed by [unicode whitespace](#unicode-whitespace) or
+a [punctuation character](#punctuation-character).
+
+Here are some examples of delimiter runs.
+
+  - left-flanking but not right-flanking:
+
+    ```
+    ***abc
+      _abc
+    **"abc"
+     _"abc"
+    ```
+
+  - right-flanking but not left-flanking:
+
+    ```
+    abc***
+      abc_
+    "abc"**
+     _"abc"
+    ```
+
+  - Both right and right-flanking:
+
+    ```
+    abc***def
+    "abc"_"def"
+    ```
+
+  - Neither right nor right-flanking:
+
+    ```
+    abc *** def
+    a _ b
+    ```
+
+(The idea of distinguishing left-flanking and right-flanking
+delimiter runs based on the character before and the character
+after comes from Roopesh Chander's
+[vfmd](http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).
+vfmd uses the terminology "emphasis indicator string" instead of "delimiter
+run," and its rules for distinguishing left- and right-flanking runs
+are a bit more complex than the ones given here.)
+
+The following rules define emphasis and strong emphasis:
 
 1.  A single `*` character [can open emphasis](@can-open-emphasis)
-    iff it is not followed by [unicode whitespace](#unicode-whitespace).
+    iff it is part of a
+    [left-flanking delimiter run](#right-facing-delimiter-run).
 
 2.  A single `_` character [can open emphasis](#can-open-emphasis) iff
-    it is not followed by [unicode whitespace](#unicode-whitespace)
-    and it is not preceded by an ASCII alphanumeric character.
+    it is part of a
+    [left-flanking delimiter run](#right-facing-delimiter-run)
+    and is not preceded by an ASCII alphanumeric character.
 
 3.  A single `*` character [can close emphasis](@can-close-emphasis)
-    iff it is not preceded by [unicode whitespace](#unicode-whitespace).
+    iff it is part of a
+    [left-flanking delimiter run](#right-facing-delimiter-run).
 
-4.  A single `_` character [can close emphasis](#can-close-emphasis) iff
-    it is not preceded by [unicode whitespace](#unicode-whitespace)
+4.  A single `_` character [can close emphasis](#can-close-emphasis)
+    iff it is part of a
+    [left-flanking delimiter run](#right-facing-delimiter-run).
     and it is not followed by an ASCII alphanumeric character.
 
 5.  A double `**` [can open strong emphasis](@can-open-strong-emphasis)
-    iff it is not followed by [unicode whitespace](#unicode-whitespace).
+    iff it is part of a
+    [left-flanking delimiter run](#right-facing-delimiter-run).
 
 6.  A double `__` [can open strong emphasis](#can-open-strong-emphasis)
-    iff it is not followed by [unicode whitespace](#unicode-whitespace)
-    and it is not preceded by an ASCII alphanumeric character.
+    iff it is part of a
+    [left-flanking delimiter run](#right-facing-delimiter-run)
+    and is not preceded by an ASCII alphanumeric character.
 
 7.  A double `**` [can close strong emphasis](@can-close-strong-emphasis)
-    iff it is not preceded by [unicode whitespace](#unicode-whitespace).
+    iff it is part of a
+    [right-flanking delimiter run](#right-facing-delimiter-run).
 
 8.  A double `__` [can close strong emphasis](#can-close-strong-emphasis)
-    iff it is not preceded by [unicode whitespace](#unicode-whitespace)
-    and it is not followed by an ASCII alphanumeric character.
+    iff it is part of a
+    [right-flanking delimiter run](#right-facing-delimiter-run).
+    and is not followed by an ASCII alphanumeric character.
 
 9.  Emphasis begins with a delimiter that [can open
     emphasis](#can-open-emphasis) and ends with a delimiter that [can close
@@ -4487,7 +4558,8 @@ Rule 1:
 .
 
 This is not emphasis, because the opening `*` is followed by
-whitespace:
+whitespace, and hence not part of a [left-flanking delimiter
+run](#right-facing-delimiter-run):
 
 .
 a * foo bar*
@@ -4495,6 +4567,16 @@ a * foo bar*
 <p>a * foo bar*</p>
 .
 
+This is not emphasis, because the opening `*` is preceded
+by an alphanumeric and followed by punctuation, and hence
+not part of a [left-flanking delimiter run](#right-facing-delimiter-run):
+
+.
+a*"foo"*
+.
+<p>a*&quot;foo&quot;*</p>
+.
+
 Unicode nonbreaking spaces count as whitespace, too:
 
 .
@@ -4534,6 +4616,15 @@ _ foo bar_
 <p>_ foo bar_</p>
 .
 
+This is not emphasis, because the opening `_` is preceded
+by an alphanumeric and followed by punctuation:
+
+.
+a_"foo"_
+.
+<p>a_&quot;foo&quot;_</p>
+.
+
 Emphasis with `_` is not allowed inside ASCII words:
 
 .
@@ -4558,6 +4649,15 @@ But it is permitted inside non-ASCII words:
 
 Rule 3:
 
+This is not emphasis, because the closing delimiter does
+not match the opening delimiter:
+
+.
+_foo*
+.
+<p>_foo*</p>
+.
+
 This is not emphasis, because the closing `*` is preceded by
 whitespace:
 
@@ -4567,6 +4667,26 @@ whitespace:
 <p>*foo bar *</p>
 .
 
+This is not emphasis, because the second `*` is
+preceded by punctuation and followed by an alphanumeric
+(hence it is not part of a [right-flanking delimiter
+run](#left-facing-delimiter-run):
+
+.
+*(*foo)
+.
+<p>*(*foo)</p>
+.
+
+The point of this restriction is more easily appreciated
+with this example:
+
+.
+*(*foo*)*
+.
+<p><em>(<em>foo</em>)</em></p>
+.
+
 Intraword emphasis with `*` is allowed:
 
 .
@@ -4587,7 +4707,24 @@ _foo bar _
 <p>_foo bar _</p>
 .
 
-Intraword emphasis:
+This is not emphasis, because the second `_` is
+preceded by punctuation and followed by an alphanumeric:
+
+.
+_(_foo)
+.
+<p>_(_foo)</p>
+.
+
+This is emphasis within emphasis:
+
+.
+_(_foo_)_
+.
+<p><em>(<em>foo</em>)</em></p>
+.
+
+Intraword emphasis is disallowed for `_`:
 
 .
 _foo_bar
@@ -4624,6 +4761,16 @@ followed by whitespace:
 <p>** foo bar**</p>
 .
 
+This is not strong emphasis, because the opening `**` is preceded
+by an alphanumeric and followed by punctuation, and hence
+not part of a [left-flanking delimiter run](#right-facing-delimiter-run):
+
+.
+a**"foo"**
+.
+<p>a**&quot;foo&quot;**</p>
+.
+
 Intraword strong emphasis with `**` is permitted:
 
 .
@@ -4649,7 +4796,16 @@ __ foo bar__
 <p>__ foo bar__</p>
 .
 
-Intraword emphasis examples:
+This is not strong emphasis, because the opening `__` is preceded
+by an alphanumeric and followed by punctuation:
+
+.
+a__"foo"__
+.
+<p>a__&quot;foo&quot;__</p>
+.
+
+Intraword strong emphasis is forbidden with `__`:
 
 .
 foo__bar__
@@ -4689,6 +4845,38 @@ by whitespace:
 (Nor can it be interpreted as an emphasized `*foo bar *`, because of
 Rule 11.)
 
+This is not strong emphasis, because the second `**` is
+preceded by punctuation and followed by an alphanumeric:
+
+.
+**(**foo)
+.
+<p>**(**foo)</p>
+.
+
+The point of this restriction is more easily appreciated
+with these examples:
+
+.
+*(**foo**)*
+.
+<p><em>(<strong>foo</strong>)</em></p>
+.
+
+.
+**Gomphocarpus (*Gomphocarpus physocarpus*, syn.
+*Asclepias physocarpa*)**
+.
+<p><strong>Gomphocarpus (<em>Gomphocarpus physocarpus</em>, syn.
+<em>Asclepias physocarpa</em>)</strong></p>
+.
+
+.
+**foo "*bar*" foo**
+.
+<p><strong>foo &quot;<em>bar</em>&quot; foo</strong></p>
+.
+
 Intraword emphasis:
 
 .
@@ -4708,7 +4896,25 @@ __foo bar __
 <p>__foo bar __</p>
 .
 
-Intraword strong emphasis examples:
+This is not strong emphasis, because the second `__` is
+preceded by punctuation and followed by an alphanumeric:
+
+.
+__(__foo)
+.
+<p>__(__foo)</p>
+.
+
+The point of this restriction is more easily appreciated
+with this example:
+
+.
+_(__foo__)_
+.
+<p><em>(<strong>foo</strong>)</em></p>
+.
+
+Intraword strong emphasis is forbidden with `__`:
 
 .
 __foo__bar
diff --git a/src/inlines.c b/src/inlines.c
index f63fabe..3f69837 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -261,7 +261,7 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
 		}
 		len = utf8proc_iterate(subj->input.data + before_char_pos,
 				 subj->pos - before_char_pos, &before_char);
-		if (len == 0) {
+		if (len == -1) {
 			before_char = 10;
 		}
 	}
@@ -273,11 +273,17 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
 
 	len = utf8proc_iterate(subj->input.data + subj->pos,
 			 subj->input.len - subj->pos, &after_char);
-	if (len == 0) {
+	if (len == -1) {
 		after_char = 10;
 	}
-	*can_open = numdelims > 0 && !utf8proc_is_space(after_char);
-	*can_close = numdelims > 0 && !utf8proc_is_space(before_char);
+	*can_open = numdelims > 0 && !utf8proc_is_space(after_char) &&
+		!(utf8proc_is_punctuation(after_char) &&
+		  !utf8proc_is_space(before_char) &&
+		  !utf8proc_is_punctuation(before_char));
+	*can_close = numdelims > 0 && !utf8proc_is_space(before_char) &&
+		!(utf8proc_is_punctuation(before_char) &&
+		  !utf8proc_is_space(after_char) &&
+		  !utf8proc_is_punctuation(after_char));
 	if (c == '_') {
 		*can_open = *can_open &&
 			!(before_char < 128 && isalnum((char)before_char));
-- 
cgit v1.2.3