summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <fiddlosopher@gmail.com>2014-10-24 11:36:32 -0700
committerJohn MacFarlane <fiddlosopher@gmail.com>2014-10-24 11:36:32 -0700
commite931841b27faa7de3d755cb3a6083acf7a464143 (patch)
treee0f6806dbc39d0cb01187adf9b53f97168cd0b6c
parent8b1a5abc80685c1caa2ca825e468902356989aa9 (diff)
parent9e30ca443275356c1693ab785d700c280d5dfa8c (diff)
Merge branch 'emphstack'
Conflicts: Makefile js/lib/inlines.js
-rw-r--r--Makefile8
-rw-r--r--TODO7
-rw-r--r--js/lib/inlines.js151
-rw-r--r--leakcheck.md1561
-rw-r--r--spec.txt110
-rw-r--r--src/blocks.c199
-rw-r--r--src/buffer.c11
-rw-r--r--src/inlines.c457
-rw-r--r--src/main.c5
-rw-r--r--src/print.c34
-rw-r--r--src/references.c5
-rw-r--r--src/utf8.c39
12 files changed, 2100 insertions, 487 deletions
diff --git a/Makefile b/Makefile
index 4a01824..e8a37d7 100644
--- a/Makefile
+++ b/Makefile
@@ -38,6 +38,9 @@ js/stmd.js: js/lib/index.js ${JSMODULES}
testjs: spec.txt
node js/test.js
+jshint:
+ jshint ${JSMODULES}
+
benchjs:
node js/bench.js ${BENCHINP}
@@ -57,14 +60,13 @@ $(SRCDIR)/case_fold_switch.inc: $(DATADIR)/CaseFolding-3.2.0.txt
$(SRCDIR)/html/html_unescape.h: $(SRCDIR)/html/html_unescape.gperf
gperf -I -t -N find_entity -H hash_entity -K entity -C -l --null-strings -m5 $< > $@
-.PHONY: leakcheck clean fuzztest dingus upload
+.PHONY: leakcheck clean fuzztest dingus upload jshint test testjs benchjs
dingus: js/stmd.js
cd js && echo "Starting dingus server at http://localhost:9000" && python -m SimpleHTTPServer 9000
leakcheck: $(PROG)
- # TODO produce leaktest.md that tests everything
- cat leaktest.md | valgrind --leak-check=full --dsymutil=yes $(PROG)
+ cat leakcheck.md | valgrind --leak-check=full --dsymutil=yes $(PROG)
operf: $(PROG)
operf $(PROG) <bench.md >/dev/null
diff --git a/TODO b/TODO
index 8b13789..be547e8 100644
--- a/TODO
+++ b/TODO
@@ -1 +1,8 @@
+- leakcheck reveals leak in new stmd code
+ Create a function to remove and free a stack entry
+ Use a while loop to remove and free all stack entries from top to the
+ one we're matching.
+- use name other than subj->last_emphasis
+
+- in js: make a proper stack (linked list) rather than using an array?
diff --git a/js/lib/inlines.js b/js/lib/inlines.js
index 34f1560..5fde099 100644
--- a/js/lib/inlines.js
+++ b/js/lib/inlines.js
@@ -262,93 +262,81 @@ var Str = function(s) {
// Attempt to parse emphasis or strong emphasis.
var parseEmphasis = function(cc,inlines) {
var startpos = this.pos;
- var c ;
- var first_close = 0;
- c = fromCodePoint(cc);
- var numdelims;
- var numclosedelims;
- var delimpos;
-
- // Get opening delimiters.
- res = this.scanDelims(cc);
- numdelims = res.numdelims;
+ var res = this.scanDelims(cc);
+ var numdelims = res.numdelims;
if (numdelims === 0) {
this.pos = startpos;
return false;
}
- if (numdelims >= 4 || !res.can_open) {
- this.pos += numdelims;
- inlines.push(Str(this.subject.slice(startpos, startpos + numdelims)));
- return true;
- }
+ if (res.can_close) {
- this.pos += numdelims;
+ // Walk the stack and find a matching opener, if possible
+ var opener = this.emphasis_openers;
+ while (opener) {
- var delims_to_match = numdelims;
-
- var current = [];
- var firstend;
- var firstpos;
- var state = 0;
- var can_close = false;
- var can_open = false;
- var last_emphasis_closer = null;
- while (this.last_emphasis_closer[c] >= this.pos) {
- res = this.scanDelims(cc);
- numclosedelims = res.numdelims;
-
- if (res.can_close) {
- if (last_emphasis_closer === null ||
- last_emphasis_closer < this.pos) {
- last_emphasis_closer = this.pos;
- }
- if (numclosedelims === 3 && delims_to_match === 3) {
- delims_to_match -= 3;
- this.pos += 3;
- current = [{t: 'Strong', c: [{t: 'Emph', c: current}]}];
- } else if (numclosedelims >= 2 && delims_to_match >= 2) {
- delims_to_match -= 2;
- this.pos += 2;
- firstend = current.length;
- firstpos = this.pos;
- current = [{t: 'Strong', c: current}];
- } else if (numclosedelims >= 1 && delims_to_match >= 1) {
- delims_to_match -= 1;
- this.pos += 1;
- firstend = current.length;
- firstpos = this.pos;
- current = [{t: 'Emph', c: current}];
- } else {
- if (!(this.parseInline(current,true))) {
- break;
- }
- }
- if (delims_to_match === 0) {
- Array.prototype.push.apply(inlines, current);
- return true;
+ if (opener.cc === cc) { // we have a match!
+
+ if (opener.numdelims <= numdelims) { // all openers used
+
+ this.pos += opener.numdelims;
+ var X;
+ switch (opener.numdelims) {
+ case 3:
+ X = function(x) { return Strong([Emph(x)]); };
+ break;
+ case 2:
+ X = Strong;
+ break;
+ case 1:
+ default:
+ X = Emph;
+ break;
}
- } else if (!(this.parseInline(current,true))) {
- break;
+ inlines[opener.pos] = X(inlines.slice(opener.pos + 1));
+ inlines.splice(opener.pos + 1, inlines.length - (opener.pos + 1));
+ // Remove entries after this, to prevent overlapping nesting:
+ this.emphasis_openers = opener.previous;
+ return true;
+
+ } else if (opener.numdelims > numdelims) { // only some openers used
+
+ this.pos += numdelims;
+ opener.numdelims -= numdelims;
+ inlines[opener.pos].c =
+ inlines[opener.pos].c.slice(0, opener.numdelims);
+ var X = numdelims === 2 ? Strong : Emph;
+ inlines[opener.pos + 1] = X(inlines.slice(opener.pos + 1));
+ inlines.splice(opener.pos + 2, inlines.length - (opener.pos + 2));
+ // Remove entries after this, to prevent overlapping nesting:
+ this.emphasis_openers = opener;
+ return true;
+
+ }
+
}
+ opener = opener.previous;
+ }
}
- // we didn't match emphasis: fallback
- inlines.push(Str(this.subject.slice(startpos,
- startpos + delims_to_match)));
- if (delims_to_match < numdelims) {
- Array.prototype.push.apply(inlines, current.slice(0,firstend));
- this.pos = firstpos;
- } else { // delims_to_match === numdelims
- this.pos = startpos + delims_to_match;
- }
+ // If we're here, we didn't match a closer.
+
+ this.pos += numdelims;
+ inlines.push(Str(this.subject.slice(startpos, startpos + numdelims)));
+
+ if (res.can_open) {
- if (last_emphasis_closer) {
- this.last_emphasis_closer[c] = last_emphasis_closer;
+ // Add entry to stack for this opener
+ this.emphasis_openers = { cc: cc,
+ numdelims: numdelims,
+ pos: inlines.length - 1,
+ previous: this.emphasis_openers };
}
+
return true;
+
};
// Attempt to parse link title (sans quotes), returning the string
@@ -629,18 +617,11 @@ var parseReference = function(s, refmap) {
};
// Parse the next inline element in subject, advancing subject position.
-// If memoize is set, memoize the result.
// On success, add the result to the inlines list, and return true.
// On failure, return false.
-var parseInline = function(inlines, memoize) {
+var parseInline = function(inlines) {
var startpos = this.pos;
var origlen = inlines.length;
- var memoized = memoize && this.memo[startpos];
- if (memoized) {
- this.pos = memoized.endpos;
- Array.prototype.push.apply(inlines, memoized.inline);
- return true;
- }
var c = this.peek();
if (c === -1) {
@@ -683,10 +664,6 @@ var parseInline = function(inlines, memoize) {
inlines.push({t: 'Str', c: fromCodePoint(c)});
}
- if (memoize) {
- this.memo[startpos] = { inline: inlines.slice(origlen),
- endpos: this.pos };
- }
return true;
};
@@ -695,10 +672,9 @@ var parseInlines = function(s, refmap) {
this.subject = s;
this.pos = 0;
this.refmap = refmap || {};
- this.memo = {};
- this.last_emphasis_closer = { '*': s.length, '_': s.length };
+ this.emphasis_openers = null;
var inlines = [];
- while (this.parseInline(inlines, false)) {
+ while (this.parseInline(inlines)) {
}
return inlines;
};
@@ -708,10 +684,9 @@ function InlineParser(){
return {
subject: '',
label_nest_level: 0, // used by parseLinkLabel method
- last_emphasis_closer: null, // used by parseEmphasis method
+ emphasis_openers: null, // used by parseEmphasis method
pos: 0,
refmap: {},
- memo: {},
match: match,
peek: peek,
spnl: spnl,
diff --git a/leakcheck.md b/leakcheck.md
new file mode 100644
index 0000000..06716e1
--- /dev/null
+++ b/leakcheck.md
@@ -0,0 +1,1561 @@
+→foo→baz→→bim
+
+ a→a
+ ὐ→a
+
+- `one
+- two`
+
+***
+---
+___
+
++++
+
+===
+
+--
+**
+__
+
+ ***
+ ***
+ ***
+
+ ***
+
+Foo
+ ***
+
+_____________________________________
+
+ - - -
+
+ ** * ** * ** * **
+
+- - - -
+
+- - - -
+
+_ _ _ _ a
+
+a------
+
+ *-*
+
+- foo
+***
+- bar
+
+Foo
+***
+bar
+
+Foo
+---
+bar
+
+* Foo
+* * *
+* Bar
+
+- Foo
+- * * *
+
+# foo
+## foo
+### foo
+#### foo
+##### foo
+###### foo
+
+####### foo
+
+#5 bolt
+
+\## foo
+
+# foo *bar* \*baz\*
+
+# foo
+
+ ### foo
+ ## foo
+ # foo
+
+ # foo
+
+foo
+ # bar
+
+## foo ##
+ ### bar ###
+
+# foo ##################################
+##### foo ##
+
+### foo ###
+
+### foo ### b
+
+### foo \###
+## foo \#\##
+# foo \#
+
+****
+## foo
+****
+
+Foo bar
+# baz
+Bar foo
+
+##
+#
+### ###
+
+Foo *bar*
+=========
+
+Foo *bar*
+---------
+
+Foo
+-------------------------
+
+Foo
+=
+
+ Foo
+---
+
+ Foo
+-----
+
+ Foo
+ ===
+
+ Foo
+ ---
+
+ Foo
+---
+
+Foo
+ ----
+
+Foo
+ ---
+
+Foo
+= =
+
+Foo
+--- -
+
+Foo
+-----
+
+Foo\
+----
+
+`Foo
+----
+`
+
+<a title="a lot
+---
+of dashes"/>
+
+> Foo
+---
+
+Foo
+Bar
+---
+
+Foo
+Bar
+===
+
+---
+Foo
+---
+Bar
+---
+Baz
+
+
+====
+
+ a simple
+ indented code block
+
+ <a/>
+ *hi*
+
+ - one
+
+ chunk1
+
+ chunk2
+
+
+
+ chunk3
+
+ chunk1
+
+ chunk2
+
+Foo
+ bar
+
+
+ foo
+bar
+
+# Header
+ foo
+Header
+------
+ foo
+----
+
+ foo
+ bar
+
+
+
+ foo
+
+
+
+ foo
+
+```
+<
+ >
+```
+
+~~~
+<
+ >
+~~~
+
+```
+aaa
+~~~
+```
+
+~~~
+aaa
+```
+~~~
+
+````
+aaa
+```
+``````
+
+~~~~
+aaa
+~~~
+~~~~
+
+```
+
+`````
+
+```
+aaa
+
+```
+
+
+```
+
+```
+```
+
+ ```
+ aaa
+aaa
+```
+
+ ```
+aaa
+ aaa
+aaa
+ ```
+
+ ```
+ aaa
+ aaa
+ aaa
+ ```
+
+ ```
+ aaa
+ ```
+
+``` ```
+aaa
+
+~~~~~~
+aaa
+~~~ ~~
+
+foo
+```
+bar
+```
+baz
+
+foo
+---
+~~~
+bar
+~~~
+# baz
+
+```ruby
+def foo(x)
+ return 3
+end
+```
+
+~~~~ ruby startline=3 $%@#$
+def foo(x)
+ return 3
+end
+~~~~~~~
+
+````;
+````
+
+``` aa ```
+foo
+
+```
+``` aaa
+```
+
+<table>
+ <tr>
+ <td>
+ hi
+ </td>
+ </tr>
+</table>
+
+okay.
+
+ <div>
+ *hello*
+ <foo><a>
+
+<DIV CLASS="foo">
+
+*Markdown*
+
+</DIV>
+
+<div></div>
+``` c
+int x = 33;
+```
+
+<!-- Foo
+bar
+ baz -->
+
+<?php
+ echo 'foo'
+?>
+
+<![CDATA[
+function matchwo(a,b)
+{
+if (a < b && a < 0) then
+ {
+ return 1;
+ }
+else
+ {
+ return 0;
+ }
+}
+]]>
+
+ <!-- foo -->
+
+ <!-- foo -->
+
+Foo
+<div>
+bar
+</div>
+
+<div>
+bar
+</div>
+*foo*
+
+<div class
+foo
+
+<div>
+
+*Emphasized* text.
+
+</div>
+
+<div>
+*Emphasized* text.
+</div>
+
+<table>
+
+<tr>
+
+<td>
+Hi
+</td>
+
+</tr>
+
+</table>
+
+[foo]: /url "title"
+
+[foo]
+
+ [foo]:
+ /url
+ 'the title'
+
+[foo]
+
+[Foo*bar\]]:my_(url) 'title (with parens)'
+
+[Foo*bar\]]
+
+[Foo bar]:
+<my url>
+'title'
+
+[Foo bar]
+
+[foo]:
+/url
+
+[foo]
+
+[foo]:
+
+[foo]
+
+[foo]
+
+[foo]: url
+
+[foo]
+
+[foo]: first
+[foo]: second
+
+[FOO]: /url
+
+[Foo]
+
+[ΑΓΩ]: /φου
+
+[αγω]
+
+[foo]: /url
+
+[foo]: /url "title" ok
+
+ [foo]: /url "title"
+
+[foo]
+
+```
+[foo]: /url
+```
+
+[foo]
+
+Foo
+[bar]: /baz
+
+[bar]
+
+# [Foo]
+[foo]: /url
+> bar
+
+[foo]: /foo-url "foo"
+[bar]: /bar-url
+ "bar"
+[baz]: /baz-url
+
+[foo],
+[bar],
+[baz]
+
+[foo]
+
+> [foo]: /url
+
+aaa
+
+bbb
+
+aaa
+bbb
+
+ccc
+ddd
+
+aaa
+
+
+bbb
+
+ aaa
+ bbb
+
+aaa
+ bbb
+ ccc
+
+ aaa
+bbb
+
+ aaa
+bbb
+
+aaa
+bbb
+
+
+
+aaa
+
+
+# aaa
+
+
+
+> # Foo
+> bar
+> baz
+
+># Foo
+>bar
+> baz
+
+ > # Foo
+ > bar
+ > baz
+
+ > # Foo
+ > bar
+ > baz
+
+> # Foo
+> bar
+baz
+
+> bar
+baz
+> foo
+
+> foo
+---
+
+> - foo
+- bar
+
+> foo
+ bar
+
+> ```
+foo
+```
+
+>
+
+>
+>
+>
+
+>
+> foo
+>
+
+> foo
+
+> bar
+
+> foo
+> bar
+
+> foo
+>
+> bar
+
+foo
+> bar
+
+> aaa
+***
+> bbb
+
+> bar
+baz
+
+> bar
+
+baz
+
+> bar
+>
+baz
+
+> > > foo
+bar
+
+>>> foo
+> bar
+>>baz
+
+> code
+
+> not code
+
+A paragraph
+with two lines.
+
+ indented code
+
+> A block quote.
+
+1. A paragraph
+ with two lines.
+
+ indented code
+
+ > A block quote.
+
+- one
+
+ two
+
+- one
+
+ two
+
+ - one
+
+ two
+
+ - one
+
+ two
+
+ > > 1. one
+>>
+>> two
+
+>>- one
+>>
+ > > two
+
+- foo
+
+ bar
+
+- foo
+
+
+ bar
+
+- ```
+ foo
+
+
+ bar
+ ```
+
+1. foo
+
+ ```
+ bar
+ ```
+
+ baz
+
+ > bam
+
+- foo
+
+ bar
+
+ 10. foo
+
+ bar
+
+ indented code
+
+paragraph
+
+ more code
+
+1. indented code
+
+ paragraph
+
+ more code
+
+1. indented code
+
+ paragraph
+
+ more code
+
+ foo
+
+bar
+
+- foo
+
+ bar
+
+- foo
+
+ bar
+
+ 1. A paragraph
+ with two lines.
+
+ indented code
+
+ > A block quote.
+
+ 1. A paragraph
+ with two lines.
+
+ indented code
+
+ > A block quote.
+
+ 1. A paragraph
+ with two lines.
+
+ indented code
+
+ > A block quote.
+
+ 1. A paragraph
+ with two lines.
+
+ indented code
+
+ > A block quote.
+
+ 1. A paragraph
+with two lines.
+
+ indented code
+
+ > A block quote.
+
+ 1. A paragraph
+ with two lines.
+
+> 1. > Blockquote
+continued here.
+
+> 1. > Blockquote
+> continued here.
+
+- foo
+ - bar
+ - baz
+
+- foo
+ - bar
+ - baz
+
+10) foo
+ - bar
+
+10) foo
+ - bar
+
+- - foo
+
+1. - 2. foo
+
+- foo
+-
+- bar
+
+-
+
+- foo
+- bar
++ baz
+
+1. foo
+2. bar
+3) baz
+
+- foo
+
+- bar
+
+
+- baz
+
+- foo
+
+
+ bar
+- baz
+
+- foo
+ - bar
+ - baz
+
+
+ bim
+
+- foo
+- bar
+
+
+- baz
+- bim
+
+- foo
+
+ notcode
+
+- foo
+
+
+ code
+
+- a
+ - b
+ - c
+ - d
+ - e
+ - f
+- g
+
+- a
+- b
+
+- c
+
+* a
+*
+
+* c
+
+- a
+- b
+
+ c
+- d
+
+- a
+- b
+
+ [ref]: /url
+- d
+
+- a
+- ```
+ b
+
+
+ ```
+- c
+
+- a
+ - b
+
+ c
+- d
+
+* a
+ > b
+ >
+* c
+
+- a
+ > b
+ ```
+ c
+ ```
+- d
+
+- a
+
+- a
+ - b
+
+* foo
+ * bar
+
+ baz
+
+- a
+ - b
+ - c
+
+- d
+ - e
+ - f
+
+`hi`lo`
+
+\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~
+
+\→\A\a\ \3\φ\«
+
+\*not emphasized*
+\<br/> not a tag
+\[not a link](/foo)
+\`not code`
+1\. not a list
+\* not a list
+\# not a header
+\[foo]: /url "not a reference"
+
+\\*emphasis*
+
+foo\
+bar
+
+`` \[\` ``
+
+ \[\]
+
+~~~
+\[\]
+~~~
+
+<http://example.com?find=\*>
+
+<a href="/bar\/)">
+
+[foo](/bar\* "ti\*tle")
+
+[foo]
+
+[foo]: /bar\* "ti\*tle"
+
+``` foo\+bar
+foo
+```
+
+&nbsp; &amp; &copy; &AElig; &Dcaron; &frac34; &HilbertSpace; &DifferentialD; &ClockwiseContourIntegral;
+
+&#35; &#1234; &#992; &#98765432;
+
+&#X22; &#XD06; &#xcab;
+
+&nbsp &x; &#; &#x; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?;
+
+&copy
+
+&MadeUpEntity;
+
+<a href="&ouml;&ouml;.html">
+
+[foo](/f&ouml;&ouml; "f&ouml;&ouml;")
+
+[foo]
+
+[foo]: /f&ouml;&ouml; "f&ouml;&ouml;"
+
+``` f&ouml;&ouml;
+foo
+```
+
+`f&ouml;&ouml;`
+
+ f&ouml;f&ouml;
+
+`foo`
+
+`` foo ` bar ``
+
+` `` `
+
+``
+foo
+``
+
+`foo bar
+ baz`
+
+`foo `` bar`
+
+`foo\`bar`
+
+*foo`*`
+
+[not a `link](/foo`)
+
+<http://foo.bar.`baz>`
+
+<a href="`">`
+
+```foo``
+
+`foo
+
+*foo bar*
+
+_foo bar_
+
+**foo bar**
+
+__foo bar__
+
+*foo
+bar*
+
+_foo
+bar_
+
+**foo
+bar**
+
+__foo
+bar__
+
+*foo [bar](/url)*
+
+_foo [bar](/url)_
+
+**foo [bar](/url)**
+
+__foo [bar](/url)__
+
+*foo [bar*](/url)
+
+_foo [bar_](/url)
+
+**<a href="**">
+
+__<a href="__">
+
+*a `*`*
+
+_a `_`_
+
+**a<http://foo.bar?q=**>
+
+__a<http://foo.bar?q=__>
+
+and * foo bar*
+
+_ foo bar_
+
+and ** foo bar**
+
+__ foo bar__
+
+and *foo bar *
+
+and _foo bar _
+
+and **foo bar **
+
+and __foo bar __
+
+****hi****
+
+_____hi_____
+
+Sign here: _________
+
+** is not an empty emphasis
+
+**** is not an empty strong emphasis
+
+*here is a \**
+
+__this is a double underscore (`__`)__
+
+*_*
+
+_*_
+
+*__*
+
+_**_
+
+foo*bar*baz
+
+foo_bar_baz
+
+foo__bar__baz
+
+_foo_bar_baz_
+
+11*15*32
+
+11_15_32
+
+_foo_bar_baz_
+
+__foo__bar__baz__
+
+***foo bar***
+
+___foo bar___
+
+***foo** bar*
+
+___foo__ bar_
+
+***foo* bar**
+
+___foo_ bar__
+
+*foo **bar***
+
+_foo __bar___
+
+**foo *bar***
+
+__foo _bar___
+
+*foo **bar***
+
+_foo __bar___
+
+*foo *bar* baz*
+
+_foo _bar_ baz_
+
+**foo **bar** baz**
+
+__foo __bar__ baz__
+
+*foo **bar** baz*
+
+_foo __bar__ baz_
+
+**foo *bar* baz**
+
+__foo _bar_ baz__
+
+**foo, *bar*, baz**
+
+__foo, _bar_, baz__
+
+*foo**bar**baz*
+
+**foo*bar*baz**
+
+**foo**
+
+****foo****
+
+*_foo_*
+
+**__foo__**
+
+*foo**
+
+*foo *bar**
+
+**foo***
+
+***foo* bar***
+
+***foo** bar***
+
+*foo**bar***
+
+*foo****
+
+*foo**
+
+**foo*
+
+*foo *bar**
+
+**foo* bar*
+
+*bar***
+
+***foo*
+
+**bar***
+
+***foo**
+
+***foo *bar*
+
+[link](/uri "title")
+
+[link](/uri)
+
+[link]()
+
+[link](<>)
+
+[link](/my uri)
+
+[link](</my uri>)
+
+[link](foo
+bar)
+
+[link]((foo)and(bar))
+
+[link](foo(and(bar)))
+
+[link](foo(and\(bar\)))
+
+[link](<foo(and(bar))>)
+
+[link](foo\)\:)
+
+[link](foo%20b&auml;)
+
+[link]("title")
+
+[link](/url "title")
+[link](/url 'title')
+[link](/url (title))
+
+[link](/url "title \"&quot;")
+
+[link](/url "title "and" title")
+
+[link](/url 'title "and" title')
+
+[link]( /uri
+ "title" )
+
+[link] (/uri)
+
+[foo <bar attr="](baz)">
+
+[foo][bar]
+
+[bar]: /url "title"
+
+[*foo\!*][bar]
+
+[bar]: /url "title"
+
+[foo][BaR]
+
+[bar]: /url "title"
+
+[Толпой][Толпой] is a Russian word.
+
+[ТОЛПОЙ]: /url
+
+[Foo
+ bar]: /url
+
+[Baz][Foo bar]
+
+[foo] [bar]
+
+[bar]: /url "title"
+
+[foo]
+[bar]
+
+[bar]: /url "title"
+
+[foo]: /url1
+
+[foo]: /url2
+
+[bar][foo]
+
+[bar][foo\!]
+
+[foo!]: /url
+
+[foo][]
+
+[foo]: /url "title"
+
+[*foo* bar][]
+
+[*foo* bar]: /url "title"
+
+[Foo][]
+
+[foo]: /url "title"
+
+[foo]
+[]
+
+[foo]: /url "title"
+
+[foo]
+
+[foo]: /url "title"
+
+[*foo* bar]
+
+[*foo* bar]: /url "title"
+
+[[*foo* bar]]
+
+[*foo* bar]: /url "title"
+
+[Foo]
+
+[foo]: /url "title"
+
+\[foo]
+
+[foo]: /url "title"
+
+[foo*]: /url
+
+*[foo*]
+
+[foo`]: /url
+
+[foo`]`
+
+[[[foo]]]
+
+[[[foo]]]: /url
+
+[[[foo]]]
+
+[[[foo]]]: /url1
+[foo]: /url2
+
+[\[foo]
+
+[\[foo]: /url
+
+[foo][bar]
+
+[foo]: /url1
+[bar]: /url2
+
+[foo][bar][baz]
+
+[baz]: /url
+
+[foo][bar][baz]
+
+[baz]: /url1
+[bar]: /url2
+
+[foo][bar][baz]
+
+[baz]: /url1
+[foo]: /url2
+
+![foo](/url "title")
+
+![foo *bar*]
+
+[foo *bar*]: train.jpg "train & tracks"
+
+![foo *bar*][]
+
+[foo *bar*]: train.jpg "train & tracks"
+
+![foo *bar*][foobar]
+
+[FOOBAR]: train.jpg "train & tracks"
+
+![foo](train.jpg)
+
+My ![foo bar](/path/to/train.jpg "title" )
+
+![foo](<url>)
+
+![](/url)
+
+![foo] [bar]
+
+[bar]: /url
+
+![foo] [bar]
+
+[BAR]: /url
+
+![foo][]
+
+[foo]: /url "title"
+
+![*foo* bar][]
+
+[*foo* bar]: /url "title"
+
+![Foo][]
+
+[foo]: /url "title"
+
+![foo]
+[]
+
+[foo]: /url "title"
+
+![foo]
+
+[foo]: /url "title"
+
+![*foo* bar]
+
+[*foo* bar]: /url "title"
+
+![[foo]]
+
+[[foo]]: /url "title"
+
+![Foo]
+
+[foo]: /url "title"
+
+\!\[foo]
+
+[foo]: /url "title"
+
+\![foo]
+
+[foo]: /url "title"
+
+<http://foo.bar.baz>
+
+<http://foo.bar.baz?q=hello&id=22&boolean>
+
+<irc://foo.bar:2233/baz>
+
+<MAILTO:FOO@BAR.BAZ>
+
+<http://foo.bar/baz bim>
+
+<foo@bar.example.com>
+
+<foo+special@Bar.baz-bar0.com>
+
+<>
+
+<heck://bing.bong>
+
+< http://foo.bar >
+
+<foo.bar.baz>
+
+<localhost:5001/foo>
+
+http://example.com
+
+foo@bar.example.com
+
+<a><bab><c2c>
+
+<a/><b2/>
+
+<a /><b2
+data="foo" >
+
+<a foo="bar" bam = 'baz <em>"</em>'
+_boolean zoop:33=zoop:33 />
+
+<33> <__>
+
+<a h*#ref="hi">
+
+<a href="hi'> <a href=hi'>
+
+< a><
+foo><bar/ >
+
+<a href='bar'title=title>
+
+</a>
+</foo >
+
+</a href="foo">
+
+foo <!-- this is a
+comment - with hyphen -->
+
+foo <!-- not a comment -- two hyphens -->
+
+foo <?php echo $a; ?>
+
+foo <!ELEMENT br EMPTY>
+
+foo <![CDATA[>&<]]>
+
+<a href="&ouml;">
+
+<a href="\*">
+
+<a href="\"">
+
+foo
+baz
+
+foo\
+baz
+
+foo
+baz
+
+foo
+ bar
+
+foo\
+ bar
+
+*foo
+bar*
+
+*foo\
+bar*
+
+`code
+span`
+
+`code\
+span`
+
+<a href="foo
+bar">
+
+<a href="foo\
+bar">
+
+foo
+baz
+
+foo
+ baz
+
+hello $.;'there
+
+Foo χρῆν
+
+Multiple spaces
+
diff --git a/spec.txt b/spec.txt
index 0c09c43..3f10459 100644
--- a/spec.txt
+++ b/spec.txt
@@ -4095,21 +4095,39 @@ for efficient parsing strategies that do not backtrack:
(c) it is not followed by an ASCII alphanumeric character.
9. Emphasis begins with a delimiter that [can open
- emphasis](#can-open-emphasis) and includes inlines parsed
- sequentially until a delimiter that [can close
+ emphasis](#can-open-emphasis) and ends with a delimiter that [can close
emphasis](#can-close-emphasis), and that uses the same
- character (`_` or `*`) as the opening delimiter, is reached.
+ character (`_` or `*`) as the opening delimiter. The inlines
+ between the open delimiter and the closing delimiter are the
+ contents of the emphasis inline.
10. Strong emphasis begins with a delimiter that [can open strong
- emphasis](#can-open-strong-emphasis) and includes inlines parsed
- sequentially until a delimiter that [can close strong
- emphasis](#can-close-strong-emphasis), and that uses the
- same character (`_` or `*`) as the opening delimiter, is reached.
+ emphasis](#can-open-strong-emphasis) and ends with a delimiter that
+ [can close strong emphasis](#can-close-strong-emphasis), and that uses the
+ same character (`_` or `*`) as the opening delimiter. The inlines
+ between the open delimiter and the closing delimiter are the
+ contents of the strong emphasis inline.
-11. In case of ambiguity, strong emphasis takes precedence. Thus,
- `**foo**` is `<strong>foo</strong>`, not `<em><em>foo</em></em>`,
- and `***foo***` is `<strong><em>foo</em></strong>`, not
- `<em><strong>foo</strong></em>` or `<em><em><em>foo</em></em></em>`.
+Where rules 1--10 above are compatible with multiple parsings,
+the following principles resolve ambiguity:
+
+11. An interpretation `<strong>...</strong>` is always preferred to
+ `<em><em>...</em></em>`.
+
+12. An interpretation `<strong><em>...</em></strong>` is always
+ preferred to `<em><strong>..</strong></em>`.
+
+13. When two potential emphasis or strong emphasis spans overlap,
+ the first takes precedence. Thus, for example, `*foo _bar* baz_`
+ is parsed as `<em>foo _bar</em> baz_` rather than
+ `*foo <em>bar* baz</em>`.
+
+14. Inline code spans, links, images, and HTML tags group more tightly
+ than emphasis. So, when there is a choice between an interpretation
+ that contains one of these elements and one that does not, the
+ former always wins. Thus, for example, `*[foo*](bar)` is
+ parsed as `*<a href="bar">foo*</a>` rather than as
+ `<em>[foo</em>](bar)`.
These rules can be illustrated through a series of examples.
@@ -4689,6 +4707,15 @@ We retain symmetry in these cases:
<p><em><em>foo</em> bar</em></p>
.
+Note that this is not a case of strong emphasis,
+since the interior `*` closes regular emphasis:
+
+.
+**foo bar* baz**
+.
+<p><em><em>foo bar</em> baz</em>*</p>
+.
+
More cases with mismatched delimiters:
.
@@ -4721,6 +4748,67 @@ More cases with mismatched delimiters:
<p>***foo <em>bar</em></p>
.
+The following case illustrates rule 13:
+
+.
+*foo _bar* baz_
+.
+<p><em>foo _bar</em> baz_</p>
+.
+
+The following cases illustrate rule 14:
+
+.
+*[foo*](bar)
+.
+<p>*<a href="bar">foo*</a></p>
+.
+
+.
+*![foo*](bar)
+.
+<p>*<img src="bar" alt="foo*" /></p>
+.
+
+.
+*<img src="foo" title="*"/>
+.
+<p>*<img src="foo" title="*"/></p>
+.
+
+.
+*a`a*`
+.
+<p>*a<code>a*</code></p>
+.
+
+Here is a tricky case that can be a performance problem with some
+parsers:
+
+.
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+.
+<p>*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a
+*a **a *a **a *a **a *a **a</p>
+.
+
## Links
A link contains a [link label](#link-label) (the visible text),
diff --git a/src/blocks.c b/src/blocks.c
index 5b38116..c0c7e23 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -47,13 +47,13 @@ bool is_blank(strbuf *s, int offset)
{
while (offset < s->size) {
switch (s->ptr[offset]) {
- case '\n':
- return true;
- case ' ':
- offset++;
- break;
- default:
- return false;
+ case '\n':
+ return true;
+ case ' ':
+ offset++;
+ break;
+ default:
+ return false;
}
}
@@ -63,17 +63,17 @@ bool is_blank(strbuf *s, int offset)
static inline bool can_contain(int parent_type, int child_type)
{
return ( parent_type == BLOCK_DOCUMENT ||
- parent_type == BLOCK_BQUOTE ||
- parent_type == BLOCK_LIST_ITEM ||
- (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
+ parent_type == BLOCK_BQUOTE ||
+ parent_type == BLOCK_LIST_ITEM ||
+ (parent_type == BLOCK_LIST && child_type == BLOCK_LIST_ITEM) );
}
static inline bool accepts_lines(int block_type)
{
return (block_type == BLOCK_PARAGRAPH ||
- block_type == BLOCK_ATX_HEADER ||
- block_type == BLOCK_INDENTED_CODE ||
- block_type == BLOCK_FENCED_CODE);
+ block_type == BLOCK_ATX_HEADER ||
+ block_type == BLOCK_INDENTED_CODE ||
+ block_type == BLOCK_FENCED_CODE);
}
static void add_line(node_block* node_block, chunk *ch, int offset)
@@ -156,77 +156,77 @@ static void finalize(node_block* b, int line_number)
}
switch (b->tag) {
- case BLOCK_PARAGRAPH:
- pos = 0;
- while (strbuf_at(&b->string_content, 0) == '[' &&
- (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {
-
- strbuf_drop(&b->string_content, pos);
- }
- if (is_blank(&b->string_content, 0)) {
- b->tag = BLOCK_REFERENCE_DEF;
- }
- break;
+ case BLOCK_PARAGRAPH:
+ pos = 0;
+ while (strbuf_at(&b->string_content, 0) == '[' &&
+ (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {
- case BLOCK_INDENTED_CODE:
- remove_trailing_blank_lines(&b->string_content);
- strbuf_putc(&b->string_content, '\n');
- break;
-
- case BLOCK_FENCED_CODE:
- // first line of contents becomes info
- firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
-
- strbuf_init(&b->as.code.info, 0);
- houdini_unescape_html_f(
- &b->as.code.info,
- b->string_content.ptr,
- firstlinelen
+ strbuf_drop(&b->string_content, pos);
+ }
+ if (is_blank(&b->string_content, 0)) {
+ b->tag = BLOCK_REFERENCE_DEF;
+ }
+ break;
+
+ case BLOCK_INDENTED_CODE:
+ remove_trailing_blank_lines(&b->string_content);
+ strbuf_putc(&b->string_content, '\n');
+ break;
+
+ case BLOCK_FENCED_CODE:
+ // first line of contents becomes info
+ firstlinelen = strbuf_strchr(&b->string_content, '\n', 0);
+
+ strbuf_init(&b->as.code.info, 0);
+ houdini_unescape_html_f(
+ &b->as.code.info,
+ b->string_content.ptr,
+ firstlinelen
);
- strbuf_drop(&b->string_content, firstlinelen + 1);
+ strbuf_drop(&b->string_content, firstlinelen + 1);
- strbuf_trim(&b->as.code.info);
- strbuf_unescape(&b->as.code.info);
- break;
+ strbuf_trim(&b->as.code.info);
+ strbuf_unescape(&b->as.code.info);
+ break;
- case BLOCK_LIST: // determine tight/loose status
- b->as.list.tight = true; // tight by default
- item = b->children;
+ case BLOCK_LIST: // determine tight/loose status
+ b->as.list.tight = true; // tight by default
+ item = b->children;
- while (item) {
- // check for non-final non-empty list item ending with blank line:
- if (item->last_line_blank && item->next) {
+ while (item) {
+ // check for non-final non-empty list item ending with blank line:
+ if (item->last_line_blank && item->next) {
+ b->as.list.tight = false;
+ break;
+ }
+ // recurse into children of list item, to see if there are
+ // spaces between them:
+ subitem = item->children;
+ while (subitem) {
+ if (ends_with_blank_line(subitem) &&
+ (item->next || subitem->next)) {
b->as.list.tight = false;
break;
}
- // recurse into children of list item, to see if there are
- // spaces between them:
- subitem = item->children;
- while (subitem) {
- if (ends_with_blank_line(subitem) &&
- (item->next || subitem->next)) {
- b->as.list.tight = false;
- break;
- }
- subitem = subitem->next;
- }
- if (!(b->as.list.tight)) {
- break;
- }
- item = item->next;
+ subitem = subitem->next;
}
+ if (!(b->as.list.tight)) {
+ break;
+ }
+ item = item->next;
+ }
- break;
+ break;
- default:
- break;
+ default:
+ break;
}
}
// Add a node_block as child of another. Return pointer to child.
static node_block* add_child(node_block* parent,
- int block_type, int start_line, int start_column)
+ int block_type, int start_line, int start_column)
{
assert(parent);
@@ -276,14 +276,14 @@ void stmd_free_nodes(node_block *e)
void process_inlines(node_block* cur, reference_map *refmap)
{
switch (cur->tag) {
- case BLOCK_PARAGRAPH:
- case BLOCK_ATX_HEADER:
- case BLOCK_SETEXT_HEADER:
- cur->inline_content = parse_inlines(&cur->string_content, refmap);
- break;
-
- default:
- break;
+ case BLOCK_PARAGRAPH:
+ case BLOCK_ATX_HEADER:
+ case BLOCK_SETEXT_HEADER:
+ cur->inline_content = parse_inlines(&cur->string_content, refmap);
+ break;
+
+ default:
+ break;
}
node_block *child = cur->children;
@@ -355,9 +355,9 @@ static int parse_list_marker(chunk *input, int pos, struct ListData ** dataptr)
static int lists_match(struct ListData *list_data, struct ListData *item_data)
{
return (list_data->list_type == item_data->list_type &&
- list_data->delimiter == item_data->delimiter &&
- // list_data->marker_offset == item_data.marker_offset &&
- list_data->bullet_char == item_data->bullet_char);
+ list_data->delimiter == item_data->delimiter &&
+ // list_data->marker_offset == item_data.marker_offset &&
+ list_data->bullet_char == item_data->bullet_char);
}
static node_block *finalize_document(node_block *document, int linenum)
@@ -486,7 +486,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
} else if (container->tag == BLOCK_LIST_ITEM) {
if (indent >= container->as.list.marker_offset +
- container->as.list.padding) {
+ container->as.list.padding) {
offset += container->as.list.marker_offset +
container->as.list.padding;
} else if (blank) {
@@ -506,7 +506,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
}
} else if (container->tag == BLOCK_ATX_HEADER ||
- container->tag == BLOCK_SETEXT_HEADER) {
+ container->tag == BLOCK_SETEXT_HEADER) {
// a header can never contain more than one line
all_matched = false;
@@ -550,7 +550,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
// unless last matched container is code node_block, try new container starts:
while (container->tag != BLOCK_FENCED_CODE && container->tag != BLOCK_INDENTED_CODE &&
- container->tag != BLOCK_HTML) {
+ container->tag != BLOCK_HTML) {
first_nonspace = offset;
while (peek_at(&input, first_nonspace) == ' ')
@@ -603,17 +603,17 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
// note, we don't adjust offset because the tag is part of the text
} else if (container->tag == BLOCK_PARAGRAPH &&
- (lev = scan_setext_header_line(&input, first_nonspace)) &&
- // check that there is only one line in the paragraph:
- strbuf_strrchr(&container->string_content, '\n',
- strbuf_len(&container->string_content) - 2) < 0) {
+ (lev = scan_setext_header_line(&input, first_nonspace)) &&
+ // check that there is only one line in the paragraph:
+ strbuf_strrchr(&container->string_content, '\n',
+ strbuf_len(&container->string_content) - 2) < 0) {
container->tag = BLOCK_SETEXT_HEADER;
container->as.header.level = lev;
offset = input.len - 1;
} else if (!(container->tag == BLOCK_PARAGRAPH && !all_matched) &&
- (matched = scan_hrule(&input, first_nonspace))) {
+ (matched = scan_hrule(&input, first_nonspace))) {
// it's only now that we know the line is not part of a setext header:
container = add_child(container, BLOCK_HRULE, line_number, first_nonspace + 1);
@@ -646,16 +646,16 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
data->marker_offset = indent;
if (container->tag != BLOCK_LIST ||
- !lists_match(&container->as.list, data)) {
+ !lists_match(&container->as.list, data)) {
container = add_child(container, BLOCK_LIST, line_number,
- first_nonspace + 1);
+ first_nonspace + 1);
memcpy(&container->as.list, data, sizeof(*data));
}
// add the list item
container = add_child(container, BLOCK_LIST_ITEM, line_number,
- first_nonspace + 1);
+ first_nonspace + 1);
/* TODO: static */
memcpy(&container->as.list, data, sizeof(*data));
free(data);
@@ -684,11 +684,11 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
// lists or breaking out of lists. we also don't set last_line_blank
// on an empty list item.
container->last_line_blank = (blank &&
- container->tag != BLOCK_BQUOTE &&
- container->tag != BLOCK_FENCED_CODE &&
- !(container->tag == BLOCK_LIST_ITEM &&
- container->children == NULL &&
- container->start_line == line_number));
+ container->tag != BLOCK_BQUOTE &&
+ container->tag != BLOCK_FENCED_CODE &&
+ !(container->tag == BLOCK_LIST_ITEM &&
+ container->children == NULL &&
+ container->start_line == line_number));
node_block *cont = container;
while (cont->parent) {
@@ -697,10 +697,10 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
}
if (cur != last_matched_container &&
- container == last_matched_container &&
- !blank &&
- cur->tag == BLOCK_PARAGRAPH &&
- strbuf_len(&cur->string_content) > 0) {
+ container == last_matched_container &&
+ !blank &&
+ cur->tag == BLOCK_PARAGRAPH &&
+ strbuf_len(&cur->string_content) > 0) {
add_line(cur, &input, offset);
@@ -721,7 +721,7 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
matched = 0;
if (indent <= 3 &&
- peek_at(&input, first_nonspace) == container->as.code.fence_char) {
+ peek_at(&input, first_nonspace) == container->as.code.fence_char) {
int fence_len = scan_close_code_fence(&input, first_nonspace);
if (fence_len > container->as.code.fence_length)
matched = 1;
@@ -767,4 +767,3 @@ static void incorporate_line(strbuf *line, int line_number, node_block** curptr)
*curptr = container;
}
}
-
diff --git a/src/buffer.c b/src/buffer.c
index 7c2b86b..2e32720 100644
--- a/src/buffer.c
+++ b/src/buffer.c
@@ -15,8 +15,8 @@
unsigned char strbuf__initbuf[1];
unsigned char strbuf__oom[1];
-#define ENSURE_SIZE(b, d) \
- if ((d) > buf->asize && strbuf_grow(b, (d)) < 0)\
+#define ENSURE_SIZE(b, d) \
+ if ((d) > buf->asize && strbuf_grow(b, (d)) < 0) \
return -1;
void strbuf_init(strbuf *buf, int initial_size)
@@ -111,8 +111,8 @@ int strbuf_set(strbuf *buf, const unsigned char *data, int len)
int strbuf_sets(strbuf *buf, const char *string)
{
return strbuf_set(buf,
- (const unsigned char *)string,
- string ? strlen(string) : 0);
+ (const unsigned char *)string,
+ string ? strlen(string) : 0);
}
int strbuf_putc(strbuf *buf, int c)
@@ -155,7 +155,7 @@ int strbuf_vprintf(strbuf *buf, const char *format, va_list ap)
(char *)buf->ptr + buf->size,
buf->asize - buf->size,
format, args
- );
+ );
if (len < 0) {
free(buf->ptr);
@@ -351,4 +351,3 @@ extern void strbuf_unescape(strbuf *buf)
strbuf_truncate(buf, w);
}
-
diff --git a/src/inlines.c b/src/inlines.c
index 71d75e9..07a75f9 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -10,11 +10,19 @@
#include "scanners.h"
#include "inlines.h"
+typedef struct InlineStack {
+ struct InlineStack *previous;
+ node_inl *first_inline;
+ int delim_count;
+ char delim_char;
+} inline_stack;
+
typedef struct Subject {
chunk input;
int pos;
int label_nestlevel;
reference_map *refmap;
+ inline_stack *emphasis_openers;
} subject;
static node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap);
@@ -108,26 +116,26 @@ extern void free_inlines(node_inl* e)
node_inl * next;
while (e != NULL) {
switch (e->tag){
- case INL_STRING:
- case INL_RAW_HTML:
- case INL_CODE:
- chunk_free(&e->content.literal);
- break;
- case INL_LINEBREAK:
- case INL_SOFTBREAK:
- break;
- case INL_LINK:
- case INL_IMAGE:
- free(e->content.linkable.url);
- free(e->content.linkable.title);
- free_inlines(e->content.linkable.label);
- break;
- case INL_EMPH:
- case INL_STRONG:
- free_inlines(e->content.inlines);
- break;
- default:
- break;
+ case INL_STRING:
+ case INL_RAW_HTML:
+ case INL_CODE:
+ chunk_free(&e->content.literal);
+ break;
+ case INL_LINEBREAK:
+ case INL_SOFTBREAK:
+ break;
+ case INL_LINK:
+ case INL_IMAGE:
+ free(e->content.linkable.url);
+ free(e->content.linkable.title);
+ free_inlines(e->content.linkable.label);
+ break;
+ case INL_EMPH:
+ case INL_STRONG:
+ free_inlines(e->content.inlines);
+ break;
+ default:
+ break;
}
next = e->next;
free(e);
@@ -158,6 +166,7 @@ static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap)
e->pos = 0;
e->label_nestlevel = 0;
e->refmap = refmap;
+ e->emphasis_openers = NULL;
chunk_rtrim(&e->input);
}
@@ -170,6 +179,7 @@ static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap)
e->pos = 0;
e->label_nestlevel = 0;
e->refmap = refmap;
+ e->emphasis_openers = NULL;
chunk_rtrim(&e->input);
}
@@ -262,12 +272,11 @@ static node_inl* handle_backticks(subject *subj)
}
// Scan ***, **, or * and return number scanned, or 0.
-// Don't advance position.
+// Advances position.
static int scan_delims(subject* subj, char c, bool * can_open, bool * can_close)
{
int numdelims = 0;
char char_before, char_after;
- int startpos = subj->pos;
char_before = subj->pos == 0 ? '\n' : peek_at(subj, subj->pos - 1);
while (peek_char(subj) == c) {
@@ -281,135 +290,106 @@ static int scan_delims(subject* subj, char c, bool * can_open, bool * can_close)
*can_open = *can_open && !isalnum(char_before);
*can_close = *can_close && !isalnum(char_after);
}
- subj->pos = startpos;
return numdelims;
}
+static void free_openers(subject* subj, inline_stack* istack)
+{
+ inline_stack * tempstack;
+ while (subj->emphasis_openers != istack) {
+ tempstack = subj->emphasis_openers;
+ subj->emphasis_openers = subj->emphasis_openers->previous;
+ free(tempstack);
+ }
+}
+
// Parse strong/emph or a fallback.
// Assumes the subject has '_' or '*' at the current position.
-static node_inl* handle_strong_emph(subject* subj, char c)
+static node_inl* handle_strong_emph(subject* subj, char c, node_inl **last)
{
bool can_open, can_close;
- node_inl * result = NULL;
- node_inl ** last = malloc(sizeof(node_inl *));
- node_inl * new;
- node_inl * il;
- node_inl * first_head = NULL;
- node_inl * first_close = NULL;
- int first_close_delims = 0;
int numdelims;
-
- *last = NULL;
+ int useDelims;
+ inline_stack * istack;
+ node_inl * inl;
+ node_inl * emph;
+ node_inl * inl_text;
numdelims = scan_delims(subj, c, &can_open, &can_close);
- subj->pos += numdelims;
-
- new = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims));
- *last = new;
- first_head = new;
- result = new;
-
- if (!can_open || numdelims == 0) {
- goto done;
- }
-
- switch (numdelims) {
- case 1:
- while (true) {
- numdelims = scan_delims(subj, c, &can_open, &can_close);
- if (numdelims >= 1 && can_close) {
- subj->pos += 1;
- first_head->tag = INL_EMPH;
- chunk_free(&first_head->content.literal);
- first_head->content.inlines = first_head->next;
- first_head->next = NULL;
- goto done;
- } else {
- if (!parse_inline(subj, last)) {
- goto done;
- }
- }
- }
- break;
- case 2:
- while (true) {
- numdelims = scan_delims(subj, c, &can_open, &can_close);
- if (numdelims >= 2 && can_close) {
- subj->pos += 2;
- first_head->tag = INL_STRONG;
- chunk_free(&first_head->content.literal);
- first_head->content.inlines = first_head->next;
- first_head->next = NULL;
- goto done;
- } else {
- if (!parse_inline(subj, last)) {
- goto done;
- }
- }
- }
- break;
- case 3:
- while (true) {
- numdelims = scan_delims(subj, c, &can_open, &can_close);
- if (can_close && numdelims >= 1 && numdelims <= 3 &&
- numdelims != first_close_delims) {
- new = make_str(chunk_dup(&subj->input, subj->pos, numdelims));
- append_inlines(*last, new);
- *last = new;
- if (first_close_delims == 1 && numdelims > 2) {
- numdelims = 2;
- } else if (first_close_delims == 2) {
- numdelims = 1;
- } else if (numdelims == 3) {
- // If we opened with ***, we interpret it as ** followed by *
- // giving us <strong><em>
- numdelims = 1;
- }
- subj->pos += numdelims;
- if (first_close) {
- first_head->tag = first_close_delims == 1 ? INL_STRONG : INL_EMPH;
- chunk_free(&first_head->content.literal);
- first_head->content.inlines =
- make_inlines(first_close_delims == 1 ? INL_EMPH : INL_STRONG,
- first_head->next);
-
- il = first_head->next;
- while (il->next && il->next != first_close) {
- il = il->next;
- }
- il->next = NULL;
-
- first_head->content.inlines->next = first_close->next;
-
- il = first_head->content.inlines;
- while (il->next && il->next != *last) {
- il = il->next;
- }
- il->next = NULL;
- free_inlines(*last);
-
- first_close->next = NULL;
- free_inlines(first_close);
- first_head->next = NULL;
- goto done;
- } else {
- first_close = *last;
- first_close_delims = numdelims;
- }
- } else {
- if (!parse_inline(subj, last)) {
- goto done;
- }
- }
- }
- break;
- default:
- goto done;
+
+ if (can_close)
+ {
+ // walk the stack and find a matching opener, if there is one
+ istack = subj->emphasis_openers;
+ while (true)
+ {
+ if (istack == NULL)
+ goto cannotClose;
+
+ if (istack->delim_char == c)
+ break;
+
+ istack = istack->previous;
+ }
+
+ // calculate the actual number of delimeters used from this closer
+ useDelims = istack->delim_count;
+ if (useDelims == 3) useDelims = numdelims == 3 ? 1 : numdelims;
+ else if (useDelims > numdelims) useDelims = 1;
+
+ if (istack->delim_count == useDelims)
+ {
+ // the opener is completely used up - remove the stack entry and reuse the inline element
+ inl = istack->first_inline;
+ inl->tag = useDelims == 1 ? INL_EMPH : INL_STRONG;
+ chunk_free(&inl->content.literal);
+ inl->content.inlines = inl->next;
+ inl->next = NULL;
+
+ // remove this opener and all later ones from stack:
+ free_openers(subj, istack->previous);
+ *last = inl;
+ }
+ else
+ {
+ // the opener will only partially be used - stack entry remains (truncated) and a new inline is added.
+ inl = istack->first_inline;
+ istack->delim_count -= useDelims;
+ inl->content.literal.len = istack->delim_count;
+
+ emph = useDelims == 1 ? make_emph(inl->next) : make_strong(inl->next);
+ inl->next = emph;
+
+ // remove all later openers from stack:
+ free_openers(subj, istack);
+
+ *last = emph;
+ }
+
+ // if the closer was not fully used, move back a char or two and try again.
+ if (useDelims < numdelims)
+ {
+ subj->pos = subj->pos - numdelims + useDelims;
+ return handle_strong_emph(subj, c, last);
+ }
+
+ return NULL; // make_str(chunk_literal(""));
+ }
+
+cannotClose:
+ inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims));
+
+ if (can_open)
+ {
+ istack = (inline_stack*)malloc(sizeof(inline_stack));
+ istack->delim_count = numdelims;
+ istack->delim_char = c;
+ istack->first_inline = inl_text;
+ istack->previous = subj->emphasis_openers;
+ subj->emphasis_openers = istack;
}
-done:
- free(last);
- return result;
+ return inl_text;
}
// Parse backslash-escape or just a backslash, returning an inline.
@@ -438,9 +418,9 @@ static node_inl* handle_entity(subject* subj)
advance(subj);
len = houdini_unescape_ent(&ent,
- subj->input.data + subj->pos,
- subj->input.len - subj->pos
- );
+ subj->input.data + subj->pos,
+ subj->input.len - subj->pos
+ );
if (len == 0)
return make_str(chunk_literal("&"));
@@ -513,8 +493,8 @@ unsigned char *clean_title(chunk *title)
// remove surrounding quotes if any:
if ((first == '\'' && last == '\'') ||
- (first == '(' && last == ')') ||
- (first == '"' && last == '"')) {
+ (first == '(' && last == ')') ||
+ (first == '"' && last == '"')) {
houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
} else {
houdini_unescape_html_f(&buf, title->data, title->len);
@@ -542,7 +522,7 @@ static node_inl* handle_pointy_brace(subject* subj)
return make_autolink(
make_str_with_entities(&contents),
contents, 0
- );
+ );
}
// next try to match an email autolink
@@ -552,9 +532,9 @@ static node_inl* handle_pointy_brace(subject* subj)
subj->pos += matchlen;
return make_autolink(
- make_str_with_entities(&contents),
- contents, 1
- );
+ make_str_with_entities(&contents),
+ contents, 1
+ );
}
// finally, try to match an html tag
@@ -598,30 +578,30 @@ static int link_label(subject* subj, chunk *raw_label)
char c;
while ((c = peek_char(subj)) && (c != ']' || nestlevel > 0)) {
switch (c) {
- case '`':
- tmp = handle_backticks(subj);
- free_inlines(tmp);
- break;
- case '<':
- tmp = handle_pointy_brace(subj);
- free_inlines(tmp);
- break;
- case '[': // nested []
- nestlevel++;
- advance(subj);
- break;
- case ']': // nested []
- nestlevel--;
- advance(subj);
- break;
- case '\\':
- advance(subj);
- if (ispunct(peek_char(subj))) {
- advance(subj);
- }
- break;
- default:
+ case '`':
+ tmp = handle_backticks(subj);
+ free_inlines(tmp);
+ break;
+ case '<':
+ tmp = handle_pointy_brace(subj);
+ free_inlines(tmp);
+ break;
+ case '[': // nested []
+ nestlevel++;
+ advance(subj);
+ break;
+ case ']': // nested []
+ nestlevel--;
+ advance(subj);
+ break;
+ case '\\':
+ advance(subj);
+ if (ispunct(peek_char(subj))) {
advance(subj);
+ }
+ break;
+ default:
+ advance(subj);
}
}
if (c == ']') {
@@ -657,8 +637,8 @@ static node_inl* handle_left_bracket(subject* subj)
if (found_label) {
if (peek_char(subj) == '(' &&
- ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
- ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) {
+ ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
+ ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) {
// try to parse an explicit link:
starturl = subj->pos + 1 + sps; // after (
@@ -684,8 +664,8 @@ static node_inl* handle_left_bracket(subject* subj)
subj->pos = endlabel;
lab = parse_chunk_inlines(&rawlabel, subj->refmap);
result = append_inlines(make_str(chunk_literal("[")),
- append_inlines(lab,
- make_str(chunk_literal("]"))));
+ append_inlines(lab,
+ make_str(chunk_literal("]"))));
return result;
}
} else {
@@ -714,7 +694,7 @@ static node_inl* handle_left_bracket(subject* subj)
subj->pos = endlabel;
lab = parse_chunk_inlines(&rawlabel, subj->refmap);
result = append_inlines(make_str(chunk_literal("[")),
- append_inlines(lab, make_str(chunk_literal("]"))));
+ append_inlines(lab, make_str(chunk_literal("]"))));
}
return result;
}
@@ -736,8 +716,8 @@ static node_inl* handle_newline(subject *subj)
advance(subj);
}
if (nlpos > 1 &&
- peek_at(subj, nlpos - 1) == ' ' &&
- peek_at(subj, nlpos - 2) == ' ') {
+ peek_at(subj, nlpos - 1) == ' ' &&
+ peek_at(subj, nlpos - 2) == ' ') {
return make_linebreak();
} else {
return make_softbreak();
@@ -754,9 +734,22 @@ extern node_inl* parse_inlines_while(subject* subj, int (*f)(subject*))
{
node_inl* result = NULL;
node_inl** last = &result;
+ node_inl* first = NULL;
while ((*f)(subj) && parse_inline(subj, last)) {
+ if (!first) {
+ first = *last;
+ }
+ }
+
+ inline_stack* istack = subj->emphasis_openers;
+ inline_stack* temp;
+ while (istack != NULL) {
+ temp = istack->previous;
+ free(istack);
+ istack = temp;
}
- return result;
+
+ return first;
}
node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap)
@@ -812,69 +805,62 @@ static int parse_inline(subject* subj, node_inl ** last)
return 0;
}
switch(c){
- case '\n':
- new = handle_newline(subj);
- break;
- case '`':
- new = handle_backticks(subj);
- break;
- case '\\':
- new = handle_backslash(subj);
- break;
- case '&':
- new = handle_entity(subj);
- break;
- case '<':
- new = handle_pointy_brace(subj);
- break;
- case '_':
- if (subj->pos > 0) {
- unsigned char prev = peek_at(subj, subj->pos - 1);
- if (isalnum(prev) || prev == '_') {
- new = make_str(chunk_literal("_"));
- advance(subj);
- break;
- }
- }
-
- new = handle_strong_emph(subj, '_');
- break;
- case '*':
- new = handle_strong_emph(subj, '*');
- break;
- case '[':
+ case '\n':
+ new = handle_newline(subj);
+ break;
+ case '`':
+ new = handle_backticks(subj);
+ break;
+ case '\\':
+ new = handle_backslash(subj);
+ break;
+ case '&':
+ new = handle_entity(subj);
+ break;
+ case '<':
+ new = handle_pointy_brace(subj);
+ break;
+ case '_':
+ new = handle_strong_emph(subj, '_', last);
+ break;
+ case '*':
+ new = handle_strong_emph(subj, '*', last);
+ break;
+ case '[':
+ new = handle_left_bracket(subj);
+ break;
+ case '!':
+ advance(subj);
+ if (peek_char(subj) == '[') {
new = handle_left_bracket(subj);
- break;
- case '!':
- advance(subj);
- if (peek_char(subj) == '[') {
- new = handle_left_bracket(subj);
- if (new != NULL && new->tag == INL_LINK) {
- new->tag = INL_IMAGE;
- } else {
- new = append_inlines(make_str(chunk_literal("!")), new);
- }
+ if (new != NULL && new->tag == INL_LINK) {
+ new->tag = INL_IMAGE;
} else {
- new = make_str(chunk_literal("!"));
- }
- break;
- default:
- endpos = subject_find_special_char(subj);
- contents = chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
- subj->pos = endpos;
-
- // if we're at a newline, strip trailing spaces.
- if (peek_char(subj) == '\n') {
- chunk_rtrim(&contents);
+ new = append_inlines(make_str(chunk_literal("!")), new);
}
+ } else {
+ new = make_str(chunk_literal("!"));
+ }
+ break;
+ default:
+ endpos = subject_find_special_char(subj);
+ contents = chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
+ subj->pos = endpos;
+
+ // if we're at a newline, strip trailing spaces.
+ if (peek_char(subj) == '\n') {
+ chunk_rtrim(&contents);
+ }
- new = make_str(contents);
+ new = make_str(contents);
}
if (*last == NULL) {
*last = new;
- } else {
+ } else if (new) {
append_inlines(*last, new);
+ *last = new;
}
+
return 1;
}
@@ -890,8 +876,8 @@ void spnl(subject* subj)
{
bool seen_newline = false;
while (peek_char(subj) == ' ' ||
- (!seen_newline &&
- (seen_newline = peek_char(subj) == '\n'))) {
+ (!seen_newline &&
+ (seen_newline = peek_char(subj) == '\n'))) {
advance(subj);
}
}
@@ -958,4 +944,3 @@ int parse_reference_inline(strbuf *input, reference_map *refmap)
reference_create(refmap, &lab, &url, &title);
return subj.pos;
}
-
diff --git a/src/main.c b/src/main.c
index 76a0e12..99d14f8 100644
--- a/src/main.c
+++ b/src/main.c
@@ -38,7 +38,7 @@ int main(int argc, char *argv[])
printf(" - CommonMark converter (c) 2014 John MacFarlane\n");
exit(0);
} else if ((strcmp(argv[i], "--help") == 0) ||
- (strcmp(argv[i], "-h") == 0)) {
+ (strcmp(argv[i], "-h") == 0)) {
print_usage();
exit(0);
} else if (strcmp(argv[i], "--ast") == 0) {
@@ -61,7 +61,7 @@ int main(int argc, char *argv[])
if (fp == NULL) {
fprintf(stderr, "Error opening file %s: %s\n",
- argv[files[i]], strerror(errno));
+ argv[files[i]], strerror(errno));
exit(1);
}
@@ -74,4 +74,3 @@ int main(int argc, char *argv[])
return 0;
}
-
diff --git a/src/print.c b/src/print.c
index 83f8daa..f3bd8e5 100644
--- a/src/print.c
+++ b/src/print.c
@@ -16,17 +16,17 @@ static void print_str(const unsigned char *s, int len)
unsigned char c = s[i];
switch (c) {
- case '\n':
- printf("\\n");
- break;
- case '"':
- printf("\\\"");
- break;
- case '\\':
- printf("\\\\");
- break;
- default:
- putchar((int)c);
+ case '\n':
+ printf("\\n");
+ break;
+ case '"':
+ printf("\\\"");
+ break;
+ case '\\':
+ printf("\\\\");
+ break;
+ default:
+ putchar((int)c);
}
}
putchar('"');
@@ -116,13 +116,13 @@ static void print_blocks(node_block* b, int indent)
data = &(b->as.list);
if (data->list_type == ordered) {
printf("list (type=ordered tight=%s start=%d delim=%s)\n",
- (data->tight ? "true" : "false"),
- data->start,
- (data->delimiter == parens ? "parens" : "period"));
+ (data->tight ? "true" : "false"),
+ data->start,
+ (data->delimiter == parens ? "parens" : "period"));
} else {
printf("list (type=bullet tight=%s bullet_char=%c)\n",
- (data->tight ? "true" : "false"),
- data->bullet_char);
+ (data->tight ? "true" : "false"),
+ data->bullet_char);
}
print_blocks(b->children, indent + 2);
break;
@@ -148,7 +148,7 @@ static void print_blocks(node_block* b, int indent)
break;
case BLOCK_FENCED_CODE:
printf("fenced_code length=%d info=",
- b->as.code.fence_length);
+ b->as.code.fence_length);
print_str(b->as.code.info.ptr, -1);
putchar(' ');
print_str(b->string_content.ptr, -1);
diff --git a/src/references.c b/src/references.c
index 3e54b48..975bf81 100644
--- a/src/references.c
+++ b/src/references.c
@@ -55,7 +55,7 @@ static void add_reference(reference_map *map, reference* ref)
while (t) {
if (t->hash == ref->hash &&
- !strcmp((char *)t->label, (char *)ref->label)) {
+ !strcmp((char *)t->label, (char *)ref->label)) {
reference_free(ref);
return;
}
@@ -105,7 +105,7 @@ reference* reference_lookup(reference_map *map, chunk *label)
while (ref) {
if (ref->hash == hash &&
- !strcmp((char *)ref->label, (char *)norm))
+ !strcmp((char *)ref->label, (char *)norm))
break;
ref = ref->next;
}
@@ -138,4 +138,3 @@ reference_map *reference_map_new(void)
memset(map, 0x0, sizeof(reference_map));
return map;
}
-
diff --git a/src/utf8.c b/src/utf8.c
index 6b34831..8a786b7 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -103,24 +103,24 @@ int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst)
return -1;
switch (length) {
- case 1:
- uc = str[0];
- break;
- case 2:
- uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
- if (uc < 0x80) uc = -1;
- break;
- case 3:
- uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
- + (str[2] & 0x3F);
- if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
- (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
- break;
- case 4:
- uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
- + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
- if (uc < 0x10000 || uc >= 0x110000) uc = -1;
- break;
+ case 1:
+ uc = str[0];
+ break;
+ case 2:
+ uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
+ if (uc < 0x80) uc = -1;
+ break;
+ case 3:
+ uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
+ + (str[2] & 0x3F);
+ if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
+ (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
+ break;
+ case 4:
+ uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
+ + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
+ if (uc < 0x10000 || uc >= 0x110000) uc = -1;
+ break;
}
if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
@@ -173,7 +173,7 @@ void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len)
{
int32_t c;
-#define bufpush(x) \
+#define bufpush(x) \
utf8proc_encode_char(x, dest)
while (len > 0) {
@@ -190,4 +190,3 @@ void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len)
len -= char_len;
}
}
-