From ead4f4b77b23874d275863180cf4fd7ebebd38cd Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 16:19:06 -0800 Subject: Improved fuzztest. --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fe781d6..afe939e 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ JSMODULES=$(wildcard js/lib/*.js) SPEC=spec.txt SITE=_site BUILDDIR=build +FUZZCHARS=2000000 # for fuzztest PROG?=$(BUILDDIR)/src/cmark .PHONY: all spec leakcheck clean fuzztest dingus upload jshint test testjs benchjs update-site upload-site check @@ -71,8 +72,10 @@ operf: $(PROG) operf $(PROG) <$(BENCHINP) >/dev/null fuzztest: - for i in `seq 1 10`; do \ - time cat /dev/urandom | head -c 500000 | iconv -f latin1 -t utf-8 | tee fuzz-$$i.txt | $(PROG) > /dev/null && rm fuzz-$$i.txt ; done + { for i in `seq 1 10`; do \ + cat /dev/urandom | head -c $(FUZZCHARS) | iconv -f latin1 -t utf-8 | tee fuzz-$$i.txt | \ + /usr/bin/env time -p $(PROG) >/dev/null && rm fuzz-$$i.txt ; \ + done } 2>&1 | grep user update-site: spec.html make -C $(SITE) update -- cgit v1.2.3 From 535afc08febf42fb984e7115bf7a947985690e41 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 20:24:32 -0800 Subject: Spec: better system for designating definitions of terms. `[foo](@foo)` turns into an anchor with id `foo` that links to itself and has class `definition`. This allows us to remove the manual `` tags in `spec.txt`. A simple regex substitution in the Makefile handles the transformation. I believe this addresses some of the concerns in #141 by giving the definitions a prominent appearance. Note that we want definitions to link to themselves, to allow users to quickly generate a link to the relevant bit of the spec. --- Makefile | 5 +- spec.txt | 198 +++++++++++++++++++++++++++++----------------------------- template.html | 1 + 3 files changed, 103 insertions(+), 101 deletions(-) diff --git a/Makefile b/Makefile index afe939e..5f9cb34 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,10 @@ spec.md: $(SPEC) perl spec2md.pl < $< > $@ spec.html: spec.md template.html - pandoc --no-highlight --number-sections --template template.html -s --toc -S $< > $@ # | perl -pe 's/␣/ <\/span>/g' > $@ + pandoc --no-highlight --number-sections --template template.html -s --toc -S $< | \ + perl -pe 's/a href="@([^"]*)"/a id="\1" href="#\1" class="definition"/g' > $@ + + # | perl -pe 's/␣/ <\/span>/g' > $@ spec.pdf: spec.md template.tex specfilter.hs pandoc -s $< --template template.tex \ diff --git a/spec.txt b/spec.txt index f6d90c0..ab8d75b 100644 --- a/spec.txt +++ b/spec.txt @@ -191,11 +191,11 @@ In the examples, the `→` character is used to represent tabs. # Preprocessing -A [line](#line) +A [line](@line) is a sequence of zero or more [characters](#character) followed by a line ending (CR, LF, or CRLF) or by the end of file. -A [character](#character) is a unicode code point. +A [character](@character) is a unicode code point. This spec does not specify an encoding; it thinks of lines as composed of characters rather than bytes. A conforming parser may be limited to a certain encoding. @@ -221,15 +221,15 @@ Tabs in lines are expanded to spaces, with a tab stop of 4 characters: Line endings are replaced by newline characters (LF). A line containing no characters, or a line containing only spaces (after -tab expansion), is called a [blank line](#blank-line). - +tab expansion), is called a [blank line](@blank-line). # Blocks and inlines -We can think of a document as a sequence of [blocks](#block)---structural elements like paragraphs, block quotations, +We can think of a document as a sequence of +[blocks](@block)---structural +elements like paragraphs, block quotations, lists, headers, rules, and code blocks. Blocks can contain other -blocks, or they can contain [inline](#inline) content: +blocks, or they can contain [inline](@inline) content: words, spaces, links, emphasized text, images, and inline code. ## Precedence @@ -260,9 +260,9 @@ one block element does not affect the inline parsing of any other. ## Container blocks and leaf blocks We can divide blocks into two types: -[container blocks](#container-block), -which can contain other blocks, and [leaf blocks](#leaf-block), - which cannot. +[container blocks](@container-block), +which can contain other blocks, and [leaf blocks](@leaf-block), +which cannot. # Leaf blocks @@ -274,7 +274,7 @@ Markdown document. A line consisting of 0-3 spaces of indentation, followed by a sequence of three or more matching `-`, `_`, or `*` characters, each followed optionally by any number of spaces, forms a [horizontal -rule](#horizontal-rule). +rule](@horizontal-rule). . *** @@ -474,7 +474,7 @@ If you want a horizontal rule in a list item, use a different bullet: ## ATX headers -An [ATX header](#atx-header) +An [ATX header](@atx-header) consists of a string of characters, parsed as inline content, between an opening sequence of 1--6 unescaped `#` characters and an optional closing sequence of any number of `#` characters. The opening sequence @@ -672,14 +672,14 @@ ATX headers can be empty: ## Setext headers -A [setext header](#setext-header) +A [setext header](@setext-header) consists of a line of text, containing at least one nonspace character, with no more than 3 spaces indentation, followed by a [setext header underline](#setext-header-underline). The line of text must be one that, were it not followed by the setext header underline, would be interpreted as part of a paragraph: it cannot be a code block, header, blockquote, horizontal rule, or list. A [setext header -underline](#setext-header-underline) +underline](@setext-header-underline) is a sequence of `=` characters or a sequence of `-` characters, with no more than 3 spaces indentation and any number of trailing spaces. The header is a level 1 header if `=` characters are used, and @@ -943,10 +943,10 @@ use backslash escapes: ## Indented code blocks -An [indented code block](#indented-code-block) - is composed of one or more +An [indented code block](@indented-code-block) +is composed of one or more [indented chunks](#indented-chunk) separated by blank lines. -An [indented chunk](#indented-chunk) +An [indented chunk](@indented-chunk) is a sequence of non-blank lines, each indented four or more spaces. An indented code block cannot interrupt a paragraph, so if it occurs before or after a paragraph, there must be an @@ -1096,16 +1096,16 @@ Trailing spaces are included in the code block's content: ## Fenced code blocks -A [code fence](#code-fence) is a sequence +A [code fence](@code-fence) is a sequence of at least three consecutive backtick characters (`` ` ``) or tildes (`~`). (Tildes and backticks cannot be mixed.) -A [fenced code block](#fenced-code-block) +A [fenced code block](@fenced-code-block) begins with a code fence, indented no more than three spaces. The line with the opening code fence may optionally contain some text following the code fence; this is trimmed of leading and trailing -spaces and called the [info string](#info-string). - The info string may not contain any backtick +spaces and called the [info string](@info-string). +The info string may not contain any backtick characters. (The reason for this restriction is that otherwise some inline code would be incorrectly interpreted as the beginning of a fenced code block.) @@ -1458,7 +1458,7 @@ Closing code fences cannot have info strings: ## HTML blocks -An [HTML block tag](#html-block-tag) is +An [HTML block tag](@html-block-tag) is an [open tag](#open-tag) or [closing tag](#closing-tag) whose tag name is one of the following (case-insensitive): `article`, `header`, `aside`, `hgroup`, `blockquote`, `hr`, `iframe`, @@ -1469,7 +1469,7 @@ name is one of the following (case-insensitive): `tr`, `form`, `ul`, `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `video`, `script`, `style`. -An [HTML block](#html-block) begins with an +An [HTML block](@html-block) begins with an [HTML block tag](#html-block-tag), [HTML comment](#html-comment), [processing instruction](#processing-instruction), [declaration](#declaration), or [CDATA section](#cdata-section). @@ -1743,8 +1743,8 @@ So there is no important loss of expressive power with the new rule. ## Link reference definitions -A [link reference definition](#link-reference-definition) - consists of a [link +A [link reference definition](@link-reference-definition) +consists of a [link label](#link-label), indented up to three spaces, followed by a colon (`:`), optional blank space (including up to one newline), a [link destination](#link-destination), optional @@ -1969,7 +1969,7 @@ are defined: ## Paragraphs A sequence of non-blank lines that cannot be interpreted as other -kinds of blocks forms a [paragraph](#paragraph). +kinds of blocks forms a [paragraph](@paragraph). The contents of the paragraph are the result of parsing the paragraph's raw content as inlines. The paragraph's raw content is formed by concatenating the lines and removing initial and final @@ -2113,12 +2113,11 @@ these constructions. (A recipe is provided below in the section entitled ## Block quotes -A [block quote marker](#block-quote-marker) +A [block quote marker](@block-quote-marker) consists of 0-3 spaces of initial indent, plus (a) the character `>` together with a following space, or (b) a single character `>` not followed by a space. -The following rules define [block quotes](#block-quote): - +The following rules define [block quotes](@block-quote): 1. **Basic case.** If a string of lines *Ls* constitute a sequence of blocks *Bs*, then the result of prepending a [block quote @@ -2131,8 +2130,8 @@ The following rules define [block quotes](#block-quote): more lines in which the next non-space character after the [block quote marker](#block-quote-marker) is [paragraph continuation text](#paragraph-continuation-text) is a block quote with *Bs* as - its content. - [Paragraph continuation text](#paragraph-continuation-text) is text + its content. + [Paragraph continuation text](@paragraph-continuation-text) is text that will be parsed as part of the content of a paragraph, but does not occur at the beginning of the paragraph. @@ -2475,19 +2474,18 @@ the `>`: ## List items -A [list marker](#list-marker) is a +A [list marker](@list-marker) is a [bullet list marker](#bullet-list-marker) or an [ordered list marker](#ordered-list-marker). -A [bullet list marker](#bullet-list-marker) +A [bullet list marker](@bullet-list-marker) is a `-`, `+`, or `*` character. -An [ordered list marker](#ordered-list-marker) +An [ordered list marker](@ordered-list-marker) is a sequence of one of more digits (`0-9`), followed by either a `.` character or a `)` character. -The following rules define [list items](#list-item): +The following rules define [list items](@list-item): 1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with a non-space character and not separated @@ -2938,8 +2936,9 @@ Four spaces indent gives a code block: some or all of the indentation from one or more lines in which the next non-space character after the indentation is [paragraph continuation text](#paragraph-continuation-text) is a - list item with the same contents and attributes. + list item with the same contents and attributes. The unindented + lines are called + [lazy continuation lines](@lazy-continuation-line). Here is an example with [lazy continuation lines](#lazy-continuation-line): @@ -3316,33 +3315,33 @@ takes four spaces (a common case), but diverge in other cases. ## Lists -A [list](#list) is a sequence of one or more +A [list](@list) is a sequence of one or more list items [of the same type](#of-the-same-type). The list items may be separated by single [blank lines](#blank-line), but two blank lines end all containing lists. -Two list items are [of the same type](#of-the-same-type) - if they begin with a [list +Two list items are [of the same type](@of-the-same-type) +if they begin with a [list marker](#list-marker) of the same type. Two list markers are of the same type if (a) they are bullet list markers using the same character (`-`, `+`, or `*`) or (b) they are ordered list numbers with the same delimiter (either `.` or `)`). -A list is an [ordered list](#ordered-list) +A list is an [ordered list](@ordered-list) if its constituent list items begin with [ordered list markers](#ordered-list-marker), and a [bullet -list](#bullet-list) if its constituent list +list](@bullet-list) if its constituent list items begin with [bullet list markers](#bullet-list-marker). -The [start number](#start-number) +The [start number](@start-number) of an [ordered list](#ordered-list) is determined by the list number of its initial list item. The numbers of subsequent list items are disregarded. -A list is [loose](#loose) if it any of its constituent +A list is [loose](@loose) if it any of its constituent list items are separated by blank lines, or if any of its constituent list items directly contain two block-level elements with a blank line -between them. Otherwise a list is [tight](#tight). +between them. Otherwise a list is [tight](@tight). (The difference in HTML output is that paragraphs in a loose list are wrapped in `

` tags, while paragraphs in a tight list are not.) @@ -3420,8 +3419,8 @@ blank lines: Second, we are attracted to a -> [principle of uniformity](#principle-of-uniformity): id="principle-of-uniformity"> if a span of text has a certain +> [principle of uniformity](@principle-of-uniformity): +> if a span of text has a certain > meaning, it will continue to have the same meaning when put into a list > item. @@ -3947,7 +3946,7 @@ the entities when generating HTML, and simplifies the job of implementations targetting other languages, as these will only need to handle the unicode chars and need not be HTML-entity aware. -[Named entities](#name-entities) consist of `&` +[Named entities](@name-entities) consist of `&` + any of the valid HTML5 entity names + `;`. The [following document](http://www.whatwg.org/specs/web-apps/current-work/multipage/entities.json) is used as an authoritative source of the valid entity names and their @@ -3964,7 +3963,7 @@ always need to be written as entities for security reasons.

  & © Æ Ď ¾ ℋ ⅆ ∲

. -[Decimal entities](#decimal-entities) +[Decimal entities](@decimal-entities) consist of `&#` + a string of 1--8 arabic digits + `;`. Again, these entities need to be recognised and tranformed into their corresponding UTF8 codepoints. Invalid Unicode codepoints will be written as the @@ -3976,7 +3975,7 @@ UTF8 codepoints. Invalid Unicode codepoints will be written as the

# Ӓ Ϡ �

. -[Hexadecimal entities](#hexadecimal-entities) +[Hexadecimal entities](@hexadecimal-entities) consist of `&#` + either `X` or `x` + a string of 1-8 hexadecimal digits + `;`. They will also be parsed and turned into their corresponding UTF8 values in the AST. @@ -4063,7 +4062,7 @@ Entities are treated as literal text in code spans and code blocks: ## Code span -A [backtick string](#backtick-string) +A [backtick string](@backtick-string) is a string of one or more backtick characters (`` ` ``) that is neither preceded nor followed by a backtick. @@ -4247,31 +4246,31 @@ no emphasis: foo_bar_baz The following rules capture all of these patterns, while allowing for efficient parsing strategies that do not backtrack: -1. A single `*` character [can open emphasis](#can-open-emphasis) - iff it is not followed by +1. A single `*` character [can open emphasis](@can-open-emphasis) + iff it is not followed by whitespace. 2. A single `_` character [can open emphasis](#can-open-emphasis) iff it is not followed by whitespace and it is not preceded by an ASCII alphanumeric character. -3. A single `*` character [can close emphasis](#can-close-emphasis) - iff it is not preceded by whitespace. +3. A single `*` character [can close emphasis](@can-close-emphasis) + iff it is not preceded by whitespace. 4. A single `_` character [can close emphasis](#can-close-emphasis) iff it is not preceded by whitespace and it is not followed by an ASCII alphanumeric character. -5. A double `**` [can open strong emphasis](#can-open-strong-emphasis) - iff it is not followed by +5. A double `**` [can open strong emphasis](@can-open-strong-emphasis) + iff it is not followed by whitespace. 6. A double `__` [can open strong emphasis](#can-open-strong-emphasis) iff it is not followed by whitespace and it is not preceded by an ASCII alphanumeric character. -7. A double `**` [can close strong emphasis](#can-close-strong-emphasis) - iff it is not preceded by +7. A double `**` [can close strong emphasis](@can-close-strong-emphasis) + iff it is not preceded by whitespace. 8. A double `__` [can close strong emphasis](#can-close-strong-emphasis) @@ -5119,7 +5118,7 @@ and title are given immediately after the label. In [reference links](#reference-links) the destination and title are defined elsewhere in the document. -A [link label](#link-label) consists of +A [link label](@link-label) consists of - an opening `[`, followed by - zero or more backtick code spans, autolinks, HTML tags, link labels, @@ -5134,7 +5133,7 @@ These rules are motivated by the following intuitive ideas: but less tightly than `<>` or `` ` ``. - Link labels may contain material in matching square brackets. -A [link destination](#link-destination) +A [link destination](@link-destination) consists of either - a sequence of zero or more characters between an opening `<` and a @@ -5147,7 +5146,7 @@ consists of either a balanced pair of unescaped parentheses that is not itself inside a balanced pair of unescaped paretheses. -A [link title](#link-title) consists of either +A [link title](@link-title) consists of either - a sequence of zero or more characters between straight double-quote characters (`"`), including a `"` character only if it is @@ -5160,7 +5159,7 @@ A [link title](#link-title) consists of either - a sequence of zero or more characters between matching parentheses (`(...)`), including a `)` character only if it is backslash-escaped. -An [inline link](#inline-link) +An [inline link](@inline-link) consists of a [link label](#link-label) followed immediately by a left parenthesis `(`, optional whitespace, an optional [link destination](#link-destination), @@ -5366,16 +5365,15 @@ an HTML tag: . -There are three kinds of [reference links](#reference-link): - +There are three kinds of [reference links](@reference-link): -A [full reference link](#full-reference-link) +A [full reference link](@full-reference-link) consists of a [link label](#link-label), optional whitespace, and another [link label](#link-label) that [matches](#matches) a [link reference definition](#link-reference-definition) elsewhere in the document. -One label [matches](#matches) +One label [matches](@matches) another just in case their normalized forms are equal. To normalize a label, perform the *unicode case fold* and collapse consecutive internal whitespace to a single space. If there are multiple matching reference @@ -5482,8 +5480,8 @@ labels define equivalent inline content:

[bar][foo!]

. -A [collapsed reference link](#collapsed-reference-link) - consists of a [link +A [collapsed reference link](@collapsed-reference-link) +consists of a [link label](#link-label) that [matches](#matches) a [link reference definition](#link-reference-definition) elsewhere in the document, optional whitespace, and the string `[]`. The contents of the @@ -5530,8 +5528,8 @@ between the two sets of brackets:

foo

. -A [shortcut reference link](#shortcut-reference-link) - consists of a [link +A [shortcut reference link](@shortcut-reference-link) +consists of a [link label](#link-label) that [matches](#matches) a [link reference definition](#link-reference-definition) elsewhere in the document and is not followed by `[]` or a link label. @@ -5870,18 +5868,18 @@ Autolinks are absolute URIs and email addresses inside `<` and `>`. They are parsed as links, with the URL or email address as the link label. -A [URI autolink](#uri-autolink) +A [URI autolink](@uri-autolink) consists of `<`, followed by an [absolute URI](#absolute-uri) not containing `<`, followed by `>`. It is parsed as a link to the URI, with the URI as the link's label. -An [absolute URI](#absolute-uri), +An [absolute URI](@absolute-uri), for these purposes, consists of a [scheme](#scheme) followed by a colon (`:`) followed by zero or more characters other than ASCII whitespace and control characters, `<`, and `>`. If the URI includes these characters, you must use percent-encoding (e.g. `%20` for a space). -The following [schemes](#scheme) +The following [schemes](@scheme) are recognized (case-insensitive): `coap`, `doi`, `javascript`, `aaa`, `aaas`, `about`, `acap`, `cap`, `cid`, `crid`, `data`, `dav`, `dict`, `dns`, `file`, `ftp`, `geo`, `go`, @@ -5943,12 +5941,12 @@ Spaces are not allowed in autolinks:

<http://foo.bar/baz bim>

. -An [email autolink](#email-autolink) +An [email autolink](@email-autolink) consists of `<`, followed by an [email address](#email-address), followed by `>`. The link's label is the email address, and the URL is `mailto:` followed by the email address. -An [email address](#email-address), +An [email address](@email-address), for these purposes, is anything that matches the [non-normative regex from the HTML5 spec](http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#e-mail-state-%28type=email%29): @@ -6023,67 +6021,67 @@ so custom tags (and even, say, DocBook tags) may be used. Here is the grammar for tags: -A [tag name](#tag-name) consists of an ASCII letter +A [tag name](@tag-name) consists of an ASCII letter followed by zero or more ASCII letters or digits. -An [attribute](#attribute) consists of whitespace, -an **attribute name**, and an optional **attribute value -specification**. +An [attribute](@attribute) consists of whitespace, +an [attribute name](#attribute-name), and an optional +[attribute value specification](#attribute-value-specification). -An [attribute name](#attribute-name) +An [attribute name](@attribute-name) consists of an ASCII letter, `_`, or `:`, followed by zero or more ASCII letters, digits, `_`, `.`, `:`, or `-`. (Note: This is the XML specification restricted to ASCII. HTML5 is laxer.) -An [attribute value specification](#attribute-value-specification) - consists of optional whitespace, +An [attribute value specification](@attribute-value-specification) +consists of optional whitespace, a `=` character, optional whitespace, and an [attribute value](#attribute-value). -An [attribute value](#attribute-value) +An [attribute value](@attribute-value) consists of an [unquoted attribute value](#unquoted-attribute-value), a [single-quoted attribute value](#single-quoted-attribute-value), or a [double-quoted attribute value](#double-quoted-attribute-value). -An [unquoted attribute value](#unquoted-attribute-value) - is a nonempty string of characters not +An [unquoted attribute value](@unquoted-attribute-value) +is a nonempty string of characters not including spaces, `"`, `'`, `=`, `<`, `>`, or `` ` ``. -A [single-quoted attribute value](#single-quoted-attribute-value) - consists of `'`, zero or more +A [single-quoted attribute value](@single-quoted-attribute-value) +consists of `'`, zero or more characters not including `'`, and a final `'`. -A [double-quoted attribute value](#double-quoted-attribute-value) - consists of `"`, zero or more +A [double-quoted attribute value](@double-quoted-attribute-value) +consists of `"`, zero or more characters not including `"`, and a final `"`. -An [open tag](#open-tag) consists of a `<` character, +An [open tag](@open-tag) consists of a `<` character, a [tag name](#tag-name), zero or more [attributes](#attribute), optional whitespace, an optional `/` character, and a `>` character. -A [closing tag](#closing-tag) consists of the +A [closing tag](@closing-tag) consists of the string ``. -An [HTML comment](#html-comment) consists of the +An [HTML comment](@html-comment) consists of the string ``. -A [processing instruction](#processing-instruction) - consists of the string ``, and the string `?>`. -A [declaration](#declaration) consists of the +A [declaration](@declaration) consists of the string ``, and the character `>`. -A [CDATA section](#cdata-section) consists of +A [CDATA section](@cdata-section) consists of the string ``, and the string `]]>`. -An [HTML tag](#html-tag) consists of an [open +An [HTML tag](@html-tag) consists of an [open tag](#open-tag), a [closing tag](#closing-tag), an [HTML comment](#html-comment), a [processing instruction](#processing-instruction), an [element type @@ -6252,7 +6250,7 @@ Backslash escapes do not work in HTML attributes: A line break (not in a code span or HTML tag) that is preceded by two or more spaces is parsed as a [hard line -break](#hard-line-break) (rendered +break](@hard-line-break) (rendered in HTML as a `
` tag): . diff --git a/template.html b/template.html index bc5ba26..9ae92f8 100644 --- a/template.html +++ b/template.html @@ -17,6 +17,7 @@ h1 { font-size: 140%; font-weight: bold; border-top: 1px solid gray; padding-top h2 { font-size: 120%; font-weight: bold; } h3 { font-size: 110%; font-weight: bold; } h4 { font-size: 100%; font-weight: bold; } +a.definition { font-weight: bold; } span.space { position: relative; } span.space:after { content: ""; -- cgit v1.2.3 From faacf7065f0303a5b0d5c241e06bcfd8c827651c Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 20:52:36 -0800 Subject: Make spaces visible using CSS. Closes #49. I didn't want to actually insert the characters, since I want the code samples to be cut/pasteable. But this CSS trick seems to work. --- Makefile | 5 +++-- spec2md.pl | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 5f9cb34..1b2d510 100644 --- a/Makefile +++ b/Makefile @@ -37,9 +37,10 @@ spec.md: $(SPEC) spec.html: spec.md template.html pandoc --no-highlight --number-sections --template template.html -s --toc -S $< | \ - perl -pe 's/a href="@([^"]*)"/a id="\1" href="#\1" class="definition"/g' > $@ + perl -pe 's/a href="@([^"]*)"/a id="\1" href="#\1" class="definition"/g' | \ + perl -pe 's/␣/ <\/span>/g' \ + > $@ - # | perl -pe 's/␣/ <\/span>/g' > $@ spec.pdf: spec.md template.tex specfilter.hs pandoc -s $< --template template.tex \ diff --git a/spec2md.pl b/spec2md.pl index f93aad8..313f86f 100644 --- a/spec2md.pl +++ b/spec2md.pl @@ -29,7 +29,7 @@ while () { $section = $match[0]; } if ($stage != 0) { - # $_ =~ s/ /␣/g; + $_ =~ s/ /␣/g; } print $_; } -- cgit v1.2.3 From ac1c9301f00bdb6cceaf1c49e18f96bab5ca9396 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 21:01:18 -0800 Subject: template.html changes needed to make visible spaces work. --- template.html | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/template.html b/template.html index 9ae92f8..f9723c6 100644 --- a/template.html +++ b/template.html @@ -20,11 +20,11 @@ h4 { font-size: 100%; font-weight: bold; } a.definition { font-weight: bold; } span.space { position: relative; } span.space:after { - content: ""; + content: "·"; position: absolute; /* create a mark that indicates a space (trick from D. Greenspan) */ - top: 3px; bottom: 3px; left: 1px; right: 1px; - border: 1px solid #999; + top: 0px; bottom: 7px; left: 1px; right: 1px; + color: #AAA; } div.example { overflow: hidden; } p { text-align: justify; } @@ -44,9 +44,6 @@ div.example > pre.markdown { clear:left; } pre.tree { font-weight: bold; color: #777; } pre.markdown { background-color: #D3E1E4; } pre.html { background-color: #C9CaCE; } -pre.html span.space:after { - border: 1px solid #666; -} #watermark { position:fixed; bottom:0px; -- cgit v1.2.3 From 98dfa2d659f844a927d8570c1becbbed2d1834ef Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 07:26:39 -0800 Subject: Simplified link_label parser. It now just scans for an unescaped `]`. No `[` characters are permitted in labels. Backticks don't have their usual meaning in labels. This accords with the behavior of some of the main Markdown parsers: marked, sundown, discount, kramdown, showdown, Markdown.pl, PHP Markdown. --- src/inlines.c | 58 ++++++++-------------------------------------------------- 1 file changed, 8 insertions(+), 50 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index 773027e..9197ee0 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -22,7 +22,6 @@ typedef struct OpenerStack { typedef struct Subject { chunk input; int pos; - int label_nestlevel; reference_map *refmap; opener_stack *openers; } subject; @@ -195,7 +194,6 @@ static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) e->input.len = buffer->size; e->input.alloc = 0; e->pos = 0; - e->label_nestlevel = 0; e->refmap = refmap; e->openers = NULL; @@ -208,7 +206,6 @@ static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap) e->input.len = chunk->len; e->input.alloc = 0; e->pos = 0; - e->label_nestlevel = 0; e->refmap = refmap; e->openers = NULL; @@ -601,69 +598,30 @@ static node_inl* handle_pointy_brace(subject* subj) } // Parse a link label. Returns 1 if successful. -// Unless raw_label is null, it is set to point to the raw contents of the []. -// Assumes the subject has a '[' character at the current position. -// Returns 0 and does not advance if no matching ] is found. -// Note the precedence: code backticks have precedence over label bracket -// markers, which have precedence over *, _, and other inline formatting -// markers. So, 2 below contains a link while 1 does not: -// 1. [a link `with a ](/url)` character -// 2. [a link *with emphasized ](/url) text* +// Note: unescaped brackets are not allowed in labels. +// The label begins with `[` and ends with the first `]` character +// encountered. Backticks in labels do not start code spans. static int link_label(subject* subj, chunk *raw_label) { - int nestlevel = 0; - node_inl* tmp = NULL; int startpos = subj->pos; - if (subj->label_nestlevel) { - // if we've already checked to the end of the subject - // for a label, even with a different starting [, we - // know we won't find one here and we can just return. - // Note: nestlevel 1 would be: [foo [bar] - // nestlevel 2 would be: [foo [bar [baz] - subj->label_nestlevel--; - return 0; - } - advance(subj); // advance past [ unsigned char c; - while ((c = peek_char(subj)) && (c != ']' || nestlevel > 0)) { - switch (c) { - case '`': - tmp = handle_backticks(subj); - free_inlines(tmp); - break; - case '<': - tmp = handle_pointy_brace(subj); - free_inlines(tmp); - break; - case '[': // nested [] - nestlevel++; - advance(subj); - break; - case ']': // nested [] - nestlevel--; - advance(subj); - break; - case '\\': + while ((c = peek_char(subj)) && c != '[' && c != ']') { + if (c == '\\') { advance(subj); if (ispunct(peek_char(subj))) { advance(subj); } - break; - default: - advance(subj); } + advance(subj); } - if (nestlevel == 0 && c == ']') { + + if (c == ']') { // match found *raw_label = chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); - subj->label_nestlevel = 0; advance(subj); // advance past ] return 1; } else { - if (c == 0) { - subj->label_nestlevel = nestlevel; - } subj->pos = startpos; // rewind return 0; } -- cgit v1.2.3 From c84db152b53edaa6373bcb89a96b5b30830f8185 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 08:15:49 -0800 Subject: Initial steps towards link parsing. --- src/inlines.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 114 insertions(+), 17 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index 9197ee0..bf76e1a 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -627,25 +627,120 @@ static int link_label(subject* subj, chunk *raw_label) } } -// Parse a link or the link portion of an image, or return a fallback. -static node_inl* handle_left_bracket(subject* subj) +// Return a link, an image, or a literal close bracket. +static node_inl* handle_close_bracket(subject* subj) { + int initial_pos; + int starturl, endurl, starttitle, endtitle, endall; + int n; + int sps; + chunk url, title; + opener_stack *ostack = subj->openers; + node_inl *link_text = NULL; + node_inl *tmp = NULL; + + advance(subj); // advance past ] + initial_pos = subj->pos; + + // look through stack of openers for a [ or ! + while (ostack) { + if (ostack->delim_char == '[' || ostack->delim_char == '!') { + break; + } + ostack = ostack->previous; + } + + if (ostack == NULL) { + return make_str(chunk_literal("]")); + } + + // If we got here, we matched a potential link/image text. + link_text = ostack->first_inline->next; + + // Now we check to see if it's a link/image. + + + if (peek_char(subj) == '(' && + ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && + ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { + + // try to parse an explicit link: + starturl = subj->pos + 1 + sps; // after ( + endurl = starturl + n; + starttitle = endurl + scan_spacechars(&subj->input, endurl); + + // ensure there are spaces btw url and title + endtitle = (starttitle == endurl) ? starttitle : + starttitle + scan_link_title(&subj->input, starttitle); + + endall = endtitle + scan_spacechars(&subj->input, endtitle); + + if (peek_at(subj, endall) == ')') { + subj->pos = endall + 1; + + url = chunk_dup(&subj->input, starturl, endurl - starturl); + title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); + + tmp = link_text->next; + ostack->first_inline->content.literal = chunk_literal("X"); // TODO a kludge + ostack->first_inline->next = make_link(link_text, url, title); + return make_str(chunk_literal("X")); + } else { + goto noMatch; + } + } else { + goto noMatch; // for now + } + + // if found, check to see if we have a target: + // - followed by (inline link) + // - followed by [link label] that matches + // - followed by [], and our brackets have a label that matches + // - our brackets have a label that matches + + // if no target, remove the matching opener from the stack and return literal ]. + // if yes target, remove the matching opener and any later openers. + // return a link or an image. + + /* + chunk rawlabel_tmp; + chunk reflabel; + + // Check for reference link. + // First, see if there's another label: + subj->pos = subj->pos + scan_spacechars(&subj->input, endlabel); + reflabel = rawlabel; + + // if followed by a nonempty link label, we change reflabel to it: + if (peek_char(subj) == '[' && link_label(subj, &rawlabel_tmp)) { + if (rawlabel_tmp.len > 0) + reflabel = rawlabel_tmp; + } else { + subj->pos = endlabel; + } + + // lookup rawlabel in subject->reference_map: + ref = reference_lookup(subj->refmap, &reflabel); + if (ref != NULL) { // found + lab = parse_chunk_inlines(&rawlabel, NULL); + result = make_ref_link(lab, ref); + } else { + goto noMatch; + } + return result; + node_inl *lab = NULL; node_inl *result = NULL; reference *ref; - int n; - int sps; int found_label; - int endlabel, startpos, starturl, endurl, starttitle, endtitle, endall; chunk rawlabel; - chunk url, title; - startpos = subj->pos; found_label = link_label(subj, &rawlabel); endlabel = subj->pos; - if (found_label) { + if (found_label) + { if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { @@ -700,10 +795,11 @@ static node_inl* handle_left_bracket(subject* subj) return result; } } + */ noMatch: // If we fall through to here, it means we didn't match a link: - subj->pos = startpos + 1; // advance past [ - return make_str(chunk_literal("[")); + subj->pos = initial_pos; + return make_str(chunk_literal("]")); } // Parse a hard or soft linebreak, returning an inline. @@ -824,17 +920,18 @@ static int parse_inline(subject* subj, node_inl ** last) new = handle_strong_emph(subj, '*', last); break; case '[': - new = handle_left_bracket(subj); + advance(subj); + new = make_str(chunk_literal("[")); + subj->openers = push_opener(subj, 1, '[', new); + break; + case ']': + new = handle_close_bracket(subj); break; case '!': advance(subj); if (peek_char(subj) == '[') { - new = handle_left_bracket(subj); - if (new != NULL && new->tag == INL_LINK) { - new->tag = INL_IMAGE; - } else { - new = append_inlines(make_str(chunk_literal("!")), new); - } + new = make_str(chunk_literal("![")); + subj->openers = push_opener(subj, 1, '!', new); } else { new = make_str(chunk_literal("!")); } -- cgit v1.2.3 From 2750fd33414ebc396ee67dad730b93b1a7b64264 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 10:41:22 -0800 Subject: Got inline links working. --- src/inlines.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index bf76e1a..1a7b7a7 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -72,12 +72,6 @@ inline static node_inl* make_autolink(node_inl* label, chunk url, int is_email) return make_link_(label, clean_autolink(&url, is_email), NULL); } -// Create an inline with a linkable string value. -inline static node_inl* make_link(node_inl* label, chunk url, chunk title) -{ - return make_link_(label, clean_url(&url), clean_title(&title)); -} - inline static node_inl* make_inlines(int t, node_inl* contents) { node_inl * e = calloc(1, sizeof(*e)); @@ -628,16 +622,18 @@ static int link_label(subject* subj, chunk *raw_label) } // Return a link, an image, or a literal close bracket. -static node_inl* handle_close_bracket(subject* subj) +static node_inl* handle_close_bracket(subject* subj, node_inl **last) { int initial_pos; int starturl, endurl, starttitle, endtitle, endall; int n; int sps; + bool is_image = false; chunk url, title; opener_stack *ostack = subj->openers; - node_inl *link_text = NULL; - node_inl *tmp = NULL; + node_inl *link_text; + node_inl *tmp; + node_inl *inl; advance(subj); // advance past ] initial_pos = subj->pos; @@ -655,6 +651,7 @@ static node_inl* handle_close_bracket(subject* subj) } // If we got here, we matched a potential link/image text. + is_image = ostack->delim_char == '!'; link_text = ostack->first_inline->next; // Now we check to see if it's a link/image. @@ -682,9 +679,20 @@ static node_inl* handle_close_bracket(subject* subj) title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); tmp = link_text->next; - ostack->first_inline->content.literal = chunk_literal("X"); // TODO a kludge - ostack->first_inline->next = make_link(link_text, url, title); - return make_str(chunk_literal("X")); + inl = ostack->first_inline; + inl->tag = is_image ? INL_IMAGE : INL_LINK; + chunk_free(&inl->content.literal); + inl->content.linkable.label = link_text; + inl->content.linkable.url = clean_url(&url); + inl->content.linkable.title = clean_title(&title); + chunk_free(&url); + chunk_free(&title); + inl->next = NULL; + + // remove this opener and all later ones from stack: + free_openers(subj, ostack->previous); + *last = inl; + return NULL; } else { goto noMatch; } @@ -925,7 +933,7 @@ static int parse_inline(subject* subj, node_inl ** last) subj->openers = push_opener(subj, 1, '[', new); break; case ']': - new = handle_close_bracket(subj); + new = handle_close_bracket(subj, last); break; case '!': advance(subj); -- cgit v1.2.3 From ea81ce0001cb842586af381f98f43e10caa8a8dc Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 11:35:09 -0800 Subject: Got ref links working, but with deallocation issues. --- src/inlines.c | 170 ++++++++++++++++------------------------------------------ 1 file changed, 47 insertions(+), 123 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index 1a7b7a7..3e3ef0a 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -628,12 +628,15 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) int starturl, endurl, starttitle, endtitle, endall; int n; int sps; + reference *ref; bool is_image = false; - chunk url, title; + chunk urlchunk, titlechunk; + unsigned char *url, *title; opener_stack *ostack = subj->openers; node_inl *link_text; node_inl *tmp; node_inl *inl; + chunk raw_label; advance(subj); // advance past ] initial_pos = subj->pos; @@ -656,7 +659,7 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) // Now we check to see if it's a link/image. - + // First, look for an inline link. if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { @@ -675,139 +678,60 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) if (peek_at(subj, endall) == ')') { subj->pos = endall + 1; - url = chunk_dup(&subj->input, starturl, endurl - starturl); - title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); - - tmp = link_text->next; - inl = ostack->first_inline; - inl->tag = is_image ? INL_IMAGE : INL_LINK; - chunk_free(&inl->content.literal); - inl->content.linkable.label = link_text; - inl->content.linkable.url = clean_url(&url); - inl->content.linkable.title = clean_title(&title); - chunk_free(&url); - chunk_free(&title); - inl->next = NULL; - - // remove this opener and all later ones from stack: - free_openers(subj, ostack->previous); - *last = inl; - return NULL; - } else { - goto noMatch; - } - } else { - goto noMatch; // for now - } + urlchunk = chunk_dup(&subj->input, starturl, endurl - starturl); + titlechunk = chunk_dup(&subj->input, starttitle, endtitle - starttitle); + url = clean_url(&urlchunk); + title = clean_title(&titlechunk); + chunk_free(&urlchunk); + chunk_free(&titlechunk); + goto match; - // if found, check to see if we have a target: - // - followed by (inline link) - // - followed by [link label] that matches - // - followed by [], and our brackets have a label that matches - // - our brackets have a label that matches - - // if no target, remove the matching opener from the stack and return literal ]. - // if yes target, remove the matching opener and any later openers. - // return a link or an image. - - /* - chunk rawlabel_tmp; - chunk reflabel; - - // Check for reference link. - // First, see if there's another label: - subj->pos = subj->pos + scan_spacechars(&subj->input, endlabel); - reflabel = rawlabel; - - // if followed by a nonempty link label, we change reflabel to it: - if (peek_char(subj) == '[' && link_label(subj, &rawlabel_tmp)) { - if (rawlabel_tmp.len > 0) - reflabel = rawlabel_tmp; - } else { - subj->pos = endlabel; - } - - // lookup rawlabel in subject->reference_map: - ref = reference_lookup(subj->refmap, &reflabel); - if (ref != NULL) { // found - lab = parse_chunk_inlines(&rawlabel, NULL); - result = make_ref_link(lab, ref); } else { goto noMatch; } - return result; - - node_inl *lab = NULL; - node_inl *result = NULL; - reference *ref; - int found_label; - - chunk rawlabel; - - found_label = link_label(subj, &rawlabel); - endlabel = subj->pos; - - if (found_label) - { - if (peek_char(subj) == '(' && - ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && - ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { - - // try to parse an explicit link: - starturl = subj->pos + 1 + sps; // after ( - endurl = starturl + n; - starttitle = endurl + scan_spacechars(&subj->input, endurl); - - // ensure there are spaces btw url and title - endtitle = (starttitle == endurl) ? starttitle : - starttitle + scan_link_title(&subj->input, starttitle); - - endall = endtitle + scan_spacechars(&subj->input, endtitle); - - if (peek_at(subj, endall) == ')') { - subj->pos = endall + 1; + } - url = chunk_dup(&subj->input, starturl, endurl - starturl); - title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); - lab = parse_chunk_inlines(&rawlabel, NULL); + // Next, look for a following [link label] that matches in refmap. + // skip spaces + subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos); + raw_label = chunk_literal(""); + if (!link_label(subj, &raw_label) || raw_label.len == 0) { + chunk_free(&raw_label); + raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); + } - return make_link(lab, url, title); - } else { - goto noMatch; - } - } else { - chunk rawlabel_tmp; - chunk reflabel; - - // Check for reference link. - // First, see if there's another label: - subj->pos = subj->pos + scan_spacechars(&subj->input, endlabel); - reflabel = rawlabel; - - // if followed by a nonempty link label, we change reflabel to it: - if (peek_char(subj) == '[' && link_label(subj, &rawlabel_tmp)) { - if (rawlabel_tmp.len > 0) - reflabel = rawlabel_tmp; - } else { - subj->pos = endlabel; - } + log_info("looking up '%s'", chunk_to_cstr(&raw_label)); + ref = reference_lookup(subj->refmap, &raw_label); + chunk_free(&raw_label); - // lookup rawlabel in subject->reference_map: - ref = reference_lookup(subj->refmap, &reflabel); - if (ref != NULL) { // found - lab = parse_chunk_inlines(&rawlabel, NULL); - result = make_ref_link(lab, ref); - } else { - goto noMatch; - } - return result; - } + if (ref != NULL) { // found + log_info("ref found url{%s} title{%s}", ref->url, ref->title); + url = ref->url; + title = ref->title; + goto match; + } else { + goto noMatch; } - */ + noMatch: // If we fall through to here, it means we didn't match a link: subj->pos = initial_pos; return make_str(chunk_literal("]")); + +match: + tmp = link_text->next; + inl = ostack->first_inline; + inl->tag = is_image ? INL_IMAGE : INL_LINK; + chunk_free(&inl->content.literal); + inl->content.linkable.label = link_text; + inl->content.linkable.url = url; + inl->content.linkable.title = title; + inl->next = NULL; + + // remove this opener and all later ones from stack: + free_openers(subj, ostack->previous); + *last = inl; + return NULL; } // Parse a hard or soft linebreak, returning an inline. -- cgit v1.2.3 From 18207addd5d922a9ca1ec6e83a895108f13e3c25 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 12:05:19 -0800 Subject: Fixed allocation issue. --- src/html/html.c | 4 ++-- src/inlines.c | 12 ++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/html/html.c b/src/html/html.c index 5f08506..ea83992 100644 --- a/src/html/html.c +++ b/src/html/html.c @@ -125,7 +125,7 @@ static void inlines_to_plain_html(strbuf *html, node_inl* ils) case INL_LINK: case INL_IMAGE: - children = ils->content.inlines; + children = ils->content.linkable.label; visit_children = true; rstack = push_inline(rstack, ils->next, ""); break; @@ -197,7 +197,7 @@ static void inlines_to_html(strbuf *html, node_inl* ils) } strbuf_puts(html, "\">"); - children = ils->content.inlines; + children = ils->content.linkable.label; rstack = push_inline(rstack, ils->next, ""); break; diff --git a/src/inlines.c b/src/inlines.c index 3e3ef0a..937c33f 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -634,7 +634,6 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) unsigned char *url, *title; opener_stack *ostack = subj->openers; node_inl *link_text; - node_inl *tmp; node_inl *inl; chunk raw_label; @@ -696,18 +695,16 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos); raw_label = chunk_literal(""); if (!link_label(subj, &raw_label) || raw_label.len == 0) { - chunk_free(&raw_label); + // chunk_free(&raw_label); raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - log_info("looking up '%s'", chunk_to_cstr(&raw_label)); ref = reference_lookup(subj->refmap, &raw_label); chunk_free(&raw_label); if (ref != NULL) { // found - log_info("ref found url{%s} title{%s}", ref->url, ref->title); - url = ref->url; - title = ref->title; + url = bufdup(ref->url); + title = bufdup(ref->title); goto match; } else { goto noMatch; @@ -719,7 +716,6 @@ noMatch: return make_str(chunk_literal("]")); match: - tmp = link_text->next; inl = ostack->first_inline; inl->tag = is_image ? INL_IMAGE : INL_LINK; chunk_free(&inl->content.literal); @@ -727,10 +723,10 @@ match: inl->content.linkable.url = url; inl->content.linkable.title = title; inl->next = NULL; + *last = inl; // remove this opener and all later ones from stack: free_openers(subj, ostack->previous); - *last = inl; return NULL; } -- cgit v1.2.3 From c6f95684f90a4d1efd2185984b1aa2931591efb4 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 12:08:36 -0800 Subject: Fixed problem with images. --- src/inlines.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/inlines.c b/src/inlines.c index 937c33f..a3d848d 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -858,6 +858,7 @@ static int parse_inline(subject* subj, node_inl ** last) case '!': advance(subj); if (peek_char(subj) == '[') { + advance(subj); new = make_str(chunk_literal("![")); subj->openers = push_opener(subj, 1, '!', new); } else { -- cgit v1.2.3 From 7c44ac85bfa68e756d9a32635b114444512b683d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 12:18:04 -0800 Subject: Fixed backslash-escape inside link label. Down to 8 failures, all cases where the spec will need to be changed to reflect lack of priority of links over emphasis. --- src/inlines.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index a3d848d..4628e32 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -607,8 +607,9 @@ static int link_label(subject* subj, chunk *raw_label) if (ispunct(peek_char(subj))) { advance(subj); } + } else { + advance(subj); } - advance(subj); } if (c == ']') { // match found @@ -699,7 +700,12 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - ref = reference_lookup(subj->refmap, &raw_label); + // TODO - document this hard length limit in READE; also impose for creation of refs + if (raw_label.len < 1000) { + ref = reference_lookup(subj->refmap, &raw_label); + } else { + ref = NULL; + } chunk_free(&raw_label); if (ref != NULL) { // found -- cgit v1.2.3 From db596350ac569436d568790410facef14d47670f Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 13:41:28 -0800 Subject: Disallow links inside links and images inside images. --- src/inlines.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index 4628e32..069544b 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -634,6 +634,8 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) chunk urlchunk, titlechunk; unsigned char *url, *title; opener_stack *ostack = subj->openers; + opener_stack *closer_above; + opener_stack *tempstack; node_inl *link_text; node_inl *inl; chunk raw_label; @@ -700,7 +702,7 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - // TODO - document this hard length limit in READE; also impose for creation of refs + // TODO - document this hard length limit in spec; also impose for creation of refs if (raw_label.len < 1000) { ref = reference_lookup(subj->refmap, &raw_label); } else { @@ -731,8 +733,30 @@ match: inl->next = NULL; *last = inl; - // remove this opener and all later ones from stack: + // remove this opener and all later ones: free_openers(subj, ostack->previous); + + // remove earlier ones of the same kind + // (so, no links in links, and no images in images): + // (This code can be removed if we decide to allow links + // inside links and images inside images): + ostack = subj->openers; + closer_above = NULL; + while (ostack != NULL) { + tempstack = ostack->previous; + if (ostack->delim_char == (is_image ? '!' : '[')) { + free(ostack); + if (closer_above) { + closer_above->previous = tempstack; + } else { + subj->openers = tempstack; + } + } else { + closer_above = ostack; + } + ostack = tempstack; + } + return NULL; } -- cgit v1.2.3 From d352e22ff937548fb02f79043f47d2143050c63e Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 13:43:56 -0800 Subject: Removed some unused code. --- src/inlines.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index 069544b..7a7ca02 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -26,11 +26,9 @@ typedef struct Subject { opener_stack *openers; } subject; -static node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap); static node_inl *parse_inlines_from_subject(subject* subj); static int parse_inline(subject* subj, node_inl ** last); -static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap); static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap); static int subject_find_special_char(subject *subj); @@ -62,11 +60,6 @@ static inline node_inl *make_link_(node_inl *label, unsigned char *url, unsigned return e; } -inline static node_inl* make_ref_link(node_inl* label, reference *ref) -{ - return make_link_(label, bufdup(ref->url), bufdup(ref->title)); -} - inline static node_inl* make_autolink(node_inl* label, chunk url, int is_email) { return make_link_(label, clean_autolink(&url, is_email), NULL); @@ -194,18 +187,6 @@ static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) chunk_rtrim(&e->input); } -static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap) -{ - e->input.data = chunk->data; - e->input.len = chunk->len; - e->input.alloc = 0; - e->pos = 0; - e->refmap = refmap; - e->openers = NULL; - - chunk_rtrim(&e->input); -} - inline static int isbacktick(int c) { return (c == '`'); @@ -803,13 +784,6 @@ extern node_inl* parse_inlines_from_subject(subject* subj) return first; } -node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap) -{ - subject subj; - subject_from_chunk(&subj, chunk, refmap); - return parse_inlines_from_subject(&subj); -} - static int subject_find_special_char(subject *subj) { // "\n\\`&_*[] Date: Sat, 8 Nov 2014 15:15:20 -0800 Subject: Added MAX_LINK_LABEL_LENGTH to cmark.h. Use in link label parsing and reference lookup. --- src/cmark.h | 2 ++ src/inlines.c | 24 ++++++++++++++---------- src/references.c | 3 +++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/cmark.h b/src/cmark.h index ff2f9a2..a135fa9 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -10,6 +10,8 @@ #define VERSION "0.1" #define CODE_INDENT 4 +#define MAX_LINK_LABEL_LENGTH 1000 + struct node_inl { enum { INL_STRING, diff --git a/src/inlines.c b/src/inlines.c index 7a7ca02..0527d92 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -579,17 +579,24 @@ static node_inl* handle_pointy_brace(subject* subj) static int link_label(subject* subj, chunk *raw_label) { int startpos = subj->pos; + int length = 0; advance(subj); // advance past [ unsigned char c; while ((c = peek_char(subj)) && c != '[' && c != ']') { if (c == '\\') { advance(subj); + length++; if (ispunct(peek_char(subj))) { advance(subj); + length++; } } else { advance(subj); + length++; + } + if (length > MAX_LINK_LABEL_LENGTH) { + goto noMatch; } } @@ -597,10 +604,12 @@ static int link_label(subject* subj, chunk *raw_label) *raw_label = chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); advance(subj); // advance past ] return 1; - } else { - subj->pos = startpos; // rewind - return 0; } + + noMatch: + subj->pos = startpos; // rewind + return 0; + } // Return a link, an image, or a literal close bracket. @@ -679,16 +688,11 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos); raw_label = chunk_literal(""); if (!link_label(subj, &raw_label) || raw_label.len == 0) { - // chunk_free(&raw_label); + chunk_free(&raw_label); raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - // TODO - document this hard length limit in spec; also impose for creation of refs - if (raw_label.len < 1000) { - ref = reference_lookup(subj->refmap, &raw_label); - } else { - ref = NULL; - } + ref = reference_lookup(subj->refmap, &raw_label); chunk_free(&raw_label); if (ref != NULL) { // found diff --git a/src/references.c b/src/references.c index 5ba4b24..def4dd8 100644 --- a/src/references.c +++ b/src/references.c @@ -100,6 +100,9 @@ reference* reference_lookup(reference_map *map, chunk *label) unsigned char *norm; unsigned int hash; + if (label->len > MAX_LINK_LABEL_LENGTH) + return NULL; + if (map == NULL) return NULL; -- cgit v1.2.3 From 7c1733bbf56cc99d053e99ecb16356586a4fc61a Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 9 Nov 2014 12:21:25 -0800 Subject: runtests.pl: distinguish error status from failures. --- runtests.pl | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/runtests.pl b/runtests.pl index ae1195e..09f0ba1 100644 --- a/runtests.pl +++ b/runtests.pl @@ -19,6 +19,7 @@ if (!(@PROG && defined $SPEC)) { my $passed = 0; my $failed = 0; my $skipped = 0; +my $errored = 0; # Markdown implementations vary on insignificant whitespace. # Some leave blanks between block elements, others don't. @@ -63,6 +64,7 @@ sub tidy return $out; } +# return 0 for passing test, -1 for failing, positive for error sub dotest { my $markdown = $_[0]; @@ -79,13 +81,14 @@ sub dotest $actual = do { local $/; <$out>; }; close $out; waitpid($pid, 0); + my $exit_status = $?; $html = &tidy($html); $actual = &tidy($actual); $actual =~ s/\'/'/g; if ($actual eq $html) { print colored("✓", "green"); - return 1; + return 0; } else { print colored("\n✘ $testname", "red"); print "\n"; @@ -99,7 +102,11 @@ sub dotest print $actual; print "\n"; print color "black"; - return 0; + if ($exit_status == 0) { + return -1; + } else { + return $exit_status; + } } } @@ -111,6 +118,7 @@ my $linenum = 0; my $exampleline = 0; my @secnums = (); my $secheading; +my $testresult; open(SPEC, "< $SPEC"); while () { @@ -123,11 +131,13 @@ while () { if ($stage == 0) { $example++; if (!$PATT || $secheading =~ /$PATT/) { - if (&dotest($markdown, $html, - "Example $example (line $exampleline)")) { + $testresult = &dotest($markdown, $html, "Example $example (line $exampleline)"); + if ($testresult == 0) { $passed++; - } else { + } elsif ($testresult == -1) { $failed++; + } else { + $errored++; } } else { $skipped++; @@ -161,6 +171,6 @@ while () { } print "\n"; -print STDERR colored("$passed tests passed, $failed failed, $skipped skipped.", "bold"); +print STDERR colored("$passed tests passed, $failed failed, $errored errored, $skipped skipped.", "bold"); print STDERR "\n"; exit $failed; -- cgit v1.2.3 From 014d2d0699d8875e766afcf01580c4a2ea093131 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 22:43:39 -0800 Subject: Restored priority of links over emphasis grouping. Now when we encounter (possibly) closing `*` or `_` delimiters, we simply add them to the delimiters stack. This gets processed by `process_emphasis` either (a) when a link is created (in which case only the inlines created by the link are processed) or at the end of processing a run of inlines. --- src/inlines.c | 267 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 159 insertions(+), 108 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index 0527d92..a1ecf01 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -11,19 +11,23 @@ #include "inlines.h" #include "debug.h" -typedef struct OpenerStack { - struct OpenerStack *previous; + +typedef struct DelimiterStack { + struct DelimiterStack *previous; + struct DelimiterStack *next; node_inl *first_inline; int delim_count; unsigned char delim_char; int position; -} opener_stack; + bool can_open; + bool can_close; +} delimiter_stack; typedef struct Subject { chunk input; int pos; reference_map *refmap; - opener_stack *openers; + delimiter_stack *delimiters; } subject; static node_inl *parse_inlines_from_subject(subject* subj); @@ -182,7 +186,7 @@ static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) e->input.alloc = 0; e->pos = 0; e->refmap = refmap; - e->openers = NULL; + e->delimiters = NULL; chunk_rtrim(&e->input); } @@ -296,30 +300,57 @@ static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * c return numdelims; } -static void free_openers(subject* subj, opener_stack* istack) +/* +static void print_delimiters(subject *subj) +{ + delimiter_stack *tempstack; + tempstack = subj->delimiters; + while (tempstack != NULL) { + printf("Item at %p: %d %d %d %d next(%p) prev(%p)\n", + tempstack, tempstack->delim_count, tempstack->delim_char, + tempstack->can_open, tempstack->can_close, + tempstack->next, tempstack->previous); + tempstack = tempstack->previous; + } +} +*/ + +static void remove_delimiter(subject *subj, delimiter_stack *stack) { - opener_stack * tempstack; - while (subj->openers != istack) { - tempstack = subj->openers; - subj->openers = subj->openers->previous; - free(tempstack); + if (stack->previous != NULL) { + stack->previous->next = stack->next; } + if (stack->next == NULL) { + // top of stack + subj->delimiters = stack->previous; + } else { + stack->next->previous = stack->previous; + } + free(stack); } -static opener_stack * push_opener(subject *subj, - int numdelims, - unsigned char c, - node_inl *inl_text) +static delimiter_stack * push_delimiter(subject *subj, + int numdelims, + unsigned char c, + bool can_open, + bool can_close, + node_inl *inl_text) { - opener_stack *istack = - (opener_stack*)malloc(sizeof(opener_stack)); + delimiter_stack *istack = + (delimiter_stack*)malloc(sizeof(delimiter_stack)); if (istack == NULL) { return NULL; } istack->delim_count = numdelims; istack->delim_char = c; + istack->can_open = can_open; + istack->can_close = can_close; istack->first_inline = inl_text; - istack->previous = subj->openers; + istack->previous = subj->delimiters; + istack->next = NULL; + if (istack->previous != NULL) { + istack->previous->next = istack; + } istack->position = subj->pos; return istack; } @@ -328,91 +359,119 @@ static opener_stack * push_opener(subject *subj, // Assumes the subject has '_' or '*' at the current position. static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **last) { - bool can_open, can_close; int numdelims; - int useDelims; - int openerDelims; - opener_stack * istack; - node_inl * inl; - node_inl * emph; node_inl * inl_text; + bool can_open, can_close; numdelims = scan_delims(subj, c, &can_open, &can_close); - if (can_close) - { - // walk the stack and find a matching opener, if there is one - istack = subj->openers; - while (true) - { - if (istack == NULL) - goto cannotClose; + inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); - if (istack->delim_char == c) - break; + if (can_open || can_close) { + subj->delimiters = push_delimiter(subj, numdelims, c, can_open, can_close, + inl_text); + } - istack = istack->previous; - } + return inl_text; +} - // calculate the actual number of delimeters used from this closer - openerDelims = istack->delim_count; - if (numdelims < 3 || openerDelims < 3) { - useDelims = numdelims <= openerDelims ? numdelims : openerDelims; - } else { // (numdelims >= 3 && openerDelims >= 3) - useDelims = numdelims % 2 == 0 ? 2 : 1; - } +static void process_emphasis(subject *subj, delimiter_stack *stack_bottom) +{ + delimiter_stack *closer = subj->delimiters; + delimiter_stack *opener, *tempstack, *nextstack; + int use_delims; + node_inl *inl, *tmp, *emph; + + // move back to first relevant delim. + while (closer != NULL && closer->previous != stack_bottom) { + closer = closer->previous; + } - if (istack->delim_count == useDelims) - { - // the opener is completely used up - remove the stack entry and reuse the inline element - inl = istack->first_inline; - inl->tag = useDelims == 1 ? INL_EMPH : INL_STRONG; - chunk_free(&inl->content.literal); - inl->content.inlines = inl->next; - inl->next = NULL; - - // remove this opener and all later ones from stack: - free_openers(subj, istack->previous); - *last = inl; + // now move forward, looking for closers, and handling each + while (closer != NULL) { + if (closer->can_close && + (closer->delim_char == '*' || closer->delim_char == '_')) { + // Now look backwards for first matching opener: + opener = closer->previous; + while (opener != NULL && opener != stack_bottom) { + if (opener->delim_char == closer->delim_char && + opener->can_open) { + break; + } + opener = opener->previous; + } + if (opener != NULL && opener != stack_bottom) { + // calculate the actual number of delimeters used from this closer + if (closer->delim_count < 3 || opener->delim_count < 3) { + use_delims = closer->delim_count <= opener->delim_count ? + closer->delim_count : opener->delim_count; + } else { // closer and opener both have >= 3 delims + use_delims = closer->delim_count % 2 == 0 ? 2 : 1; } - else - { - // the opener will only partially be used - stack entry remains (truncated) and a new inline is added. - inl = istack->first_inline; - istack->delim_count -= useDelims; - inl->content.literal.len = istack->delim_count; - emph = useDelims == 1 ? make_emph(inl->next) : make_strong(inl->next); - inl->next = emph; + inl = opener->first_inline; - // remove all later openers from stack: - free_openers(subj, istack); + // remove used delimiters from stack elements and associated inlines. + opener->delim_count -= use_delims; + closer->delim_count -= use_delims; + inl->content.literal.len = opener->delim_count; + closer->first_inline->content.literal.len = closer->delim_count; - *last = emph; + // free delimiters between opener and closer + tempstack = closer->previous; + while (tempstack != NULL && tempstack != opener) { + nextstack = tempstack->previous; + remove_delimiter(subj, tempstack); + tempstack = nextstack; } - // if the closer was not fully used, move back a char or two and try again. - if (useDelims < numdelims) - { - subj->pos = subj->pos - numdelims + useDelims; - return NULL; + // create new emph or strong, and splice it in to our inlines + // between the opener and closer + emph = use_delims == 1 ? make_emph(inl->next) : make_strong(inl->next); + emph->next = closer->first_inline; + inl->next = emph; + tmp = emph->content.inlines; + while (tmp->next != NULL && tmp->next != closer->first_inline) { + tmp = tmp->next; + } + tmp->next = NULL; + + // if opener has 0 delims, remove it and its associated inline + if (opener->delim_count == 0) { + // replace empty opener inline with emph + chunk_free(&(inl->content.literal)); + inl->tag = emph->tag; + inl->next = emph->next; + inl->content.inlines = emph->content.inlines; + free(emph); + emph = inl; + // remove opener from stack + remove_delimiter(subj, opener); } - return NULL; // make_str(chunk_literal("")); - } - - cannotClose: - inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); - - if (can_open) - { - subj->openers = push_opener(subj, - numdelims, - c, - inl_text); + // if closer has 0 delims, remove it and its associated inline + if (closer->delim_count == 0) { + // remove empty closer inline + tmp = closer->first_inline; + emph->next = tmp->next; + tmp->next = NULL; + free_inlines(tmp); + // remove closer from stack + tempstack = closer->next; + remove_delimiter(subj, closer); + closer = tempstack; + } + } else { + closer = closer->next; + } + } else { + closer = closer->next; } - - return inl_text; + } + // free all delimiters in stack down to stack_bottom: + while (subj->delimiters != stack_bottom) { + remove_delimiter(subj, subj->delimiters); + } } // Parse backslash-escape or just a backslash, returning an inline. @@ -623,9 +682,9 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) bool is_image = false; chunk urlchunk, titlechunk; unsigned char *url, *title; - opener_stack *ostack = subj->openers; - opener_stack *closer_above; - opener_stack *tempstack; + delimiter_stack *ostack; + delimiter_stack *closer_above; + delimiter_stack *tempstack; node_inl *link_text; node_inl *inl; chunk raw_label; @@ -633,7 +692,8 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) advance(subj); // advance past ] initial_pos = subj->pos; - // look through stack of openers for a [ or ! + // look through stack of delimiters for a [ or ! + ostack = subj->delimiters; while (ostack) { if (ostack->delim_char == '[' || ostack->delim_char == '!') { break; @@ -713,19 +773,18 @@ match: inl->tag = is_image ? INL_IMAGE : INL_LINK; chunk_free(&inl->content.literal); inl->content.linkable.label = link_text; + process_emphasis(subj, ostack->previous); inl->content.linkable.url = url; inl->content.linkable.title = title; inl->next = NULL; *last = inl; - // remove this opener and all later ones: - free_openers(subj, ostack->previous); - - // remove earlier ones of the same kind + // process_emphasis will remove this delimiter and all later ones. + // Now we also remove earlier ones of the same kind // (so, no links in links, and no images in images): // (This code can be removed if we decide to allow links // inside links and images inside images): - ostack = subj->openers; + ostack = subj->delimiters; closer_above = NULL; while (ostack != NULL) { tempstack = ostack->previous; @@ -734,7 +793,7 @@ match: if (closer_above) { closer_above->previous = tempstack; } else { - subj->openers = tempstack; + subj->delimiters = tempstack; } } else { closer_above = ostack; @@ -777,13 +836,7 @@ extern node_inl* parse_inlines_from_subject(subject* subj) } } - opener_stack* istack = subj->openers; - opener_stack* temp; - while (istack != NULL) { - temp = istack->previous; - free(istack); - istack = temp; - } + process_emphasis(subj, NULL); return first; } @@ -849,16 +902,14 @@ static int parse_inline(subject* subj, node_inl ** last) case '<': new = handle_pointy_brace(subj); break; - case '_': - new = handle_strong_emph(subj, '_', last); - break; case '*': - new = handle_strong_emph(subj, '*', last); + case '_': + new = handle_strong_emph(subj, c, last); break; case '[': advance(subj); new = make_str(chunk_literal("[")); - subj->openers = push_opener(subj, 1, '[', new); + subj->delimiters = push_delimiter(subj, 1, '[', true, false, new); break; case ']': new = handle_close_bracket(subj, last); @@ -868,7 +919,7 @@ static int parse_inline(subject* subj, node_inl ** last) if (peek_char(subj) == '[') { advance(subj); new = make_str(chunk_literal("![")); - subj->openers = push_opener(subj, 1, '!', new); + subj->delimiters = push_delimiter(subj, 1, '!', false, true, new); } else { new = make_str(chunk_literal("!")); } -- cgit v1.2.3