From 98dfa2d659f844a927d8570c1becbbed2d1834ef Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 07:26:39 -0800 Subject: Simplified link_label parser. It now just scans for an unescaped `]`. No `[` characters are permitted in labels. Backticks don't have their usual meaning in labels. This accords with the behavior of some of the main Markdown parsers: marked, sundown, discount, kramdown, showdown, Markdown.pl, PHP Markdown. --- src/inlines.c | 58 ++++++++-------------------------------------------------- 1 file changed, 8 insertions(+), 50 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 773027e..9197ee0 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -22,7 +22,6 @@ typedef struct OpenerStack { typedef struct Subject { chunk input; int pos; - int label_nestlevel; reference_map *refmap; opener_stack *openers; } subject; @@ -195,7 +194,6 @@ static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) e->input.len = buffer->size; e->input.alloc = 0; e->pos = 0; - e->label_nestlevel = 0; e->refmap = refmap; e->openers = NULL; @@ -208,7 +206,6 @@ static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap) e->input.len = chunk->len; e->input.alloc = 0; e->pos = 0; - e->label_nestlevel = 0; e->refmap = refmap; e->openers = NULL; @@ -601,69 +598,30 @@ static node_inl* handle_pointy_brace(subject* subj) } // Parse a link label. Returns 1 if successful. -// Unless raw_label is null, it is set to point to the raw contents of the []. -// Assumes the subject has a '[' character at the current position. -// Returns 0 and does not advance if no matching ] is found. -// Note the precedence: code backticks have precedence over label bracket -// markers, which have precedence over *, _, and other inline formatting -// markers. So, 2 below contains a link while 1 does not: -// 1. [a link `with a ](/url)` character -// 2. [a link *with emphasized ](/url) text* +// Note: unescaped brackets are not allowed in labels. +// The label begins with `[` and ends with the first `]` character +// encountered. Backticks in labels do not start code spans. static int link_label(subject* subj, chunk *raw_label) { - int nestlevel = 0; - node_inl* tmp = NULL; int startpos = subj->pos; - if (subj->label_nestlevel) { - // if we've already checked to the end of the subject - // for a label, even with a different starting [, we - // know we won't find one here and we can just return. - // Note: nestlevel 1 would be: [foo [bar] - // nestlevel 2 would be: [foo [bar [baz] - subj->label_nestlevel--; - return 0; - } - advance(subj); // advance past [ unsigned char c; - while ((c = peek_char(subj)) && (c != ']' || nestlevel > 0)) { - switch (c) { - case '`': - tmp = handle_backticks(subj); - free_inlines(tmp); - break; - case '<': - tmp = handle_pointy_brace(subj); - free_inlines(tmp); - break; - case '[': // nested [] - nestlevel++; - advance(subj); - break; - case ']': // nested [] - nestlevel--; - advance(subj); - break; - case '\\': + while ((c = peek_char(subj)) && c != '[' && c != ']') { + if (c == '\\') { advance(subj); if (ispunct(peek_char(subj))) { advance(subj); } - break; - default: - advance(subj); } + advance(subj); } - if (nestlevel == 0 && c == ']') { + + if (c == ']') { // match found *raw_label = chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); - subj->label_nestlevel = 0; advance(subj); // advance past ] return 1; } else { - if (c == 0) { - subj->label_nestlevel = nestlevel; - } subj->pos = startpos; // rewind return 0; } -- cgit v1.2.3 From c84db152b53edaa6373bcb89a96b5b30830f8185 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 08:15:49 -0800 Subject: Initial steps towards link parsing. --- src/inlines.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 114 insertions(+), 17 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 9197ee0..bf76e1a 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -627,25 +627,120 @@ static int link_label(subject* subj, chunk *raw_label) } } -// Parse a link or the link portion of an image, or return a fallback. -static node_inl* handle_left_bracket(subject* subj) +// Return a link, an image, or a literal close bracket. +static node_inl* handle_close_bracket(subject* subj) { + int initial_pos; + int starturl, endurl, starttitle, endtitle, endall; + int n; + int sps; + chunk url, title; + opener_stack *ostack = subj->openers; + node_inl *link_text = NULL; + node_inl *tmp = NULL; + + advance(subj); // advance past ] + initial_pos = subj->pos; + + // look through stack of openers for a [ or ! + while (ostack) { + if (ostack->delim_char == '[' || ostack->delim_char == '!') { + break; + } + ostack = ostack->previous; + } + + if (ostack == NULL) { + return make_str(chunk_literal("]")); + } + + // If we got here, we matched a potential link/image text. + link_text = ostack->first_inline->next; + + // Now we check to see if it's a link/image. + + + if (peek_char(subj) == '(' && + ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && + ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { + + // try to parse an explicit link: + starturl = subj->pos + 1 + sps; // after ( + endurl = starturl + n; + starttitle = endurl + scan_spacechars(&subj->input, endurl); + + // ensure there are spaces btw url and title + endtitle = (starttitle == endurl) ? starttitle : + starttitle + scan_link_title(&subj->input, starttitle); + + endall = endtitle + scan_spacechars(&subj->input, endtitle); + + if (peek_at(subj, endall) == ')') { + subj->pos = endall + 1; + + url = chunk_dup(&subj->input, starturl, endurl - starturl); + title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); + + tmp = link_text->next; + ostack->first_inline->content.literal = chunk_literal("X"); // TODO a kludge + ostack->first_inline->next = make_link(link_text, url, title); + return make_str(chunk_literal("X")); + } else { + goto noMatch; + } + } else { + goto noMatch; // for now + } + + // if found, check to see if we have a target: + // - followed by (inline link) + // - followed by [link label] that matches + // - followed by [], and our brackets have a label that matches + // - our brackets have a label that matches + + // if no target, remove the matching opener from the stack and return literal ]. + // if yes target, remove the matching opener and any later openers. + // return a link or an image. + + /* + chunk rawlabel_tmp; + chunk reflabel; + + // Check for reference link. + // First, see if there's another label: + subj->pos = subj->pos + scan_spacechars(&subj->input, endlabel); + reflabel = rawlabel; + + // if followed by a nonempty link label, we change reflabel to it: + if (peek_char(subj) == '[' && link_label(subj, &rawlabel_tmp)) { + if (rawlabel_tmp.len > 0) + reflabel = rawlabel_tmp; + } else { + subj->pos = endlabel; + } + + // lookup rawlabel in subject->reference_map: + ref = reference_lookup(subj->refmap, &reflabel); + if (ref != NULL) { // found + lab = parse_chunk_inlines(&rawlabel, NULL); + result = make_ref_link(lab, ref); + } else { + goto noMatch; + } + return result; + node_inl *lab = NULL; node_inl *result = NULL; reference *ref; - int n; - int sps; int found_label; - int endlabel, startpos, starturl, endurl, starttitle, endtitle, endall; chunk rawlabel; - chunk url, title; - startpos = subj->pos; found_label = link_label(subj, &rawlabel); endlabel = subj->pos; - if (found_label) { + if (found_label) + { if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { @@ -700,10 +795,11 @@ static node_inl* handle_left_bracket(subject* subj) return result; } } + */ noMatch: // If we fall through to here, it means we didn't match a link: - subj->pos = startpos + 1; // advance past [ - return make_str(chunk_literal("[")); + subj->pos = initial_pos; + return make_str(chunk_literal("]")); } // Parse a hard or soft linebreak, returning an inline. @@ -824,17 +920,18 @@ static int parse_inline(subject* subj, node_inl ** last) new = handle_strong_emph(subj, '*', last); break; case '[': - new = handle_left_bracket(subj); + advance(subj); + new = make_str(chunk_literal("[")); + subj->openers = push_opener(subj, 1, '[', new); + break; + case ']': + new = handle_close_bracket(subj); break; case '!': advance(subj); if (peek_char(subj) == '[') { - new = handle_left_bracket(subj); - if (new != NULL && new->tag == INL_LINK) { - new->tag = INL_IMAGE; - } else { - new = append_inlines(make_str(chunk_literal("!")), new); - } + new = make_str(chunk_literal("![")); + subj->openers = push_opener(subj, 1, '!', new); } else { new = make_str(chunk_literal("!")); } -- cgit v1.2.3 From 2750fd33414ebc396ee67dad730b93b1a7b64264 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 10:41:22 -0800 Subject: Got inline links working. --- src/inlines.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index bf76e1a..1a7b7a7 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -72,12 +72,6 @@ inline static node_inl* make_autolink(node_inl* label, chunk url, int is_email) return make_link_(label, clean_autolink(&url, is_email), NULL); } -// Create an inline with a linkable string value. -inline static node_inl* make_link(node_inl* label, chunk url, chunk title) -{ - return make_link_(label, clean_url(&url), clean_title(&title)); -} - inline static node_inl* make_inlines(int t, node_inl* contents) { node_inl * e = calloc(1, sizeof(*e)); @@ -628,16 +622,18 @@ static int link_label(subject* subj, chunk *raw_label) } // Return a link, an image, or a literal close bracket. -static node_inl* handle_close_bracket(subject* subj) +static node_inl* handle_close_bracket(subject* subj, node_inl **last) { int initial_pos; int starturl, endurl, starttitle, endtitle, endall; int n; int sps; + bool is_image = false; chunk url, title; opener_stack *ostack = subj->openers; - node_inl *link_text = NULL; - node_inl *tmp = NULL; + node_inl *link_text; + node_inl *tmp; + node_inl *inl; advance(subj); // advance past ] initial_pos = subj->pos; @@ -655,6 +651,7 @@ static node_inl* handle_close_bracket(subject* subj) } // If we got here, we matched a potential link/image text. + is_image = ostack->delim_char == '!'; link_text = ostack->first_inline->next; // Now we check to see if it's a link/image. @@ -682,9 +679,20 @@ static node_inl* handle_close_bracket(subject* subj) title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); tmp = link_text->next; - ostack->first_inline->content.literal = chunk_literal("X"); // TODO a kludge - ostack->first_inline->next = make_link(link_text, url, title); - return make_str(chunk_literal("X")); + inl = ostack->first_inline; + inl->tag = is_image ? INL_IMAGE : INL_LINK; + chunk_free(&inl->content.literal); + inl->content.linkable.label = link_text; + inl->content.linkable.url = clean_url(&url); + inl->content.linkable.title = clean_title(&title); + chunk_free(&url); + chunk_free(&title); + inl->next = NULL; + + // remove this opener and all later ones from stack: + free_openers(subj, ostack->previous); + *last = inl; + return NULL; } else { goto noMatch; } @@ -925,7 +933,7 @@ static int parse_inline(subject* subj, node_inl ** last) subj->openers = push_opener(subj, 1, '[', new); break; case ']': - new = handle_close_bracket(subj); + new = handle_close_bracket(subj, last); break; case '!': advance(subj); -- cgit v1.2.3 From ea81ce0001cb842586af381f98f43e10caa8a8dc Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 11:35:09 -0800 Subject: Got ref links working, but with deallocation issues. --- src/inlines.c | 170 ++++++++++++++++------------------------------------------ 1 file changed, 47 insertions(+), 123 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 1a7b7a7..3e3ef0a 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -628,12 +628,15 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) int starturl, endurl, starttitle, endtitle, endall; int n; int sps; + reference *ref; bool is_image = false; - chunk url, title; + chunk urlchunk, titlechunk; + unsigned char *url, *title; opener_stack *ostack = subj->openers; node_inl *link_text; node_inl *tmp; node_inl *inl; + chunk raw_label; advance(subj); // advance past ] initial_pos = subj->pos; @@ -656,7 +659,7 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) // Now we check to see if it's a link/image. - + // First, look for an inline link. if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { @@ -675,139 +678,60 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) if (peek_at(subj, endall) == ')') { subj->pos = endall + 1; - url = chunk_dup(&subj->input, starturl, endurl - starturl); - title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); - - tmp = link_text->next; - inl = ostack->first_inline; - inl->tag = is_image ? INL_IMAGE : INL_LINK; - chunk_free(&inl->content.literal); - inl->content.linkable.label = link_text; - inl->content.linkable.url = clean_url(&url); - inl->content.linkable.title = clean_title(&title); - chunk_free(&url); - chunk_free(&title); - inl->next = NULL; - - // remove this opener and all later ones from stack: - free_openers(subj, ostack->previous); - *last = inl; - return NULL; - } else { - goto noMatch; - } - } else { - goto noMatch; // for now - } + urlchunk = chunk_dup(&subj->input, starturl, endurl - starturl); + titlechunk = chunk_dup(&subj->input, starttitle, endtitle - starttitle); + url = clean_url(&urlchunk); + title = clean_title(&titlechunk); + chunk_free(&urlchunk); + chunk_free(&titlechunk); + goto match; - // if found, check to see if we have a target: - // - followed by (inline link) - // - followed by [link label] that matches - // - followed by [], and our brackets have a label that matches - // - our brackets have a label that matches - - // if no target, remove the matching opener from the stack and return literal ]. - // if yes target, remove the matching opener and any later openers. - // return a link or an image. - - /* - chunk rawlabel_tmp; - chunk reflabel; - - // Check for reference link. - // First, see if there's another label: - subj->pos = subj->pos + scan_spacechars(&subj->input, endlabel); - reflabel = rawlabel; - - // if followed by a nonempty link label, we change reflabel to it: - if (peek_char(subj) == '[' && link_label(subj, &rawlabel_tmp)) { - if (rawlabel_tmp.len > 0) - reflabel = rawlabel_tmp; - } else { - subj->pos = endlabel; - } - - // lookup rawlabel in subject->reference_map: - ref = reference_lookup(subj->refmap, &reflabel); - if (ref != NULL) { // found - lab = parse_chunk_inlines(&rawlabel, NULL); - result = make_ref_link(lab, ref); } else { goto noMatch; } - return result; - - node_inl *lab = NULL; - node_inl *result = NULL; - reference *ref; - int found_label; - - chunk rawlabel; - - found_label = link_label(subj, &rawlabel); - endlabel = subj->pos; - - if (found_label) - { - if (peek_char(subj) == '(' && - ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && - ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { - - // try to parse an explicit link: - starturl = subj->pos + 1 + sps; // after ( - endurl = starturl + n; - starttitle = endurl + scan_spacechars(&subj->input, endurl); - - // ensure there are spaces btw url and title - endtitle = (starttitle == endurl) ? starttitle : - starttitle + scan_link_title(&subj->input, starttitle); - - endall = endtitle + scan_spacechars(&subj->input, endtitle); - - if (peek_at(subj, endall) == ')') { - subj->pos = endall + 1; + } - url = chunk_dup(&subj->input, starturl, endurl - starturl); - title = chunk_dup(&subj->input, starttitle, endtitle - starttitle); - lab = parse_chunk_inlines(&rawlabel, NULL); + // Next, look for a following [link label] that matches in refmap. + // skip spaces + subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos); + raw_label = chunk_literal(""); + if (!link_label(subj, &raw_label) || raw_label.len == 0) { + chunk_free(&raw_label); + raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); + } - return make_link(lab, url, title); - } else { - goto noMatch; - } - } else { - chunk rawlabel_tmp; - chunk reflabel; - - // Check for reference link. - // First, see if there's another label: - subj->pos = subj->pos + scan_spacechars(&subj->input, endlabel); - reflabel = rawlabel; - - // if followed by a nonempty link label, we change reflabel to it: - if (peek_char(subj) == '[' && link_label(subj, &rawlabel_tmp)) { - if (rawlabel_tmp.len > 0) - reflabel = rawlabel_tmp; - } else { - subj->pos = endlabel; - } + log_info("looking up '%s'", chunk_to_cstr(&raw_label)); + ref = reference_lookup(subj->refmap, &raw_label); + chunk_free(&raw_label); - // lookup rawlabel in subject->reference_map: - ref = reference_lookup(subj->refmap, &reflabel); - if (ref != NULL) { // found - lab = parse_chunk_inlines(&rawlabel, NULL); - result = make_ref_link(lab, ref); - } else { - goto noMatch; - } - return result; - } + if (ref != NULL) { // found + log_info("ref found url{%s} title{%s}", ref->url, ref->title); + url = ref->url; + title = ref->title; + goto match; + } else { + goto noMatch; } - */ + noMatch: // If we fall through to here, it means we didn't match a link: subj->pos = initial_pos; return make_str(chunk_literal("]")); + +match: + tmp = link_text->next; + inl = ostack->first_inline; + inl->tag = is_image ? INL_IMAGE : INL_LINK; + chunk_free(&inl->content.literal); + inl->content.linkable.label = link_text; + inl->content.linkable.url = url; + inl->content.linkable.title = title; + inl->next = NULL; + + // remove this opener and all later ones from stack: + free_openers(subj, ostack->previous); + *last = inl; + return NULL; } // Parse a hard or soft linebreak, returning an inline. -- cgit v1.2.3 From 18207addd5d922a9ca1ec6e83a895108f13e3c25 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 12:05:19 -0800 Subject: Fixed allocation issue. --- src/inlines.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 3e3ef0a..937c33f 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -634,7 +634,6 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) unsigned char *url, *title; opener_stack *ostack = subj->openers; node_inl *link_text; - node_inl *tmp; node_inl *inl; chunk raw_label; @@ -696,18 +695,16 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos); raw_label = chunk_literal(""); if (!link_label(subj, &raw_label) || raw_label.len == 0) { - chunk_free(&raw_label); + // chunk_free(&raw_label); raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - log_info("looking up '%s'", chunk_to_cstr(&raw_label)); ref = reference_lookup(subj->refmap, &raw_label); chunk_free(&raw_label); if (ref != NULL) { // found - log_info("ref found url{%s} title{%s}", ref->url, ref->title); - url = ref->url; - title = ref->title; + url = bufdup(ref->url); + title = bufdup(ref->title); goto match; } else { goto noMatch; @@ -719,7 +716,6 @@ noMatch: return make_str(chunk_literal("]")); match: - tmp = link_text->next; inl = ostack->first_inline; inl->tag = is_image ? INL_IMAGE : INL_LINK; chunk_free(&inl->content.literal); @@ -727,10 +723,10 @@ match: inl->content.linkable.url = url; inl->content.linkable.title = title; inl->next = NULL; + *last = inl; // remove this opener and all later ones from stack: free_openers(subj, ostack->previous); - *last = inl; return NULL; } -- cgit v1.2.3 From c6f95684f90a4d1efd2185984b1aa2931591efb4 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 12:08:36 -0800 Subject: Fixed problem with images. --- src/inlines.c | 1 + 1 file changed, 1 insertion(+) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 937c33f..a3d848d 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -858,6 +858,7 @@ static int parse_inline(subject* subj, node_inl ** last) case '!': advance(subj); if (peek_char(subj) == '[') { + advance(subj); new = make_str(chunk_literal("![")); subj->openers = push_opener(subj, 1, '!', new); } else { -- cgit v1.2.3 From 7c44ac85bfa68e756d9a32635b114444512b683d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 12:18:04 -0800 Subject: Fixed backslash-escape inside link label. Down to 8 failures, all cases where the spec will need to be changed to reflect lack of priority of links over emphasis. --- src/inlines.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index a3d848d..4628e32 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -607,8 +607,9 @@ static int link_label(subject* subj, chunk *raw_label) if (ispunct(peek_char(subj))) { advance(subj); } + } else { + advance(subj); } - advance(subj); } if (c == ']') { // match found @@ -699,7 +700,12 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - ref = reference_lookup(subj->refmap, &raw_label); + // TODO - document this hard length limit in READE; also impose for creation of refs + if (raw_label.len < 1000) { + ref = reference_lookup(subj->refmap, &raw_label); + } else { + ref = NULL; + } chunk_free(&raw_label); if (ref != NULL) { // found -- cgit v1.2.3 From db596350ac569436d568790410facef14d47670f Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 13:41:28 -0800 Subject: Disallow links inside links and images inside images. --- src/inlines.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 4628e32..069544b 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -634,6 +634,8 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) chunk urlchunk, titlechunk; unsigned char *url, *title; opener_stack *ostack = subj->openers; + opener_stack *closer_above; + opener_stack *tempstack; node_inl *link_text; node_inl *inl; chunk raw_label; @@ -700,7 +702,7 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - // TODO - document this hard length limit in READE; also impose for creation of refs + // TODO - document this hard length limit in spec; also impose for creation of refs if (raw_label.len < 1000) { ref = reference_lookup(subj->refmap, &raw_label); } else { @@ -731,8 +733,30 @@ match: inl->next = NULL; *last = inl; - // remove this opener and all later ones from stack: + // remove this opener and all later ones: free_openers(subj, ostack->previous); + + // remove earlier ones of the same kind + // (so, no links in links, and no images in images): + // (This code can be removed if we decide to allow links + // inside links and images inside images): + ostack = subj->openers; + closer_above = NULL; + while (ostack != NULL) { + tempstack = ostack->previous; + if (ostack->delim_char == (is_image ? '!' : '[')) { + free(ostack); + if (closer_above) { + closer_above->previous = tempstack; + } else { + subj->openers = tempstack; + } + } else { + closer_above = ostack; + } + ostack = tempstack; + } + return NULL; } -- cgit v1.2.3 From d352e22ff937548fb02f79043f47d2143050c63e Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 13:43:56 -0800 Subject: Removed some unused code. --- src/inlines.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 069544b..7a7ca02 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -26,11 +26,9 @@ typedef struct Subject { opener_stack *openers; } subject; -static node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap); static node_inl *parse_inlines_from_subject(subject* subj); static int parse_inline(subject* subj, node_inl ** last); -static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap); static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap); static int subject_find_special_char(subject *subj); @@ -62,11 +60,6 @@ static inline node_inl *make_link_(node_inl *label, unsigned char *url, unsigned return e; } -inline static node_inl* make_ref_link(node_inl* label, reference *ref) -{ - return make_link_(label, bufdup(ref->url), bufdup(ref->title)); -} - inline static node_inl* make_autolink(node_inl* label, chunk url, int is_email) { return make_link_(label, clean_autolink(&url, is_email), NULL); @@ -194,18 +187,6 @@ static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) chunk_rtrim(&e->input); } -static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap) -{ - e->input.data = chunk->data; - e->input.len = chunk->len; - e->input.alloc = 0; - e->pos = 0; - e->refmap = refmap; - e->openers = NULL; - - chunk_rtrim(&e->input); -} - inline static int isbacktick(int c) { return (c == '`'); @@ -803,13 +784,6 @@ extern node_inl* parse_inlines_from_subject(subject* subj) return first; } -node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap) -{ - subject subj; - subject_from_chunk(&subj, chunk, refmap); - return parse_inlines_from_subject(&subj); -} - static int subject_find_special_char(subject *subj) { // "\n\\`&_*[] Date: Sat, 8 Nov 2014 15:15:20 -0800 Subject: Added MAX_LINK_LABEL_LENGTH to cmark.h. Use in link label parsing and reference lookup. --- src/inlines.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 7a7ca02..0527d92 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -579,17 +579,24 @@ static node_inl* handle_pointy_brace(subject* subj) static int link_label(subject* subj, chunk *raw_label) { int startpos = subj->pos; + int length = 0; advance(subj); // advance past [ unsigned char c; while ((c = peek_char(subj)) && c != '[' && c != ']') { if (c == '\\') { advance(subj); + length++; if (ispunct(peek_char(subj))) { advance(subj); + length++; } } else { advance(subj); + length++; + } + if (length > MAX_LINK_LABEL_LENGTH) { + goto noMatch; } } @@ -597,10 +604,12 @@ static int link_label(subject* subj, chunk *raw_label) *raw_label = chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); advance(subj); // advance past ] return 1; - } else { - subj->pos = startpos; // rewind - return 0; } + + noMatch: + subj->pos = startpos; // rewind + return 0; + } // Return a link, an image, or a literal close bracket. @@ -679,16 +688,11 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos); raw_label = chunk_literal(""); if (!link_label(subj, &raw_label) || raw_label.len == 0) { - // chunk_free(&raw_label); + chunk_free(&raw_label); raw_label = chunk_dup(&subj->input, ostack->position, initial_pos - ostack->position - 1); } - // TODO - document this hard length limit in spec; also impose for creation of refs - if (raw_label.len < 1000) { - ref = reference_lookup(subj->refmap, &raw_label); - } else { - ref = NULL; - } + ref = reference_lookup(subj->refmap, &raw_label); chunk_free(&raw_label); if (ref != NULL) { // found -- cgit v1.2.3 From 014d2d0699d8875e766afcf01580c4a2ea093131 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 8 Nov 2014 22:43:39 -0800 Subject: Restored priority of links over emphasis grouping. Now when we encounter (possibly) closing `*` or `_` delimiters, we simply add them to the delimiters stack. This gets processed by `process_emphasis` either (a) when a link is created (in which case only the inlines created by the link are processed) or at the end of processing a run of inlines. --- src/inlines.c | 267 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 159 insertions(+), 108 deletions(-) (limited to 'src/inlines.c') diff --git a/src/inlines.c b/src/inlines.c index 0527d92..a1ecf01 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -11,19 +11,23 @@ #include "inlines.h" #include "debug.h" -typedef struct OpenerStack { - struct OpenerStack *previous; + +typedef struct DelimiterStack { + struct DelimiterStack *previous; + struct DelimiterStack *next; node_inl *first_inline; int delim_count; unsigned char delim_char; int position; -} opener_stack; + bool can_open; + bool can_close; +} delimiter_stack; typedef struct Subject { chunk input; int pos; reference_map *refmap; - opener_stack *openers; + delimiter_stack *delimiters; } subject; static node_inl *parse_inlines_from_subject(subject* subj); @@ -182,7 +186,7 @@ static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) e->input.alloc = 0; e->pos = 0; e->refmap = refmap; - e->openers = NULL; + e->delimiters = NULL; chunk_rtrim(&e->input); } @@ -296,30 +300,57 @@ static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * c return numdelims; } -static void free_openers(subject* subj, opener_stack* istack) +/* +static void print_delimiters(subject *subj) +{ + delimiter_stack *tempstack; + tempstack = subj->delimiters; + while (tempstack != NULL) { + printf("Item at %p: %d %d %d %d next(%p) prev(%p)\n", + tempstack, tempstack->delim_count, tempstack->delim_char, + tempstack->can_open, tempstack->can_close, + tempstack->next, tempstack->previous); + tempstack = tempstack->previous; + } +} +*/ + +static void remove_delimiter(subject *subj, delimiter_stack *stack) { - opener_stack * tempstack; - while (subj->openers != istack) { - tempstack = subj->openers; - subj->openers = subj->openers->previous; - free(tempstack); + if (stack->previous != NULL) { + stack->previous->next = stack->next; } + if (stack->next == NULL) { + // top of stack + subj->delimiters = stack->previous; + } else { + stack->next->previous = stack->previous; + } + free(stack); } -static opener_stack * push_opener(subject *subj, - int numdelims, - unsigned char c, - node_inl *inl_text) +static delimiter_stack * push_delimiter(subject *subj, + int numdelims, + unsigned char c, + bool can_open, + bool can_close, + node_inl *inl_text) { - opener_stack *istack = - (opener_stack*)malloc(sizeof(opener_stack)); + delimiter_stack *istack = + (delimiter_stack*)malloc(sizeof(delimiter_stack)); if (istack == NULL) { return NULL; } istack->delim_count = numdelims; istack->delim_char = c; + istack->can_open = can_open; + istack->can_close = can_close; istack->first_inline = inl_text; - istack->previous = subj->openers; + istack->previous = subj->delimiters; + istack->next = NULL; + if (istack->previous != NULL) { + istack->previous->next = istack; + } istack->position = subj->pos; return istack; } @@ -328,91 +359,119 @@ static opener_stack * push_opener(subject *subj, // Assumes the subject has '_' or '*' at the current position. static node_inl* handle_strong_emph(subject* subj, unsigned char c, node_inl **last) { - bool can_open, can_close; int numdelims; - int useDelims; - int openerDelims; - opener_stack * istack; - node_inl * inl; - node_inl * emph; node_inl * inl_text; + bool can_open, can_close; numdelims = scan_delims(subj, c, &can_open, &can_close); - if (can_close) - { - // walk the stack and find a matching opener, if there is one - istack = subj->openers; - while (true) - { - if (istack == NULL) - goto cannotClose; + inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); - if (istack->delim_char == c) - break; + if (can_open || can_close) { + subj->delimiters = push_delimiter(subj, numdelims, c, can_open, can_close, + inl_text); + } - istack = istack->previous; - } + return inl_text; +} - // calculate the actual number of delimeters used from this closer - openerDelims = istack->delim_count; - if (numdelims < 3 || openerDelims < 3) { - useDelims = numdelims <= openerDelims ? numdelims : openerDelims; - } else { // (numdelims >= 3 && openerDelims >= 3) - useDelims = numdelims % 2 == 0 ? 2 : 1; - } +static void process_emphasis(subject *subj, delimiter_stack *stack_bottom) +{ + delimiter_stack *closer = subj->delimiters; + delimiter_stack *opener, *tempstack, *nextstack; + int use_delims; + node_inl *inl, *tmp, *emph; + + // move back to first relevant delim. + while (closer != NULL && closer->previous != stack_bottom) { + closer = closer->previous; + } - if (istack->delim_count == useDelims) - { - // the opener is completely used up - remove the stack entry and reuse the inline element - inl = istack->first_inline; - inl->tag = useDelims == 1 ? INL_EMPH : INL_STRONG; - chunk_free(&inl->content.literal); - inl->content.inlines = inl->next; - inl->next = NULL; - - // remove this opener and all later ones from stack: - free_openers(subj, istack->previous); - *last = inl; + // now move forward, looking for closers, and handling each + while (closer != NULL) { + if (closer->can_close && + (closer->delim_char == '*' || closer->delim_char == '_')) { + // Now look backwards for first matching opener: + opener = closer->previous; + while (opener != NULL && opener != stack_bottom) { + if (opener->delim_char == closer->delim_char && + opener->can_open) { + break; + } + opener = opener->previous; + } + if (opener != NULL && opener != stack_bottom) { + // calculate the actual number of delimeters used from this closer + if (closer->delim_count < 3 || opener->delim_count < 3) { + use_delims = closer->delim_count <= opener->delim_count ? + closer->delim_count : opener->delim_count; + } else { // closer and opener both have >= 3 delims + use_delims = closer->delim_count % 2 == 0 ? 2 : 1; } - else - { - // the opener will only partially be used - stack entry remains (truncated) and a new inline is added. - inl = istack->first_inline; - istack->delim_count -= useDelims; - inl->content.literal.len = istack->delim_count; - emph = useDelims == 1 ? make_emph(inl->next) : make_strong(inl->next); - inl->next = emph; + inl = opener->first_inline; - // remove all later openers from stack: - free_openers(subj, istack); + // remove used delimiters from stack elements and associated inlines. + opener->delim_count -= use_delims; + closer->delim_count -= use_delims; + inl->content.literal.len = opener->delim_count; + closer->first_inline->content.literal.len = closer->delim_count; - *last = emph; + // free delimiters between opener and closer + tempstack = closer->previous; + while (tempstack != NULL && tempstack != opener) { + nextstack = tempstack->previous; + remove_delimiter(subj, tempstack); + tempstack = nextstack; } - // if the closer was not fully used, move back a char or two and try again. - if (useDelims < numdelims) - { - subj->pos = subj->pos - numdelims + useDelims; - return NULL; + // create new emph or strong, and splice it in to our inlines + // between the opener and closer + emph = use_delims == 1 ? make_emph(inl->next) : make_strong(inl->next); + emph->next = closer->first_inline; + inl->next = emph; + tmp = emph->content.inlines; + while (tmp->next != NULL && tmp->next != closer->first_inline) { + tmp = tmp->next; + } + tmp->next = NULL; + + // if opener has 0 delims, remove it and its associated inline + if (opener->delim_count == 0) { + // replace empty opener inline with emph + chunk_free(&(inl->content.literal)); + inl->tag = emph->tag; + inl->next = emph->next; + inl->content.inlines = emph->content.inlines; + free(emph); + emph = inl; + // remove opener from stack + remove_delimiter(subj, opener); } - return NULL; // make_str(chunk_literal("")); - } - - cannotClose: - inl_text = make_str(chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); - - if (can_open) - { - subj->openers = push_opener(subj, - numdelims, - c, - inl_text); + // if closer has 0 delims, remove it and its associated inline + if (closer->delim_count == 0) { + // remove empty closer inline + tmp = closer->first_inline; + emph->next = tmp->next; + tmp->next = NULL; + free_inlines(tmp); + // remove closer from stack + tempstack = closer->next; + remove_delimiter(subj, closer); + closer = tempstack; + } + } else { + closer = closer->next; + } + } else { + closer = closer->next; } - - return inl_text; + } + // free all delimiters in stack down to stack_bottom: + while (subj->delimiters != stack_bottom) { + remove_delimiter(subj, subj->delimiters); + } } // Parse backslash-escape or just a backslash, returning an inline. @@ -623,9 +682,9 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) bool is_image = false; chunk urlchunk, titlechunk; unsigned char *url, *title; - opener_stack *ostack = subj->openers; - opener_stack *closer_above; - opener_stack *tempstack; + delimiter_stack *ostack; + delimiter_stack *closer_above; + delimiter_stack *tempstack; node_inl *link_text; node_inl *inl; chunk raw_label; @@ -633,7 +692,8 @@ static node_inl* handle_close_bracket(subject* subj, node_inl **last) advance(subj); // advance past ] initial_pos = subj->pos; - // look through stack of openers for a [ or ! + // look through stack of delimiters for a [ or ! + ostack = subj->delimiters; while (ostack) { if (ostack->delim_char == '[' || ostack->delim_char == '!') { break; @@ -713,19 +773,18 @@ match: inl->tag = is_image ? INL_IMAGE : INL_LINK; chunk_free(&inl->content.literal); inl->content.linkable.label = link_text; + process_emphasis(subj, ostack->previous); inl->content.linkable.url = url; inl->content.linkable.title = title; inl->next = NULL; *last = inl; - // remove this opener and all later ones: - free_openers(subj, ostack->previous); - - // remove earlier ones of the same kind + // process_emphasis will remove this delimiter and all later ones. + // Now we also remove earlier ones of the same kind // (so, no links in links, and no images in images): // (This code can be removed if we decide to allow links // inside links and images inside images): - ostack = subj->openers; + ostack = subj->delimiters; closer_above = NULL; while (ostack != NULL) { tempstack = ostack->previous; @@ -734,7 +793,7 @@ match: if (closer_above) { closer_above->previous = tempstack; } else { - subj->openers = tempstack; + subj->delimiters = tempstack; } } else { closer_above = ostack; @@ -777,13 +836,7 @@ extern node_inl* parse_inlines_from_subject(subject* subj) } } - opener_stack* istack = subj->openers; - opener_stack* temp; - while (istack != NULL) { - temp = istack->previous; - free(istack); - istack = temp; - } + process_emphasis(subj, NULL); return first; } @@ -849,16 +902,14 @@ static int parse_inline(subject* subj, node_inl ** last) case '<': new = handle_pointy_brace(subj); break; - case '_': - new = handle_strong_emph(subj, '_', last); - break; case '*': - new = handle_strong_emph(subj, '*', last); + case '_': + new = handle_strong_emph(subj, c, last); break; case '[': advance(subj); new = make_str(chunk_literal("[")); - subj->openers = push_opener(subj, 1, '[', new); + subj->delimiters = push_delimiter(subj, 1, '[', true, false, new); break; case ']': new = handle_close_bracket(subj, last); @@ -868,7 +919,7 @@ static int parse_inline(subject* subj, node_inl ** last) if (peek_char(subj) == '[') { advance(subj); new = make_str(chunk_literal("![")); - subj->openers = push_opener(subj, 1, '!', new); + subj->delimiters = push_delimiter(subj, 1, '!', false, true, new); } else { new = make_str(chunk_literal("!")); } -- cgit v1.2.3