From 7423feff96dc1331b3578440e417224e96dac448 Mon Sep 17 00:00:00 2001 From: Yuki Izumi Date: Mon, 17 Jul 2017 15:20:12 +1000 Subject: Fix URL scanner. --- src/inlines.c | 90 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 38 deletions(-) (limited to 'src') diff --git a/src/inlines.c b/src/inlines.c index b864e12..f223baf 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -695,8 +695,8 @@ static cmark_node *handle_entity(subject *subj) { return make_str(subj->mem, cmark_chunk_buf_detach(&ent)); } -// Clean a URL: remove surrounding whitespace and surrounding <>, -// and remove \ that escape punctuation. +// Clean a URL: remove surrounding whitespace, and remove \ that escape +// punctuation. cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { cmark_strbuf buf = CMARK_BUF_INIT(mem); @@ -707,11 +707,7 @@ cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { return result; } - if (url->data[0] == '<' && url->data[url->len - 1] == '>') { - houdini_unescape_html_f(&buf, url->data + 1, url->len - 2); - } else { - houdini_unescape_html_f(&buf, url->data, url->len); - } + houdini_unescape_html_f(&buf, url->data, url->len); cmark_strbuf_unescape(&buf); return cmark_chunk_buf_detach(&buf); @@ -824,10 +820,43 @@ noMatch: subj->pos = startpos; // rewind return 0; } -static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset) { + +static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset, cmark_chunk *output) { bufsize_t i = offset; size_t nb_p = 0; + while (i < input->len) { + if (input->data[i] == '\\' && + i + 1 < input-> len && + cmark_ispunct(input->data[i+1])) + i += 2; + else if (input->data[i] == '(') { + ++nb_p; + ++i; + } else if (input->data[i] == ')') { + if (nb_p == 0) + break; + --nb_p; + ++i; + } else if (cmark_isspace(input->data[i])) + break; + else + ++i; + } + + if (i >= input->len) + return -1; + + { + cmark_chunk result = {input->data + offset, i - offset, 0}; + *output = result; + } + return i - offset; +} + +static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset, cmark_chunk *output) { + bufsize_t i = offset; + if (i < input->len && input->data[i] == '<') { ++i; while (i < input->len) { @@ -836,42 +865,30 @@ static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset) { break; } else if (input->data[i] == '\\') i += 2; - else if (cmark_isspace(input->data[i])) - return -1; + else if (cmark_isspace(input->data[i]) || input->data[i] == '<') + return manual_scan_link_url_2(input, offset, output); else ++i; } } else { - while (i < input->len) { - if (input->data[i] == '\\' && - i + 1 < input-> len && - cmark_ispunct(input->data[i+1])) - i += 2; - else if (input->data[i] == '(') { - ++nb_p; - ++i; - } else if (input->data[i] == ')') { - if (nb_p == 0) - break; - --nb_p; - ++i; - } else if (cmark_isspace(input->data[i])) - break; - else - ++i; - } + return manual_scan_link_url_2(input, offset, output); } if (i >= input->len) return -1; + + { + cmark_chunk result = {input->data + offset + 1, i - 2 - offset, 0}; + *output = result; + } return i - offset; } + // Return a link, an image, or a literal close bracket. static cmark_node *handle_close_bracket(subject *subj) { bufsize_t initial_pos, after_link_text_pos; - bufsize_t starturl, endurl, starttitle, endtitle, endall; - bufsize_t n; - bufsize_t sps; + bufsize_t endurl, starttitle, endtitle, endall; + bufsize_t sps, n; cmark_reference *ref = NULL; cmark_chunk url_chunk, title_chunk; cmark_chunk url, title; @@ -907,11 +924,10 @@ static cmark_node *handle_close_bracket(subject *subj) { // First, look for an inline link. if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && - ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { + ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, &url_chunk)) > -1)) { // try to parse an explicit link: - starturl = subj->pos + 1 + sps; // after ( - endurl = starturl + n; + endurl = subj->pos + 1 + sps + n; starttitle = endurl + scan_spacechars(&subj->input, endurl); // ensure there are spaces btw url and title @@ -924,7 +940,6 @@ static cmark_node *handle_close_bracket(subject *subj) { if (peek_at(subj, endall) == ')') { subj->pos = endall + 1; - url_chunk = cmark_chunk_dup(&subj->input, starturl, endurl - starturl); title_chunk = cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); url = cmark_clean_url(subj->mem, &url_chunk); @@ -1213,9 +1228,8 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, // parse link url: spnl(&subj); - matchlen = manual_scan_link_url(&subj.input, subj.pos); - if (matchlen > 0) { - url = cmark_chunk_dup(&subj.input, subj.pos, matchlen); + if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1 && + url.len > 0) { subj.pos += matchlen; } else { return 0; -- cgit v1.2.3