diff options
author | Yuki Izumi <ashe@kivikakk.ee> | 2017-08-09 15:56:21 +1000 |
---|---|---|
committer | Yuki Izumi <ashe@kivikakk.ee> | 2017-08-09 15:56:21 +1000 |
commit | 919cdb2c583163411b3b15b2eae0ce72cf2d7981 (patch) | |
tree | 558beeb51433ae15ed2e86b6d440f43215d81bd8 | |
parent | 61b51fa7c8ec635eee19a16c6aa38c39093a0572 (diff) |
Add sourcepos info for inlines
-rw-r--r-- | api_test/main.c | 97 | ||||
-rw-r--r-- | src/inlines.c | 118 | ||||
-rw-r--r-- | src/iterator.c | 1 |
3 files changed, 167 insertions, 49 deletions
diff --git a/api_test/main.c b/api_test/main.c index d720234..08f3c98 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -552,9 +552,9 @@ static void render_xml(test_batch_runner *runner) { STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n" "<paragraph sourcepos=\"1:1-1:9\">\n" - " <text>foo </text>\n" - " <emph>\n" - " <text>bar</text>\n" + " <text sourcepos=\"1:1-1:4\">foo </text>\n" + " <emph sourcepos=\"1:5-1:9\">\n" + " <text sourcepos=\"1:6-1:8\">bar</text>\n" " </emph>\n" "</paragraph>\n", "render first paragraph with source pos"); @@ -883,6 +883,95 @@ static void test_feed_across_line_ending(test_batch_runner *runner) { cmark_node_free(document); } +static void source_pos(test_batch_runner *runner) { + static const char markdown[] = + "Hi *there*.\n" + "\n" + "Hello “ <http://www.google.com>\n" + "there `hi` -- [okay](www.google.com (ok)).\n" + "\n" + "> 1. Okay.\n" + "> Sure.\n" + ">\n" + "> 2. Yes, okay.\n" + "> ![ok](hi \"yes\")\n"; + + cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT); + char *xml = cmark_render_xml(doc, CMARK_OPT_DEFAULT | CMARK_OPT_SOURCEPOS); + STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n" + "<document sourcepos=\"1:1-10:20\" xmlns=\"http://commonmark.org/xml/1.0\">\n" + " <paragraph sourcepos=\"1:1-1:11\">\n" + " <text sourcepos=\"1:1-1:3\">Hi </text>\n" + " <emph sourcepos=\"1:4-1:10\">\n" + " <text sourcepos=\"1:5-1:9\">there</text>\n" + " </emph>\n" + " <text sourcepos=\"1:11-1:11\">.</text>\n" + " </paragraph>\n" + " <paragraph sourcepos=\"3:1-4:42\">\n" + " <text sourcepos=\"3:1-3:14\">Hello “ </text>\n" + " <link sourcepos=\"3:15-3:37\" destination=\"http://www.google.com\" title=\"\">\n" + " <text sourcepos=\"3:16-3:36\">http://www.google.com</text>\n" + " </link>\n" + " <softbreak />\n" + " <text sourcepos=\"4:1-4:6\">there </text>\n" + " <code sourcepos=\"4:8-4:9\">hi</code>\n" + " <text sourcepos=\"4:11-4:14\"> -- </text>\n" + " <link sourcepos=\"4:15-4:41\" destination=\"www.google.com\" title=\"ok\">\n" + " <text sourcepos=\"4:16-4:19\">okay</text>\n" + " </link>\n" + " <text sourcepos=\"4:42-4:42\">.</text>\n" + " </paragraph>\n" + " <block_quote sourcepos=\"6:1-10:20\">\n" + " <list sourcepos=\"6:3-10:20\" type=\"ordered\" start=\"1\" delim=\"period\" tight=\"false\">\n" + " <item sourcepos=\"6:3-8:1\">\n" + " <paragraph sourcepos=\"6:6-7:10\">\n" + " <text sourcepos=\"6:6-6:10\">Okay.</text>\n" + " <softbreak />\n" + " <text sourcepos=\"7:6-7:10\">Sure.</text>\n" + " </paragraph>\n" + " </item>\n" + " <item sourcepos=\"9:3-10:20\">\n" + " <paragraph sourcepos=\"9:6-10:20\">\n" + " <text sourcepos=\"9:6-9:15\">Yes, okay.</text>\n" + " <softbreak />\n" + " <image sourcepos=\"10:6-10:20\" destination=\"hi\" title=\"yes\">\n" + " <text sourcepos=\"10:8-10:9\">ok</text>\n" + " </image>\n" + " </paragraph>\n" + " </item>\n" + " </list>\n" + " </block_quote>\n" + "</document>\n", + "sourcepos are as expected"); + free(xml); + cmark_node_free(doc); +} + +static void ref_source_pos(test_batch_runner *runner) { + static const char markdown[] = + "Let's try [reference] links.\n" + "\n" + "[reference]: https://github.com (GitHub)\n"; + + cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT); + char *xml = cmark_render_xml(doc, CMARK_OPT_DEFAULT | CMARK_OPT_SOURCEPOS); + STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n" + "<document sourcepos=\"1:1-3:40\" xmlns=\"http://commonmark.org/xml/1.0\">\n" + " <paragraph sourcepos=\"1:1-1:28\">\n" + " <text sourcepos=\"1:1-1:10\">Let's try </text>\n" + " <link sourcepos=\"1:11-1:21\" destination=\"https://github.com\" title=\"GitHub\">\n" + " <text sourcepos=\"1:12-1:20\">reference</text>\n" + " </link>\n" + " <text sourcepos=\"1:22-1:28\"> links.</text>\n" + " </paragraph>\n" + "</document>\n", + "sourcepos are as expected"); + free(xml); + cmark_node_free(doc); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -908,6 +997,8 @@ int main() { test_cplusplus(runner); test_safe(runner); test_feed_across_line_ending(runner); + source_pos(runner); + ref_source_pos(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; diff --git a/src/inlines.c b/src/inlines.c index 6bf82e2..3f1b9ed 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -22,9 +22,9 @@ static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98"; static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99"; // Macros for creating various kinds of simple. -#define make_str(mem, s) make_literal(mem, CMARK_NODE_TEXT, s) -#define make_code(mem, s) make_literal(mem, CMARK_NODE_CODE, s) -#define make_raw_html(mem, s) make_literal(mem, CMARK_NODE_HTML_INLINE, s) +#define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s) +#define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s) +#define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s) #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK) #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK) #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH) @@ -55,7 +55,10 @@ typedef struct bracket { typedef struct { cmark_mem *mem; cmark_chunk input; + int line; bufsize_t pos; + int block_offset; + int column_offset; cmark_reference_map *refmap; delimiter *last_delim; bracket *last_bracket; @@ -72,17 +75,22 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, static int parse_inline(subject *subj, cmark_node *parent, int options); -static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap); +static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, + cmark_strbuf *buffer, cmark_reference_map *refmap); static bufsize_t subject_find_special_char(subject *subj, int options); // Create an inline with a literal string value. -static CMARK_INLINE cmark_node *make_literal(cmark_mem *mem, cmark_node_type t, +static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t, + int start_column, int end_column, cmark_chunk s) { - cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e)); - cmark_strbuf_init(mem, &e->content, 0); - e->type = t; + cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e)); + cmark_strbuf_init(subj->mem, &e->content, 0); + e->type = (uint16_t)t; e->as.literal = s; + e->start_line = e->end_line = subj->line; + // columns are 1 based. + e->start_column = start_column + 1 + subj->column_offset + subj->block_offset; + e->end_column = end_column + 1 + subj->column_offset + subj->block_offset; return e; } @@ -95,14 +103,15 @@ static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) { } // Like make_str, but parses entities. -static cmark_node *make_str_with_entities(cmark_mem *mem, +static cmark_node *make_str_with_entities(subject *subj, + int start_column, int end_column, cmark_chunk *content) { - cmark_strbuf unescaped = CMARK_BUF_INIT(mem); + cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem); if (houdini_unescape_html(&unescaped, content->data, content->len)) { - return make_str(mem, cmark_chunk_buf_detach(&unescaped)); + return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped)); } else { - return make_str(mem, *content); + return make_str(subj, start_column, end_column, *content); } } @@ -140,23 +149,30 @@ static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url, return cmark_chunk_buf_detach(&buf); } -static CMARK_INLINE cmark_node *make_autolink(cmark_mem *mem, cmark_chunk url, - int is_email) { - cmark_node *link = make_simple(mem, CMARK_NODE_LINK); - link->as.link.url = cmark_clean_autolink(mem, &url, is_email); +static CMARK_INLINE cmark_node *make_autolink(subject *subj, + int start_column, int end_column, + cmark_chunk url, int is_email) { + cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK); + link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email); link->as.link.title = cmark_chunk_literal(""); - cmark_node_append_child(link, make_str_with_entities(mem, &url)); + link->start_line = link->end_line = subj->line; + link->start_column = start_column + 1; + link->end_column = end_column + 1; + cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url)); return link; } -static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap) { +static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e, + cmark_strbuf *buffer, cmark_reference_map *refmap) { int i; e->mem = mem; e->input.data = buffer->ptr; e->input.len = buffer->size; e->input.alloc = 0; + e->line = line_number; e->pos = 0; + e->block_offset = block_offset; + e->column_offset = 0; e->refmap = refmap; e->last_delim = NULL; e->last_bracket = NULL; @@ -277,7 +293,7 @@ static cmark_node *handle_backticks(subject *subj) { if (endpos == 0) { // not found subj->pos = startpos; // rewind - return make_str(subj->mem, openticks); + return make_str(subj, subj->pos, subj->pos, openticks); } else { cmark_strbuf buf = CMARK_BUF_INIT(subj->mem); @@ -286,7 +302,7 @@ static cmark_node *handle_backticks(subject *subj) { cmark_strbuf_trim(&buf); cmark_strbuf_normalize_whitespace(&buf); - return make_code(subj->mem, cmark_chunk_buf_detach(&buf)); + return make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf)); } } @@ -443,7 +459,7 @@ static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); } - inl_text = make_str(subj->mem, contents); + inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents); if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { push_delimiter(subj, c, can_open, can_close, inl_text); @@ -459,7 +475,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) { advance(subj); if (!smart || peek_char(subj) != '-') { - return make_str(subj->mem, cmark_chunk_literal("-")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-")); } while (smart && peek_char(subj) == '-') { @@ -492,7 +508,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) { cmark_strbuf_puts(&buf, ENDASH); } - return make_str(subj->mem, cmark_chunk_buf_detach(&buf)); + return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf)); } // Assumes we have a period at the current position. @@ -502,12 +518,12 @@ static cmark_node *handle_period(subject *subj, bool smart) { advance(subj); if (peek_char(subj) == '.') { advance(subj); - return make_str(subj->mem, cmark_chunk_literal(ELLIPSES)); + return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES)); } else { - return make_str(subj->mem, cmark_chunk_literal("..")); + return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("..")); } } else { - return make_str(subj->mem, cmark_chunk_literal(".")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal(".")); } } @@ -643,6 +659,10 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, } cmark_node_insert_after(opener_inl, emph); + emph->start_line = emph->end_line = subj->line; + emph->start_column = opener_inl->start_column + subj->column_offset; + emph->end_column = closer_inl->end_column + subj->column_offset; + // if opener has 0 characters, remove it and its associated inline if (opener_num_chars == 0) { cmark_node_free(opener_inl); @@ -669,11 +689,11 @@ static cmark_node *handle_backslash(subject *subj) { if (cmark_ispunct( nextchar)) { // only ascii symbols and newline can be escaped advance(subj); - return make_str(subj->mem, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); + return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); } else if (!is_eof(subj) && skip_line_end(subj)) { return make_linebreak(subj->mem); } else { - return make_str(subj->mem, cmark_chunk_literal("\\")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\")); } } @@ -689,10 +709,10 @@ static cmark_node *handle_entity(subject *subj) { subj->input.len - subj->pos); if (len == 0) - return make_str(subj->mem, cmark_chunk_literal("&")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&")); subj->pos += len; - return make_str(subj->mem, cmark_chunk_buf_detach(&ent)); + return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent)); } // Clean a URL: remove surrounding whitespace, and remove \ that escape @@ -751,7 +771,7 @@ static cmark_node *handle_pointy_brace(subject *subj) { contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; - return make_autolink(subj->mem, contents, 0); + return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0); } // next try to match an email autolink @@ -760,7 +780,7 @@ static cmark_node *handle_pointy_brace(subject *subj) { contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; - return make_autolink(subj->mem, contents, 1); + return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1); } // finally, try to match an html tag @@ -768,11 +788,11 @@ static cmark_node *handle_pointy_brace(subject *subj) { if (matchlen > 0) { contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1); subj->pos += matchlen; - return make_raw_html(subj->mem, contents); + return make_raw_html(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents); } // if nothing matches, just return the opening <: - return make_str(subj->mem, cmark_chunk_literal("<")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<")); } // Parse a link label. Returns 1 if successful. @@ -908,13 +928,13 @@ static cmark_node *handle_close_bracket(subject *subj) { opener = subj->last_bracket; if (opener == NULL) { - return make_str(subj->mem, cmark_chunk_literal("]")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); } if (!opener->active) { // take delimiter off stack pop_bracket(subj); - return make_str(subj->mem, cmark_chunk_literal("]")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); } // If we got here, we matched a potential link/image text. @@ -991,12 +1011,15 @@ noMatch: // If we fall through to here, it means we didn't match a link: pop_bracket(subj); // remove this opener from delimiter list subj->pos = initial_pos; - return make_str(subj->mem, cmark_chunk_literal("]")); + return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]")); match: inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK); inl->as.link.url = url; inl->as.link.title = title; + inl->start_line = inl->end_line = subj->line; + inl->start_column = opener->inl_text->start_column; + inl->end_column = subj->pos + subj->column_offset + subj->block_offset; cmark_node_insert_before(opener->inl_text, inl); // Add link text: tmp = opener->inl_text->next; @@ -1043,6 +1066,8 @@ static cmark_node *handle_newline(subject *subj) { if (peek_at(subj, subj->pos) == '\n') { advance(subj); } + ++subj->line; + subj->column_offset = -subj->pos; // skip spaces at beginning of line skip_spaces(subj); if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' && @@ -1102,7 +1127,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { cmark_node *new_inl = NULL; cmark_chunk contents; unsigned char c; - bufsize_t endpos; + bufsize_t startpos, endpos; c = peek_char(subj); if (c == 0) { return 0; @@ -1138,7 +1163,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { break; case '[': advance(subj); - new_inl = make_str(subj->mem, cmark_chunk_literal("[")); + new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("[")); push_bracket(subj, false, new_inl); break; case ']': @@ -1148,15 +1173,16 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { advance(subj); if (peek_char(subj) == '[') { advance(subj); - new_inl = make_str(subj->mem, cmark_chunk_literal("![")); + new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("![")); push_bracket(subj, true, new_inl); } else { - new_inl = make_str(subj->mem, cmark_chunk_literal("!")); + new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!")); } break; default: endpos = subject_find_special_char(subj, options); contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos); + startpos = subj->pos; subj->pos = endpos; // if we're at a newline, strip trailing spaces. @@ -1164,7 +1190,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { cmark_chunk_rtrim(&contents); } - new_inl = make_str(subj->mem, contents); + new_inl = make_str(subj, startpos, endpos - 1, contents); } if (new_inl != NULL) { cmark_node_append_child(parent, new_inl); @@ -1177,7 +1203,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, cmark_reference_map *refmap, int options) { subject subj; - subject_from_buf(mem, &subj, &parent->content, refmap); + subject_from_buf(mem, parent->start_line, parent->start_column - 1, &subj, &parent->content, refmap); cmark_chunk_rtrim(&subj.input); while (!is_eof(&subj) && parse_inline(&subj, parent, options)) @@ -1216,7 +1242,7 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, bufsize_t matchlen = 0; bufsize_t beforetitle; - subject_from_buf(mem, &subj, input, NULL); + subject_from_buf(mem, -1, 0, &subj, input, NULL); // parse label: if (!link_label(&subj, &lab) || lab.len == 0) diff --git a/src/iterator.c b/src/iterator.c index 24423a2..f5cd802 100644 --- a/src/iterator.c +++ b/src/iterator.c @@ -106,6 +106,7 @@ void cmark_consolidate_text_nodes(cmark_node *root) { while (tmp && tmp->type == CMARK_NODE_TEXT) { cmark_iter_next(iter); // advance pointer cmark_strbuf_put(&buf, tmp->as.literal.data, tmp->as.literal.len); + cur->end_column = tmp->end_column; next = tmp->next; cmark_node_free(tmp); tmp = next; |