summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuki Izumi <ashe@kivikakk.ee>2017-08-09 15:56:21 +1000
committerYuki Izumi <ashe@kivikakk.ee>2017-08-09 15:56:21 +1000
commit919cdb2c583163411b3b15b2eae0ce72cf2d7981 (patch)
tree558beeb51433ae15ed2e86b6d440f43215d81bd8
parent61b51fa7c8ec635eee19a16c6aa38c39093a0572 (diff)
Add sourcepos info for inlines
-rw-r--r--api_test/main.c97
-rw-r--r--src/inlines.c118
-rw-r--r--src/iterator.c1
3 files changed, 167 insertions, 49 deletions
diff --git a/api_test/main.c b/api_test/main.c
index d720234..08f3c98 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -552,9 +552,9 @@ static void render_xml(test_batch_runner *runner) {
STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
"<paragraph sourcepos=\"1:1-1:9\">\n"
- " <text>foo </text>\n"
- " <emph>\n"
- " <text>bar</text>\n"
+ " <text sourcepos=\"1:1-1:4\">foo </text>\n"
+ " <emph sourcepos=\"1:5-1:9\">\n"
+ " <text sourcepos=\"1:6-1:8\">bar</text>\n"
" </emph>\n"
"</paragraph>\n",
"render first paragraph with source pos");
@@ -883,6 +883,95 @@ static void test_feed_across_line_ending(test_batch_runner *runner) {
cmark_node_free(document);
}
+static void source_pos(test_batch_runner *runner) {
+ static const char markdown[] =
+ "Hi *there*.\n"
+ "\n"
+ "Hello &ldquo; <http://www.google.com>\n"
+ "there `hi` -- [okay](www.google.com (ok)).\n"
+ "\n"
+ "> 1. Okay.\n"
+ "> Sure.\n"
+ ">\n"
+ "> 2. Yes, okay.\n"
+ "> ![ok](hi \"yes\")\n";
+
+ cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT);
+ char *xml = cmark_render_xml(doc, CMARK_OPT_DEFAULT | CMARK_OPT_SOURCEPOS);
+ STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+ "<document sourcepos=\"1:1-10:20\" xmlns=\"http://commonmark.org/xml/1.0\">\n"
+ " <paragraph sourcepos=\"1:1-1:11\">\n"
+ " <text sourcepos=\"1:1-1:3\">Hi </text>\n"
+ " <emph sourcepos=\"1:4-1:10\">\n"
+ " <text sourcepos=\"1:5-1:9\">there</text>\n"
+ " </emph>\n"
+ " <text sourcepos=\"1:11-1:11\">.</text>\n"
+ " </paragraph>\n"
+ " <paragraph sourcepos=\"3:1-4:42\">\n"
+ " <text sourcepos=\"3:1-3:14\">Hello “ </text>\n"
+ " <link sourcepos=\"3:15-3:37\" destination=\"http://www.google.com\" title=\"\">\n"
+ " <text sourcepos=\"3:16-3:36\">http://www.google.com</text>\n"
+ " </link>\n"
+ " <softbreak />\n"
+ " <text sourcepos=\"4:1-4:6\">there </text>\n"
+ " <code sourcepos=\"4:8-4:9\">hi</code>\n"
+ " <text sourcepos=\"4:11-4:14\"> -- </text>\n"
+ " <link sourcepos=\"4:15-4:41\" destination=\"www.google.com\" title=\"ok\">\n"
+ " <text sourcepos=\"4:16-4:19\">okay</text>\n"
+ " </link>\n"
+ " <text sourcepos=\"4:42-4:42\">.</text>\n"
+ " </paragraph>\n"
+ " <block_quote sourcepos=\"6:1-10:20\">\n"
+ " <list sourcepos=\"6:3-10:20\" type=\"ordered\" start=\"1\" delim=\"period\" tight=\"false\">\n"
+ " <item sourcepos=\"6:3-8:1\">\n"
+ " <paragraph sourcepos=\"6:6-7:10\">\n"
+ " <text sourcepos=\"6:6-6:10\">Okay.</text>\n"
+ " <softbreak />\n"
+ " <text sourcepos=\"7:6-7:10\">Sure.</text>\n"
+ " </paragraph>\n"
+ " </item>\n"
+ " <item sourcepos=\"9:3-10:20\">\n"
+ " <paragraph sourcepos=\"9:6-10:20\">\n"
+ " <text sourcepos=\"9:6-9:15\">Yes, okay.</text>\n"
+ " <softbreak />\n"
+ " <image sourcepos=\"10:6-10:20\" destination=\"hi\" title=\"yes\">\n"
+ " <text sourcepos=\"10:8-10:9\">ok</text>\n"
+ " </image>\n"
+ " </paragraph>\n"
+ " </item>\n"
+ " </list>\n"
+ " </block_quote>\n"
+ "</document>\n",
+ "sourcepos are as expected");
+ free(xml);
+ cmark_node_free(doc);
+}
+
+static void ref_source_pos(test_batch_runner *runner) {
+ static const char markdown[] =
+ "Let's try [reference] links.\n"
+ "\n"
+ "[reference]: https://github.com (GitHub)\n";
+
+ cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT);
+ char *xml = cmark_render_xml(doc, CMARK_OPT_DEFAULT | CMARK_OPT_SOURCEPOS);
+ STR_EQ(runner, xml, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ "<!DOCTYPE document SYSTEM \"CommonMark.dtd\">\n"
+ "<document sourcepos=\"1:1-3:40\" xmlns=\"http://commonmark.org/xml/1.0\">\n"
+ " <paragraph sourcepos=\"1:1-1:28\">\n"
+ " <text sourcepos=\"1:1-1:10\">Let's try </text>\n"
+ " <link sourcepos=\"1:11-1:21\" destination=\"https://github.com\" title=\"GitHub\">\n"
+ " <text sourcepos=\"1:12-1:20\">reference</text>\n"
+ " </link>\n"
+ " <text sourcepos=\"1:22-1:28\"> links.</text>\n"
+ " </paragraph>\n"
+ "</document>\n",
+ "sourcepos are as expected");
+ free(xml);
+ cmark_node_free(doc);
+}
+
int main() {
int retval;
test_batch_runner *runner = test_batch_runner_new();
@@ -908,6 +997,8 @@ int main() {
test_cplusplus(runner);
test_safe(runner);
test_feed_across_line_ending(runner);
+ source_pos(runner);
+ ref_source_pos(runner);
test_print_summary(runner);
retval = test_ok(runner) ? 0 : 1;
diff --git a/src/inlines.c b/src/inlines.c
index 6bf82e2..3f1b9ed 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -22,9 +22,9 @@ static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
// Macros for creating various kinds of simple.
-#define make_str(mem, s) make_literal(mem, CMARK_NODE_TEXT, s)
-#define make_code(mem, s) make_literal(mem, CMARK_NODE_CODE, s)
-#define make_raw_html(mem, s) make_literal(mem, CMARK_NODE_HTML_INLINE, s)
+#define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s)
+#define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s)
+#define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s)
#define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)
#define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)
#define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)
@@ -55,7 +55,10 @@ typedef struct bracket {
typedef struct {
cmark_mem *mem;
cmark_chunk input;
+ int line;
bufsize_t pos;
+ int block_offset;
+ int column_offset;
cmark_reference_map *refmap;
delimiter *last_delim;
bracket *last_bracket;
@@ -72,17 +75,22 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
static int parse_inline(subject *subj, cmark_node *parent, int options);
-static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer,
- cmark_reference_map *refmap);
+static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
+ cmark_strbuf *buffer, cmark_reference_map *refmap);
static bufsize_t subject_find_special_char(subject *subj, int options);
// Create an inline with a literal string value.
-static CMARK_INLINE cmark_node *make_literal(cmark_mem *mem, cmark_node_type t,
+static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,
+ int start_column, int end_column,
cmark_chunk s) {
- cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e));
- cmark_strbuf_init(mem, &e->content, 0);
- e->type = t;
+ cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));
+ cmark_strbuf_init(subj->mem, &e->content, 0);
+ e->type = (uint16_t)t;
e->as.literal = s;
+ e->start_line = e->end_line = subj->line;
+ // columns are 1 based.
+ e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
+ e->end_column = end_column + 1 + subj->column_offset + subj->block_offset;
return e;
}
@@ -95,14 +103,15 @@ static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) {
}
// Like make_str, but parses entities.
-static cmark_node *make_str_with_entities(cmark_mem *mem,
+static cmark_node *make_str_with_entities(subject *subj,
+ int start_column, int end_column,
cmark_chunk *content) {
- cmark_strbuf unescaped = CMARK_BUF_INIT(mem);
+ cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);
if (houdini_unescape_html(&unescaped, content->data, content->len)) {
- return make_str(mem, cmark_chunk_buf_detach(&unescaped));
+ return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped));
} else {
- return make_str(mem, *content);
+ return make_str(subj, start_column, end_column, *content);
}
}
@@ -140,23 +149,30 @@ static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url,
return cmark_chunk_buf_detach(&buf);
}
-static CMARK_INLINE cmark_node *make_autolink(cmark_mem *mem, cmark_chunk url,
- int is_email) {
- cmark_node *link = make_simple(mem, CMARK_NODE_LINK);
- link->as.link.url = cmark_clean_autolink(mem, &url, is_email);
+static CMARK_INLINE cmark_node *make_autolink(subject *subj,
+ int start_column, int end_column,
+ cmark_chunk url, int is_email) {
+ cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK);
+ link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
link->as.link.title = cmark_chunk_literal("");
- cmark_node_append_child(link, make_str_with_entities(mem, &url));
+ link->start_line = link->end_line = subj->line;
+ link->start_column = start_column + 1;
+ link->end_column = end_column + 1;
+ cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
return link;
}
-static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer,
- cmark_reference_map *refmap) {
+static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
+ cmark_strbuf *buffer, cmark_reference_map *refmap) {
int i;
e->mem = mem;
e->input.data = buffer->ptr;
e->input.len = buffer->size;
e->input.alloc = 0;
+ e->line = line_number;
e->pos = 0;
+ e->block_offset = block_offset;
+ e->column_offset = 0;
e->refmap = refmap;
e->last_delim = NULL;
e->last_bracket = NULL;
@@ -277,7 +293,7 @@ static cmark_node *handle_backticks(subject *subj) {
if (endpos == 0) { // not found
subj->pos = startpos; // rewind
- return make_str(subj->mem, openticks);
+ return make_str(subj, subj->pos, subj->pos, openticks);
} else {
cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
@@ -286,7 +302,7 @@ static cmark_node *handle_backticks(subject *subj) {
cmark_strbuf_trim(&buf);
cmark_strbuf_normalize_whitespace(&buf);
- return make_code(subj->mem, cmark_chunk_buf_detach(&buf));
+ return make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf));
}
}
@@ -443,7 +459,7 @@ static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
}
- inl_text = make_str(subj->mem, contents);
+ inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents);
if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) {
push_delimiter(subj, c, can_open, can_close, inl_text);
@@ -459,7 +475,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) {
advance(subj);
if (!smart || peek_char(subj) != '-') {
- return make_str(subj->mem, cmark_chunk_literal("-"));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-"));
}
while (smart && peek_char(subj) == '-') {
@@ -492,7 +508,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) {
cmark_strbuf_puts(&buf, ENDASH);
}
- return make_str(subj->mem, cmark_chunk_buf_detach(&buf));
+ return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf));
}
// Assumes we have a period at the current position.
@@ -502,12 +518,12 @@ static cmark_node *handle_period(subject *subj, bool smart) {
advance(subj);
if (peek_char(subj) == '.') {
advance(subj);
- return make_str(subj->mem, cmark_chunk_literal(ELLIPSES));
+ return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES));
} else {
- return make_str(subj->mem, cmark_chunk_literal(".."));
+ return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal(".."));
}
} else {
- return make_str(subj->mem, cmark_chunk_literal("."));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("."));
}
}
@@ -643,6 +659,10 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
}
cmark_node_insert_after(opener_inl, emph);
+ emph->start_line = emph->end_line = subj->line;
+ emph->start_column = opener_inl->start_column + subj->column_offset;
+ emph->end_column = closer_inl->end_column + subj->column_offset;
+
// if opener has 0 characters, remove it and its associated inline
if (opener_num_chars == 0) {
cmark_node_free(opener_inl);
@@ -669,11 +689,11 @@ static cmark_node *handle_backslash(subject *subj) {
if (cmark_ispunct(
nextchar)) { // only ascii symbols and newline can be escaped
advance(subj);
- return make_str(subj->mem, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
+ return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
} else if (!is_eof(subj) && skip_line_end(subj)) {
return make_linebreak(subj->mem);
} else {
- return make_str(subj->mem, cmark_chunk_literal("\\"));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
}
}
@@ -689,10 +709,10 @@ static cmark_node *handle_entity(subject *subj) {
subj->input.len - subj->pos);
if (len == 0)
- return make_str(subj->mem, cmark_chunk_literal("&"));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&"));
subj->pos += len;
- return make_str(subj->mem, cmark_chunk_buf_detach(&ent));
+ return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent));
}
// Clean a URL: remove surrounding whitespace, and remove \ that escape
@@ -751,7 +771,7 @@ static cmark_node *handle_pointy_brace(subject *subj) {
contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
subj->pos += matchlen;
- return make_autolink(subj->mem, contents, 0);
+ return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0);
}
// next try to match an email autolink
@@ -760,7 +780,7 @@ static cmark_node *handle_pointy_brace(subject *subj) {
contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
subj->pos += matchlen;
- return make_autolink(subj->mem, contents, 1);
+ return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1);
}
// finally, try to match an html tag
@@ -768,11 +788,11 @@ static cmark_node *handle_pointy_brace(subject *subj) {
if (matchlen > 0) {
contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
subj->pos += matchlen;
- return make_raw_html(subj->mem, contents);
+ return make_raw_html(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents);
}
// if nothing matches, just return the opening <:
- return make_str(subj->mem, cmark_chunk_literal("<"));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<"));
}
// Parse a link label. Returns 1 if successful.
@@ -908,13 +928,13 @@ static cmark_node *handle_close_bracket(subject *subj) {
opener = subj->last_bracket;
if (opener == NULL) {
- return make_str(subj->mem, cmark_chunk_literal("]"));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
}
if (!opener->active) {
// take delimiter off stack
pop_bracket(subj);
- return make_str(subj->mem, cmark_chunk_literal("]"));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
}
// If we got here, we matched a potential link/image text.
@@ -991,12 +1011,15 @@ noMatch:
// If we fall through to here, it means we didn't match a link:
pop_bracket(subj); // remove this opener from delimiter list
subj->pos = initial_pos;
- return make_str(subj->mem, cmark_chunk_literal("]"));
+ return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
match:
inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
inl->as.link.url = url;
inl->as.link.title = title;
+ inl->start_line = inl->end_line = subj->line;
+ inl->start_column = opener->inl_text->start_column;
+ inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
cmark_node_insert_before(opener->inl_text, inl);
// Add link text:
tmp = opener->inl_text->next;
@@ -1043,6 +1066,8 @@ static cmark_node *handle_newline(subject *subj) {
if (peek_at(subj, subj->pos) == '\n') {
advance(subj);
}
+ ++subj->line;
+ subj->column_offset = -subj->pos;
// skip spaces at beginning of line
skip_spaces(subj);
if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' &&
@@ -1102,7 +1127,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
cmark_node *new_inl = NULL;
cmark_chunk contents;
unsigned char c;
- bufsize_t endpos;
+ bufsize_t startpos, endpos;
c = peek_char(subj);
if (c == 0) {
return 0;
@@ -1138,7 +1163,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
break;
case '[':
advance(subj);
- new_inl = make_str(subj->mem, cmark_chunk_literal("["));
+ new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("["));
push_bracket(subj, false, new_inl);
break;
case ']':
@@ -1148,15 +1173,16 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
advance(subj);
if (peek_char(subj) == '[') {
advance(subj);
- new_inl = make_str(subj->mem, cmark_chunk_literal("!["));
+ new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("!["));
push_bracket(subj, true, new_inl);
} else {
- new_inl = make_str(subj->mem, cmark_chunk_literal("!"));
+ new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!"));
}
break;
default:
endpos = subject_find_special_char(subj, options);
contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
+ startpos = subj->pos;
subj->pos = endpos;
// if we're at a newline, strip trailing spaces.
@@ -1164,7 +1190,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
cmark_chunk_rtrim(&contents);
}
- new_inl = make_str(subj->mem, contents);
+ new_inl = make_str(subj, startpos, endpos - 1, contents);
}
if (new_inl != NULL) {
cmark_node_append_child(parent, new_inl);
@@ -1177,7 +1203,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent,
cmark_reference_map *refmap, int options) {
subject subj;
- subject_from_buf(mem, &subj, &parent->content, refmap);
+ subject_from_buf(mem, parent->start_line, parent->start_column - 1, &subj, &parent->content, refmap);
cmark_chunk_rtrim(&subj.input);
while (!is_eof(&subj) && parse_inline(&subj, parent, options))
@@ -1216,7 +1242,7 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input,
bufsize_t matchlen = 0;
bufsize_t beforetitle;
- subject_from_buf(mem, &subj, input, NULL);
+ subject_from_buf(mem, -1, 0, &subj, input, NULL);
// parse label:
if (!link_label(&subj, &lab) || lab.len == 0)
diff --git a/src/iterator.c b/src/iterator.c
index 24423a2..f5cd802 100644
--- a/src/iterator.c
+++ b/src/iterator.c
@@ -106,6 +106,7 @@ void cmark_consolidate_text_nodes(cmark_node *root) {
while (tmp && tmp->type == CMARK_NODE_TEXT) {
cmark_iter_next(iter); // advance pointer
cmark_strbuf_put(&buf, tmp->as.literal.data, tmp->as.literal.len);
+ cur->end_column = tmp->end_column;
next = tmp->next;
cmark_node_free(tmp);
tmp = next;