summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMathieu Duponchelle <MathieuDuponchelle@users.noreply.github.com>2016-12-20 22:00:17 +0100
committerJohn MacFarlane <jgm@berkeley.edu>2016-12-20 16:00:17 -0500
commit9e643720ec903f3b448bd2589a0c02c2514805ae (patch)
tree5ee8793c56f141821b0039920c2f7cd0b8b544f9 /src
parent29c46c5aeda66e9c454ac8d802e65692d0bab566 (diff)
More sourcepos! (#169)
* open_new_blocks: always create child before advancing offset * Source map * Extent's typology * In-depth python bindings
Diffstat (limited to 'src')
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/blocks.c186
-rw-r--r--src/cmark.c5
-rw-r--r--src/cmark.h60
-rw-r--r--src/inlines.c143
-rw-r--r--src/inlines.h11
-rw-r--r--src/parser.h4
-rw-r--r--src/source_map.c293
-rw-r--r--src/source_map.h66
9 files changed, 704 insertions, 66 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0cb6530..b75c0c7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ set(HEADERS
houdini.h
cmark_ctype.h
render.h
+ source_map.h
)
set(LIBRARY_SOURCES
cmark.c
@@ -40,6 +41,7 @@ set(LIBRARY_SOURCES
houdini_html_e.c
houdini_html_u.c
cmark_ctype.c
+ source_map.c
${HEADERS}
)
diff --git a/src/blocks.c b/src/blocks.c
index 99dd082..1c1d160 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -28,6 +28,10 @@
#define MIN(x, y) ((x < y) ? x : y)
#endif
+#ifndef MAX
+#define MAX(x, y) ((x > y) ? x : y)
+#endif
+
#define peek_at(i, n) (i)->data[n]
static bool S_last_line_blank(const cmark_node *node) {
@@ -93,6 +97,7 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) {
parser->root = document;
parser->current = document;
parser->line_number = 0;
+ parser->line_offset = 0;
parser->offset = 0;
parser->column = 0;
parser->first_nonspace = 0;
@@ -103,6 +108,7 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) {
parser->last_line_length = 0;
parser->options = options;
parser->last_buffer_ended_with_cr = false;
+ parser->source_map = source_map_new(mem);
return parser;
}
@@ -116,6 +122,7 @@ void cmark_parser_free(cmark_parser *parser) {
cmark_mem *mem = parser->mem;
cmark_strbuf_free(&parser->curline);
cmark_strbuf_free(&parser->linebuf);
+ source_map_free(parser->source_map);
cmark_reference_map_free(parser->refmap);
mem->free(parser);
}
@@ -255,10 +262,13 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
switch (S_type(b)) {
case CMARK_NODE_PARAGRAPH:
+ source_map_start_cursor(parser->source_map, parser->last_paragraph_extent);
while (cmark_strbuf_at(node_content, 0) == '[' &&
(pos = cmark_parse_reference_inline(parser->mem, node_content,
- parser->refmap))) {
-
+ parser->refmap, parser->root,
+ parser->source_map))) {
+ parser->last_paragraph_extent = parser->source_map->cursor;
+ source_map_start_cursor(parser->source_map, parser->last_paragraph_extent);
cmark_strbuf_drop(node_content, pos);
}
if (is_blank(node_content, 0)) {
@@ -266,7 +276,6 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
cmark_node_free(b);
}
break;
-
case CMARK_NODE_CODE_BLOCK:
if (!b->as.code.fenced) { // indented code
remove_trailing_blank_lines(node_content);
@@ -361,21 +370,32 @@ static cmark_node *add_child(cmark_parser *parser, cmark_node *parent,
// Walk through node and all children, recursively, parsing
// string content into inline content where appropriate.
-static void process_inlines(cmark_mem *mem, cmark_node *root,
- cmark_reference_map *refmap, int options) {
- cmark_iter *iter = cmark_iter_new(root);
+static void process_inlines(cmark_parser *parser) {
+ cmark_iter *iter = cmark_iter_new(parser->root);
cmark_node *cur;
cmark_event_type ev_type;
+ cmark_source_extent *cur_extent = parser->source_map->head;
while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
cur = cmark_iter_get_node(iter);
if (ev_type == CMARK_EVENT_ENTER) {
if (contains_inlines(S_type(cur))) {
- cmark_parse_inlines(mem, cur, refmap, options);
+ while (cur_extent && cur_extent->node != cur) {
+ cur_extent = source_map_stitch_extent(parser->source_map, cur_extent, parser->root, parser->line_offset)->next;
+ }
+
+ assert(cur_extent);
+
+ source_map_start_cursor(parser->source_map, cur_extent);
+ cmark_parse_inlines(parser->mem, cur, parser->refmap, parser->options, parser->source_map, parser->line_offset);
}
}
}
+ while (cur_extent) {
+ cur_extent = source_map_stitch_extent(parser->source_map, cur_extent, parser->root, parser->line_offset)->next;
+ }
+
cmark_iter_free(iter);
}
@@ -482,7 +502,10 @@ static cmark_node *finalize_document(cmark_parser *parser) {
}
finalize(parser, parser->root);
- process_inlines(parser->mem, parser->root, parser->refmap, parser->options);
+
+ process_inlines(parser);
+
+ assert(source_map_check(parser->source_map, parser->line_offset));
return parser->root;
}
@@ -524,6 +547,7 @@ void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) {
static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
size_t len, bool eof) {
const unsigned char *end = buffer + len;
+ const unsigned char *skipped;
static const uint8_t repl[] = {239, 191, 189};
if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
@@ -534,6 +558,7 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
while (buffer < end) {
const unsigned char *eol;
bufsize_t chunk_len;
+ bufsize_t linebuf_size = 0;
bool process = false;
for (eol = buffer; eol < end; ++eol) {
if (S_is_line_end_char(*eol)) {
@@ -551,6 +576,7 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
chunk_len = (eol - buffer);
if (process) {
if (parser->linebuf.size > 0) {
+ linebuf_size = cmark_strbuf_len(&parser->linebuf);
cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
cmark_strbuf_clear(&parser->linebuf);
@@ -569,6 +595,8 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
}
buffer += chunk_len;
+ skipped = buffer;
+
if (buffer < end) {
if (*buffer == '\0') {
// skip over NULL
@@ -584,6 +612,11 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
buffer++;
}
}
+ chunk_len += buffer - skipped;
+ chunk_len += linebuf_size;
+
+ if (process)
+ parser->line_offset += chunk_len;
}
}
@@ -643,11 +676,13 @@ static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) {
// indicates a number of columns; otherwise, a number of bytes.
// If advancing a certain number of columns partially consumes
// a tab character, parser->partially_consumed_tab is set to true.
-static void S_advance_offset(cmark_parser *parser, cmark_chunk *input,
- bufsize_t count, bool columns) {
+static void S_advance_offset(cmark_parser *parser, cmark_node *container, cmark_extent_type type,
+ cmark_chunk *input, bufsize_t count, bool columns) {
char c;
int chars_to_tab;
int chars_to_advance;
+ int initial_pos = parser->offset + parser->line_offset;
+
while (count > 0 && (c = peek_at(input, parser->offset))) {
if (c == '\t') {
chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
@@ -670,6 +705,8 @@ static void S_advance_offset(cmark_parser *parser, cmark_chunk *input,
count -= 1;
}
}
+
+ source_map_append_extent(parser->source_map, initial_pos, parser->offset + parser->line_offset, container, type);
}
static bool S_last_child_is_open(cmark_node *container) {
@@ -677,7 +714,7 @@ static bool S_last_child_is_open(cmark_node *container) {
(container->last_child->flags & CMARK_NODE__OPEN);
}
-static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) {
+static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input, cmark_node *container) {
bool res = false;
bufsize_t matched = 0;
@@ -685,10 +722,10 @@ static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) {
parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>';
if (matched) {
- S_advance_offset(parser, input, parser->indent + 1, true);
+ S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, parser->indent + 1, true);
if (S_is_space_or_tab(peek_at(input, parser->offset))) {
- S_advance_offset(parser, input, 1, true);
+ S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, 1, true);
}
res = true;
@@ -702,7 +739,7 @@ static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input,
if (parser->indent >=
container->as.list.marker_offset + container->as.list.padding) {
- S_advance_offset(parser, input, container->as.list.marker_offset +
+ S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, container->as.list.marker_offset +
container->as.list.padding,
true);
res = true;
@@ -710,7 +747,7 @@ static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input,
// if container->first_child is NULL, then the opening line
// of the list item was blank after the list marker; in this
// case, we are done with the list item.
- S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
+ S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset,
false);
res = true;
}
@@ -724,10 +761,10 @@ static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input,
if (!container->as.code.fenced) { // indented
if (parser->indent >= CODE_INDENT) {
- S_advance_offset(parser, input, CODE_INDENT, true);
+ S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, CODE_INDENT, true);
res = true;
} else if (parser->blank) {
- S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
+ S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset,
false);
res = true;
}
@@ -743,14 +780,14 @@ static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input,
// closing fence - and since we're at
// the end of a line, we can stop processing it:
*should_continue = false;
- S_advance_offset(parser, input, matched, false);
+ S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, matched, false);
parser->current = finalize(parser, container);
} else {
// skip opt. spaces of fence parser->offset
int i = container->as.code.fence_offset;
while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) {
- S_advance_offset(parser, input, 1, true);
+ S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, 1, true);
i--;
}
res = true;
@@ -807,7 +844,7 @@ static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input,
switch (cont_type) {
case CMARK_NODE_BLOCK_QUOTE:
- if (!parse_block_quote_prefix(parser, input))
+ if (!parse_block_quote_prefix(parser, input, container))
goto done;
break;
case CMARK_NODE_ITEM:
@@ -867,29 +904,26 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
indented = parser->indent >= CODE_INDENT;
if (!indented && peek_at(input, parser->first_nonspace) == '>') {
+ *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE,
+ parser->first_nonspace + 1);
- bufsize_t blockquote_startpos = parser->first_nonspace;
-
- S_advance_offset(parser, input,
+ S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input,
parser->first_nonspace + 1 - parser->offset, false);
// optional following character
if (S_is_space_or_tab(peek_at(input, parser->offset))) {
- S_advance_offset(parser, input, 1, true);
+ S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true);
}
- *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE,
- blockquote_startpos + 1);
} else if (!indented && (matched = scan_atx_heading_start(
input, parser->first_nonspace))) {
bufsize_t hashpos;
int level = 0;
- bufsize_t heading_startpos = parser->first_nonspace;
- S_advance_offset(parser, input,
+ *container = add_child(parser, *container, CMARK_NODE_HEADING,
+ parser->first_nonspace + 1);
+ S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input,
parser->first_nonspace + matched - parser->offset,
false);
- *container = add_child(parser, *container, CMARK_NODE_HEADING,
- heading_startpos + 1);
hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace);
@@ -911,7 +945,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
(*container)->as.code.fence_offset =
(int8_t)(parser->first_nonspace - parser->offset);
(*container)->as.code.info = cmark_chunk_literal("");
- S_advance_offset(parser, input,
+ S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input,
parser->first_nonspace + matched - parser->offset,
false);
@@ -931,14 +965,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
(*container)->type = (uint16_t)CMARK_NODE_HEADING;
(*container)->as.heading.level = lev;
(*container)->as.heading.setext = true;
- S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
+ S_advance_offset(parser, *container, CMARK_EXTENT_CLOSER, input, input->len - 1 - parser->offset, false);
} else if (!indented &&
!(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) &&
(matched = scan_thematic_break(input, parser->first_nonspace))) {
// it's only now that we know the line is not part of a setext heading:
*container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
parser->first_nonspace + 1);
- S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
+ S_advance_offset(parser, *container, CMARK_EXTENT_CONTENT, input, input->len - 1 - parser->offset, false);
} else if ((!indented || cont_type == CMARK_NODE_LIST) &&
(matched = parse_list_marker(
parser->mem, input, parser->first_nonspace,
@@ -946,20 +980,37 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
// Note that we can have new list items starting with >= 4
// spaces indent, as long as the list container is still open.
+ cmark_node *list = NULL;
+ cmark_node *item = NULL;
+ cmark_source_extent *save_source_map_tail;
int i = 0;
+ if (cont_type != CMARK_NODE_LIST ||
+ !lists_match(&((*container)->as.list), data)) {
+ *container = add_child(parser, *container, CMARK_NODE_LIST,
+ parser->first_nonspace + 1);
+ list = *container;
+
+ }
+
+ // add the list item
+ *container = add_child(parser, *container, CMARK_NODE_ITEM,
+ parser->first_nonspace + 1);
+ item = *container;
+
// compute padding:
- S_advance_offset(parser, input,
+ S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input,
parser->first_nonspace + matched - parser->offset,
false);
save_partially_consumed_tab = parser->partially_consumed_tab;
save_offset = parser->offset;
save_column = parser->column;
+ save_source_map_tail = parser->source_map->tail;
while (parser->column - save_column <= 5 &&
S_is_space_or_tab(peek_at(input, parser->offset))) {
- S_advance_offset(parser, input, 1, true);
+ S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true);
}
i = parser->column - save_column;
@@ -969,9 +1020,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
data->padding = matched + 1;
parser->offset = save_offset;
parser->column = save_column;
+ if (save_source_map_tail) {
+ cmark_source_extent *tmp_extent;
+ for (tmp_extent = save_source_map_tail->next; tmp_extent; tmp_extent = source_map_free_extent(parser->source_map, tmp_extent));
+ }
+
parser->partially_consumed_tab = save_partially_consumed_tab;
if (i > 0) {
- S_advance_offset(parser, input, 1, true);
+ S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true);
}
} else {
data->padding = matched + i;
@@ -982,22 +1038,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
data->marker_offset = parser->indent;
- if (cont_type != CMARK_NODE_LIST ||
- !lists_match(&((*container)->as.list), data)) {
- *container = add_child(parser, *container, CMARK_NODE_LIST,
- parser->first_nonspace + 1);
-
- memcpy(&((*container)->as.list), data, sizeof(*data));
- }
-
- // add the list item
- *container = add_child(parser, *container, CMARK_NODE_ITEM,
- parser->first_nonspace + 1);
/* TODO: static */
- memcpy(&((*container)->as.list), data, sizeof(*data));
+ if (list)
+ memcpy(&(list->as.list), data, sizeof(*data));
+ if (item)
+ memcpy(&(item->as.list), data, sizeof(*data));
+
parser->mem->free(data);
} else if (indented && !maybe_lazy && !parser->blank) {
- S_advance_offset(parser, input, CODE_INDENT, true);
*container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
parser->offset + 1);
(*container)->as.code.fenced = false;
@@ -1006,6 +1054,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container,
(*container)->as.code.fence_offset = 0;
(*container)->as.code.info = cmark_chunk_literal("");
+ S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, CODE_INDENT, true);
} else {
break;
}
@@ -1070,6 +1119,11 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
}
if (S_type(container) == CMARK_NODE_CODE_BLOCK) {
+ source_map_append_extent(parser->source_map,
+ parser->offset + parser->line_offset,
+ parser->line_offset + input->len,
+ container,
+ CMARK_EXTENT_CONTENT);
add_line(container, input, parser);
} else if (S_type(container) == CMARK_NODE_HTML_BLOCK) {
add_line(container, input, parser);
@@ -1110,22 +1164,43 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container,
container = finalize(parser, container);
assert(parser->current != NULL);
}
+ source_map_append_extent(parser->source_map,
+ parser->offset + parser->line_offset,
+ parser->line_offset + input->len,
+ container,
+ CMARK_EXTENT_CONTENT);
} else if (parser->blank) {
- // ??? do nothing
+ source_map_append_extent(parser->source_map,
+ parser->line_offset + parser->offset,
+ parser->line_offset + input->len,
+ container,
+ CMARK_EXTENT_BLANK);
} else if (accepts_lines(S_type(container))) {
+ bufsize_t initial_len = input->len;
+ bool chopped = false;
+
if (S_type(container) == CMARK_NODE_HEADING &&
container->as.heading.setext == false) {
chop_trailing_hashtags(input);
+ chopped = true;
}
- S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
+ S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset,
false);
add_line(container, input, parser);
+
+ if (chopped)
+ source_map_append_extent(parser->source_map,
+ MAX(parser->line_offset + parser->offset, parser->line_offset + input->len),
+ parser->line_offset + initial_len,
+ container,
+ CMARK_EXTENT_CLOSER);
} else {
// create paragraph container for line
container = add_child(parser, container, CMARK_NODE_PARAGRAPH,
parser->first_nonspace + 1);
- S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
+ S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, parser->first_nonspace - parser->offset,
false);
+ parser->last_paragraph_extent = parser->source_map->tail;
add_line(container, input, parser);
}
@@ -1187,6 +1262,7 @@ finished:
cmark_node *cmark_parser_finish(cmark_parser *parser) {
if (parser->linebuf.size) {
S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
+ parser->line_offset += parser->linebuf.size;
cmark_strbuf_clear(&parser->linebuf);
}
@@ -1205,3 +1281,9 @@ cmark_node *cmark_parser_finish(cmark_parser *parser) {
#endif
return parser->root;
}
+
+cmark_source_extent *
+cmark_parser_get_first_source_extent(cmark_parser *parser)
+{
+ return parser->source_map->head;
+}
diff --git a/src/cmark.c b/src/cmark.c
index 0d3bc16..2ef6cb4 100644
--- a/src/cmark.c
+++ b/src/cmark.c
@@ -24,6 +24,11 @@ static void *xrealloc(void *ptr, size_t size) {
return new_ptr;
}
+void cmark_default_mem_free(void *ptr)
+{
+ free(ptr);
+}
+
cmark_mem DEFAULT_MEM_ALLOCATOR = {xcalloc, xrealloc, free};
char *cmark_markdown_to_html(const char *text, size_t len, int options) {
diff --git a/src/cmark.h b/src/cmark.h
index 6ed7eb0..034f0e6 100644
--- a/src/cmark.h
+++ b/src/cmark.h
@@ -2,6 +2,7 @@
#define CMARK_H
#include <stdio.h>
+#include <stdint.h>
#include <cmark_export.h>
#include <cmark_version.h>
@@ -65,6 +66,21 @@ typedef enum {
CMARK_NODE_LAST_INLINE = CMARK_NODE_IMAGE,
} cmark_node_type;
+typedef enum {
+ CMARK_EXTENT_NONE,
+ CMARK_EXTENT_OPENER,
+ CMARK_EXTENT_CLOSER,
+ CMARK_EXTENT_BLANK,
+ CMARK_EXTENT_CONTENT,
+ CMARK_EXTENT_PUNCTUATION,
+ CMARK_EXTENT_LINK_DESTINATION,
+ CMARK_EXTENT_LINK_TITLE,
+ CMARK_EXTENT_LINK_LABEL,
+ CMARK_EXTENT_REFERENCE_DESTINATION,
+ CMARK_EXTENT_REFERENCE_LABEL,
+ CMARK_EXTENT_REFERENCE_TITLE,
+} cmark_extent_type;
+
/* For backwards compatibility: */
#define CMARK_NODE_HEADER CMARK_NODE_HEADING
#define CMARK_NODE_HRULE CMARK_NODE_THEMATIC_BREAK
@@ -86,6 +102,7 @@ typedef enum {
typedef struct cmark_node cmark_node;
typedef struct cmark_parser cmark_parser;
typedef struct cmark_iter cmark_iter;
+typedef struct cmark_source_extent cmark_source_extent;
/**
* ## Custom memory allocator support
@@ -100,6 +117,11 @@ typedef struct cmark_mem {
void (*free)(void *);
} cmark_mem;
+/** Convenience function for bindings.
+ */
+CMARK_EXPORT
+void cmark_default_mem_free(void *ptr);
+
/**
* ## Creating and Destroying Nodes
*/
@@ -477,6 +499,11 @@ void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len);
CMARK_EXPORT
cmark_node *cmark_parser_finish(cmark_parser *parser);
+/** Return a pointer to the first extent of the parser's source map
+ */
+CMARK_EXPORT
+cmark_source_extent *cmark_parser_get_first_source_extent(cmark_parser *parser);
+
/** Parse a CommonMark document in 'buffer' of length 'len'.
* Returns a pointer to a tree of nodes. The memory allocated for
* the node tree should be released using 'cmark_node_free'
@@ -492,6 +519,39 @@ cmark_node *cmark_parse_document(const char *buffer, size_t len, int options);
CMARK_EXPORT
cmark_node *cmark_parse_file(FILE *f, int options);
+/**
+ * ## Source map API
+ */
+
+/* Return the index, in bytes, of the start of this extent */
+CMARK_EXPORT
+uint64_t cmark_source_extent_get_start(cmark_source_extent *extent);
+
+/* Return the index, in bytes, of the stop of this extent. This
+ * index is not included in the extent*/
+CMARK_EXPORT
+uint64_t cmark_source_extent_get_stop(cmark_source_extent *extent);
+
+/* Return the extent immediately following 'extent' */
+CMARK_EXPORT
+cmark_source_extent *cmark_source_extent_get_next(cmark_source_extent *extent);
+
+/* Return the extent immediately preceding 'extent' */
+CMARK_EXPORT
+cmark_source_extent *cmark_source_extent_get_previous(cmark_source_extent *extent);
+
+/* Return the node 'extent' maps to */
+CMARK_EXPORT
+cmark_node *cmark_source_extent_get_node(cmark_source_extent *extent);
+
+/* Return the type of 'extent' */
+CMARK_EXPORT
+cmark_extent_type cmark_source_extent_get_type(cmark_source_extent *extent);
+
+/* Return a string representation of 'extent' */
+CMARK_EXPORT
+const char *cmark_source_extent_get_type_string(cmark_source_extent *extent);
+
/**
* ## Rendering
*/
diff --git a/src/inlines.c b/src/inlines.c
index c9cccfe..96da28b 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -13,6 +13,10 @@
#include "scanners.h"
#include "inlines.h"
+#ifndef MIN
+#define MIN(x, y) ((x < y) ? x : y)
+#endif
+
static const char *EMDASH = "\xE2\x80\x94";
static const char *ENDASH = "\xE2\x80\x93";
static const char *ELLIPSES = "\xE2\x80\xA6";
@@ -39,6 +43,7 @@ typedef struct delimiter {
unsigned char delim_char;
bool can_open;
bool can_close;
+ cmark_source_extent *extent;
} delimiter;
typedef struct bracket {
@@ -49,6 +54,7 @@ typedef struct bracket {
bool image;
bool active;
bool bracket_after;
+ cmark_source_extent *extent;
} bracket;
typedef struct {
@@ -60,6 +66,7 @@ typedef struct {
bracket *last_bracket;
bufsize_t backticks[MAXBACKTICKS + 1];
bool scanned_for_backticks;
+ cmark_source_map *source_map;
} subject;
static CMARK_INLINE bool S_is_line_end_char(char c) {
@@ -72,7 +79,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
static int parse_inline(subject *subj, cmark_node *parent, int options);
static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer,
- cmark_reference_map *refmap);
+ cmark_reference_map *refmap, cmark_source_map *source_map);
static bufsize_t subject_find_special_char(subject *subj, int options);
// Create an inline with a literal string value.
@@ -148,7 +155,7 @@ static CMARK_INLINE cmark_node *make_autolink(cmark_mem *mem, cmark_chunk url,
}
static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer,
- cmark_reference_map *refmap) {
+ cmark_reference_map *refmap, cmark_source_map *source_map) {
int i;
e->mem = mem;
e->input.data = buffer->ptr;
@@ -158,6 +165,8 @@ static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer,
e->refmap = refmap;
e->last_delim = NULL;
e->last_bracket = NULL;
+ e->source_map = source_map;
+
for (i=0; i <= MAXBACKTICKS; i++) {
e->backticks[i] = 0;
}
@@ -404,6 +413,7 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open,
if (delim->previous != NULL) {
delim->previous->next = delim;
}
+ delim->extent = NULL;
subj->last_delim = delim;
}
@@ -419,11 +429,12 @@ static void push_bracket(subject *subj, bool image, cmark_node *inl_text) {
b->previous_delimiter = subj->last_delim;
b->position = subj->pos;
b->bracket_after = false;
+ b->extent = NULL;
subj->last_bracket = b;
}
// Assumes the subject has a c at the current position.
-static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
+static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart, bool *pushed) {
bufsize_t numdelims;
cmark_node *inl_text;
bool can_open, can_close;
@@ -444,6 +455,9 @@ static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) {
push_delimiter(subj, c, can_open, can_close, inl_text);
+ *pushed = true;
+ } else {
+ *pushed = false;
}
return inl_text;
@@ -607,6 +621,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
bufsize_t opener_num_chars = opener_inl->as.literal.len;
bufsize_t closer_num_chars = closer_inl->as.literal.len;
cmark_node *tmp, *tmpnext, *emph;
+ cmark_source_extent *tmp_extent;
// calculate the actual number of characters used from this closer
if (closer_num_chars < 3 || opener_num_chars < 3) {
@@ -642,9 +657,28 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
}
cmark_node_insert_after(opener_inl, emph);
+ tmp_extent = closer->extent->prev;
+
+ source_map_insert_extent(subj->source_map,
+ opener->extent,
+ opener->extent->stop - use_delims,
+ opener->extent->stop,
+ emph,
+ CMARK_EXTENT_OPENER);
+ opener->extent->stop -= use_delims;
+
+ source_map_insert_extent(subj->source_map,
+ tmp_extent,
+ closer->extent->start,
+ closer->extent->start + use_delims,
+ emph,
+ CMARK_EXTENT_CLOSER);
+ closer->extent->start += use_delims;
+
// if opener has 0 characters, remove it and its associated inline
if (opener_num_chars == 0) {
cmark_node_free(opener_inl);
+ source_map_free_extent(subj->source_map, opener->extent);
remove_delimiter(subj, opener);
}
@@ -654,6 +688,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
cmark_node_free(closer_inl);
// remove closer from list
tmp_delim = closer->next;
+ source_map_free_extent(subj->source_map, closer->extent);
remove_delimiter(subj, closer);
closer = tmp_delim;
}
@@ -876,6 +911,8 @@ static cmark_node *handle_close_bracket(subject *subj) {
int found_label;
cmark_node *tmp, *tmpnext;
bool is_image;
+ bool is_inline = false;
+ bool is_shortcut = false;
advance(subj); // advance past ]
initial_pos = subj->pos;
@@ -926,6 +963,7 @@ static cmark_node *handle_close_bracket(subject *subj) {
title = cmark_clean_title(subj->mem, &title_chunk);
cmark_chunk_free(subj->mem, &url_chunk);
cmark_chunk_free(subj->mem, &title_chunk);
+ is_inline = true;
goto match;
} else {
@@ -948,6 +986,7 @@ static cmark_node *handle_close_bracket(subject *subj) {
cmark_chunk_free(subj->mem, &raw_label);
raw_label = cmark_chunk_dup(&subj->input, opener->position,
initial_pos - opener->position - 1);
+ is_shortcut = true;
found_label = true;
}
@@ -977,6 +1016,28 @@ match:
cmark_node_insert_before(opener->inl_text, inl);
// Add link text:
tmp = opener->inl_text->next;
+ assert(opener->extent);
+
+ opener->extent->node = inl;
+ opener->extent->type = CMARK_EXTENT_PUNCTUATION;
+
+ source_map_splice_extent(subj->source_map, initial_pos - 1, initial_pos, inl, CMARK_EXTENT_PUNCTUATION);
+ if (is_inline) {
+ source_map_splice_extent(subj->source_map, after_link_text_pos, starturl, inl, CMARK_EXTENT_PUNCTUATION);
+ source_map_splice_extent(subj->source_map, starturl, endurl, inl, CMARK_EXTENT_LINK_DESTINATION);
+ if (endtitle != starttitle) {
+ source_map_splice_extent(subj->source_map, endurl, starttitle, inl, CMARK_EXTENT_BLANK);
+ source_map_splice_extent(subj->source_map, starttitle, endtitle, inl, CMARK_EXTENT_LINK_TITLE);
+ source_map_splice_extent(subj->source_map, endtitle, subj->pos, inl, CMARK_EXTENT_BLANK);
+ } else {
+ source_map_splice_extent(subj->source_map, endurl, subj->pos, inl, CMARK_EXTENT_BLANK);
+ }
+ } else if (!is_shortcut) {
+ source_map_splice_extent(subj->source_map, initial_pos, initial_pos + 1, inl, CMARK_EXTENT_PUNCTUATION);
+ source_map_splice_extent(subj->source_map, initial_pos + 1, subj->pos - 1, inl, CMARK_EXTENT_LINK_LABEL);
+ source_map_splice_extent(subj->source_map, subj->pos - 1, subj->pos, inl, CMARK_EXTENT_PUNCTUATION);
+ }
+
while (tmp) {
tmpnext = tmp->next;
cmark_node_append_child(inl, tmp);
@@ -1080,6 +1141,10 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
cmark_chunk contents;
unsigned char c;
bufsize_t endpos;
+ bufsize_t startpos = subj->pos;
+ bool add_extent_to_last_bracket = false;
+ bool add_extent_to_last_delimiter = false;
+
c = peek_char(subj);
if (c == 0) {
return 0;
@@ -1105,7 +1170,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
case '_':
case '\'':
case '"':
- new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0);
+ new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0, &add_extent_to_last_delimiter);
break;
case '-':
new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0);
@@ -1117,6 +1182,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
advance(subj);
new_inl = make_str(subj->mem, cmark_chunk_literal("["));
push_bracket(subj, false, new_inl);
+ add_extent_to_last_bracket = true;
break;
case ']':
new_inl = handle_close_bracket(subj);
@@ -1127,6 +1193,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
advance(subj);
new_inl = make_str(subj->mem, cmark_chunk_literal("!["));
push_bracket(subj, true, new_inl);
+ add_extent_to_last_bracket = true;
} else {
new_inl = make_str(subj->mem, cmark_chunk_literal("!"));
}
@@ -1143,7 +1210,17 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
new_inl = make_str(subj->mem, contents);
}
+
if (new_inl != NULL) {
+ cmark_source_extent *extent;
+
+ extent = source_map_splice_extent(subj->source_map, startpos, subj->pos, new_inl, CMARK_EXTENT_CONTENT);
+
+ if (add_extent_to_last_bracket)
+ subj->last_bracket->extent = extent;
+ else if (add_extent_to_last_delimiter)
+ subj->last_delim->extent = extent;
+
cmark_node_append_child(parent, new_inl);
}
@@ -1152,9 +1229,11 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) {
// Parse inlines from parent's string_content, adding as children of parent.
extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent,
- cmark_reference_map *refmap, int options) {
+ cmark_reference_map *refmap, int options,
+ cmark_source_map *source_map, uint64_t total_length) {
subject subj;
- subject_from_buf(mem, &subj, &parent->content, refmap);
+ subject_from_buf(mem, &subj, &parent->content, refmap, source_map);
+ bufsize_t initial_len = subj.input.len;
cmark_chunk_rtrim(&subj.input);
while (!is_eof(&subj) && parse_inline(&subj, parent, options))
@@ -1168,6 +1247,13 @@ extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent,
while (subj.last_bracket) {
pop_bracket(&subj);
}
+
+ source_map_insert_extent(source_map,
+ source_map->cursor,
+ source_map->cursor->stop,
+ MIN(source_map->cursor->stop + initial_len - subj.input.len, total_length),
+ parent,
+ CMARK_EXTENT_BLANK);
}
// Parse zero or more space characters, including at most one newline.
@@ -1183,22 +1269,30 @@ static void spnl(subject *subj) {
// Return 0 if no reference found, otherwise position of subject
// after reference is parsed.
bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input,
- cmark_reference_map *refmap) {
+ cmark_reference_map *refmap,
+ cmark_node *root,
+ cmark_source_map *source_map) {
subject subj;
+ cmark_node *container = source_map->cursor->node;
+ cmark_source_extent *tmp_extent = source_map->cursor;
cmark_chunk lab;
cmark_chunk url;
cmark_chunk title;
bufsize_t matchlen = 0;
- bufsize_t beforetitle;
+ bufsize_t starttitle, endtitle;
+ bufsize_t endlabel;
+ bufsize_t starturl, endurl;
- subject_from_buf(mem, &subj, input, NULL);
+ subject_from_buf(mem, &subj, input, NULL, source_map);
// parse label:
if (!link_label(&subj, &lab) || lab.len == 0)
return 0;
+ endlabel = subj.pos - 1;
+
// colon:
if (peek_char(&subj) == ':') {
advance(&subj);
@@ -1208,6 +1302,7 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input,
// parse link url:
spnl(&subj);
+ starturl = subj.pos;
matchlen = manual_scan_link_url(&subj.input, subj.pos);
if (matchlen > 0) {
url = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
@@ -1217,22 +1312,29 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input,
}
// parse optional link_title
- beforetitle = subj.pos;
+ endurl = subj.pos;
spnl(&subj);
+ starttitle = subj.pos;
matchlen = scan_link_title(&subj.input, subj.pos);
if (matchlen) {
title = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
subj.pos += matchlen;
} else {
- subj.pos = beforetitle;
+ subj.pos = endurl;
+ starttitle = endurl;
+ endtitle = endurl;
title = cmark_chunk_literal("");
}
+ endtitle = subj.pos;
+
// parse final spaces and newline:
skip_spaces(&subj);
if (!skip_line_end(&subj)) {
if (matchlen) { // try rewinding before title
- subj.pos = beforetitle;
+ subj.pos = endurl;
+ starttitle = endurl;
+ endtitle = endurl;
skip_spaces(&subj);
if (!skip_line_end(&subj)) {
return 0;
@@ -1243,5 +1345,22 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input,
}
// insert reference into refmap
cmark_reference_create(refmap, &lab, &url, &title);
+
+ // Mark the extents of the reference
+ source_map_splice_extent(source_map, 0, 1, root, CMARK_EXTENT_PUNCTUATION);
+ source_map_splice_extent(source_map, 1, endlabel, root, CMARK_EXTENT_REFERENCE_LABEL);
+ source_map_splice_extent(source_map, endlabel, endlabel + 2, root, CMARK_EXTENT_PUNCTUATION);
+ source_map_splice_extent(source_map, endlabel + 2, starturl, root, CMARK_EXTENT_BLANK);
+ source_map_splice_extent(source_map, starturl, endurl, root, CMARK_EXTENT_REFERENCE_DESTINATION);
+ source_map_splice_extent(source_map, endurl, starttitle, root, CMARK_EXTENT_BLANK);
+ source_map_splice_extent(source_map, starttitle, endtitle, root, CMARK_EXTENT_REFERENCE_TITLE);
+ source_map_splice_extent(source_map, endtitle, subj.pos, root, CMARK_EXTENT_BLANK);
+
+ while (tmp_extent != source_map->cursor) {
+ if (tmp_extent->node == container)
+ tmp_extent->node = root;
+ tmp_extent = tmp_extent->next;
+ }
+
return subj.pos;
}
diff --git a/src/inlines.h b/src/inlines.h
index 52be768..8de31b1 100644
--- a/src/inlines.h
+++ b/src/inlines.h
@@ -1,6 +1,10 @@
#ifndef CMARK_INLINES_H
#define CMARK_INLINES_H
+#include "chunk.h"
+#include "references.h"
+#include "source_map.h"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -9,10 +13,13 @@ cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url);
cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title);
void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent,
- cmark_reference_map *refmap, int options);
+ cmark_reference_map *refmap, int options,
+ cmark_source_map *source_map, uint64_t total_length);
bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input,
- cmark_reference_map *refmap);
+ cmark_reference_map *refmap,
+ cmark_node *root,
+ cmark_source_map *source_map);
#ifdef __cplusplus
}
diff --git a/src/parser.h b/src/parser.h
index 0c5033b..b28a8a7 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -5,6 +5,7 @@
#include "node.h"
#include "buffer.h"
#include "memory.h"
+#include "source_map.h"
#ifdef __cplusplus
extern "C" {
@@ -27,9 +28,12 @@ struct cmark_parser {
bool partially_consumed_tab;
cmark_strbuf curline;
bufsize_t last_line_length;
+ bufsize_t line_offset;
cmark_strbuf linebuf;
int options;
bool last_buffer_ended_with_cr;
+ cmark_source_map *source_map;
+ cmark_source_extent *last_paragraph_extent;
};
#ifdef __cplusplus
diff --git a/src/source_map.c b/src/source_map.c
new file mode 100644
index 0000000..db01a21
--- /dev/null
+++ b/src/source_map.c
@@ -0,0 +1,293 @@
+#include <assert.h>
+
+#include "source_map.h"
+
+cmark_source_map *
+source_map_new(cmark_mem *mem)
+{
+ cmark_source_map *res = (cmark_source_map *) mem->calloc(1, sizeof(cmark_source_map));
+ res->mem = mem;
+ return res;
+}
+
+void
+source_map_free(cmark_source_map *self)
+{
+ cmark_source_extent *tmp;
+ for (tmp = self->head; tmp; tmp = source_map_free_extent(self, tmp));
+ self->mem->free(self);
+}
+
+cmark_source_extent *
+source_map_append_extent(cmark_source_map *self, uint64_t start, uint64_t stop, cmark_node *node, cmark_extent_type type)
+{
+ assert (start <= stop);
+ assert (!self->tail || self->tail->stop <= start);
+
+ cmark_source_extent *res = (cmark_source_extent *) self->mem->calloc(1, sizeof(cmark_source_extent));
+
+ res->start = start;
+ res->stop = stop;
+ res->node = node;
+ res->type = type;
+
+ res->next = NULL;
+ res->prev = self->tail;
+
+ if (!self->head)
+ self->head = res;
+ else
+ self->tail->next = res;
+
+ self->tail = res;
+
+ return res;
+}
+
+cmark_source_extent *
+source_map_insert_extent(cmark_source_map *self, cmark_source_extent *previous,
+ uint64_t start, uint64_t stop, cmark_node *node, cmark_extent_type type)
+{
+ if (start == stop)
+ return previous;
+
+ cmark_source_extent *extent = (cmark_source_extent *) self->mem->calloc(1, sizeof(cmark_source_extent));
+
+ extent->start = start;
+ extent->stop = stop;
+ extent->node = node;
+ extent->type = type;
+ extent->next = previous->next;
+ extent->prev = previous;
+ previous->next = extent;
+
+ if (extent->next)
+ extent->next->prev = extent;
+ else
+ self->tail = extent;
+
+ return extent;
+}
+
+cmark_source_extent *
+source_map_free_extent(cmark_source_map *self, cmark_source_extent *extent)
+{
+ cmark_source_extent *next = extent->next;
+
+ if (extent->prev)
+ extent->prev->next = next;
+
+ if (extent->next)
+ extent->next->prev = extent->prev;
+
+ if (extent == self->tail)
+ self->tail = extent->prev;
+
+ if (extent == self->head)
+ self->head = extent->next;
+
+ if (extent == self->cursor) {
+ self->cursor = extent->prev;
+ }
+
+ if (extent == self->next_cursor) {
+ self->next_cursor = extent->next;
+ }
+
+ self->mem->free(extent);
+
+ return next;
+}
+
+cmark_source_extent *
+source_map_stitch_extent(cmark_source_map *self, cmark_source_extent *extent,
+ cmark_node *node, uint64_t total_length)
+{
+ cmark_source_extent *next_extent = extent->next;
+ cmark_source_extent *res;
+
+ while (next_extent && extent->start == extent->stop) {
+ extent = source_map_free_extent(self, extent);
+ extent = next_extent;
+ next_extent = extent->next;
+ }
+
+ if (next_extent) {
+ res = source_map_insert_extent(self,
+ extent,
+ extent->stop,
+ extent->next->start,
+ node,
+ CMARK_EXTENT_BLANK);
+ } else {
+ res = source_map_insert_extent(self,
+ extent,
+ extent->stop,
+ total_length,
+ node,
+ CMARK_EXTENT_BLANK);
+ }
+
+ if (extent->start == extent->stop)
+ source_map_free_extent(self, extent);
+
+ return res;
+}
+
+cmark_source_extent *
+source_map_splice_extent(cmark_source_map *self, uint64_t start, uint64_t stop,
+ cmark_node *node, cmark_extent_type type)
+{
+ if (!self->next_cursor) {
+ self->cursor = source_map_insert_extent(self,
+ self->cursor,
+ start + self->cursor_offset,
+ stop + self->cursor_offset, node, type);
+
+ return self->cursor;
+ } else if (start + self->cursor_offset < self->next_cursor->start &&
+ stop + self->cursor_offset <= self->next_cursor->start) {
+ self->cursor = source_map_insert_extent(self,
+ self->cursor,
+ start + self->cursor_offset,
+ stop + self->cursor_offset, node, type);
+
+ return self->cursor;
+ } else if (start + self->cursor_offset < self->next_cursor->start) {
+ uint64_t new_start = self->next_cursor->start - self->cursor_offset;
+
+ self->cursor = source_map_insert_extent(self,
+ self->cursor,
+ start + self->cursor_offset,
+ self->next_cursor->start,
+ node, type);
+
+ if (new_start == stop)
+ return self->cursor;
+
+ start = new_start;
+ }
+
+ while (self->next_cursor && start + self->cursor_offset >= self->next_cursor->start) {
+ self->cursor_offset += self->next_cursor->stop - self->next_cursor->start;
+ self->cursor = self->cursor->next;
+ self->next_cursor = self->cursor->next;
+ }
+
+ return source_map_splice_extent(self, start, stop, node, type);
+}
+
+bool
+source_map_start_cursor(cmark_source_map *self, cmark_source_extent *cursor)
+{
+ self->cursor = cursor ? cursor : self->head;
+
+ if (!self->cursor)
+ return false;
+
+ self->next_cursor = self->cursor->next;
+ self->cursor_offset = self->cursor->stop;
+
+ return true;
+}
+
+void
+source_map_pretty_print(cmark_source_map *self) {
+ cmark_source_extent *tmp;
+
+ for (tmp = self->head; tmp; tmp = tmp->next) {
+ printf ("%lu:%lu - %s, %s (%p)\n", tmp->start, tmp->stop,
+ cmark_node_get_type_string(tmp->node),
+ cmark_source_extent_get_type_string(tmp),
+ (void *) tmp->node);
+ }
+}
+
+bool
+source_map_check(cmark_source_map *self, uint64_t total_length)
+{
+ uint64_t last_stop = 0;
+ cmark_source_extent *tmp;
+
+ for (tmp = self->head; tmp; tmp = tmp->next) {
+ if (tmp->start != last_stop) {
+ return false;
+ } if (tmp->start == tmp->stop)
+ return false;
+ last_stop = tmp->stop;
+ }
+
+ if (last_stop != total_length)
+ return false;
+
+ return true;
+}
+
+
+uint64_t
+cmark_source_extent_get_start(cmark_source_extent *extent)
+{
+ return extent->start;
+}
+
+uint64_t
+cmark_source_extent_get_stop(cmark_source_extent *extent)
+{
+ return extent->stop;
+}
+
+cmark_node *
+cmark_source_extent_get_node(cmark_source_extent *extent)
+{
+ return extent->node;
+}
+
+cmark_source_extent *
+cmark_source_extent_get_next(cmark_source_extent *extent)
+{
+ return extent->next;
+}
+
+cmark_source_extent *
+cmark_source_extent_get_previous(cmark_source_extent *extent)
+{
+ return extent->prev;
+}
+
+cmark_extent_type
+cmark_source_extent_get_type(cmark_source_extent *extent)
+{
+ return extent->type;
+}
+
+const char *
+cmark_source_extent_get_type_string(cmark_source_extent *extent)
+{
+ switch (extent->type) {
+ case CMARK_EXTENT_NONE:
+ return "unknown";
+ case CMARK_EXTENT_OPENER:
+ return "opener";
+ case CMARK_EXTENT_CLOSER:
+ return "closer";
+ case CMARK_EXTENT_BLANK:
+ return "blank";
+ case CMARK_EXTENT_CONTENT:
+ return "content";
+ case CMARK_EXTENT_PUNCTUATION:
+ return "punctuation";
+ case CMARK_EXTENT_LINK_DESTINATION:
+ return "link_destination";
+ case CMARK_EXTENT_LINK_TITLE:
+ return "link_title";
+ case CMARK_EXTENT_LINK_LABEL:
+ return "link_label";
+ case CMARK_EXTENT_REFERENCE_DESTINATION:
+ return "reference_destination";
+ case CMARK_EXTENT_REFERENCE_LABEL:
+ return "reference_label";
+ case CMARK_EXTENT_REFERENCE_TITLE:
+ return "reference_title";
+ }
+ return "unknown";
+}
diff --git a/src/source_map.h b/src/source_map.h
new file mode 100644
index 0000000..619a073
--- /dev/null
+++ b/src/source_map.h
@@ -0,0 +1,66 @@
+#ifndef CMARK_SOURCE_MAP_H
+#define CMARK_SOURCE_MAP_H
+
+#include "cmark.h"
+#include "config.h"
+
+typedef struct _cmark_source_map
+{
+ cmark_source_extent *head;
+ cmark_source_extent *tail;
+ cmark_source_extent *cursor;
+ cmark_source_extent *next_cursor;
+ uint64_t cursor_offset;
+ cmark_mem *mem;
+} cmark_source_map;
+
+struct cmark_source_extent
+{
+ uint64_t start;
+ uint64_t stop;
+ struct cmark_source_extent *next;
+ struct cmark_source_extent *prev;
+ cmark_node *node;
+ cmark_extent_type type;
+};
+
+cmark_source_map * source_map_new (cmark_mem *mem);
+
+void source_map_free (cmark_source_map *self);
+
+bool source_map_check (cmark_source_map *self,
+ uint64_t total_length);
+
+void source_map_pretty_print (cmark_source_map *self);
+
+cmark_source_extent * source_map_append_extent(cmark_source_map *self,
+ uint64_t start,
+ uint64_t stop,
+ cmark_node *node,
+ cmark_extent_type type);
+
+cmark_source_extent * source_map_insert_extent(cmark_source_map *self,
+ cmark_source_extent *previous,
+ uint64_t start,
+ uint64_t stop,
+ cmark_node *node,
+ cmark_extent_type type);
+
+cmark_source_extent * source_map_free_extent (cmark_source_map *self,
+ cmark_source_extent *extent);
+
+cmark_source_extent * source_map_stitch_extent(cmark_source_map *self,
+ cmark_source_extent *extent,
+ cmark_node *node,
+ uint64_t total_length);
+
+cmark_source_extent * source_map_splice_extent(cmark_source_map *self,
+ uint64_t start,
+ uint64_t stop,
+ cmark_node *node,
+ cmark_extent_type type);
+
+bool source_map_start_cursor (cmark_source_map *self,
+ cmark_source_extent *cursor);
+
+#endif