summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Wellnhofer <wellnhofer@aevum.de>2020-01-19 00:51:02 +0100
committerJohn MacFarlane <jgm@berkeley.edu>2020-01-23 08:25:54 -0800
commitb0a4cfa36e99c27dd2b20be8f8888fa7721bad58 (patch)
tree528ace24d0526b0dd647bcd774f348e677b78a9f
parent75b48c5938f5984dbcf79a579d15c9cbd6447d12 (diff)
Use C string instead of chunk for literal text
Use zero-terminated C strings and a separate length field instead of cmark_chunks. Literal inline text will now be copied from the parent block's content buffer, slowing the benchmark down by 10-15%. The node struct never references memory of other nodes now, fixing #309. Node accessors don't have to check for delayed creation of C strings, so parsing and iterating all literals using the public API should actually be faster than before.
-rw-r--r--api_test/main.c19
-rw-r--r--src/blocks.c3
-rw-r--r--src/commonmark.c3
-rw-r--r--src/inlines.c59
-rw-r--r--src/iterator.c5
-rw-r--r--src/node.c7
-rw-r--r--src/node.h8
7 files changed, 71 insertions, 33 deletions
diff --git a/api_test/main.c b/api_test/main.c
index e7fccbd..994ee39 100644
--- a/api_test/main.c
+++ b/api_test/main.c
@@ -243,6 +243,21 @@ static void accessors(test_batch_runner *runner) {
cmark_node_free(doc);
}
+static void free_parent(test_batch_runner *runner) {
+ static const char markdown[] = "text\n";
+
+ cmark_node *doc =
+ cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT);
+
+ cmark_node *para = cmark_node_first_child(doc);
+ cmark_node *text = cmark_node_first_child(para);
+ cmark_node_unlink(text);
+ cmark_node_free(doc);
+ STR_EQ(runner, cmark_node_get_literal(text), "text",
+ "inline content after freeing parent block");
+ cmark_node_free(text);
+}
+
static void node_check(test_batch_runner *runner) {
// Construct an incomplete tree.
cmark_node *doc = cmark_node_new(CMARK_NODE_DOCUMENT);
@@ -381,9 +396,6 @@ static void create_tree(test_batch_runner *runner) {
free(html);
cmark_node_free(doc);
-
- // TODO: Test that the contents of an unlinked inline are valid
- // after the parent block was destroyed. This doesn't work so far.
cmark_node_free(emph);
}
@@ -1031,6 +1043,7 @@ int main() {
version(runner);
constructor(runner);
accessors(runner);
+ free_parent(runner);
node_check(runner);
iterator(runner);
iterator_delete(runner);
diff --git a/src/blocks.c b/src/blocks.c
index 5214f47..9970cc9 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -322,7 +322,8 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
break;
case CMARK_NODE_HTML_BLOCK:
- b->as.literal = cmark_chunk_buf_detach(node_content);
+ b->as.literal.len = node_content->size;
+ b->as.literal.data = cmark_strbuf_detach(node_content);
break;
case CMARK_NODE_LIST: // determine tight/loose status
diff --git a/src/commonmark.c b/src/commonmark.c
index 89aef5b..41bfa52 100644
--- a/src/commonmark.c
+++ b/src/commonmark.c
@@ -146,8 +146,7 @@ static bool is_autolink(cmark_node *node) {
if (strcmp((const char *)url, "mailto:") == 0) {
url += 7;
}
- return strncmp((const char *)url, (char *)link_text->as.literal.data,
- link_text->as.literal.len) == 0;
+ return strcmp((const char *)url, (char *)link_text->as.literal.data) == 0;
}
// if node is a block node, returns node.
diff --git a/src/inlines.c b/src/inlines.c
index 7d584ca..2c13546 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -22,9 +22,6 @@ static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
// Macros for creating various kinds of simple.
-#define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s)
-#define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s)
-#define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s)
#define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)
#define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)
#define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)
@@ -81,12 +78,10 @@ static bufsize_t subject_find_special_char(subject *subj, int options);
// Create an inline with a literal string value.
static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,
- int start_column, int end_column,
- cmark_chunk s) {
+ int start_column, int end_column) {
cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));
cmark_strbuf_init(subj->mem, &e->content, 0);
e->type = (uint16_t)t;
- e->as.literal = s;
e->start_line = e->end_line = subj->line;
// columns are 1 based.
e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
@@ -102,6 +97,23 @@ static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) {
return e;
}
+static cmark_node *make_str(subject *subj, int sc, int ec, cmark_chunk s) {
+ cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);
+ e->as.literal.data = (unsigned char *)subj->mem->realloc(NULL, s.len + 1);
+ memcpy(e->as.literal.data, s.data, s.len);
+ e->as.literal.data[s.len] = 0;
+ e->as.literal.len = s.len;
+ return e;
+}
+
+static cmark_node *make_str_from_buf(subject *subj, int sc, int ec,
+ cmark_strbuf *buf) {
+ cmark_node *e = make_literal(subj, CMARK_NODE_TEXT, sc, ec);
+ e->as.literal.len = buf->size;
+ e->as.literal.data = cmark_strbuf_detach(buf);
+ return e;
+}
+
// Like make_str, but parses entities.
static cmark_node *make_str_with_entities(subject *subj,
int start_column, int end_column,
@@ -109,7 +121,7 @@ static cmark_node *make_str_with_entities(subject *subj,
cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);
if (houdini_unescape_html(&unescaped, content->data, content->len)) {
- return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped));
+ return make_str_from_buf(subj, start_column, end_column, &unescaped);
} else {
return make_str(subj, start_column, end_column, *content);
}
@@ -368,7 +380,10 @@ static cmark_node *handle_backticks(subject *subj, int options) {
endpos - startpos - openticks.len);
S_normalize_code(&buf);
- cmark_node *node = make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf));
+ cmark_node *node = make_literal(subj, CMARK_NODE_CODE, startpos,
+ endpos - openticks.len - 1);
+ node->as.literal.len = buf.size;
+ node->as.literal.data = cmark_strbuf_detach(&buf);
adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);
return node;
}
@@ -579,7 +594,7 @@ static cmark_node *handle_hyphen(subject *subj, bool smart) {
cmark_strbuf_puts(&buf, ENDASH);
}
- return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf));
+ return make_str_from_buf(subj, startpos, subj->pos - 1, &buf);
}
// Assumes we have a period at the current position.
@@ -656,19 +671,15 @@ static void process_emphasis(subject *subj, delimiter *stack_bottom) {
closer = closer->next;
}
} else if (closer->delim_char == '\'') {
- cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
- closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
+ cmark_node_set_literal(closer->inl_text, RIGHTSINGLEQUOTE);
if (opener_found) {
- cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
- opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
+ cmark_node_set_literal(opener->inl_text, LEFTSINGLEQUOTE);
}
closer = closer->next;
} else if (closer->delim_char == '"') {
- cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
- closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
+ cmark_node_set_literal(closer->inl_text, RIGHTDOUBLEQUOTE);
if (opener_found) {
- cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
- opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
+ cmark_node_set_literal(opener->inl_text, LEFTDOUBLEQUOTE);
}
closer = closer->next;
}
@@ -709,7 +720,9 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener,
opener_num_chars -= use_delims;
closer_num_chars -= use_delims;
opener_inl->as.literal.len = opener_num_chars;
+ opener_inl->as.literal.data[opener_num_chars] = 0;
closer_inl->as.literal.len = closer_num_chars;
+ closer_inl->as.literal.data[closer_num_chars] = 0;
// free delimiters between opener and closer
delim = closer->previous;
@@ -785,7 +798,7 @@ static cmark_node *handle_entity(subject *subj) {
return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&"));
subj->pos += len;
- return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent));
+ return make_str_from_buf(subj, subj->pos - 1 - len, subj->pos - 1, &ent);
}
// Clean a URL: remove surrounding whitespace, and remove \ that escape
@@ -853,9 +866,15 @@ static cmark_node *handle_pointy_brace(subject *subj, int options) {
// finally, try to match an html tag
matchlen = scan_html_tag(&subj->input, subj->pos);
if (matchlen > 0) {
- contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
+ unsigned char *src = subj->input.data + subj->pos - 1;
+ bufsize_t len = matchlen + 1;
subj->pos += matchlen;
- cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents);
+ cmark_node *node = make_literal(subj, CMARK_NODE_HTML_INLINE,
+ subj->pos - matchlen - 1, subj->pos - 1);
+ node->as.literal.data = (unsigned char *)subj->mem->realloc(NULL, len + 1);
+ memcpy(node->as.literal.data, src, len);
+ node->as.literal.data[len] = 0;
+ node->as.literal.len = len;
adjust_subj_node_newlines(subj, node, matchlen, 1, options);
return node;
}
diff --git a/src/iterator.c b/src/iterator.c
index f5cd802..cd7db8e 100644
--- a/src/iterator.c
+++ b/src/iterator.c
@@ -111,8 +111,9 @@ void cmark_consolidate_text_nodes(cmark_node *root) {
cmark_node_free(tmp);
tmp = next;
}
- cmark_chunk_free(iter->mem, &cur->as.literal);
- cur->as.literal = cmark_chunk_buf_detach(&buf);
+ iter->mem->free(cur->as.literal.data);
+ cur->as.literal.len = buf.size;
+ cur->as.literal.data = cmark_strbuf_detach(&buf);
}
}
diff --git a/src/node.c b/src/node.c
index 1e1f0e0..931bd46 100644
--- a/src/node.c
+++ b/src/node.c
@@ -116,7 +116,7 @@ static void S_free_nodes(cmark_node *e) {
case CMARK_NODE_HTML_INLINE:
case CMARK_NODE_CODE:
case CMARK_NODE_HTML_BLOCK:
- cmark_chunk_free(NODE_MEM(e), &e->as.literal);
+ NODE_MEM(e)->free(e->as.literal.data);
break;
case CMARK_NODE_LINK:
case CMARK_NODE_IMAGE:
@@ -295,7 +295,7 @@ const char *cmark_node_get_literal(cmark_node *node) {
case CMARK_NODE_TEXT:
case CMARK_NODE_HTML_INLINE:
case CMARK_NODE_CODE:
- return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.literal);
+ return node->as.literal.data ? (char *)node->as.literal.data : "";
case CMARK_NODE_CODE_BLOCK:
return (char *)node->as.code.literal;
@@ -317,7 +317,8 @@ int cmark_node_set_literal(cmark_node *node, const char *content) {
case CMARK_NODE_TEXT:
case CMARK_NODE_HTML_INLINE:
case CMARK_NODE_CODE:
- cmark_chunk_set_cstr(NODE_MEM(node), &node->as.literal, content);
+ node->as.literal.len = cmark_set_cstr(NODE_MEM(node),
+ &node->as.literal.data, content);
return 1;
case CMARK_NODE_CODE_BLOCK:
diff --git a/src/node.h b/src/node.h
index b557f12..fbf449c 100644
--- a/src/node.h
+++ b/src/node.h
@@ -10,7 +10,11 @@ extern "C" {
#include "cmark.h"
#include "buffer.h"
-#include "chunk.h"
+
+typedef struct {
+ unsigned char *data;
+ bufsize_t len;
+} cmark_literal;
typedef struct {
cmark_list_type list_type;
@@ -72,7 +76,7 @@ struct cmark_node {
uint16_t flags;
union {
- cmark_chunk literal;
+ cmark_literal literal;
cmark_list list;
cmark_code code;
cmark_heading heading;