diff options
author | Nick Wellnhofer <wellnhofer@aevum.de> | 2016-12-31 02:25:16 +0100 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2016-12-30 18:25:16 -0700 |
commit | 4fbe344df43ed7f60a3d3a53981088334cb709fc (patch) | |
tree | f7f8264d06cb31024f169fda11c5d72821510c51 | |
parent | c4c1d59ca29aceb1c5919908ac97d9476264fd96 (diff) |
Change types for source map offsets (#174)
* Improve strbuf guarantees
Introduce BUFSIZE_MAX macro and make sure that the strbuf implementation
can handle strings up to this size.
* Abort early if document size exceeds internal limit
* Change types for source map offsets
Switch to size_t for the public API, making the public headers
C89-compatible again.
Switch to bufsize_t internally, reducing memory usage and improving
performance on 32-bit platforms.
* Make parser return NULL on internal index overflow
Make S_parser_feed set an error and ignore subsequent chunks if the
total input document size exceeds an internal limit. Make
cmark_parser_finish return NULL if an error was encountered. Add
public API functions to retrieve error code and error message.
strbuf overflow in renderers and OOM in parser or renderers still
cause an abort.
-rw-r--r-- | api_test/main.c | 37 | ||||
-rw-r--r-- | src/blocks.c | 48 | ||||
-rw-r--r-- | src/buffer.c | 32 | ||||
-rw-r--r-- | src/buffer.h | 20 | ||||
-rw-r--r-- | src/cmark.c | 3 | ||||
-rw-r--r-- | src/cmark.h | 32 | ||||
-rw-r--r-- | src/inlines.c | 2 | ||||
-rw-r--r-- | src/inlines.h | 2 | ||||
-rw-r--r-- | src/main.c | 5 | ||||
-rw-r--r-- | src/parser.h | 3 | ||||
-rw-r--r-- | src/source_map.c | 22 | ||||
-rw-r--r-- | src/source_map.h | 23 | ||||
-rw-r--r-- | test/cmark.py | 2 |
13 files changed, 187 insertions, 44 deletions
diff --git a/api_test/main.c b/api_test/main.c index 17e1582..61291dc 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -5,6 +5,7 @@ #define CMARK_NO_SHORT_NAMES #include "cmark.h" #include "node.h" +#include "parser.h" #include "harness.h" #include "cplusplus.h" @@ -883,6 +884,41 @@ static void test_feed_across_line_ending(test_batch_runner *runner) { cmark_node_free(document); } +static cmark_node *S_parse_with_fake_total(bufsize_t fake_total, + const char *str, + cmark_err_type *err) { + cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT); + parser->total_bytes = fake_total; + cmark_parser_feed(parser, str, strlen(str)); + cmark_node *doc = cmark_parser_finish(parser); + *err = cmark_parser_get_error(parser); + cmark_parser_free(parser); + return doc; +} + +static void test_bufsize_overflow(test_batch_runner *runner) { + cmark_node *doc; + cmark_err_type err; + + doc = S_parse_with_fake_total(BUFSIZE_MAX, "a", &err); + OK(runner, doc == NULL, "parse 1 byte after BUFSIZE_MAX bytes fails"); + INT_EQ(runner, err, CMARK_ERR_INPUT_TOO_LARGE, + "parse 1 byte after BUFSIZE_MAX bytes error code"); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 9, "0123456789", &err); + OK(runner, doc == NULL, "parse 10 byte after BUFSIZE_MAX-9 bytes fails"); + INT_EQ(runner, err, CMARK_ERR_INPUT_TOO_LARGE, + "parse 10 byte after BUFSIZE_MAX-9 bytes error code"); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 1, "a", &err); + OK(runner, doc != NULL, "parse 1 byte after BUFSIZE_MAX-1 bytes"); + cmark_node_free(doc); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 10, "0123456789", &err); + OK(runner, doc != NULL, "parse 10 byte after BUFSIZE_MAX-10 bytes"); + cmark_node_free(doc); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -908,6 +944,7 @@ int main() { test_cplusplus(runner); test_safe(runner); test_feed_across_line_ending(runner); + test_bufsize_overflow(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; diff --git a/src/blocks.c b/src/blocks.c index 1c1d160..c680535 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -96,6 +96,8 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) { parser->refmap = cmark_reference_map_new(mem); parser->root = document; parser->current = document; + parser->error_code = CMARK_ERR_NONE; + parser->total_bytes = 0; parser->line_number = 0; parser->line_offset = 0; parser->offset = 0; @@ -550,6 +552,20 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, const unsigned char *skipped; static const uint8_t repl[] = {239, 191, 189}; + if (parser->error_code) { + return; + } + + // Limit maximum document size to BUFSIZE_MAX. This makes sure that we + // never create strbufs larger than BUFSIZE_MAX. Unfortunately, the + // public API doesn't have an error reporting mechanism, so all we can + // do is to abort. + if (len > (size_t)(BUFSIZE_MAX - parser->total_bytes)) { + parser->error_code = CMARK_ERR_INPUT_TOO_LARGE; + return; + } + parser->total_bytes += (bufsize_t)len; + if (parser->last_buffer_ended_with_cr && *buffer == '\n') { // skip NL if last buffer ended with CR ; see #117 buffer++; @@ -1266,14 +1282,19 @@ cmark_node *cmark_parser_finish(cmark_parser *parser) { cmark_strbuf_clear(&parser->linebuf); } + cmark_strbuf_clear(&parser->curline); + + if (parser->error_code) { + cmark_node_free(parser->root); + return NULL; + } + finalize_document(parser); if (parser->options & CMARK_OPT_NORMALIZE) { cmark_consolidate_text_nodes(parser->root); } - cmark_strbuf_free(&parser->curline); - #if CMARK_DEBUG_NODES if (cmark_node_check(parser->root, stderr)) { abort(); @@ -1287,3 +1308,26 @@ cmark_parser_get_first_source_extent(cmark_parser *parser) { return parser->source_map->head; } + +cmark_err_type cmark_parser_get_error(cmark_parser *parser) { + return parser->error_code; +} + +const char *cmark_parser_get_error_message(cmark_parser *parser) { + const char *str = NULL; + + switch (parser->error_code) { + case CMARK_ERR_OUT_OF_MEMORY: + str = "Out of memory"; + break; + case CMARK_ERR_INPUT_TOO_LARGE: + str = "Input too large"; + break; + default: + str = "Unknown error"; + break; + } + + return str; +} + diff --git a/src/buffer.c b/src/buffer.c index a6754b6..9a9e9ad 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -33,6 +33,11 @@ void cmark_strbuf_init(cmark_mem *mem, cmark_strbuf *buf, } static CMARK_INLINE void S_strbuf_grow_by(cmark_strbuf *buf, bufsize_t add) { + // Safety check for overflow. + if (add > BUFSIZE_MAX - buf->size) { + fprintf(stderr, "Internal cmark_strbuf overflow"); + abort(); + } cmark_strbuf_grow(buf, buf->size + add); } @@ -42,18 +47,25 @@ void cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size) { if (target_size < buf->asize) return; - if (target_size > (bufsize_t)(INT32_MAX / 2)) - abort(); - - /* Oversize the buffer by 50% to guarantee amortized linear time - * complexity on append operations. */ - bufsize_t new_size = target_size + target_size / 2; - new_size += 1; - new_size = (new_size + 7) & ~7; + // Oversize the buffer by 50% to guarantee amortized linear time + // complexity on append operations. + bufsize_t add = target_size / 2; + // Account for terminating NUL byte. + add += 1; + // Round up to multiple of eight. + add = (add + 7) & ~7; + + // Check for overflow but allow an additional NUL byte. + if (target_size + add > BUFSIZE_MAX + 1) { + target_size = BUFSIZE_MAX + 1; + } + else { + target_size += add; + } buf->ptr = (unsigned char *)buf->mem->realloc(buf->asize ? buf->ptr : NULL, - new_size); - buf->asize = new_size; + target_size); + buf->asize = target_size; } bufsize_t cmark_strbuf_len(const cmark_strbuf *buf) { return buf->size; } diff --git a/src/buffer.h b/src/buffer.h index e878075..7f31a74 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -13,8 +13,28 @@ extern "C" { #endif +#ifndef CMARK_HUGE_DOCS + +// Maximum strbuf size without terminating NUL byte. +#define BUFSIZE_MAX (INT32_MAX - 1) + typedef int32_t bufsize_t; +#else // CMARK_HUGE_DOCS + +// This is an untested proof of concept of how to handle multi-gigabyte +// documents on 64-bit platforms at the expense of internal struct sizes. + +#ifdef PTRDIFF_MAX + #define BUFSIZE_MAX (PTRDIFF_MAX - 1) +#else + #define BUFSIZE_MAX (ptrdiff_t)((size_t)-1 / 2) +#endif + +typedef ptrdiff_t bufsize_t; + +#endif // CMARK_HUGE_DOCS + typedef struct { cmark_mem *mem; unsigned char *ptr; diff --git a/src/cmark.c b/src/cmark.c index 2ef6cb4..da93abe 100644 --- a/src/cmark.c +++ b/src/cmark.c @@ -36,6 +36,9 @@ char *cmark_markdown_to_html(const char *text, size_t len, int options) { char *result; doc = cmark_parse_document(text, len, options); + if (doc == NULL) { + return NULL; + } result = cmark_render_html(doc, options); cmark_node_free(doc); diff --git a/src/cmark.h b/src/cmark.h index 034f0e6..5ce6d10 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -2,7 +2,6 @@ #define CMARK_H #include <stdio.h> -#include <stdint.h> #include <cmark_export.h> #include <cmark_version.h> @@ -23,7 +22,7 @@ extern "C" { /** Convert 'text' (assumed to be a UTF-8 encoded string with length * 'len') from CommonMark Markdown to HTML, returning a null-terminated, * UTF-8-encoded string. It is the caller's responsibility - * to free the returned buffer. + * to free the returned buffer. Returns NULL on error. */ CMARK_EXPORT char *cmark_markdown_to_html(const char *text, size_t len, int options); @@ -99,6 +98,12 @@ typedef enum { CMARK_PAREN_DELIM } cmark_delim_type; +typedef enum { + CMARK_ERR_NONE, + CMARK_ERR_OUT_OF_MEMORY, + CMARK_ERR_INPUT_TOO_LARGE +} cmark_err_type; + typedef struct cmark_node cmark_node; typedef struct cmark_parser cmark_parser; typedef struct cmark_iter cmark_iter; @@ -489,12 +494,22 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem); CMARK_EXPORT void cmark_parser_free(cmark_parser *parser); +/** Return the error code after a failed operation. + */ +CMARK_EXPORT +cmark_err_type cmark_parser_get_error(cmark_parser *parser); + +/** Return the error code after a failed operation. + */ +CMARK_EXPORT +const char *cmark_parser_get_error_message(cmark_parser *parser); + /** Feeds a string of length 'len' to 'parser'. */ CMARK_EXPORT void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len); -/** Finish parsing and return a pointer to a tree of nodes. +/** Finish parsing and return a pointer to a tree of nodes or NULL on error. */ CMARK_EXPORT cmark_node *cmark_parser_finish(cmark_parser *parser); @@ -507,7 +522,7 @@ cmark_source_extent *cmark_parser_get_first_source_extent(cmark_parser *parser); /** Parse a CommonMark document in 'buffer' of length 'len'. * Returns a pointer to a tree of nodes. The memory allocated for * the node tree should be released using 'cmark_node_free' - * when it is no longer needed. + * when it is no longer needed. Returns NULL on error. */ CMARK_EXPORT cmark_node *cmark_parse_document(const char *buffer, size_t len, int options); @@ -515,22 +530,23 @@ cmark_node *cmark_parse_document(const char *buffer, size_t len, int options); /** Parse a CommonMark document in file 'f', returning a pointer to * a tree of nodes. The memory allocated for the node tree should be * released using 'cmark_node_free' when it is no longer needed. + * Returns NULL on error. */ CMARK_EXPORT cmark_node *cmark_parse_file(FILE *f, int options); -/** +/** * ## Source map API */ /* Return the index, in bytes, of the start of this extent */ CMARK_EXPORT -uint64_t cmark_source_extent_get_start(cmark_source_extent *extent); +size_t cmark_source_extent_get_start(cmark_source_extent *extent); -/* Return the index, in bytes, of the stop of this extent. This +/* Return the index, in bytes, of the stop of this extent. This * index is not included in the extent*/ CMARK_EXPORT -uint64_t cmark_source_extent_get_stop(cmark_source_extent *extent); +size_t cmark_source_extent_get_stop(cmark_source_extent *extent); /* Return the extent immediately following 'extent' */ CMARK_EXPORT diff --git a/src/inlines.c b/src/inlines.c index 96da28b..099078e 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -1230,7 +1230,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { // Parse inlines from parent's string_content, adding as children of parent. extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, cmark_reference_map *refmap, int options, - cmark_source_map *source_map, uint64_t total_length) { + cmark_source_map *source_map, bufsize_t total_length) { subject subj; subject_from_buf(mem, &subj, &parent->content, refmap, source_map); bufsize_t initial_len = subj.input.len; diff --git a/src/inlines.h b/src/inlines.h index 8de31b1..8459794 100644 --- a/src/inlines.h +++ b/src/inlines.h @@ -14,7 +14,7 @@ cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title); void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, cmark_reference_map *refmap, int options, - cmark_source_map *source_map, uint64_t total_length); + cmark_source_map *source_map, bufsize_t total_length); bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, cmark_reference_map *refmap, @@ -181,6 +181,11 @@ int main(int argc, char *argv[]) { document = cmark_parser_finish(parser); cmark_parser_free(parser); + if (document == NULL) { + fprintf(stderr, "%s", cmark_parser_get_error_message(parser)); + exit(1); + } + print_document(document, writer, options, width); cmark_node_free(document); diff --git a/src/parser.h b/src/parser.h index b28a8a7..7b4fdbc 100644 --- a/src/parser.h +++ b/src/parser.h @@ -2,6 +2,7 @@ #define CMARK_AST_H #include <stdio.h> +#include "cmark.h" #include "node.h" #include "buffer.h" #include "memory.h" @@ -18,6 +19,8 @@ struct cmark_parser { struct cmark_reference_map *refmap; struct cmark_node *root; struct cmark_node *current; + cmark_err_type error_code; + bufsize_t total_bytes; int line_number; bufsize_t offset; bufsize_t column; diff --git a/src/source_map.c b/src/source_map.c index db01a21..dccbe7c 100644 --- a/src/source_map.c +++ b/src/source_map.c @@ -19,7 +19,7 @@ source_map_free(cmark_source_map *self) } cmark_source_extent * -source_map_append_extent(cmark_source_map *self, uint64_t start, uint64_t stop, cmark_node *node, cmark_extent_type type) +source_map_append_extent(cmark_source_map *self, bufsize_t start, bufsize_t stop, cmark_node *node, cmark_extent_type type) { assert (start <= stop); assert (!self->tail || self->tail->stop <= start); @@ -46,7 +46,7 @@ source_map_append_extent(cmark_source_map *self, uint64_t start, uint64_t stop, cmark_source_extent * source_map_insert_extent(cmark_source_map *self, cmark_source_extent *previous, - uint64_t start, uint64_t stop, cmark_node *node, cmark_extent_type type) + bufsize_t start, bufsize_t stop, cmark_node *node, cmark_extent_type type) { if (start == stop) return previous; @@ -101,7 +101,7 @@ source_map_free_extent(cmark_source_map *self, cmark_source_extent *extent) cmark_source_extent * source_map_stitch_extent(cmark_source_map *self, cmark_source_extent *extent, - cmark_node *node, uint64_t total_length) + cmark_node *node, bufsize_t total_length) { cmark_source_extent *next_extent = extent->next; cmark_source_extent *res; @@ -135,7 +135,7 @@ source_map_stitch_extent(cmark_source_map *self, cmark_source_extent *extent, } cmark_source_extent * -source_map_splice_extent(cmark_source_map *self, uint64_t start, uint64_t stop, +source_map_splice_extent(cmark_source_map *self, bufsize_t start, bufsize_t stop, cmark_node *node, cmark_extent_type type) { if (!self->next_cursor) { @@ -154,7 +154,7 @@ source_map_splice_extent(cmark_source_map *self, uint64_t start, uint64_t stop, return self->cursor; } else if (start + self->cursor_offset < self->next_cursor->start) { - uint64_t new_start = self->next_cursor->start - self->cursor_offset; + bufsize_t new_start = self->next_cursor->start - self->cursor_offset; self->cursor = source_map_insert_extent(self, self->cursor, @@ -196,17 +196,17 @@ source_map_pretty_print(cmark_source_map *self) { cmark_source_extent *tmp; for (tmp = self->head; tmp; tmp = tmp->next) { - printf ("%lu:%lu - %s, %s (%p)\n", tmp->start, tmp->stop, - cmark_node_get_type_string(tmp->node), + printf ("%d:%d - %s, %s (%p)\n", tmp->start, tmp->stop, + cmark_node_get_type_string(tmp->node), cmark_source_extent_get_type_string(tmp), (void *) tmp->node); } } bool -source_map_check(cmark_source_map *self, uint64_t total_length) +source_map_check(cmark_source_map *self, bufsize_t total_length) { - uint64_t last_stop = 0; + bufsize_t last_stop = 0; cmark_source_extent *tmp; for (tmp = self->head; tmp; tmp = tmp->next) { @@ -224,13 +224,13 @@ source_map_check(cmark_source_map *self, uint64_t total_length) } -uint64_t +size_t cmark_source_extent_get_start(cmark_source_extent *extent) { return extent->start; } -uint64_t +size_t cmark_source_extent_get_stop(cmark_source_extent *extent) { return extent->stop; diff --git a/src/source_map.h b/src/source_map.h index 619a073..dca5a9f 100644 --- a/src/source_map.h +++ b/src/source_map.h @@ -3,6 +3,7 @@ #include "cmark.h" #include "config.h" +#include "buffer.h" typedef struct _cmark_source_map { @@ -10,14 +11,14 @@ typedef struct _cmark_source_map cmark_source_extent *tail; cmark_source_extent *cursor; cmark_source_extent *next_cursor; - uint64_t cursor_offset; + bufsize_t cursor_offset; cmark_mem *mem; } cmark_source_map; struct cmark_source_extent { - uint64_t start; - uint64_t stop; + bufsize_t start; + bufsize_t stop; struct cmark_source_extent *next; struct cmark_source_extent *prev; cmark_node *node; @@ -29,20 +30,20 @@ cmark_source_map * source_map_new (cmark_mem *mem); void source_map_free (cmark_source_map *self); bool source_map_check (cmark_source_map *self, - uint64_t total_length); + bufsize_t total_length); void source_map_pretty_print (cmark_source_map *self); cmark_source_extent * source_map_append_extent(cmark_source_map *self, - uint64_t start, - uint64_t stop, + bufsize_t start, + bufsize_t stop, cmark_node *node, cmark_extent_type type); cmark_source_extent * source_map_insert_extent(cmark_source_map *self, cmark_source_extent *previous, - uint64_t start, - uint64_t stop, + bufsize_t start, + bufsize_t stop, cmark_node *node, cmark_extent_type type); @@ -52,11 +53,11 @@ cmark_source_extent * source_map_free_extent (cmark_source_map *self, cmark_source_extent * source_map_stitch_extent(cmark_source_map *self, cmark_source_extent *extent, cmark_node *node, - uint64_t total_length); + bufsize_t total_length); cmark_source_extent * source_map_splice_extent(cmark_source_map *self, - uint64_t start, - uint64_t stop, + bufsize_t start, + bufsize_t stop, cmark_node *node, cmark_extent_type type); diff --git a/test/cmark.py b/test/cmark.py index 4be85a3..f4ff576 100644 --- a/test/cmark.py +++ b/test/cmark.py @@ -30,6 +30,8 @@ def to_commonmark(lib, text): render_commonmark.restype = c_char_p render_commonmark.argtypes = [c_void_p, c_int, c_int] node = parse_document(textbytes, textlen, 0) + if node is None: + raise Exception("parse_document failed") result = render_commonmark(node, 0, 0).decode('utf-8') return [0, result, ''] |