diff options
83 files changed, 916 insertions, 434 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 84a2191..a4ebe92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,8 @@ endif() set(PROJECT_NAME "cmark") set(PROJECT_VERSION_MAJOR 0) -set(PROJECT_VERSION_MINOR 18) -set(PROJECT_VERSION_PATCH 3) +set(PROJECT_VERSION_MINOR 19) +set(PROJECT_VERSION_PATCH 0) set(PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH} ) add_subdirectory(src) @@ -21,5 +21,5 @@ add_subdirectory(test testdir) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Choose the type of build, options are: Debug Profile Release." FORCE) + "Choose the type of build, options are: Debug Profile Release Asan Ubsan." FORCE) endif(NOT CMAKE_BUILD_TYPE) @@ -19,7 +19,7 @@ VERSION?=$(SPECVERSION) RELEASE?=CommonMark-$(VERSION) INSTALL_PREFIX?=/usr/local -.PHONY: all cmake_build spec leakcheck clean fuzztest dingus upload test update-site upload-site debug asan mingw archive bench astyle update-spec +.PHONY: all cmake_build spec leakcheck clean fuzztest dingus upload test update-site upload-site debug ubsan asan mingw archive bench astyle update-spec afl all: cmake_build man/man3/cmark.3 @@ -47,6 +47,12 @@ debug: cmake .. -DCMAKE_BUILD_TYPE=Debug; \ make +ubsan: + mkdir -p $(BUILDDIR); \ + cd $(BUILDDIR); \ + cmake .. -DCMAKE_BUILD_TYPE=Ubsan; \ + make + asan: mkdir -p $(BUILDDIR); \ cd $(BUILDDIR); \ @@ -67,6 +73,8 @@ afl: $(AFL_PATH)/afl-fuzz \ -i test/afl_test_cases \ -o test/afl_results \ + -x test/afl_dictionary \ + -t 100 \ -m none \ $(CMARK) @@ -123,9 +131,14 @@ $(ALLTESTS): $(SPEC) python3 test/spec_tests.py --spec $< --dump-tests | python3 -c 'import json; import sys; tests = json.loads(sys.stdin.read()); print("\n".join([test["markdown"] for test in tests]))' > $@ leakcheck: $(ALLTESTS) + rc=0; \ for format in html man xml commonmark; do \ - cat $< | valgrind --leak-check=full --dsymutil=yes --error-exitcode=1 $(PROG) -t $$format >/dev/null; \ - done + for opts in "" "--smart" "--normalize"; do \ + echo "cmark -t $$format $$opts" ; \ + cat $< | valgrind -q --leak-check=full --dsymutil=yes --error-exitcode=1 $(PROG) -t $$format $$opts >/dev/null || rc=1; \ + done; \ + done; \ + exit $$rc fuzztest: { for i in `seq 1 10`; do \ diff --git a/Makefile.nmake b/Makefile.nmake index 3f3bbce..b0556e2 100644 --- a/Makefile.nmake +++ b/Makefile.nmake @@ -2,26 +2,25 @@ SRCDIR=src DATADIR=data
BUILDDIR=build
INSTALLDIR=windows
-SPEC=spec.txt
+SPEC=test/spec.txt
PROG=$(BUILDDIR)\src\cmark.exe
GENERATOR=NMake Makefiles
all: $(BUILDDIR)
- @pushd $(BUILDDIR) && $(MAKE) /nologo && popd
+ @cd $(BUILDDIR) && $(MAKE) /nologo && cd ..
$(BUILDDIR):
- @cmake --version > nul || (echo "You need cmake to build this program: http://www.cmake.org/download/" && exit 1)
- -mkdir $(BUILDDIR) 2> nul
- pushd $(BUILDDIR) && \
+ @-mkdir $(BUILDDIR) 2> nul
+ cd $(BUILDDIR) && \
cmake \
-G "$(GENERATOR)" \
-D CMAKE_BUILD_TYPE=$(BUILD_TYPE) \
-D CMAKE_INSTALL_PREFIX=$(INSTALLDIR) \
.. && \
- popd
+ cd ..
install: all
- @pushd $(BUILDDIR) && $(MAKE) /nologo install && popd
+ @cd $(BUILDDIR) && $(MAKE) /nologo install && cd ..
clean:
-rmdir /s /q $(BUILDDIR) $(MINGW_INSTALLDIR) 2> nul
@@ -30,22 +29,8 @@ $(SRCDIR)\case_fold_switch.inc: $(DATADIR)\CaseFolding-3.2.0.txt perl mkcasefold.pl < $? > $@
test: $(SPEC) all
- @pushd $(BUILDDIR) && $(MAKE) /nologo test ARGS="-V" && popd
+ @cd $(BUILDDIR) && $(MAKE) /nologo test ARGS="-V" && cd ..
distclean: clean
del /q src\scanners.c 2> nul
del /q spec.md spec.html 2> nul
-
-### Spec ###
-
-spec.html: spec.txt template.html $(PROG)
- python3 makespec.py html > $@
-
-spec.md: spec.txt
- python3 makespec.py markdown > $@
-
-spec.pdf: spec.md template.tex specfilter.hs
- pandoc -s $< --template template.tex \
- --filter ./specfilter.hs -o $@ --latex-engine=xelatex --toc \
- --number-sections -V documentclass=report -V tocdepth=2 \
- -V classoption=twosides
@@ -1,15 +1,14 @@ -CommonMark -========== +cmark +===== -CommonMark is a rationalized version of Markdown syntax, -with a [spec][the spec] and BSD-licensed reference -implementations in C and JavaScript. +[![Build Status]](https://travis-ci.org/jgm/cmark) +[![Windows Build Status]](https://ci.appveyor.com/project/jgm/cmark) -[Try it now!](http://try.commonmark.org/) +`cmark` is the C reference implementation of [CommonMark], a +rationalized version of Markdown syntax with a [spec][the spec]. +(For the JavaScript reference implementation, see +[commonmark.js].) -For more information, see <http://commonmark.org>. - -This repository contains the C reference implementation. It provides a shared library (`libcmark`) with functions for parsing CommonMark documents to an abstract syntax tree (AST), manipulating the AST, and rendering the document to HTML, groff man, @@ -17,16 +16,45 @@ CommonMark, or an XML representation of the AST. It also provides a command-line program (`cmark`) for parsing and rendering CommonMark documents. -The library and program are written in standard C99 and have -no library dependencies. The parser is very fast, on par with -[sundown]: see the [benchmarks]. +Advantages of this library: + +- **Portable.** The library and program are written in standard + C99 and have no external dependencies. It has been tested with + MSVC, gcc, tcc, and clang. + +- **Fast.** Performance is on par with the fastest existing + Markdown parser, [sundown]: see the [benchmarks]. + +- **Accurate.** The library passes all CommonMark conformance tests. + +- **Standardized.** The library can be expected to parse CommonMark + the same way as any other conforming parser. So, for example, + you can use `commonmark.js` on the client to preview content that + will be rendered on the server using `cmark`. + +- **Robust.** The library has been extensively fuzz-tested using + american fuzzy lop. The test suite includes pathological cases + that bring many other Markdown parsers to a crawl (for example, + thousands-deep nested bracketed text or block quotes). + +- **Flexible.** CommonMark input is parsed to an AST which can be + manipulated programatically prior to rendering. + +- **Multiple renderers.** Output in HTML, groff man, CommonMark, + and a custom XML format is supported. And it is easy to write new + renderers to support other formats. + +- **Free.** BSD2-licensed. It is easy to use `libcmark` in python, lua, ruby, and other dynamic languages: see the `wrappers/` subdirectory for some simple examples. -[sundown]: https://github.com/vmg/sundown -[benchmarks]: benchmarks.md -[the spec]: http://spec.commonmark.org +There are also libraries that wrap `libcmark` for +[go](https://github.com/rhinoman/go-commonmark), +[Haskell](http://hackage.haskell.org/package/cmark), +[ruby](https://github.com/gjtorikian/commonmarker), +[Perl](https://metacpan.org/release/CommonMark), and +[R](http://cran.r-project.org/package=commonmark). Installing ---------- @@ -132,5 +160,12 @@ eliminating several worst-case performance issues. Nick Wellnhofer contributed many improvements, including most of the C library's API and its test harness. +[sundown]: https://github.com/vmg/sundown +[benchmarks]: benchmarks.md +[the spec]: http://spec.commonmark.org +[CommonMark]: http://commonmark.org [cmake]: http://www.cmake.org/download/ [re2c]: http://re2c.org +[commonmark.js]: https://github.com/jgm/commonmark.js +[Build Status]: https://img.shields.io/travis/jgm/cmark/master.svg?style=flat +[Windows Build Status]: https://ci.appveyor.com/api/projects/status/32r7s2skrgm9ubva?svg=true diff --git a/api_test/main.c b/api_test/main.c index 3390ac6..029a879 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -666,6 +666,40 @@ test_continuation_byte(test_batch_runner *runner, const char *utf8) } static void +numeric_entities(test_batch_runner *runner) +{ + test_md_to_html(runner, "�", "<p>" UTF8_REPL "</p>\n", + "Invalid numeric entity 0"); + test_md_to_html(runner, "퟿", "<p>\xED\x9F\xBF</p>\n", + "Valid numeric entity 0xD7FF"); + test_md_to_html(runner, "�", "<p>" UTF8_REPL "</p>\n", + "Invalid numeric entity 0xD800"); + test_md_to_html(runner, "�", "<p>" UTF8_REPL "</p>\n", + "Invalid numeric entity 0xDFFF"); + test_md_to_html(runner, "", "<p>\xEE\x80\x80</p>\n", + "Valid numeric entity 0xE000"); + test_md_to_html(runner, "", "<p>\xF4\x8F\xBF\xBF</p>\n", + "Valid numeric entity 0x10FFFF"); + test_md_to_html(runner, "�", "<p>" UTF8_REPL "</p>\n", + "Invalid numeric entity 0x110000"); + test_md_to_html(runner, "�", "<p>" UTF8_REPL "</p>\n", + "Invalid numeric entity 0x80000000"); + test_md_to_html(runner, "�", "<p>" UTF8_REPL "</p>\n", + "Invalid numeric entity 0xFFFFFFFF"); + test_md_to_html(runner, "�", "<p>" UTF8_REPL "</p>\n", + "Invalid numeric entity 99999999"); + + test_md_to_html(runner, "&#;", "<p>&#;</p>\n", + "Min decimal entity length"); + test_md_to_html(runner, "&#x;", "<p>&#x;</p>\n", + "Min hexadecimal entity length"); + test_md_to_html(runner, "�", "<p>&#999999999;</p>\n", + "Max decimal entity length"); + test_md_to_html(runner, "A", "<p>&#x000000041;</p>\n", + "Max hexadecimal entity length"); +} + +static void test_md_to_html(test_batch_runner *runner, const char *markdown, const char *expected_html, const char *msg) { @@ -690,6 +724,7 @@ int main() { parser(runner); render_html(runner); utf8(runner); + numeric_entities(runner); test_cplusplus(runner); test_print_summary(runner); diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..d86785b --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,15 @@ +environment: + PYTHON: "C:\\Python34-x64" + PYTHON_VERSION: "3.4.3" + PYTHON_ARCH: "64" + +# set up for nmake: +install: + - '"C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64' + - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + +build_script: + - 'nmake' + +test_script: + - 'nmake test' diff --git a/benchmarks.md b/benchmarks.md index b515c73..848c54a 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -14,8 +14,8 @@ Some benchmarks, run on an ancient Thinkpad running Intel Core 2 Duo at 2GHz. | marked | 1.94 | | **commonmark.js** | 1.93 | | discount | 1.86 | -| **cmark** | 0.36 | -| sundown | 0.34 | +| sundown | 0.33 | +| **cmark** | 0.33 | To run these benchmarks, use `make bench PROG=/path/to/program`. @@ -28,6 +28,13 @@ Markdown sources of all the localizations of the first edition of time is the *difference* between the time to run the program with the benchmark input and the time to run it with no input. (This procedure ensures that implementations in dynamic languages are -not penalized by startup time.) Amedian of ten runs is taken. The +not penalized by startup time.) A median of ten runs is taken. The process is reniced to a high priority so that the system doesn't interrupt runs. + +Note that these benchmarks were done on a 32-bit machine. On a 64-bit +machines, sundown is significantly faster than cmark (0.146s vs 0.237s +on Intel i5/OSX with Clang, 0.130s vs 0.191s on a 64-bit Debian VPS +with GCC). I do not know why the performance difference shows up on +the 64-bit architecture and not the 32-bit, but that is something that +might be investigated. diff --git a/changelog.txt b/changelog.txt index 2ac61d6..e622f0b 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,22 @@ +[0.19.0] + + * Fixed `_` emphasis parsing to conform to spec (jgm/CommonMark#317). + * Updated `spec.txt`. + * Compile static library with `-DCMARK_STATIC_DEFINE` (Nick Wellnhofer). + * Suppress warnings about Windows runtime library files (Nick Wellnhofer). + Visual Studio Express editions do not include the redistributable files. + Set `CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS` to suppress warnings. + * Added appyeyor: Windows continuous integration (`appveyor.yml`). + * Use `os.path.join` in `test/cmark.py` for proper cross-platform paths. + * Fixed `Makefile.nmake`. + * Improved `make afl`: added `test/afl_dictionary`, increased timeout + for hangs. + * Improved README with a description of the library's strengths. + * Pass-through Unicode non-characters (Nick Wellnhofer). + Despite their name, Unicode non-characters are valid code points. They + should be passed through by a library like libcmark. + * Check return status of `utf8proc_iterate` (#27). + [0.18.3] * Include patch level in soname (Nick Wellnhofer). Minor version is diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 06c13e0..716b97b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -67,8 +67,8 @@ set_target_properties(${PROGRAM} PROPERTIES COMPILE_FLAGS -DCMARK_STATIC_DEFINE) # Check integrity of node structure when compiled as debug: -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -pg -DCMARK_DEBUG_NODES") -set(CMAKE_LINKER_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -pg") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -DCMARK_DEBUG_NODES") +set(CMAKE_LINKER_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG}") set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE} -pg") set(CMAKE_LINKER_PROFILE "${CMAKE_LINKER_FLAGS_RELEASE} -pg") @@ -76,7 +76,7 @@ set(CMAKE_LINKER_PROFILE "${CMAKE_LINKER_FLAGS_RELEASE} -pg") if (${CMAKE_MAJOR_VERSION} GREATER 1 AND ${CMAKE_MINOR_VERSION} GREATER 8) set(CMAKE_C_VISIBILITY_PRESET hidden) set(CMAKE_VISIBILITY_INLINES_HIDDEN 1) -elseif(CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang") +elseif(CMAKE_COMPILER_IS_GNUCC OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") endif () @@ -87,6 +87,8 @@ set_target_properties(${LIBRARY} PROPERTIES OUTPUT_NAME "cmark" SOVERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH} VERSION ${PROJECT_VERSION}) +set_target_properties(${STATICLIBRARY} PROPERTIES + COMPILE_FLAGS -DCMARK_STATIC_DEFINE) if (MSVC) set_target_properties(${STATICLIBRARY} PROPERTIES @@ -109,6 +111,7 @@ if (MSVC) APPEND PROPERTY LINK_FLAGS /INCREMENTAL:NO) endif(MSVC) +set(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS ON) include (InstallRequiredSystemLibraries) install(TARGETS ${PROGRAM} ${LIBRARY} RUNTIME DESTINATION bin @@ -125,6 +128,7 @@ install(FILES # Feature tests include(CheckIncludeFile) include(CheckCSourceCompiles) +include(CheckCSourceRuns) include(CheckSymbolExists) CHECK_INCLUDE_FILE(stdbool.h HAVE_STDBOOL_H) CHECK_C_SOURCE_COMPILES( @@ -134,6 +138,10 @@ CHECK_C_SOURCE_COMPILES(" int f(void) __attribute__ (()); int main() { return 0; } " HAVE___ATTRIBUTE__) +CHECK_C_SOURCE_RUNS(" + #include <stdio.h> + int main() { return snprintf(NULL, 0, \"123\") == 3 ? 0 : 1; } +" HAVE_C99_SNPRINTF) CHECK_SYMBOL_EXISTS(va_copy stdarg.h HAVE_VA_COPY) CONFIGURE_FILE( @@ -161,3 +169,7 @@ endif() if($ENV{TIMER}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTIMER=1") endif($ENV{TIMER}) + +if(CMAKE_BUILD_TYPE STREQUAL "Ubsan") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") +endif() diff --git a/src/blocks.c b/src/blocks.c index 777356a..8ae452e 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -554,6 +554,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes) bool blank = false; int first_nonspace; int indent; + bool indented; cmark_chunk input; bool maybe_lazy; @@ -690,11 +691,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes) first_nonspace++; indent = first_nonspace - offset; + indented = indent >= CODE_INDENT; blank = peek_at(&input, first_nonspace) == '\n' || peek_at(&input, first_nonspace) == '\r'; - if (indent >= CODE_INDENT) { - if (!maybe_lazy && !blank) { + if (indented && !maybe_lazy && !blank) { offset += CODE_INDENT; container = add_child(parser, container, NODE_CODE_BLOCK, offset + 1); container->as.code.fenced = false; @@ -702,11 +703,8 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes) container->as.code.fence_length = 0; container->as.code.fence_offset = 0; container->as.code.info = cmark_chunk_literal(""); - } else { // indent > 4 in lazy line - break; - } - } else if (peek_at(&input, first_nonspace) == '>') { + } else if (!indented && peek_at(&input, first_nonspace) == '>') { offset = first_nonspace + 1; // optional following character @@ -714,7 +712,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes) offset++; container = add_child(parser, container, NODE_BLOCK_QUOTE, offset + 1); - } else if ((matched = scan_atx_header_start(&input, first_nonspace))) { + } else if (!indented && (matched = scan_atx_header_start(&input, first_nonspace))) { offset = first_nonspace + matched; container = add_child(parser, container, NODE_HEADER, offset + 1); @@ -729,7 +727,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes) container->as.header.level = level; container->as.header.setext = false; - } else if ((matched = scan_open_code_fence(&input, first_nonspace))) { + } else if (!indented && (matched = scan_open_code_fence(&input, first_nonspace))) { container = add_child(parser, container, NODE_CODE_BLOCK, first_nonspace + 1); container->as.code.fenced = true; @@ -739,12 +737,13 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes) container->as.code.info = cmark_chunk_literal(""); offset = first_nonspace + matched; - } else if ((matched = scan_html_block_tag(&input, first_nonspace))) { + } else if (!indented && (matched = scan_html_block_tag(&input, first_nonspace))) { container = add_child(parser, container, NODE_HTML, first_nonspace + 1); // note, we don't adjust offset because the tag is part of the text - } else if (container->type == NODE_PARAGRAPH && + } else if (!indented && + container->type == NODE_PARAGRAPH && (lev = scan_setext_header_line(&input, first_nonspace)) && // check that there is only one line in the paragraph: (cmark_strbuf_strrchr(&container->string_content, '\n', @@ -757,7 +756,9 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes) container->as.header.setext = true; offset = input.len - 1; - } else if (!(container->type == NODE_PARAGRAPH && !all_matched) && + } else if (!indented && + !(container->type == NODE_PARAGRAPH && + !all_matched) && (matched = scan_hrule(&input, first_nonspace))) { // it's only now that we know the line is not part of a setext header: diff --git a/src/buffer.c b/src/buffer.c index 5ec8b49..2b7f062 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -175,6 +175,12 @@ int cmark_strbuf_vprintf(cmark_strbuf *buf, const char *format, va_list ap) buf->asize - buf->size, format, args ); +#ifndef HAVE_C99_SNPRINTF + // Assume we're on Windows. + if (len < 0) { + len = _vscprintf(format, args); + } +#endif va_end(args); diff --git a/src/chunk.h b/src/chunk.h index 54c4b16..4bb4980 100644 --- a/src/chunk.h +++ b/src/chunk.h @@ -7,6 +7,8 @@ #include "cmark_ctype.h" #include "buffer.h" +#define CMARK_CHUNK_EMPTY { NULL, 0, 0 } + typedef struct { unsigned char *data; int len; @@ -64,7 +66,9 @@ static inline const char *cmark_chunk_to_cstr(cmark_chunk *c) } str = (unsigned char *)malloc(c->len + 1); if(str != NULL) { - memcpy(str, c->data, c->len); + if(c->len > 0){ + memcpy(str, c->data, c->len); + } str[c->len] = 0; } c->data = str; @@ -78,10 +82,17 @@ static inline void cmark_chunk_set_cstr(cmark_chunk *c, const char *str) if (c->alloc) { free(c->data); } - c->len = strlen(str); - c->data = (unsigned char *)malloc(c->len + 1); - c->alloc = 1; - memcpy(c->data, str, c->len + 1); + if (str == NULL) { + c->len = 0; + c->data = NULL; + c->alloc = 0; + } + else { + c->len = strlen(str); + c->data = (unsigned char *)malloc(c->len + 1); + c->alloc = 1; + memcpy(c->data, str, c->len + 1); + } } static inline cmark_chunk cmark_chunk_literal(const char *data) diff --git a/src/commonmark.c b/src/commonmark.c index bef92f6..47da191 100644 --- a/src/commonmark.c +++ b/src/commonmark.c @@ -49,35 +49,35 @@ typedef enum { static inline bool needs_escaping(escaping escape, - int32_t c, - unsigned char next_c, - struct render_state *state) + int32_t c, + unsigned char next_c, + struct render_state *state) { if (escape == NORMAL) { return (c == '*' || c == '_' || c == '[' || c == ']' || - c == '<' || c == '>' || c == '\\' || c == '`' || - (c == '&' && isalpha(next_c)) || - (c == '!' && next_c == '[') || - (state->begin_line && - (c == '-' || c == '+' || c == '#' || c == '=')) || - (c == '#' && (isspace(next_c) || next_c == '\0')) || - ((c == '.' || c == ')') && - isdigit(state->buffer->ptr[state->buffer->size - 1]))); + c == '<' || c == '>' || c == '\\' || c == '`' || + (c == '&' && isalpha(next_c)) || + (c == '!' && next_c == '[') || + (state->begin_line && + (c == '-' || c == '+' || c == '#' || c == '=')) || + (c == '#' && (isspace(next_c) || next_c == '\0')) || + ((c == '.' || c == ')') && + isdigit(state->buffer->ptr[state->buffer->size - 1]))); } else if (escape == TITLE) { return (c == '`' || c == '<' || c == '>' || c == '"' || - c == '\\'); + c == '\\'); } else if (escape == URL) { return (c == '`' || c == '<' || c == '>' || isspace(c) || - c == '\\' || c == ')' || c == '('); + c == '\\' || c == ')' || c == '('); } else { return false; } } static inline void out(struct render_state *state, - cmark_chunk str, - bool wrap, - escaping escape) + cmark_chunk str, + bool wrap, + escaping escape) { unsigned char* source = str.data; int length = str.len; @@ -100,7 +100,7 @@ static inline void out(struct render_state *state, cmark_strbuf_putc(state->buffer, '\n'); if (state->need_cr > 1) { cmark_strbuf_put(state->buffer, state->prefix->ptr, - state->prefix->size); + state->prefix->size); } } state->column = 0; @@ -111,12 +111,15 @@ static inline void out(struct render_state *state, while (i < length) { if (state->begin_line) { cmark_strbuf_put(state->buffer, state->prefix->ptr, - state->prefix->size); + state->prefix->size); // note: this assumes prefix is ascii: state->column = state->prefix->size; } len = utf8proc_iterate(source + i, length - i, &c); + if (len == -1) { // error condition + return; // return without rendering rest of string + } nextc = source[i + len]; if (c == 32 && wrap) { if (!state->begin_line) { @@ -124,7 +127,7 @@ static inline void out(struct render_state *state, state->column += 1; state->begin_line = false; state->last_breakable = state->buffer->size - - 1; + 1; // skip following spaces while (source[i + 1] == ' ') { i++; @@ -167,7 +170,7 @@ static inline void out(struct render_state *state, // add newline, prefix, and remainder cmark_strbuf_putc(state->buffer, '\n'); cmark_strbuf_put(state->buffer, state->prefix->ptr, - state->prefix->size); + state->prefix->size); cmark_strbuf_put(state->buffer, remainder.data, remainder.len); state->column = state->prefix->size + remainder.len; cmark_chunk_free(&remainder); @@ -236,6 +239,7 @@ is_autolink(cmark_node *node) { const char *title; const char *url; + cmark_node *link_text; if (node->type != CMARK_NODE_LINK) { return false; @@ -252,10 +256,13 @@ is_autolink(cmark_node *node) if (title != NULL && strlen(title) > 0) { return false; } - cmark_consolidate_text_nodes(node); - return (strncmp(url, - (char*)node->as.literal.data, - node->as.literal.len) == 0); + + link_text = node->first_child; + cmark_consolidate_text_nodes(link_text); + return ((int)strlen(url) == link_text->as.literal.len && + strncmp(url, + (char*)link_text->as.literal.data, + link_text->as.literal.len) == 0); } // if node is a block node, returns node. @@ -265,7 +272,7 @@ get_containing_block(cmark_node *node) { while (node && (node->type < CMARK_NODE_FIRST_BLOCK || - node->type > CMARK_NODE_LAST_BLOCK)) { + node->type > CMARK_NODE_LAST_BLOCK)) { node = node->parent; } return node; @@ -293,14 +300,14 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, // a following list. if (!(node->type == CMARK_NODE_ITEM && node->prev == NULL && entering)) { - tmp = get_containing_block(node); - state->in_tight_list_item = - (tmp->type == CMARK_NODE_ITEM && - cmark_node_get_list_tight(tmp->parent)) || - (tmp && - tmp->parent && - tmp->parent->type == CMARK_NODE_ITEM && - cmark_node_get_list_tight(tmp->parent->parent)); + tmp = get_containing_block(node); + state->in_tight_list_item = + (tmp->type == CMARK_NODE_ITEM && + cmark_node_get_list_tight(tmp->parent)) || + (tmp && + tmp->parent && + tmp->parent->type == CMARK_NODE_ITEM && + cmark_node_get_list_tight(tmp->parent->parent)); } switch (node->type) { @@ -316,7 +323,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, cmark_strbuf_puts(state->prefix, "> "); } else { cmark_strbuf_truncate(state->prefix, - state->prefix->size - 2); + state->prefix->size - 2); blankline(state); } break; @@ -348,10 +355,10 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, // we get nice transition from single digits // to double cmark_strbuf_printf(&listmarker, - "%d%s%s", list_number, - list_delim == CMARK_PAREN_DELIM ? - ")" : ".", - list_number < 10 ? " " : " "); + "%d%s%s", list_number, + list_delim == CMARK_PAREN_DELIM ? + ")" : ".", + list_number < 10 ? " " : " "); marker_width = listmarker.size; } if (entering) { @@ -361,14 +368,14 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, cmark_strbuf_puts(state->prefix, " "); } else { lit(state, (char *)listmarker.ptr, false); - for (i=marker_width; i--;) { + for (i = marker_width; i--;) { cmark_strbuf_putc(state->prefix, ' '); } } } else { cmark_strbuf_truncate(state->prefix, - state->prefix->size - - marker_width); + state->prefix->size - + marker_width); cr(state); } cmark_strbuf_free(&listmarker); @@ -405,7 +412,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, cmark_strbuf_puts(state->prefix, " "); out(state, node->as.code.literal, false, LITERAL); cmark_strbuf_truncate(state->prefix, - state->prefix->size - 4); + state->prefix->size - 4); } else { numticks = longest_backtick_sequence(code) + 1; if (numticks < 3) { @@ -514,7 +521,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, if (entering) { lit(state, "<", false); if (strncmp(cmark_node_get_url(node), - "mailto:", 7) == 0) { + "mailto:", 7) == 0) { lit(state, (char *)cmark_node_get_url(node) + 7, false); @@ -579,9 +586,10 @@ char *cmark_render_commonmark(cmark_node *root, int options, int width) if (CMARK_OPT_HARDBREAKS & options) { width = 0; } - struct render_state state = - { options, &commonmark, &prefix, 0, width, - 0, 0, true, false, false}; + struct render_state state = { + options, &commonmark, &prefix, 0, width, + 0, 0, true, false, false + }; cmark_node *cur; cmark_event_type ev_type; cmark_iter *iter = cmark_iter_new(root); diff --git a/src/config.h.in b/src/config.h.in index c1e9597..5960928 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -21,3 +21,5 @@ #ifndef HAVE_VA_COPY #define va_copy(dest, src) ((dest) = (src)) #endif + +#cmakedefine HAVE_C99_SNPRINTF diff --git a/src/houdini_html_u.c b/src/houdini_html_u.c index 2cb14b4..eaf295e 100644 --- a/src/houdini_html_u.c +++ b/src/houdini_html_u.c @@ -12,32 +12,45 @@ houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, size_t size) { size_t i = 0; - if (size > 3 && src[0] == '#') { - int codepoint = 0; + if (size >= 3 && src[0] == '#') { + int codepoint = 0; + int num_digits = 0; if (_isdigit(src[1])) { for (i = 1; i < size && _isdigit(src[i]); ++i) { - int cp = (codepoint * 10) + (src[i] - '0'); + codepoint = (codepoint * 10) + (src[i] - '0'); - if (cp < codepoint) - return 0; - - codepoint = cp; + if (codepoint >= 0x110000) { + // Keep counting digits but + // avoid integer overflow. + codepoint = 0x110000; + } } + + num_digits = i - 1; } else if (src[1] == 'x' || src[1] == 'X') { for (i = 2; i < size && _isxdigit(src[i]); ++i) { - int cp = (codepoint * 16) + ((src[i] | 32) % 39 - 9); + codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9); - if (cp < codepoint) - return 0; - - codepoint = cp; + if (codepoint >= 0x110000) { + // Keep counting digits but + // avoid integer overflow. + codepoint = 0x110000; + } } + + num_digits = i - 2; } - if (i < size && src[i] == ';' && codepoint) { + if (num_digits >= 1 && num_digits <= 8 && + i < size && src[i] == ';') { + if (codepoint == 0 || + (codepoint >= 0xD800 && codepoint < 0xE000) || + codepoint >= 0x110000) { + codepoint = 0xFFFD; + } utf8proc_encode_char(codepoint, ob); return i + 1; } @@ -261,12 +261,13 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_LINK: if (entering) { cmark_strbuf_puts(html, "<a href=\""); - if (node->as.link.url) - escape_href(html, node->as.link.url, -1); + escape_href(html, node->as.link.url.data, + node->as.link.url.len); - if (node->as.link.title) { + if (node->as.link.title.len) { cmark_strbuf_puts(html, "\" title=\""); - escape_html(html, node->as.link.title, -1); + escape_html(html, node->as.link.title.data, + node->as.link.title.len); } cmark_strbuf_puts(html, "\">"); @@ -278,15 +279,16 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_IMAGE: if (entering) { cmark_strbuf_puts(html, "<img src=\""); - if (node->as.link.url) - escape_href(html, node->as.link.url, -1); + escape_href(html, node->as.link.url.data, + node->as.link.url.len); cmark_strbuf_puts(html, "\" alt=\""); state->plain = node; } else { - if (node->as.link.title) { + if (node->as.link.title.len) { cmark_strbuf_puts(html, "\" title=\""); - escape_html(html, node->as.link.title, -1); + escape_html(html, node->as.link.title.data, + node->as.link.title.len); } cmark_strbuf_puts(html, "\" />"); diff --git a/src/inlines.c b/src/inlines.c index 7175327..232fc10 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -59,31 +59,33 @@ static void subject_from_buf(subject *e, cmark_strbuf *buffer, cmark_reference_map *refmap); static int subject_find_special_char(subject *subj, int options); -static unsigned char *cmark_clean_autolink(cmark_chunk *url, int is_email) +static cmark_chunk cmark_clean_autolink(cmark_chunk *url, int is_email) { cmark_strbuf buf = GH_BUF_INIT; cmark_chunk_trim(url); - if (url->len == 0) - return NULL; + if (url->len == 0) { + cmark_chunk result = CMARK_CHUNK_EMPTY; + return result; + } if (is_email) cmark_strbuf_puts(&buf, "mailto:"); houdini_unescape_html_f(&buf, url->data, url->len); - return cmark_strbuf_detach(&buf); + return cmark_chunk_buf_detach(&buf); } -static inline cmark_node *make_link(cmark_node *label, unsigned char *url, unsigned char *title) +static inline cmark_node *make_link(cmark_node *label, cmark_chunk *url, cmark_chunk *title) { cmark_node* e = (cmark_node *)calloc(1, sizeof(*e)); if(e != NULL) { e->type = CMARK_NODE_LINK; e->first_child = label; e->last_child = label; - e->as.link.url = url; - e->as.link.title = title; + e->as.link.url = *url; + e->as.link.title = *title; e->next = NULL; label->parent = e; } @@ -92,7 +94,9 @@ static inline cmark_node *make_link(cmark_node *label, unsigned char *url, unsig static inline cmark_node* make_autolink(cmark_node* label, cmark_chunk url, int is_email) { - return make_link(label, cmark_clean_autolink(&url, is_email), NULL); + cmark_chunk clean_url = cmark_clean_autolink(&url, is_email); + cmark_chunk title = CMARK_CHUNK_EMPTY; + return make_link(label, &clean_url, &title); } // Create an inline with a literal string value. @@ -134,19 +138,20 @@ static inline cmark_node* make_simple(cmark_node_type t) return e; } -static unsigned char *bufdup(const unsigned char *buf) +// Duplicate a chunk by creating a copy of the buffer not by reusing the +// buffer like cmark_chunk_dup does. +static cmark_chunk chunk_clone(cmark_chunk *src) { - unsigned char *new_buf = NULL; + cmark_chunk c; + int len = src->len; - if (buf) { - int len = strlen((char *)buf); - new_buf = (unsigned char *)calloc(len + 1, sizeof(*new_buf)); - if(new_buf != NULL) { - memcpy(new_buf, buf, len + 1); - } - } + c.len = len; + c.data = (unsigned char *)malloc(len + 1); + c.alloc = 1; + memcpy(c.data, src->data, len); + c.data[len] = '\0'; - return new_buf; + return c; } static void subject_from_buf(subject *e, cmark_strbuf *buffer, @@ -301,8 +306,10 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) !utf8proc_is_space(after_char) && !utf8proc_is_punctuation(after_char)); if (c == '_') { - *can_open = left_flanking && !right_flanking; - *can_close = right_flanking && !left_flanking; + *can_open = left_flanking && + (!right_flanking || utf8proc_is_punctuation(before_char)); + *can_close = right_flanking && + (!left_flanking || utf8proc_is_punctuation(after_char)); } else if (c == '\'' || c == '"') { *can_open = left_flanking && !right_flanking; *can_close = right_flanking; @@ -620,14 +627,16 @@ static cmark_node *make_str_with_entities(cmark_chunk *content) // Clean a URL: remove surrounding whitespace and surrounding <>, // and remove \ that escape punctuation. -unsigned char *cmark_clean_url(cmark_chunk *url) +cmark_chunk cmark_clean_url(cmark_chunk *url) { cmark_strbuf buf = GH_BUF_INIT; cmark_chunk_trim(url); - if (url->len == 0) - return NULL; + if (url->len == 0) { + cmark_chunk result = CMARK_CHUNK_EMPTY; + return result; + } if (url->data[0] == '<' && url->data[url->len - 1] == '>') { houdini_unescape_html_f(&buf, url->data + 1, url->len - 2); @@ -636,16 +645,18 @@ unsigned char *cmark_clean_url(cmark_chunk *url) } cmark_strbuf_unescape(&buf); - return buf.size == 0 ? NULL : cmark_strbuf_detach(&buf); + return cmark_chunk_buf_detach(&buf); } -unsigned char *cmark_clean_title(cmark_chunk *title) +cmark_chunk cmark_clean_title(cmark_chunk *title) { cmark_strbuf buf = GH_BUF_INIT; unsigned char first, last; - if (title->len == 0) - return NULL; + if (title->len == 0) { + cmark_chunk result = CMARK_CHUNK_EMPTY; + return result; + } first = title->data[0]; last = title->data[title->len - 1]; @@ -660,7 +671,7 @@ unsigned char *cmark_clean_title(cmark_chunk *title) } cmark_strbuf_unescape(&buf); - return buf.size == 0 ? NULL : cmark_strbuf_detach(&buf); + return cmark_chunk_buf_detach(&buf); } // Parse an autolink or HTML tag. @@ -764,7 +775,7 @@ static cmark_node* handle_close_bracket(subject* subj, cmark_node *parent) cmark_reference *ref; bool is_image = false; cmark_chunk url_chunk, title_chunk; - unsigned char *url, *title; + cmark_chunk url, title; delimiter *opener; cmark_node *link_text; cmark_node *inl; @@ -852,8 +863,8 @@ static cmark_node* handle_close_bracket(subject* subj, cmark_node *parent) cmark_chunk_free(&raw_label); if (ref != NULL) { // found - url = bufdup(ref->url); - title = bufdup(ref->title); + url = chunk_clone(&ref->url); + title = chunk_clone(&ref->title); goto match; } else { goto noMatch; diff --git a/src/inlines.h b/src/inlines.h index 9e56790..534588e 100644 --- a/src/inlines.h +++ b/src/inlines.h @@ -5,8 +5,8 @@ extern "C" { #endif -unsigned char *cmark_clean_url(cmark_chunk *url); -unsigned char *cmark_clean_title(cmark_chunk *title); +cmark_chunk cmark_clean_url(cmark_chunk *url); +cmark_chunk cmark_clean_title(cmark_chunk *title); void cmark_parse_inlines(cmark_node* parent, cmark_reference_map *refmap, int options); diff --git a/src/iterator.c b/src/iterator.c index c6faf99..f18e3bf 100644 --- a/src/iterator.c +++ b/src/iterator.c @@ -129,18 +129,20 @@ void cmark_consolidate_text_nodes(cmark_node *root) cur->next && cur->next->type == CMARK_NODE_TEXT) { cmark_strbuf_clear(&buf); - cmark_strbuf_puts(&buf, cmark_node_get_literal(cur)); + cmark_strbuf_put(&buf, cur->as.literal.data, cur->as.literal.len); tmp = cur->next; while (tmp && tmp->type == CMARK_NODE_TEXT) { cmark_iter_next(iter); // advance pointer - cmark_strbuf_puts(&buf, cmark_node_get_literal(tmp)); + cmark_strbuf_put(&buf, tmp->as.literal.data, tmp->as.literal.len); next = tmp->next; cmark_node_free(tmp); tmp = next; } - cmark_node_set_literal(cur, (char *)cmark_strbuf_detach(&buf)); + cmark_chunk_free(&cur->as.literal); + cur->as.literal = cmark_chunk_buf_detach(&buf); } } + cmark_strbuf_free(&buf); cmark_iter_free(iter); } @@ -102,13 +102,13 @@ int main(int argc, char *argv[]) width = (int)strtol(argv[i], &unparsed, 10); if (unparsed && strlen(unparsed) > 0) { fprintf(stderr, - "failed parsing width '%s' at '%s'\n", - argv[i], unparsed); + "failed parsing width '%s' at '%s'\n", + argv[i], unparsed); exit(1); } } else { fprintf(stderr, - "--width requires an argument\n"); + "--width requires an argument\n"); exit(1); } } else if ((strcmp(argv[i], "-t") == 0) || @@ -20,6 +20,9 @@ static void escape_man(cmark_strbuf *dest, const unsigned char *source, int leng while (i < length) { len = utf8proc_iterate(source + i, length - i, &c); + if (len == -1) { // error condition + return; // return without rendering anything + } switch(c) { case 46: if (beginLine) { @@ -122,12 +122,8 @@ void S_free_nodes(cmark_node *e) break; case NODE_LINK: case NODE_IMAGE: - if (e->as.link.url) { - free(e->as.link.url); - } - if (e->as.link.title) { - free(e->as.link.title); - } + cmark_chunk_free(&e->as.link.url); + cmark_chunk_free(&e->as.link.title); break; default: break; @@ -282,15 +278,6 @@ cmark_node_set_user_data(cmark_node *node, void *user_data) return 1; } -static char* -S_strdup(const char *str) -{ - size_t size = strlen(str) + 1; - char *dup = (char *)malloc(size); - memcpy(dup, str, size); - return dup; -} - const char* cmark_node_get_literal(cmark_node *node) { @@ -541,7 +528,7 @@ cmark_node_get_url(cmark_node *node) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - return (char *)node->as.link.url; + return cmark_chunk_to_cstr(&node->as.link.url); default: break; } @@ -559,8 +546,7 @@ cmark_node_set_url(cmark_node *node, const char *url) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - free(node->as.link.url); - node->as.link.url = (unsigned char *)S_strdup(url); + cmark_chunk_set_cstr(&node->as.link.url, url); return 1; default: break; @@ -579,7 +565,7 @@ cmark_node_get_title(cmark_node *node) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - return (char *)node->as.link.title; + return cmark_chunk_to_cstr(&node->as.link.title); default: break; } @@ -597,8 +583,7 @@ cmark_node_set_title(cmark_node *node, const char *title) switch (node->type) { case NODE_LINK: case NODE_IMAGE: - free(node->as.link.title); - node->as.link.title = (unsigned char *)S_strdup(title); + cmark_chunk_set_cstr(&node->as.link.title, title); return 1; default: break; @@ -38,8 +38,8 @@ typedef struct { } cmark_header; typedef struct { - unsigned char *url; - unsigned char *title; + cmark_chunk url; + cmark_chunk title; } cmark_link; struct cmark_node { diff --git a/src/references.c b/src/references.c index 37bf4cb..1d3d56d 100644 --- a/src/references.c +++ b/src/references.c @@ -20,8 +20,8 @@ static void reference_free(cmark_reference *ref) { if(ref != NULL) { free(ref->label); - free(ref->url); - free(ref->title); + cmark_chunk_free(&ref->url); + cmark_chunk_free(&ref->title); free(ref); } } diff --git a/src/references.h b/src/references.h index 69325bb..a360cd5 100644 --- a/src/references.h +++ b/src/references.h @@ -12,8 +12,8 @@ extern "C" { struct cmark_reference { struct cmark_reference *next; unsigned char *label; - unsigned char *url; - unsigned char *title; + cmark_chunk url; + cmark_chunk title; unsigned int hash; }; @@ -172,8 +172,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) case 3: uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + (str[2] & 0x3F); - if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || - (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; + if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000)) uc = -1; break; case 4: uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) @@ -182,7 +181,7 @@ int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) break; } - if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) + if (uc < 0) return -1; *dst = uc; @@ -118,10 +118,12 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, case CMARK_NODE_LINK: case CMARK_NODE_IMAGE: cmark_strbuf_puts(xml, " destination=\""); - escape_xml(xml, node->as.link.url, -1); + escape_xml(xml, node->as.link.url.data, + node->as.link.url.len); cmark_strbuf_putc(xml, '"'); cmark_strbuf_puts(xml, " title=\""); - escape_xml(xml, node->as.link.title, -1); + escape_xml(xml, node->as.link.title.data, + node->as.link.title.len); cmark_strbuf_putc(xml, '"'); break; default: diff --git a/test/afl_dictionary/asterisk b/test/afl_dictionary/asterisk new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/test/afl_dictionary/asterisk @@ -0,0 +1 @@ +*
\ No newline at end of file diff --git a/test/afl_dictionary/attr_generic b/test/afl_dictionary/attr_generic new file mode 100644 index 0000000..d84e4b2 --- /dev/null +++ b/test/afl_dictionary/attr_generic @@ -0,0 +1 @@ + a="1"
\ No newline at end of file diff --git a/test/afl_dictionary/attr_href b/test/afl_dictionary/attr_href new file mode 100644 index 0000000..cbb9775 --- /dev/null +++ b/test/afl_dictionary/attr_href @@ -0,0 +1 @@ + href="1"
\ No newline at end of file diff --git a/test/afl_dictionary/attr_xml_lang b/test/afl_dictionary/attr_xml_lang new file mode 100644 index 0000000..6dab3e9 --- /dev/null +++ b/test/afl_dictionary/attr_xml_lang @@ -0,0 +1 @@ + xml:lang="1"
\ No newline at end of file diff --git a/test/afl_dictionary/attr_xmlns b/test/afl_dictionary/attr_xmlns new file mode 100644 index 0000000..168863a --- /dev/null +++ b/test/afl_dictionary/attr_xmlns @@ -0,0 +1 @@ + xmlns="1"
\ No newline at end of file diff --git a/test/afl_dictionary/backslash b/test/afl_dictionary/backslash new file mode 100644 index 0000000..b7d5379 --- /dev/null +++ b/test/afl_dictionary/backslash @@ -0,0 +1 @@ +\
\ No newline at end of file diff --git a/test/afl_dictionary/backtick b/test/afl_dictionary/backtick new file mode 100644 index 0000000..64845fb --- /dev/null +++ b/test/afl_dictionary/backtick @@ -0,0 +1 @@ +`
\ No newline at end of file diff --git a/test/afl_dictionary/colon b/test/afl_dictionary/colon new file mode 100644 index 0000000..22ded55 --- /dev/null +++ b/test/afl_dictionary/colon @@ -0,0 +1 @@ +:
\ No newline at end of file diff --git a/test/afl_dictionary/dashes b/test/afl_dictionary/dashes new file mode 100644 index 0000000..73b314f --- /dev/null +++ b/test/afl_dictionary/dashes @@ -0,0 +1 @@ +---
\ No newline at end of file diff --git a/test/afl_dictionary/double_quote b/test/afl_dictionary/double_quote new file mode 100644 index 0000000..9d68933 --- /dev/null +++ b/test/afl_dictionary/double_quote @@ -0,0 +1 @@ +"
\ No newline at end of file diff --git a/test/afl_dictionary/entity_builtin b/test/afl_dictionary/entity_builtin new file mode 100644 index 0000000..1489a83 --- /dev/null +++ b/test/afl_dictionary/entity_builtin @@ -0,0 +1 @@ +<
\ No newline at end of file diff --git a/test/afl_dictionary/entity_decimal b/test/afl_dictionary/entity_decimal new file mode 100644 index 0000000..7b997f6 --- /dev/null +++ b/test/afl_dictionary/entity_decimal @@ -0,0 +1 @@ +
\ No newline at end of file diff --git a/test/afl_dictionary/entity_external b/test/afl_dictionary/entity_external new file mode 100644 index 0000000..f626a66 --- /dev/null +++ b/test/afl_dictionary/entity_external @@ -0,0 +1 @@ +&a;
\ No newline at end of file diff --git a/test/afl_dictionary/entity_hex b/test/afl_dictionary/entity_hex new file mode 100644 index 0000000..8766028 --- /dev/null +++ b/test/afl_dictionary/entity_hex @@ -0,0 +1 @@ +
\ No newline at end of file diff --git a/test/afl_dictionary/equals b/test/afl_dictionary/equals new file mode 100644 index 0000000..7193984 --- /dev/null +++ b/test/afl_dictionary/equals @@ -0,0 +1 @@ +===
\ No newline at end of file diff --git a/test/afl_dictionary/exclamation b/test/afl_dictionary/exclamation new file mode 100644 index 0000000..74e0f12 --- /dev/null +++ b/test/afl_dictionary/exclamation @@ -0,0 +1 @@ +!
\ No newline at end of file diff --git a/test/afl_dictionary/greater_than b/test/afl_dictionary/greater_than new file mode 100644 index 0000000..0817502 --- /dev/null +++ b/test/afl_dictionary/greater_than @@ -0,0 +1 @@ +>
\ No newline at end of file diff --git a/test/afl_dictionary/hash b/test/afl_dictionary/hash new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/test/afl_dictionary/hash @@ -0,0 +1 @@ +#
\ No newline at end of file diff --git a/test/afl_dictionary/hyphen b/test/afl_dictionary/hyphen new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/test/afl_dictionary/hyphen diff --git a/test/afl_dictionary/indent b/test/afl_dictionary/indent new file mode 100644 index 0000000..136d063 --- /dev/null +++ b/test/afl_dictionary/indent @@ -0,0 +1 @@ +
\ No newline at end of file diff --git a/test/afl_dictionary/left_bracket b/test/afl_dictionary/left_bracket new file mode 100644 index 0000000..8e2f0be --- /dev/null +++ b/test/afl_dictionary/left_bracket @@ -0,0 +1 @@ +[
\ No newline at end of file diff --git a/test/afl_dictionary/left_paren b/test/afl_dictionary/left_paren new file mode 100644 index 0000000..f46d387 --- /dev/null +++ b/test/afl_dictionary/left_paren @@ -0,0 +1 @@ +(
\ No newline at end of file diff --git a/test/afl_dictionary/less_than b/test/afl_dictionary/less_than new file mode 100644 index 0000000..c5fa784 --- /dev/null +++ b/test/afl_dictionary/less_than @@ -0,0 +1 @@ +<
\ No newline at end of file diff --git a/test/afl_dictionary/plus b/test/afl_dictionary/plus new file mode 100644 index 0000000..9b26e9b --- /dev/null +++ b/test/afl_dictionary/plus @@ -0,0 +1 @@ ++
\ No newline at end of file diff --git a/test/afl_dictionary/right_bracket b/test/afl_dictionary/right_bracket new file mode 100644 index 0000000..54caf60 --- /dev/null +++ b/test/afl_dictionary/right_bracket @@ -0,0 +1 @@ +]
\ No newline at end of file diff --git a/test/afl_dictionary/right_paren b/test/afl_dictionary/right_paren new file mode 100644 index 0000000..e8a0f87 --- /dev/null +++ b/test/afl_dictionary/right_paren @@ -0,0 +1 @@ +)
\ No newline at end of file diff --git a/test/afl_dictionary/single_quote b/test/afl_dictionary/single_quote new file mode 100644 index 0000000..ad2823b --- /dev/null +++ b/test/afl_dictionary/single_quote @@ -0,0 +1 @@ +'
\ No newline at end of file diff --git a/test/afl_dictionary/string_any b/test/afl_dictionary/string_any new file mode 100644 index 0000000..bcd7dd4 --- /dev/null +++ b/test/afl_dictionary/string_any @@ -0,0 +1 @@ +ANY
\ No newline at end of file diff --git a/test/afl_dictionary/string_brackets b/test/afl_dictionary/string_brackets new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/test/afl_dictionary/string_brackets @@ -0,0 +1 @@ +[]
\ No newline at end of file diff --git a/test/afl_dictionary/string_cdata b/test/afl_dictionary/string_cdata new file mode 100644 index 0000000..9d6d94e --- /dev/null +++ b/test/afl_dictionary/string_cdata @@ -0,0 +1 @@ +CDATA
\ No newline at end of file diff --git a/test/afl_dictionary/string_dashes b/test/afl_dictionary/string_dashes new file mode 100644 index 0000000..7489acc --- /dev/null +++ b/test/afl_dictionary/string_dashes @@ -0,0 +1 @@ +--
\ No newline at end of file diff --git a/test/afl_dictionary/string_empty_dblquotes b/test/afl_dictionary/string_empty_dblquotes new file mode 100644 index 0000000..3cc762b --- /dev/null +++ b/test/afl_dictionary/string_empty_dblquotes @@ -0,0 +1 @@ +""
\ No newline at end of file diff --git a/test/afl_dictionary/string_empty_quotes b/test/afl_dictionary/string_empty_quotes new file mode 100644 index 0000000..9423090 --- /dev/null +++ b/test/afl_dictionary/string_empty_quotes @@ -0,0 +1 @@ +''
\ No newline at end of file diff --git a/test/afl_dictionary/string_idrefs b/test/afl_dictionary/string_idrefs new file mode 100644 index 0000000..dd37f9c --- /dev/null +++ b/test/afl_dictionary/string_idrefs @@ -0,0 +1 @@ +IDREFS
\ No newline at end of file diff --git a/test/afl_dictionary/string_parentheses b/test/afl_dictionary/string_parentheses new file mode 100644 index 0000000..dd626a0 --- /dev/null +++ b/test/afl_dictionary/string_parentheses @@ -0,0 +1 @@ +()
\ No newline at end of file diff --git a/test/afl_dictionary/string_pcdata b/test/afl_dictionary/string_pcdata new file mode 100644 index 0000000..d2dd7f7 --- /dev/null +++ b/test/afl_dictionary/string_pcdata @@ -0,0 +1 @@ +#PCDATA
\ No newline at end of file diff --git a/test/afl_dictionary/tag_cdata b/test/afl_dictionary/tag_cdata new file mode 100644 index 0000000..fac6255 --- /dev/null +++ b/test/afl_dictionary/tag_cdata @@ -0,0 +1 @@ +<![CDATA[
\ No newline at end of file diff --git a/test/afl_dictionary/tag_close b/test/afl_dictionary/tag_close new file mode 100644 index 0000000..e8a17f4 --- /dev/null +++ b/test/afl_dictionary/tag_close @@ -0,0 +1 @@ +</a>
\ No newline at end of file diff --git a/test/afl_dictionary/tag_doctype b/test/afl_dictionary/tag_doctype new file mode 100644 index 0000000..b771752 --- /dev/null +++ b/test/afl_dictionary/tag_doctype @@ -0,0 +1 @@ +<!DOCTYPE
\ No newline at end of file diff --git a/test/afl_dictionary/tag_element b/test/afl_dictionary/tag_element new file mode 100644 index 0000000..04ad1f5 --- /dev/null +++ b/test/afl_dictionary/tag_element @@ -0,0 +1 @@ +<!ELEMENT
\ No newline at end of file diff --git a/test/afl_dictionary/tag_entity b/test/afl_dictionary/tag_entity new file mode 100644 index 0000000..ee9f1f3 --- /dev/null +++ b/test/afl_dictionary/tag_entity @@ -0,0 +1 @@ +<!ENTITY
\ No newline at end of file diff --git a/test/afl_dictionary/tag_notation b/test/afl_dictionary/tag_notation new file mode 100644 index 0000000..749f920 --- /dev/null +++ b/test/afl_dictionary/tag_notation @@ -0,0 +1 @@ +<!NOTATION
\ No newline at end of file diff --git a/test/afl_dictionary/tag_open b/test/afl_dictionary/tag_open new file mode 100644 index 0000000..6411313 --- /dev/null +++ b/test/afl_dictionary/tag_open @@ -0,0 +1 @@ +<a>
\ No newline at end of file diff --git a/test/afl_dictionary/tag_open_close b/test/afl_dictionary/tag_open_close new file mode 100644 index 0000000..4a12235 --- /dev/null +++ b/test/afl_dictionary/tag_open_close @@ -0,0 +1 @@ +<a />
\ No newline at end of file diff --git a/test/afl_dictionary/tag_open_exclamation b/test/afl_dictionary/tag_open_exclamation new file mode 100644 index 0000000..58adc03 --- /dev/null +++ b/test/afl_dictionary/tag_open_exclamation @@ -0,0 +1 @@ +<!
\ No newline at end of file diff --git a/test/afl_dictionary/tag_open_q b/test/afl_dictionary/tag_open_q new file mode 100644 index 0000000..2b4439c --- /dev/null +++ b/test/afl_dictionary/tag_open_q @@ -0,0 +1 @@ +<?
\ No newline at end of file diff --git a/test/afl_dictionary/tag_sq2_close b/test/afl_dictionary/tag_sq2_close new file mode 100644 index 0000000..facf683 --- /dev/null +++ b/test/afl_dictionary/tag_sq2_close @@ -0,0 +1 @@ +]]>
\ No newline at end of file diff --git a/test/afl_dictionary/tag_xml_q b/test/afl_dictionary/tag_xml_q new file mode 100644 index 0000000..be32990 --- /dev/null +++ b/test/afl_dictionary/tag_xml_q @@ -0,0 +1 @@ +<?xml?>
\ No newline at end of file diff --git a/test/afl_dictionary/underscore b/test/afl_dictionary/underscore new file mode 100644 index 0000000..c9cdc63 --- /dev/null +++ b/test/afl_dictionary/underscore @@ -0,0 +1 @@ +_
\ No newline at end of file diff --git a/test/cmark.py b/test/cmark.py index 40e8c22..1110860 100644 --- a/test/cmark.py +++ b/test/cmark.py @@ -4,6 +4,7 @@ from ctypes import CDLL, c_char_p, c_long from subprocess import * import platform +import os def pipe_through_prog(prog, text): p1 = Popen(prog.split(), stdout=PIPE, stdin=PIPE, stderr=PIPE) @@ -22,17 +23,16 @@ class CMark: self.to_html = lambda x: pipe_through_prog(prog, x) else: sysname = platform.system() - libname = "libcmark" if sysname == 'Darwin': - libname += ".dylib" + libname = "libcmark.dylib" elif sysname == 'Windows': libname = "cmark.dll" else: - libname += ".so" + libname = "libcmark.so" if library_dir: - libpath = library_dir + "/" + libname + libpath = os.path.join(library_dir, libname) else: - libpath = "build/src/" + libname + libpath = os.path.join("build", "src", libname) cmark = CDLL(libpath) markdown = cmark.cmark_markdown_to_html markdown.restype = c_char_p diff --git a/test/spec.txt b/test/spec.txt index ac47b1a..9b2b977 100644 --- a/test/spec.txt +++ b/test/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.18 -date: 2015-03-03 +version: 0.19 +date: 2015-04-27 license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -192,8 +192,8 @@ an implementation without writing an abstract syntax tree renderer. This document is generated from a text file, `spec.txt`, written in Markdown with a small extension for the side-by-side tests. -The script `spec2md.pl` can be used to turn `spec.txt` into pandoc -Markdown, which can then be converted into other formats. +The script `tools/makespec.py` can be used to convert `spec.txt` into +HTML or CommonMark (which can then be converted into other formats). In the examples, the `→` character is used to represent tabs. @@ -569,8 +569,11 @@ prevent things like the following from being parsed as headers: . #5 bolt + +#foobar . <p>#5 bolt</p> +<p>#foobar</p> . This is not a header, because the first `#` is escaped: @@ -724,13 +727,14 @@ ATX headers can be empty: ## Setext headers A [setext header](@setext-header) -consists of a line of text, containing at least one -[non-space character], +consists of a line of text, containing at least one [non-space character], with no more than 3 spaces indentation, followed by a [setext header underline]. The line of text must be one that, were it not followed by the setext header underline, -would be interpreted as part of a paragraph: it cannot be a code -block, header, blockquote, horizontal rule, or list. +would be interpreted as part of a paragraph: it cannot be +interpretable as a [code fence], [ATX header][ATX headers], +[block quote][block quotes], [horizontal rule][horizontal rules], +[list item][list items], or [HTML block][HTML blocks]. A [setext header underline](@setext-header-underline) is a sequence of `=` characters or a sequence of `-` characters, with no more than 3 @@ -1811,7 +1815,7 @@ title], which if it is present must be separated from the [link destination] by [whitespace]. No further [non-space character]s may occur on the line. -A [link reference-definition] +A [link reference definition] does not correspond to a structural element of a document. Instead, it defines a label which can be used in [reference link]s and reference-style [images] elsewhere in the document. [Link @@ -2587,7 +2591,7 @@ The following rules define [list items]: 1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with a [non-space character] and not separated from each other by more than one blank line, and *M* is a list - marker *M* of width *W* followed by 0 < *N* < 5 spaces, then the result + marker of width *W* followed by 0 < *N* < 5 spaces, then the result of prepending *M* and the following spaces to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a list item with *Bs* as its contents. The type of the list item @@ -2726,7 +2730,7 @@ this example: Here `two` occurs in the same column as the list marker `1.`, but is actually contained in the list item, because there is -sufficent indentation after the last containing blockquote marker. +sufficient indentation after the last containing blockquote marker. The converse is also possible. In the following example, the word `two` occurs far to the right of the initial text of the list item, `one`, but @@ -2852,7 +2856,7 @@ A list item may contain any kind of block: 2. **Item starting with indented code.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with an indented code block and not separated from each other by more than one blank line, - and *M* is a list marker *M* of width *W* followed by + and *M* is a list marker of width *W* followed by one space, then the result of prepending *M* and the following space to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. @@ -3001,7 +3005,7 @@ the above case: 3. **Item starting with a blank line.** If a sequence of lines *Ls* starting with a single [blank line] constitute a (possibly empty) sequence of blocks *Bs*, not separated from each other by more than - one blank line, and *M* is a list marker *M* of width *W*, + one blank line, and *M* is a list marker of width *W*, then the result of prepending *M* to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. @@ -3090,7 +3094,7 @@ A list may start or end with an empty list item: 4. **Indentation.** If a sequence of lines *Ls* constitutes a list item according to rule #1, #2, or #3, then the result of indenting each line - of *L* by 1-3 spaces (the same for each line) also constitutes a + of *Ls* by 1-3 spaces (the same for each line) also constitutes a list item with the same contents and attributes. If a line is empty, then it need not be indented. @@ -3834,9 +3838,11 @@ item: - b - c - d - - e - - f -- g + - e + - f + - g + - h +- i . <ul> <li>a</li> @@ -3846,6 +3852,8 @@ item: <li>e</li> <li>f</li> <li>g</li> +<li>h</li> +<li>i</li> </ul> . @@ -4275,8 +4283,8 @@ corresponding codepoints. [Decimal entities](@decimal-entities) consist of `&#` + a string of 1--8 arabic digits + `;`. Again, these -entities need to be recognised and tranformed into their corresponding -UTF8 codepoints. Invalid Unicode codepoints will be written as the +entities need to be recognised and transformed into their corresponding +unicode codepoints. Invalid unicode codepoints will be written as the "unknown codepoint" character (`0xFFFD`) . @@ -4287,7 +4295,8 @@ UTF8 codepoints. Invalid Unicode codepoints will be written as the [Hexadecimal entities](@hexadecimal-entities) consist of `&#` + either `X` or `x` + a string of 1-8 hexadecimal digits -+ `;`. They will also be parsed and turned into their corresponding UTF8 values in the AST. ++ `;`. They will also be parsed and turned into the corresponding +unicode codepoints in the AST. . " ആ ಫ @@ -4581,14 +4590,16 @@ characters that is not preceded or followed by a `_` character. A [left-flanking delimiter run](@left-flanking-delimiter-run) is a [delimiter run] that is (a) not followed by [unicode whitespace], and (b) either not followed by a [punctuation character], or -preceded by [unicode whitespace] or a [punctuation character] or -the beginning of a line. +preceded by [unicode whitespace] or a [punctuation character]. +For purposes of this definition, the beginning and the end of +the line count as unicode whitespace. A [right-flanking delimiter run](@right-flanking-delimiter-run) is a [delimiter run] that is (a) not preceded by [unicode whitespace], and (b) either not preceded by a [punctuation character], or -followed by [unicode whitespace] or a [punctuation character] or -the end of a line. +followed by [unicode whitespace] or a [punctuation character]. +For purposes of this definition, the beginning and the end of +the line count as unicode whitespace. Here are some examples of delimiter runs. @@ -4604,20 +4615,20 @@ Here are some examples of delimiter runs. - right-flanking but not left-flanking: ``` - abc*** - abc_ + abc*** + abc_ "abc"** - _"abc" + "abc"_ ``` - - Both right and right-flanking: + - Both left and right-flanking: ``` - abc***def + abc***def "abc"_"def" ``` - - Neither right nor right-flanking: + - Neither left nor right-flanking: ``` abc *** def @@ -4635,32 +4646,40 @@ are a bit more complex than the ones given here.) The following rules define emphasis and strong emphasis: 1. A single `*` character [can open emphasis](@can-open-emphasis) - iff it is part of a [left-flanking delimiter run]. + iff (if and only if) it is part of a [left-flanking delimiter run]. 2. A single `_` character [can open emphasis] iff it is part of a [left-flanking delimiter run] - and not part of a [right-flanking delimiter run]. + and either (a) not part of a [right-flanking delimiter run] + or (b) part of a [right-flanking delimeter run] + preceded by punctuation. 3. A single `*` character [can close emphasis](@can-close-emphasis) iff it is part of a [right-flanking delimiter run]. -4. A single `_` character [can close emphasis] - iff it is part of a [right-flanking delimiter run] - and not part of a [left-flanking delimiter run]. +4. A single `_` character [can close emphasis] iff + it is part of a [right-flanking delimiter run] + and either (a) not part of a [left-flanking delimiter run] + or (b) part of a [left-flanking delimeter run] + followed by punctuation. 5. A double `**` [can open strong emphasis](@can-open-strong-emphasis) iff it is part of a [left-flanking delimiter run]. -6. A double `__` [can open strong emphasis] - iff it is part of a [left-flanking delimiter run] - and not part of a [right-flanking delimiter run]. +6. A double `__` [can open strong emphasis] iff + it is part of a [left-flanking delimiter run] + and either (a) not part of a [right-flanking delimiter run] + or (b) part of a [right-flanking delimeter run] + preceded by punctuation. 7. A double `**` [can close strong emphasis](@can-close-strong-emphasis) iff it is part of a [right-flanking delimiter run]. 8. A double `__` [can close strong emphasis] - iff it is part of a [right-flanking delimiter run] - and not part of a [left-flanking delimiter run]. + it is part of a [right-flanking delimiter run] + and either (a) not part of a [left-flanking delimiter run] + or (b) part of a [left-flanking delimeter run] + followed by punctuation. 9. Emphasis begins with a delimiter that [can open emphasis] and ends with a delimiter that [can close emphasis], and that uses the same @@ -4822,13 +4841,14 @@ aa_"bb"_cc <p>aa_"bb"_cc</p> . -Here there is no emphasis, because the delimiter runs are -both left- and right-flanking: +This is emphasis, even though the opening delimiter is +both left- and right-flanking, because it is preceded by +punctuation: . -"aa"_"bb"_"cc" +foo-_(bar)_ . -<p>"aa"_"bb"_"cc"</p> +<p>foo-<em>(bar)</em></p> . Rule 3: @@ -4939,6 +4959,16 @@ _foo_bar_baz_ <p><em>foo_bar_baz</em></p> . +This is emphasis, even though the closing delimiter is +both left- and right-flanking, because it is followed by +punctuation: + +. +_(bar)_. +. +<p><em>(bar)</em>.</p> +. + Rule 5: . @@ -5035,6 +5065,17 @@ __foo, __bar__, baz__ <p><strong>foo, <strong>bar</strong>, baz</strong></p> . +This is strong emphasis, even though the opening delimiter is +both left- and right-flanking, because it is preceded by +punctuation: + +. +foo-__(bar)__ +. +<p>foo-<strong>(bar)</strong></p> +. + + Rule 7: This is not strong emphasis, because the closing delimiter is preceded @@ -5138,6 +5179,16 @@ __foo__bar__baz__ <p><strong>foo__bar__baz</strong></p> . +This is strong emphasis, even though the closing delimiter is +both left- and right-flanking, because it is followed by +punctuation: + +. +__(bar)__. +. +<p><strong>(bar)</strong>.</p> +. + Rule 9: Any nonempty sequence of inline elements can be the contents of an @@ -5706,7 +5757,7 @@ A [link destination](@link-destination) consists of either ASCII space or control characters, and includes parentheses only if (a) they are backslash-escaped or (b) they are part of a balanced pair of unescaped parentheses that is not itself - inside a balanced pair of unescaped paretheses. + inside a balanced pair of unescaped parentheses. A [link title](@link-title) consists of either @@ -5839,8 +5890,8 @@ in Markdown: URL-escaping should be left alone inside the destination, as all URL-escaped characters are also valid URL characters. HTML entities in -the destination will be parsed into their UTF-8 codepoints, as usual, and -optionally URL-escaped when written as HTML. +the destination will be parsed into the corresponding unicode +codepoints, as usual, and optionally URL-escaped when written as HTML. . [link](foo%20bä) @@ -7215,10 +7266,10 @@ foo ## Soft line breaks A regular line break (not in a code span or HTML tag) that is not -preceded by two or more spaces is parsed as a softbreak. (A -softbreak may be rendered in HTML either as a -[line ending] or as a space. The result will be the same -in browsers. In the examples here, a [line ending] will be used.) +preceded by two or more spaces or a backslash is parsed as a +softbreak. (A softbreak may be rendered in HTML either as a +[line ending] or as a space. The result will be the same in +browsers. In the examples here, a [line ending] will be used.) . foo diff --git a/wrappers/wrapper.lua b/wrappers/wrapper.lua index 11c5183..023e0b3 100755 --- a/wrappers/wrapper.lua +++ b/wrappers/wrapper.lua @@ -5,197 +5,235 @@ local ffi = require("ffi") cmark = ffi.load("libcmark") ffi.cdef[[ +char *cmark_markdown_to_html(const char *text, int len, int options); - char *cmark_markdown_to_html(const char *text, int len); - typedef enum { - /* Block */ - CMARK_NODE_DOCUMENT, - CMARK_NODE_BLOCK_QUOTE, - CMARK_NODE_LIST, - CMARK_NODE_LIST_ITEM, - CMARK_NODE_CODE_BLOCK, - CMARK_NODE_HTML, - CMARK_NODE_PARAGRAPH, - CMARK_NODE_HEADER, - CMARK_NODE_HRULE, - CMARK_NODE_REFERENCE_DEF, +typedef enum { + /* Error status */ + CMARK_NODE_NONE, - CMARK_NODE_FIRST_BLOCK = CMARK_NODE_DOCUMENT, - CMARK_NODE_LAST_BLOCK = CMARK_NODE_REFERENCE_DEF, + /* Block */ + CMARK_NODE_DOCUMENT, + CMARK_NODE_BLOCK_QUOTE, + CMARK_NODE_LIST, + CMARK_NODE_ITEM, + CMARK_NODE_CODE_BLOCK, + CMARK_NODE_HTML, + CMARK_NODE_PARAGRAPH, + CMARK_NODE_HEADER, + CMARK_NODE_HRULE, - /* Inline */ - CMARK_NODE_TEXT, - CMARK_NODE_SOFTBREAK, - CMARK_NODE_LINEBREAK, - CMARK_NODE_INLINE_CODE, - CMARK_NODE_INLINE_HTML, - CMARK_NODE_EMPH, - CMARK_NODE_STRONG, - CMARK_NODE_LINK, - CMARK_NODE_IMAGE, + CMARK_NODE_FIRST_BLOCK = CMARK_NODE_DOCUMENT, + CMARK_NODE_LAST_BLOCK = CMARK_NODE_HRULE, - CMARK_NODE_FIRST_INLINE = CMARK_NODE_TEXT, - CMARK_NODE_LAST_INLINE = CMARK_NODE_IMAGE, - } cmark_node_type; + /* Inline */ + CMARK_NODE_TEXT, + CMARK_NODE_SOFTBREAK, + CMARK_NODE_LINEBREAK, + CMARK_NODE_CODE, + CMARK_NODE_INLINE_HTML, + CMARK_NODE_EMPH, + CMARK_NODE_STRONG, + CMARK_NODE_LINK, + CMARK_NODE_IMAGE, - typedef enum { - CMARK_NO_LIST, - CMARK_BULLET_LIST, - CMARK_ORDERED_LIST - } cmark_list_type; + CMARK_NODE_FIRST_INLINE = CMARK_NODE_TEXT, + CMARK_NODE_LAST_INLINE = CMARK_NODE_IMAGE, +} cmark_node_type; - typedef enum { - CMARK_PERIOD_DELIM, - CMARK_PAREN_DELIM - } cmark_delim_type; - typedef struct cmark_node cmark_node; - typedef struct cmark_parser cmark_parser; +typedef enum { + CMARK_NO_LIST, + CMARK_BULLET_LIST, + CMARK_ORDERED_LIST +} cmark_list_type; - cmark_node* cmark_node_new(cmark_node_type type); +typedef enum { + CMARK_NO_DELIM, + CMARK_PERIOD_DELIM, + CMARK_PAREN_DELIM +} cmark_delim_type; - void - cmark_node_free(cmark_node *node); +typedef struct cmark_node cmark_node; +typedef struct cmark_parser cmark_parser; +typedef struct cmark_iter cmark_iter; - cmark_node* cmark_node_next(cmark_node *node); +typedef enum { + CMARK_EVENT_NONE, + CMARK_EVENT_DONE, + CMARK_EVENT_ENTER, + CMARK_EVENT_EXIT +} cmark_event_type; - cmark_node* cmark_node_previous(cmark_node *node); +cmark_node* +cmark_node_new(cmark_node_type type); - cmark_node* cmark_node_parent(cmark_node *node); +void +cmark_node_free(cmark_node *node); - cmark_node* cmark_node_first_child(cmark_node *node); +cmark_node* +cmark_node_next(cmark_node *node); - cmark_node* cmark_node_last_child(cmark_node *node); +cmark_node* +cmark_node_previous(cmark_node *node); - cmark_node_type cmark_node_get_type(cmark_node *node); +cmark_node* +cmark_node_parent(cmark_node *node); - const char* cmark_node_get_string_content(cmark_node *node); +cmark_node* +cmark_node_first_child(cmark_node *node); - int cmark_node_set_string_content(cmark_node *node, const char *content); +cmark_node* +cmark_node_last_child(cmark_node *node); - int cmark_node_get_header_level(cmark_node *node); +cmark_iter* +cmark_iter_new(cmark_node *root); - int cmark_node_set_header_level(cmark_node *node, int level); +void +cmark_iter_free(cmark_iter *iter); - cmark_list_type cmark_node_get_list_type(cmark_node *node); +cmark_event_type +cmark_iter_next(cmark_iter *iter); - int cmark_node_set_list_type(cmark_node *node, cmark_list_type type); +cmark_node* +cmark_iter_get_node(cmark_iter *iter); - int cmark_node_get_list_start(cmark_node *node); +cmark_event_type +cmark_iter_get_event_type(cmark_iter *iter); - int cmark_node_set_list_start(cmark_node *node, int start); +cmark_node* +cmark_iter_get_root(cmark_iter *iter); - int cmark_node_get_list_tight(cmark_node *node); +void +cmark_iter_reset(cmark_iter *iter, cmark_node *current, + cmark_event_type event_type); - int cmark_node_set_list_tight(cmark_node *node, int tight); +void* +cmark_node_get_user_data(cmark_node *node); - const char* cmark_node_get_fence_info(cmark_node *node); +int +cmark_node_set_user_data(cmark_node *node, void *user_data); - int cmark_node_set_fence_info(cmark_node *node, const char *info); +cmark_node_type +cmark_node_get_type(cmark_node *node); - const char* cmark_node_get_url(cmark_node *node); +const char* +cmark_node_get_type_string(cmark_node *node); - int cmark_node_set_url(cmark_node *node, const char *url); +const char* +cmark_node_get_literal(cmark_node *node); - const char* cmark_node_get_title(cmark_node *node); +int +cmark_node_set_literal(cmark_node *node, const char *content); - int cmark_node_set_title(cmark_node *node, const char *title); +int +cmark_node_get_header_level(cmark_node *node); - int cmark_node_get_start_line(cmark_node *node); +int +cmark_node_set_header_level(cmark_node *node, int level); - int cmark_node_get_start_column(cmark_node *node); +cmark_list_type +cmark_node_get_list_type(cmark_node *node); - int cmark_node_get_end_line(cmark_node *node); +int +cmark_node_set_list_type(cmark_node *node, cmark_list_type type); - void cmark_node_unlink(cmark_node *node); +cmark_delim_type +cmark_node_get_list_delim(cmark_node *node); - int cmark_node_insert_before(cmark_node *node, cmark_node *sibling); +int +cmark_node_set_list_delim(cmark_node *node, cmark_delim_type delim); - int cmark_node_insert_after(cmark_node *node, cmark_node *sibling); +int +cmark_node_get_list_start(cmark_node *node); - int cmark_node_prepend_child(cmark_node *node, cmark_node *child); +int +cmark_node_set_list_start(cmark_node *node, int start); - int cmark_node_append_child(cmark_node *node, cmark_node *child); +int +cmark_node_get_list_tight(cmark_node *node); - cmark_parser *cmark_parser_new(); +int +cmark_node_set_list_tight(cmark_node *node, int tight); - void cmark_parser_free(cmark_parser *parser); +const char* +cmark_node_get_fence_info(cmark_node *node); - cmark_node *cmark_parser_finish(cmark_parser *parser); +int +cmark_node_set_fence_info(cmark_node *node, const char *info); - void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len); +const char* +cmark_node_get_url(cmark_node *node); - cmark_node *cmark_parse_document(const char *buffer, size_t len); +int +cmark_node_set_url(cmark_node *node, const char *url); - char *cmark_render_ast(cmark_node *root); +const char* +cmark_node_get_title(cmark_node *node); - char *cmark_render_html(cmark_node *root); +int +cmark_node_set_title(cmark_node *node, const char *title); + +int +cmark_node_get_start_line(cmark_node *node); + +int +cmark_node_get_start_column(cmark_node *node); + +int +cmark_node_get_end_line(cmark_node *node); + +int +cmark_node_get_end_column(cmark_node *node); + +void +cmark_node_unlink(cmark_node *node); + +int +cmark_node_insert_before(cmark_node *node, cmark_node *sibling); + +int +cmark_node_insert_after(cmark_node *node, cmark_node *sibling); + +int +cmark_node_prepend_child(cmark_node *node, cmark_node *child); + +int +cmark_node_append_child(cmark_node *node, cmark_node *child); + +void +cmark_consolidate_text_nodes(cmark_node *root); + +cmark_parser *cmark_parser_new(int options); + +void cmark_parser_free(cmark_parser *parser); + +void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len); + +cmark_node *cmark_parser_finish(cmark_parser *parser); + +cmark_node *cmark_parse_document(const char *buffer, size_t len, int options); + +char *cmark_render_xml(cmark_node *root, int options); + +char *cmark_render_html(cmark_node *root, int options); + +char *cmark_render_man(cmark_node *root, int options); + +char *cmark_render_commonmark(cmark_node *root, int options, int width); + +extern const int cmark_version; + +extern const char cmark_version_string[]; ]] +CMARK_OPT_DEFAULT = 0 +CMARK_OPT_SOURCEPOS = 1 +CMARK_OPT_HARDBREAKS = 2 +CMARK_OPT_NORMALIZE = 4 +CMARK_OPT_SMART = 8 + local inp = io.read("*all") -local doc = cmark.cmark_parse_document(inp, string.len(inp)) - -local cur = doc -local next -local child - -local walk = function(action) - level = 0 - while cur ~= nil do - action(cur, level) - child = cmark.cmark_node_first_child(cur) - if child == nil then - next = cmark.cmark_node_next(cur) - while next == nil do - cur = cmark.cmark_node_parent(cur) - level = level - 1 - if cur == nil then - break - else - next = cmark.cmark_node_next(cur) - end - end - cur = next - else - level = level + 1 - cur = child - end - end -end - -local type_table = { - 'BLOCK_QUOTE', - 'LIST', - 'LIST_ITEM', - 'CODE_BLOCK', - 'HTML', - 'PARAGRAPH', - 'HEADER', - 'HRULE', - 'REFERENCE_DEF', - 'TEXT', - 'SOFTBREAK', - 'LINEBREAK', - 'INLINE_CODE', - 'INLINE_HTML', - 'EMPH', - 'STRONG', - 'LINK', - 'IMAGE', -} -type_table[0] = 'DOCUMENT' - -local function print_type(node, level) - local t = tonumber(cmark.cmark_node_get_type(node)) - io.write(string.rep(' ', level) .. type_table[t]) - if t == cmark.CMARK_NODE_TEXT then - io.write(' ' .. ffi.string(cmark.cmark_node_get_string_content(node))) - end - io.write('\n') -end - -walk(print_type) - --- local html = ffi.string(cmark.cmark_render_html(doc)) --- print(html) +local doc = cmark.cmark_parse_document(inp, string.len(inp), CMARK_OPT_SMART) +local html = ffi.string(cmark.cmark_render_html(doc, CMARK_OPT_DEFAULT)) +print(html) diff --git a/wrappers/wrapper.py b/wrappers/wrapper.py index 52cbfc7..44d982c 100755 --- a/wrappers/wrapper.py +++ b/wrappers/wrapper.py @@ -1,6 +1,8 @@ #!/usr/bin/env python # Example for using the shared library from python +# Will work with either python 2 or python 3 +# Requires cmark library to be installed from ctypes import CDLL, c_char_p, c_long import sys @@ -9,15 +11,27 @@ import platform sysname = platform.system() if sysname == 'Darwin': - cmark = CDLL("build/src/libcmark.dylib") + libname = "libcmark.dylib" +elif sysname == 'Windows': + libname = "cmark.dll" else: - cmark = CDLL("build/src/libcmark.so") + libname = "libcmark.so" +cmark = CDLL(libname) markdown = cmark.cmark_markdown_to_html markdown.restype = c_char_p -markdown.argtypes = [c_char_p, c_long] +markdown.argtypes = [c_char_p, c_long, c_long] + +opts = 8 # CMARK_OPT_PRETTY def md2html(text): - return markdown(text, len(text)) + if sys.version_info >= (3,0): + textbytes = text.encode('utf-8') + textlen = len(textbytes) + return markdown(textbytes, textlen, opts).decode('utf-8') + else: + textbytes = text + textlen = len(text) + return markdown(textbytes, textlen, opts) sys.stdout.write(md2html(sys.stdin.read())) diff --git a/wrappers/wrapper.rb b/wrappers/wrapper.rb index 59a9b87..2359366 100755 --- a/wrappers/wrapper.rb +++ b/wrappers/wrapper.rb @@ -4,12 +4,12 @@ require 'ffi' module CMark extend FFI::Library ffi_lib ['libcmark', 'cmark'] - attach_function :cmark_markdown_to_html, [:string, :int], :string + attach_function :cmark_markdown_to_html, [:string, :int, :int], :string end def markdown_to_html(s) len = s.bytesize - CMark::cmark_markdown_to_html(s, len) + CMark::cmark_markdown_to_html(s, len, 0) end STDOUT.write(markdown_to_html(ARGF.read())) diff --git a/wrappers/wrapper.rkt b/wrappers/wrapper.rkt new file mode 100644 index 0000000..d9b34e8 --- /dev/null +++ b/wrappers/wrapper.rkt @@ -0,0 +1,190 @@ +#lang racket/base + +;; requires racket >= 5.3 because of submodules + +;; Lowlevel interface + +(module low-level racket/base + + (require ffi/unsafe ffi/unsafe/define) + + (provide (all-defined-out)) + + (define-ffi-definer defcmark (ffi-lib "libcmark")) + + (define _cmark_node_type + (_enum '(none + ;; Block + document block-quote list item code-block + html paragraph header hrule + ;; Inline + text softbreak linebreak code inline-html + emph strong link image))) + (define _cmark_list_type + (_enum '(no_list bullet_list ordered_list))) + (define _cmark_delim_type + (_enum '(no_delim period_delim paren_delim))) + (define _cmark_opts + (_bitmask '(sourcepos = 1 hardbreaks = 2 normalize = 4 smart = 8))) + + (define-cpointer-type _node) + + (defcmark cmark_markdown_to_html + (_fun [bs : _bytes] [_int = (bytes-length bs)] _cmark_opts + -> [r : _bytes] -> (begin0 (bytes->string/utf-8 r) (free r)))) + + (defcmark cmark_parse_document + (_fun [bs : _bytes] [_int = (bytes-length bs)] _cmark_opts + -> _node)) + + (defcmark cmark_render_html + (_fun _node _cmark_opts + -> [r : _bytes] -> (begin0 (bytes->string/utf-8 r) (free r)))) + + (defcmark cmark_node_new (_fun _cmark_node_type -> _node)) + (defcmark cmark_node_free (_fun _node -> _void)) + + (defcmark cmark_node_next (_fun _node -> _node/null)) + (defcmark cmark_node_previous (_fun _node -> _node/null)) + (defcmark cmark_node_parent (_fun _node -> _node/null)) + (defcmark cmark_node_first_child (_fun _node -> _node/null)) + (defcmark cmark_node_last_child (_fun _node -> _node/null)) + + (defcmark cmark_node_get_user_data (_fun _node -> _racket)) + (defcmark cmark_node_set_user_data (_fun _node _racket -> _bool)) + (defcmark cmark_node_get_type (_fun _node -> _cmark_node_type)) + (defcmark cmark_node_get_type_string (_fun _node -> _bytes)) + (defcmark cmark_node_get_literal (_fun _node -> _string)) + (defcmark cmark_node_set_literal (_fun _node _string -> _bool)) + (defcmark cmark_node_get_header_level (_fun _node -> _int)) + (defcmark cmark_node_set_header_level (_fun _node _int -> _bool)) + (defcmark cmark_node_get_list_type (_fun _node -> _cmark_list_type)) + (defcmark cmark_node_set_list_type (_fun _node _cmark_list_type -> _bool)) + (defcmark cmark_node_get_list_delim (_fun _node -> _cmark_delim_type)) + (defcmark cmark_node_set_list_delim (_fun _node _cmark_delim_type -> _bool)) + (defcmark cmark_node_get_list_start (_fun _node -> _int)) + (defcmark cmark_node_set_list_start (_fun _node _int -> _bool)) + (defcmark cmark_node_get_list_tight (_fun _node -> _bool)) + (defcmark cmark_node_set_list_tight (_fun _node _bool -> _bool)) + (defcmark cmark_node_get_fence_info (_fun _node -> _string)) + (defcmark cmark_node_set_fence_info (_fun _node _string -> _bool)) + (defcmark cmark_node_get_url (_fun _node -> _string)) + (defcmark cmark_node_set_url (_fun _node _string -> _bool)) + (defcmark cmark_node_get_title (_fun _node -> _string)) + (defcmark cmark_node_set_title (_fun _node _string -> _bool)) + (defcmark cmark_node_get_start_line (_fun _node -> _int)) + (defcmark cmark_node_get_start_column (_fun _node -> _int)) + (defcmark cmark_node_get_end_line (_fun _node -> _int)) + (defcmark cmark_node_get_end_column (_fun _node -> _int)) + + (defcmark cmark_node_unlink (_fun _node -> _void)) + (defcmark cmark_node_insert_before (_fun _node _node -> _bool)) + (defcmark cmark_node_insert_after (_fun _node _node -> _bool)) + (defcmark cmark_node_prepend_child (_fun _node _node -> _bool)) + (defcmark cmark_node_append_child (_fun _node _node -> _bool)) + (defcmark cmark_consolidate_text_nodes (_fun _node -> _void)) + + ) + +;; Rackety interface + +(module high-level racket/base + + (require (submod ".." low-level) ffi/unsafe) + + (provide cmark-markdown-to-html) + (define (cmark-markdown-to-html str [options '(normalize smart)]) + (cmark_markdown_to_html (if (bytes? str) str (string->bytes/utf-8 str)) + options)) + + (require (for-syntax racket/base racket/syntax)) + (define-syntax (make-getter+setter stx) + (syntax-case stx () + [(_ name) (with-syntax ([(getter setter) + (map (λ(op) (format-id #'name "cmark_node_~a_~a" + op #'name)) + '(get set))]) + #'(cons getter setter))])) + (define-syntax-rule (define-getters+setters name [type field ...] ...) + (define name (list (list 'type (make-getter+setter field) ...) ...))) + (define-getters+setters getters+setters + [header header_level] [code-block fence_info] + [link url title] [image url title] + [list list_type list_delim list_start list_tight]) + + (provide cmark->sexpr) + (define (cmark->sexpr node) + (define text (cmark_node_get_literal node)) + (define type (cmark_node_get_type node)) + (define children + (let loop ([node (cmark_node_first_child node)]) + (if (not node) '() + (cons (cmark->sexpr node) (loop (cmark_node_next node)))))) + (define info + (cond [(assq type getters+setters) + => (λ(gss) (map (λ(gs) ((car gs) node)) (cdr gss)))] + [else '()])) + (define (assert-no what-not b) + (when b (error 'cmark->sexpr "unexpected ~a in ~s" what-not type))) + (cond [(memq type '(document paragraph header block-quote list item + emph strong link image)) + (assert-no 'text text) + (list type info children)] + [(memq type '(text code code-block html inline-html + softbreak linebreak hrule)) + (assert-no 'children (pair? children)) + (list type info text)] + [else (error 'cmark->sexpr "unknown type: ~s" type)])) + + (provide sexpr->cmark) + (define (sexpr->cmark sexpr) ; assumes valid input, as generated by the above + (define (loop sexpr) + (define type (car sexpr)) + (define info (cadr sexpr)) + (define data (caddr sexpr)) + (define node (cmark_node_new type)) + (let ([gss (assq type getters+setters)]) + (when gss + (unless (= (length (cdr gss)) (length info)) + (error 'sexpr->cmark "bad number of info values in ~s" sexpr)) + (for-each (λ(gs x) ((cdr gs) node x)) (cdr gss) info))) + (cond [(string? data) (cmark_node_set_literal node data)] + [(not data) (void)] + [(list? data) + (for ([child (in-list data)]) + (cmark_node_append_child node (sexpr->cmark child)))] + [else (error 'sexpr->cmark "bad data in ~s" sexpr)]) + node) + (define root (loop sexpr)) + (register-finalizer root cmark_node_free) + root) + + ;; Registers a `cmark_node_free` finalizer + (provide cmark-parse-document) + (define (cmark-parse-document str [options '(normalize smart)]) + (define root (cmark_parse_document + (if (bytes? str) str (string->bytes/utf-8 str)) + options)) + (register-finalizer root cmark_node_free) + root) + + (provide cmark-render-html) + (define (cmark-render-html root [options '(normalize smart)]) + (cmark_render_html root options))) + +#; ;; sample use +(begin + (require 'high-level racket/string) + (cmark-render-html + (cmark-parse-document + (string-join '("foo" + "===" + "" + "> blah" + ">" + "> blah *blah* `bar()` blah:" + ">" + "> function foo() {" + "> bar();" + "> }") + "\n")))) diff --git a/wrappers/wrapper3.py b/wrappers/wrapper3.py deleted file mode 100755 index 7a777fa..0000000 --- a/wrappers/wrapper3.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 - -# Example for using the shared library from python - -from ctypes import CDLL, c_char_p, c_long -import sys -import platform - -sysname = platform.system() - -if sysname == 'Darwin': - cmark = CDLL("build/src/libcmark.dylib") -else: - cmark = CDLL("build/src/libcmark.so") - -markdown = cmark.cmark_markdown_to_html -markdown.restype = c_char_p -markdown.argtypes = [c_char_p, c_long] - -def md2html(text): - textbytes = text.encode('utf-8') - textlen = len(textbytes) - return markdown(textbytes, textlen).decode('utf-8') - -sys.stdout.write(md2html(sys.stdin.read())) |