From 5860d60cc79332c01da59f39e90fff2bcc8a5e9e Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 14 Feb 2015 21:18:55 -0800 Subject: Added smart.c, smart.h with function abstracting smart punct rendering. Also fixed some bugs in earlier smart handling. Now handles UTF-8. --- src/CMakeLists.txt | 2 + src/html.c | 114 ++-------------------------------------- src/smart.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/smart.h | 28 ++++++++++ test/smart_punct.txt | 4 +- 5 files changed, 182 insertions(+), 112 deletions(-) create mode 100644 src/smart.c create mode 100644 src/smart.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2179c08..2150e7a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ set(HEADERS html_unescape.h houdini.h cmark_ctype.h + smart.h ) set(LIBRARY_SOURCES cmark.c @@ -36,6 +37,7 @@ set(LIBRARY_SOURCES houdini_html_e.c houdini_html_u.c cmark_ctype.c + smart.c ${HEADERS} ) diff --git a/src/html.c b/src/html.c index b4d351e..1f64196 100644 --- a/src/html.c +++ b/src/html.c @@ -9,6 +9,7 @@ #include "utf8.h" #include "buffer.h" #include "houdini.h" +#include "smart.h" // Functions to convert cmark_nodes to HTML strings. @@ -61,10 +62,6 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, char start_header[] = "as.literal; - while (i < lit.len) { - c = lit.data[i]; - // replace with efficient lookup table: - if (c != '"' && c != '-' && c != '\'' && c != '.') { - i++; - continue; - } - escape_html(html, lit.data + lastout, - i - lastout); - if (c == '\'' || c == '"') { - if (i == 0) { - if (node->prev) { - if (node->prev->type == CMARK_NODE_TEXT) { - before_char = node->prev->as.literal.data[node->prev->as.literal.len - 1]; - } else if (node->prev->type == CMARK_NODE_SOFTBREAK || - node->prev->type == CMARK_NODE_LINEBREAK) { - before_char = '\n'; - } else { - before_char = 'x'; - } - } else { - before_char = '\n'; - } - } else { - before_char = lit.data[i - 1]; - } - if (i >= lit.len - 1) { - if (node->next) { - if (node->next->type == CMARK_NODE_TEXT) { - after_char = node->next->as.literal.data[0]; - } else if (node->next->type == CMARK_NODE_SOFTBREAK || - node->next->type == CMARK_NODE_LINEBREAK) { - before_char = '\n'; - } else { - after_char = 'x'; - } - } else { - after_char = '\n'; - } - } else { - after_char = lit.data[i + 1]; - } - left_flanking = !utf8proc_is_space(after_char) && - !(utf8proc_is_punctuation(after_char) && - !utf8proc_is_space(before_char) && - !utf8proc_is_punctuation(before_char)); - right_flanking = !utf8proc_is_space(before_char) && - !(utf8proc_is_punctuation(before_char) && - !utf8proc_is_space(after_char) && - !utf8proc_is_punctuation(after_char)); - } - switch (lit.data[i]) { - case '"': - if (right_flanking) { - cmark_strbuf_puts(html, "”"); - } else { - cmark_strbuf_puts(html, "“"); - } - i += 1; - break; - case '\'': - if (left_flanking && !right_flanking) { - cmark_strbuf_puts(html, "‘"); - } else { - cmark_strbuf_puts(html, "’"); - } - i += 1; - break; - case '-': - if (i < lit.len - 1 && lit.data[i + 1] == '-') { - if (lit.data[i + 2] == '-') { - cmark_strbuf_puts(html, - "—"); - i += 3; - } else { - cmark_strbuf_puts(html, "–"); - i += 2; - } - } else { - cmark_strbuf_putc(html, c); - i += 1; - } - break; - case '.': - if (i < lit.len - 2 && lit.data[i + 1] == '.' && - lit.data[i + 2] == '.') { - cmark_strbuf_puts(html, - "…"); - i += 3; - } else { - cmark_strbuf_putc(html, c); - i += 1; - } - break; - default: - cmark_strbuf_putc(html, c); - i++; - } - lastout = i; - } - escape_html(html, node->as.literal.data + lastout, - i - lastout); - + escape_with_smart(html, node, escape_html, + "“", "”", "‘", "’", + "—", "–", "…"); } else { escape_html(html, node->as.literal.data, node->as.literal.len); diff --git a/src/smart.c b/src/smart.c new file mode 100644 index 0000000..54c9740 --- /dev/null +++ b/src/smart.c @@ -0,0 +1,146 @@ +#include +#include +#include +#include + +#include "config.h" +#include "cmark.h" +#include "node.h" +#include "utf8.h" +#include "buffer.h" +#include "chunk.h" + +void escape_with_smart(cmark_strbuf *buf, + cmark_node *node, + void (*escape)(cmark_strbuf *, const unsigned char *, int), + const char *left_double_quote, + const char *right_double_quote, + const char *left_single_quote, + const char *right_single_quote, + const char *em_dash, + const char *en_dash, + const char *ellipses) +{ + int32_t c = 0; + int32_t after_char = 0; + int32_t before_char = 0; + int len; + bool left_flanking, right_flanking; + int lastout = 0; + int i = 0; + cmark_chunk lit = node->as.literal; + + // set before_char based on previous text node if there is one: + if (node->prev) { + if (node->prev->type == CMARK_NODE_TEXT) { + + // walk back to the beginning of the UTF_8 sequence: + i = node->prev->as.literal.len - 1; + while (i > 0 && node->prev->as.literal.data[i] >> 6 == 2) { + i -= 1; + } + len = utf8proc_iterate(node->prev->as.literal.data + i, + node->prev->as.literal.len - i, + &before_char); + if (len == -1) { + before_char = 10; + } + + } else if (node->prev->type == CMARK_NODE_SOFTBREAK || + node->prev->type == CMARK_NODE_LINEBREAK) { + before_char = 10; + + } else { + before_char = 65; + } + } else { + before_char = 10; + } + + while (i < lit.len) { + len = utf8proc_iterate(lit.data + i, lit.len - i, &c); + i += len; + + // replace with efficient lookup table: + if (!(c == 34 || c == 39 || c == 45 || c == 46)) { + before_char = c; + continue; + } + (*escape)(buf, lit.data + lastout, i - len - lastout); + + if (c == 34 || c == 39) { + + if (i >= lit.len) { + if (node->next) { + if (node->next->type == CMARK_NODE_TEXT) { + utf8proc_iterate(node->next->as.literal.data, + node->next->as.literal.len, + &after_char); + } else if (node->next->type == CMARK_NODE_SOFTBREAK || + node->next->type == CMARK_NODE_LINEBREAK) { + after_char = 10; + } else { + after_char = 65; + } + } else { + after_char = 10; + } + } else { + utf8proc_iterate(lit.data + i, lit.len - i, &after_char); + } + + left_flanking = !utf8proc_is_space(after_char) && + !(utf8proc_is_punctuation(after_char) && + !utf8proc_is_space(before_char) && + !utf8proc_is_punctuation(before_char)); + right_flanking = !utf8proc_is_space(before_char) && + !(utf8proc_is_punctuation(before_char) && + !utf8proc_is_space(after_char) && + !utf8proc_is_punctuation(after_char)); + } + + switch (c) { + case 34: // " + if (right_flanking) { + cmark_strbuf_puts(buf, right_double_quote); + } else { + cmark_strbuf_puts(buf, left_double_quote); + } + break; + case 39: // ' + if (left_flanking && !right_flanking) { + cmark_strbuf_puts(buf, left_single_quote); + } else { + cmark_strbuf_puts(buf, right_single_quote); + } + break; + case 45: // - + if (i < lit.len && lit.data[i] == '-') { + if (lit.data[i + 1] == '-') { + cmark_strbuf_puts(buf, em_dash); + i += 2; + } else { + cmark_strbuf_puts(buf, en_dash); + i += 1; + } + } else { + cmark_strbuf_putc(buf, c); + } + break; + case 46: // . + if (i < lit.len - 1 && lit.data[i] == '.' && + lit.data[i + 1] == '.') { + cmark_strbuf_puts(buf, ellipses); + i += 2; + } else { + cmark_strbuf_putc(buf, c); + } + break; + default: + cmark_strbuf_putc(buf, c); + } + lastout = i; + } + (*escape)(buf, node->as.literal.data + lastout, lit.len - lastout); + +} diff --git a/src/smart.h b/src/smart.h new file mode 100644 index 0000000..fa614b3 --- /dev/null +++ b/src/smart.h @@ -0,0 +1,28 @@ +#ifndef CMARK_SMART_H +#define CMARK_SMART_H + +#include +#include +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void escape_with_smart(cmark_strbuf *buf, + cmark_node *node, + void (*escape)(cmark_strbuf *, const unsigned char *, int), + const char *left_double_quote, + const char *right_double_quote, + const char *left_single_quote, + const char *right_single_quote, + const char *em_dash, + const char *en_dash, + const char *ellipses); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/test/smart_punct.txt b/test/smart_punct.txt index c036a6d..c870c9d 100644 --- a/test/smart_punct.txt +++ b/test/smart_punct.txt @@ -35,9 +35,9 @@ Were you alive in the 70's? . . -Here is some quoted '`code`' and a "[quoted link][1]". +Here is some quoted '`code`' and a "[quoted link](url)". . -

Here is some quoted ‘code’ and a “[quoted link][1]”.

+

Here is some quoted ‘code’ and a “quoted link”.

. . -- cgit v1.2.3