From 27373892cb98a2a6a1d35fba28798d9117fff58f Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 15 Feb 2015 18:31:07 -0800 Subject: Moved handling of --smart from renderer to parser. This allows backslash escapes to disable smart quote transformations in particular cases. Closes #8. --- src/CMakeLists.txt | 2 - src/html.c | 10 +-- src/inlines.c | 105 +++++++++++++++++++++++++++---- src/man.c | 17 ++--- src/smart.c | 174 --------------------------------------------------- src/smart.h | 28 --------- test/smart_punct.txt | 24 +++---- 7 files changed, 113 insertions(+), 247 deletions(-) delete mode 100644 src/smart.c delete mode 100644 src/smart.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2150e7a..2179c08 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,6 @@ set(HEADERS html_unescape.h houdini.h cmark_ctype.h - smart.h ) set(LIBRARY_SOURCES cmark.c @@ -37,7 +36,6 @@ set(LIBRARY_SOURCES houdini_html_e.c houdini_html_u.c cmark_ctype.c - smart.c ${HEADERS} ) diff --git a/src/html.c b/src/html.c index 1f64196..5a64f9c 100644 --- a/src/html.c +++ b/src/html.c @@ -9,7 +9,6 @@ #include "utf8.h" #include "buffer.h" #include "houdini.h" -#include "smart.h" // Functions to convert cmark_nodes to HTML strings. @@ -219,14 +218,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, break; case CMARK_NODE_TEXT: - if (options & CMARK_OPT_SMARTPUNCT) { - escape_with_smart(html, node, escape_html, - "“", "”", "‘", "’", - "—", "–", "…"); - } else { - escape_html(html, node->as.literal.data, - node->as.literal.len); - } + escape_html(html, node->as.literal.data, node->as.literal.len); break; case CMARK_NODE_LINEBREAK: diff --git a/src/inlines.c b/src/inlines.c index 014c018..a5af1a5 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -271,6 +271,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) while (peek_char(subj) == c) { numdelims++; advance(subj); + if (c == '\'' || c == '"') { + break; // limit to 1 delim for quotes + } } len = utf8proc_iterate(subj->input.data + subj->pos, @@ -289,6 +292,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) if (c == '_') { *can_open = left_flanking && !right_flanking; *can_close = right_flanking && !left_flanking; + } else if (c == '\'' || c == '"') { + *can_open = left_flanking && !right_flanking; + *can_close = right_flanking; } else { *can_open = left_flanking; *can_close = right_flanking; @@ -349,25 +355,68 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open, subj->last_delim = delim; } -// Parse strong/emph or a fallback. -// Assumes the subject has '_' or '*' at the current position. -static cmark_node* handle_strong_emph(subject* subj, unsigned char c) +// Assumes the subject has a c at the current position. +static cmark_node* handle_delim(subject* subj, unsigned char c, bool smart) { int numdelims; cmark_node * inl_text; bool can_open, can_close; + cmark_chunk contents; numdelims = scan_delims(subj, c, &can_open, &can_close); - inl_text = make_str(cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); + if (c == '\'' && smart) { + contents = cmark_chunk_literal("’"); + } else if (c == '"' && smart) { + contents = cmark_chunk_literal("”"); + } else { + contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); + } + + inl_text = make_str(contents); - if (can_open || can_close) { + if ((can_open || can_close) && + (!(c == '\'' || c == '"') || smart)) { push_delimiter(subj, c, can_open, can_close, inl_text); } return inl_text; } +// Assumes we have a hyphen at the current position. +static cmark_node* handle_hyphen(subject* subj, bool smart) +{ + advance(subj); + if (smart && peek_char(subj) == '-') { + advance(subj); + if (peek_char(subj) == '-') { + advance(subj); + return make_str(cmark_chunk_literal("—")); + } else { + return make_str(cmark_chunk_literal("–")); + } + } else { + return make_str(cmark_chunk_literal("-")); + } +} + +// Assumes we have a period at the current position. +static cmark_node* handle_period(subject* subj, bool smart) +{ + advance(subj); + if (smart && peek_char(subj) == '.') { + advance(subj); + if (peek_char(subj) == '.') { + advance(subj); + return make_str(cmark_chunk_literal("…")); + } else { + return make_str(cmark_chunk_literal("..")); + } + } else { + return make_str(cmark_chunk_literal(".")); + } +} + static void process_emphasis(subject *subj, delimiter *start_delim) { delimiter *closer = subj->last_delim; @@ -381,7 +430,8 @@ static void process_emphasis(subject *subj, delimiter *start_delim) // now move forward, looking for closers, and handling each while (closer != NULL) { if (closer->can_close && - (closer->delim_char == '*' || closer->delim_char == '_')) { + (closer->delim_char == '*' || closer->delim_char == '_' || + closer->delim_char == '"' || closer->delim_char == '\'')) { // Now look backwards for first matching opener: opener = closer->previous; while (opener != NULL && opener != start_delim) { @@ -391,9 +441,31 @@ static void process_emphasis(subject *subj, delimiter *start_delim) } opener = opener->previous; } - if (opener != NULL && opener != start_delim) { - closer = S_insert_emph(subj, opener, closer); - } else { + if (closer->delim_char == '*' || closer->delim_char == '_') { + if (opener != NULL && opener != start_delim) { + closer = S_insert_emph(subj, opener, closer); + } else { + closer = closer->next; + } + } else if (closer->delim_char == '\'') { + cmark_chunk_free(&closer->inl_text->as.literal); + closer->inl_text->as.literal = + cmark_chunk_literal("’"); + if (opener != NULL && opener != start_delim) { + cmark_chunk_free(&opener->inl_text->as.literal); + opener->inl_text->as.literal = + cmark_chunk_literal("‘"); + } + closer = closer->next; + } else if (closer->delim_char == '"') { + cmark_chunk_free(&closer->inl_text->as.literal); + closer->inl_text->as.literal = + cmark_chunk_literal("”"); + if (opener != NULL && opener != start_delim) { + cmark_chunk_free(&opener->inl_text->as.literal); + opener->inl_text->as.literal = + cmark_chunk_literal("“"); + } closer = closer->next; } } else { @@ -866,7 +938,7 @@ static int subject_find_special_char(subject *subj, long options) }; // " ' . - - static const char SMART_PUNCT_TABLE[] = { + static const char SMART_PUNCT_CHARS[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, @@ -890,6 +962,9 @@ static int subject_find_special_char(subject *subj, long options) while (n < subj->input.len) { if (SPECIAL_CHARS[subj->input.data[n]]) return n; + if (options & CMARK_OPT_SMARTPUNCT && + SMART_PUNCT_CHARS[subj->input.data[n]]) + return n; n++; } @@ -926,7 +1001,15 @@ static int parse_inline(subject* subj, cmark_node * parent, long options) break; case '*': case '_': - new_inl = handle_strong_emph(subj, c); + case '\'': + case '"': + new_inl = handle_delim(subj, c, options & CMARK_OPT_SMARTPUNCT); + break; + case '-': + new_inl = handle_hyphen(subj, options & CMARK_OPT_SMARTPUNCT); + break; + case '.': + new_inl = handle_period(subj, options & CMARK_OPT_SMARTPUNCT); break; case '[': advance(subj); diff --git a/src/man.c b/src/man.c index 970bb6d..b7cb932 100644 --- a/src/man.c +++ b/src/man.c @@ -7,10 +7,11 @@ #include "cmark.h" #include "node.h" #include "buffer.h" -#include "smart.h" // Functions to convert cmark_nodes to groff man strings. +// TODO: properly escape unicode punctuation used in smart mode: +// "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]", "\\[em]", "\\[en]", "..." static void escape_man(cmark_strbuf *dest, const unsigned char *source, int length) { int i; @@ -47,7 +48,7 @@ struct render_state { static int S_render_node(cmark_node *node, cmark_event_type ev_type, - struct render_state *state, long options) + struct render_state *state) { cmark_node *tmp; cmark_strbuf *man = state->man; @@ -166,14 +167,8 @@ S_render_node(cmark_node *node, cmark_event_type ev_type, break; case CMARK_NODE_TEXT: - if (options & CMARK_OPT_SMARTPUNCT) { - escape_with_smart(man, node, escape_man, - "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]", - "\\[em]", "\\[en]", "..."); - } else { - escape_man(man, node->as.literal.data, - node->as.literal.len); - } + escape_man(man, node->as.literal.data, + node->as.literal.len); break; case CMARK_NODE_LINEBREAK: @@ -248,7 +243,7 @@ char *cmark_render_man(cmark_node *root, long options) while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { cur = cmark_iter_get_node(iter); - S_render_node(cur, ev_type, &state, options); + S_render_node(cur, ev_type, &state); } result = (char *)cmark_strbuf_detach(&man); diff --git a/src/smart.c b/src/smart.c deleted file mode 100644 index cb67448..0000000 --- a/src/smart.c +++ /dev/null @@ -1,174 +0,0 @@ -#include -#include -#include -#include - -#include "config.h" -#include "cmark.h" -#include "node.h" -#include "utf8.h" -#include "buffer.h" -#include "chunk.h" - -static const char SMART_PUNCT_TABLE[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -void escape_with_smart(cmark_strbuf *buf, - cmark_node *node, - void (*escape)(cmark_strbuf *, const unsigned char *, int), - const char *left_double_quote, - const char *right_double_quote, - const char *left_single_quote, - const char *right_single_quote, - const char *em_dash, - const char *en_dash, - const char *ellipses) -{ - char c; - int32_t after_char = 0; - int32_t before_char = 0; - bool left_flanking, right_flanking; - int lastout = 0; - int i = 0, j = 0; - cmark_chunk lit = node->as.literal; - int len; - - while (i < lit.len) { - c = lit.data[i]; - i++; - if (SMART_PUNCT_TABLE[(int)c] == 0) { - continue; - } - - if (i - 1 - lastout > 0) { - (*escape)(buf, lit.data + lastout, i - 1 - lastout); - } - - if (c == 34 || c == 39) { - if (i == 1) { - // set before_char based on previous text node if there is one: - if (node->prev) { - if (node->prev->type == CMARK_NODE_TEXT) { - - // walk to the beginning of the UTF_8 sequence: - j = node->prev->as.literal.len - 1; - while (j > 0 && - node->prev->as.literal.data[j] >> 6 == 2) { - j--; - } - len = utf8proc_iterate(node->prev->as.literal.data + i, - node->prev->as.literal.len - i, - &before_char); - if (len == -1) { - before_char = 10; - } - - } else if (node->prev->type == CMARK_NODE_SOFTBREAK || - node->prev->type == CMARK_NODE_LINEBREAK) { - before_char = 10; - - } else { - before_char = 65; - } - } else { - before_char = 10; - } - } else { - j = i - 2; - // walk back to the beginning of the UTF_8 sequence: - while (j > 0 && lit.data[j] >> 6 == 2) { - j--; - } - utf8proc_iterate(lit.data + j, lit.len - j, &before_char); - } - - if (i >= lit.len) { - if (node->next) { - if (node->next->type == CMARK_NODE_TEXT) { - utf8proc_iterate(node->next->as.literal.data, - node->next->as.literal.len, - &after_char); - } else if (node->next->type == CMARK_NODE_SOFTBREAK || - node->next->type == CMARK_NODE_LINEBREAK) { - after_char = 10; - } else { - after_char = 65; - } - } else { - after_char = 10; - } - } else { - utf8proc_iterate(lit.data + i, lit.len - i, &after_char); - } - - left_flanking = !utf8proc_is_space(after_char) && - !(utf8proc_is_punctuation(after_char) && - !utf8proc_is_space(before_char) && - !utf8proc_is_punctuation(before_char)); - right_flanking = !utf8proc_is_space(before_char) && - !(utf8proc_is_punctuation(before_char) && - !utf8proc_is_space(after_char) && - !utf8proc_is_punctuation(after_char)); - } - - switch (c) { - case '"': - if (right_flanking) { - cmark_strbuf_puts(buf, right_double_quote); - } else { - cmark_strbuf_puts(buf, left_double_quote); - } - break; - case '\'': - if (left_flanking && !right_flanking) { - cmark_strbuf_puts(buf, left_single_quote); - } else { - cmark_strbuf_puts(buf, right_single_quote); - } - break; - case '-': - if (i < lit.len && lit.data[i] == '-') { - if (lit.data[i + 1] == '-') { - cmark_strbuf_puts(buf, em_dash); - i += 2; - } else { - cmark_strbuf_puts(buf, en_dash); - i += 1; - } - } else { - cmark_strbuf_putc(buf, c); - } - break; - case '.': - if (i < lit.len - 1 && lit.data[i] == '.' && - lit.data[i + 1] == '.') { - cmark_strbuf_puts(buf, ellipses); - i += 2; - } else { - cmark_strbuf_putc(buf, c); - } - break; - default: - cmark_strbuf_putc(buf, c); - } - lastout = i; - } - (*escape)(buf, node->as.literal.data + lastout, lit.len - lastout); - -} diff --git a/src/smart.h b/src/smart.h deleted file mode 100644 index fa614b3..0000000 --- a/src/smart.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef CMARK_SMART_H -#define CMARK_SMART_H - -#include -#include -#include "config.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void escape_with_smart(cmark_strbuf *buf, - cmark_node *node, - void (*escape)(cmark_strbuf *, const unsigned char *, int), - const char *left_double_quote, - const char *right_double_quote, - const char *left_single_quote, - const char *right_single_quote, - const char *em_dash, - const char *en_dash, - const char *ellipses); - -#ifdef __cplusplus -} -#endif - -#endif - diff --git a/test/smart_punct.txt b/test/smart_punct.txt index c870c9d..5deccde 100644 --- a/test/smart_punct.txt +++ b/test/smart_punct.txt @@ -4,58 +4,58 @@ "Hello," said the spider. "'Shelob' is my name." . -

“Hello,” said the spider. -“‘Shelob’ is my name.”

+

“Hello,” said the spider. +“‘Shelob’ is my name.”

. . 'A', 'B', and 'C' are letters. . -

‘A’, ‘B’, and ‘C’ are letters.

+

‘A’, ‘B’, and ‘C’ are letters.

. . 'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' . -

‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. -So is ‘pine.’

+

‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. +So is ‘pine.’

. . 'He said, "I want to go."' . -

‘He said, “I want to go.”’

+

‘He said, “I want to go.”’

. . Were you alive in the 70's? . -

Were you alive in the 70’s?

+

Were you alive in the 70’s?

. . Here is some quoted '`code`' and a "[quoted link](url)". . -

Here is some quoted ‘code’ and a “quoted link”.

+

Here is some quoted ‘code’ and a “quoted link”.

. . Some dashes: one---two --- three---four --- five. . -

Some dashes: one—two — -three—four — five.

+

Some dashes: one—two — +three—four — five.

. . Dashes between numbers: 5--7, 255--66, 1987--1999. . -

Dashes between numbers: 5–7, 255–66, 1987–1999.

+

Dashes between numbers: 5–7, 255–66, 1987–1999.

. . Ellipses...and...and.... . -

Ellipses…and…and….

+

Ellipses…and…and….

. -- cgit v1.2.3