diff options
| -rw-r--r-- | src/CMakeLists.txt | 2 | ||||
| -rw-r--r-- | src/html.c | 10 | ||||
| -rw-r--r-- | src/inlines.c | 105 | ||||
| -rw-r--r-- | src/man.c | 17 | ||||
| -rw-r--r-- | src/smart.c | 174 | ||||
| -rw-r--r-- | src/smart.h | 28 | ||||
| -rw-r--r-- | test/smart_punct.txt | 24 | 
7 files changed, 113 insertions, 247 deletions
| diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2150e7a..2179c08 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,6 @@ set(HEADERS    html_unescape.h    houdini.h    cmark_ctype.h -  smart.h    )  set(LIBRARY_SOURCES    cmark.c @@ -37,7 +36,6 @@ set(LIBRARY_SOURCES    houdini_html_e.c    houdini_html_u.c    cmark_ctype.c -  smart.c    ${HEADERS}    ) @@ -9,7 +9,6 @@  #include "utf8.h"  #include "buffer.h"  #include "houdini.h" -#include "smart.h"  // Functions to convert cmark_nodes to HTML strings. @@ -219,14 +218,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,  		break;  	case CMARK_NODE_TEXT: -		if (options & CMARK_OPT_SMARTPUNCT) { -			escape_with_smart(html, node, escape_html, -					  "“", "”", "‘", "’", -					  "—", "–", "…"); -		} else { -			escape_html(html, node->as.literal.data, -			            node->as.literal.len); -		} +		escape_html(html, node->as.literal.data, node->as.literal.len);  		break;  	case CMARK_NODE_LINEBREAK: diff --git a/src/inlines.c b/src/inlines.c index 014c018..a5af1a5 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -271,6 +271,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)  	while (peek_char(subj) == c) {  		numdelims++;  		advance(subj); +		if (c == '\'' || c == '"') { +			break;  // limit to 1 delim for quotes +		}  	}  	len = utf8proc_iterate(subj->input.data + subj->pos, @@ -289,6 +292,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)  	if (c == '_') {  		*can_open = left_flanking && !right_flanking;  		*can_close = right_flanking && !left_flanking; +	} else if (c == '\'' || c == '"') { +		*can_open = left_flanking && !right_flanking; +		*can_close = right_flanking;  	} else {  		*can_open = left_flanking;  		*can_close = right_flanking; @@ -349,25 +355,68 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open,  	subj->last_delim = delim;  } -// Parse strong/emph or a fallback. -// Assumes the subject has '_' or '*' at the current position. -static cmark_node* handle_strong_emph(subject* subj, unsigned char c) +// Assumes the subject has a c at the current position. +static cmark_node* handle_delim(subject* subj, unsigned char c, bool smart)  {  	int numdelims;  	cmark_node * inl_text;  	bool can_open, can_close; +	cmark_chunk contents;  	numdelims = scan_delims(subj, c, &can_open, &can_close); -	inl_text = make_str(cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); +	if (c == '\'' && smart) { +		contents = cmark_chunk_literal("’"); +	} else if (c == '"' && smart) { +		contents = cmark_chunk_literal("”"); +	} else { +		contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); +	} + +	inl_text = make_str(contents); -	if (can_open || can_close) { +	if ((can_open || can_close) && +	    (!(c == '\'' || c == '"') || smart)) {  		push_delimiter(subj, c, can_open, can_close, inl_text);  	}  	return inl_text;  } +// Assumes we have a hyphen at the current position. +static cmark_node* handle_hyphen(subject* subj, bool smart) +{ +	advance(subj); +	if (smart && peek_char(subj) == '-') { +		advance(subj); +		if (peek_char(subj) == '-') { +			advance(subj); +			return make_str(cmark_chunk_literal("—")); +		} else { +			return make_str(cmark_chunk_literal("–")); +		} +	} else { +		return make_str(cmark_chunk_literal("-")); +	} +} + +// Assumes we have a period at the current position. +static cmark_node* handle_period(subject* subj, bool smart) +{ +	advance(subj); +	if (smart && peek_char(subj) == '.') { +		advance(subj); +		if (peek_char(subj) == '.') { +			advance(subj); +			return make_str(cmark_chunk_literal("…")); +		} else { +			return make_str(cmark_chunk_literal("..")); +		} +	} else { +		return make_str(cmark_chunk_literal(".")); +	} +} +  static void process_emphasis(subject *subj, delimiter *start_delim)  {  	delimiter *closer = subj->last_delim; @@ -381,7 +430,8 @@ static void process_emphasis(subject *subj, delimiter *start_delim)  	// now move forward, looking for closers, and handling each  	while (closer != NULL) {  		if (closer->can_close && -		    (closer->delim_char == '*' || closer->delim_char == '_')) { +		    (closer->delim_char == '*' || closer->delim_char == '_' || +		     closer->delim_char == '"' || closer->delim_char == '\'')) {  			// Now look backwards for first matching opener:  			opener = closer->previous;  			while (opener != NULL && opener != start_delim) { @@ -391,9 +441,31 @@ static void process_emphasis(subject *subj, delimiter *start_delim)  				}  				opener = opener->previous;  			} -			if (opener != NULL && opener != start_delim) { -				closer = S_insert_emph(subj, opener, closer); -			} else { +			if (closer->delim_char == '*' || closer->delim_char == '_') { +				if (opener != NULL && opener != start_delim) { +					closer = S_insert_emph(subj, opener, closer); +				} else { +					closer = closer->next; +				} +			} else if (closer->delim_char == '\'') { +				cmark_chunk_free(&closer->inl_text->as.literal); +				closer->inl_text->as.literal = +					cmark_chunk_literal("’"); +				if (opener != NULL && opener != start_delim) { +					cmark_chunk_free(&opener->inl_text->as.literal); +					opener->inl_text->as.literal = +						cmark_chunk_literal("‘"); +				} +				closer = closer->next; +			} else if (closer->delim_char == '"') { +				cmark_chunk_free(&closer->inl_text->as.literal); +				closer->inl_text->as.literal = +					cmark_chunk_literal("”"); +				if (opener != NULL && opener != start_delim) { +					cmark_chunk_free(&opener->inl_text->as.literal); +					opener->inl_text->as.literal = +						cmark_chunk_literal("“"); +				}  				closer = closer->next;  			}  		} else { @@ -866,7 +938,7 @@ static int subject_find_special_char(subject *subj, long options)  	};  	// " ' . - -	static const char SMART_PUNCT_TABLE[] = { +	static const char SMART_PUNCT_CHARS[] = {  		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  		0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, @@ -890,6 +962,9 @@ static int subject_find_special_char(subject *subj, long options)  	while (n < subj->input.len) {  		if (SPECIAL_CHARS[subj->input.data[n]])  			return n; +		if (options & CMARK_OPT_SMARTPUNCT && +		    SMART_PUNCT_CHARS[subj->input.data[n]]) +			return n;  		n++;  	} @@ -926,7 +1001,15 @@ static int parse_inline(subject* subj, cmark_node * parent, long options)  		break;  	case '*':  	case '_': -		new_inl = handle_strong_emph(subj, c); +	case '\'': +	case '"': +		new_inl = handle_delim(subj, c, options & CMARK_OPT_SMARTPUNCT); +		break; +	case '-': +		new_inl = handle_hyphen(subj, options & CMARK_OPT_SMARTPUNCT); +		break; +	case '.': +		new_inl = handle_period(subj, options & CMARK_OPT_SMARTPUNCT);  		break;  	case '[':  		advance(subj); @@ -7,10 +7,11 @@  #include "cmark.h"  #include "node.h"  #include "buffer.h" -#include "smart.h"  // Functions to convert cmark_nodes to groff man strings. +// TODO:  properly escape unicode punctuation used in smart mode: +// "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]", "\\[em]", "\\[en]", "..."  static void escape_man(cmark_strbuf *dest, const unsigned char *source, int length)  {  	int i; @@ -47,7 +48,7 @@ struct render_state {  static int  S_render_node(cmark_node *node, cmark_event_type ev_type, -              struct render_state *state, long options) +              struct render_state *state)  {  	cmark_node *tmp;  	cmark_strbuf *man = state->man; @@ -166,14 +167,8 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,  		break;  	case CMARK_NODE_TEXT: -		if (options & CMARK_OPT_SMARTPUNCT) { -			escape_with_smart(man, node, escape_man, -					  "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]", -					  "\\[em]", "\\[en]", "..."); -		} else { -			escape_man(man, node->as.literal.data, -				   node->as.literal.len); -		} +		escape_man(man, node->as.literal.data, +			   node->as.literal.len);  		break;  	case CMARK_NODE_LINEBREAK: @@ -248,7 +243,7 @@ char *cmark_render_man(cmark_node *root, long options)  	while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {  		cur = cmark_iter_get_node(iter); -		S_render_node(cur, ev_type, &state, options); +		S_render_node(cur, ev_type, &state);  	}  	result = (char *)cmark_strbuf_detach(&man); diff --git a/src/smart.c b/src/smart.c deleted file mode 100644 index cb67448..0000000 --- a/src/smart.c +++ /dev/null @@ -1,174 +0,0 @@ -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <assert.h> - -#include "config.h" -#include "cmark.h" -#include "node.h" -#include "utf8.h" -#include "buffer.h" -#include "chunk.h" - -static const char SMART_PUNCT_TABLE[] = { -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -void escape_with_smart(cmark_strbuf *buf, -		       cmark_node *node, -		       void (*escape)(cmark_strbuf *, const unsigned char *, int), -		       const char *left_double_quote, -		       const char *right_double_quote, -		       const char *left_single_quote, -		       const char *right_single_quote, -		       const char *em_dash, -		       const char *en_dash, -		       const char *ellipses) -{ -	char c; -	int32_t after_char = 0; -	int32_t before_char = 0; -	bool left_flanking, right_flanking; -	int lastout = 0; -	int i = 0, j = 0; -	cmark_chunk lit = node->as.literal; -	int len; - -	while (i < lit.len) { -		c = lit.data[i]; -		i++; -		if (SMART_PUNCT_TABLE[(int)c] == 0) { -			continue; -		} - -		if (i - 1 - lastout > 0) { -			(*escape)(buf, lit.data + lastout, i - 1 - lastout); -		} - -		if (c == 34 || c == 39) { -			if (i == 1) { -                                // set before_char based on previous text node if there is one: -				if (node->prev) { -					if (node->prev->type == CMARK_NODE_TEXT) { - -						// walk to the beginning of the UTF_8 sequence: -						j = node->prev->as.literal.len - 1; -						while (j > 0 && -						       node->prev->as.literal.data[j] >> 6 == 2) { -							j--; -						} -						len = utf8proc_iterate(node->prev->as.literal.data + i, -								       node->prev->as.literal.len - i, -								       &before_char); -						if (len == -1) { -							before_char = 10; -						} - -					} else if (node->prev->type == CMARK_NODE_SOFTBREAK || -						   node->prev->type == CMARK_NODE_LINEBREAK) { -						before_char = 10; - -					} else { -						before_char = 65; -					} -				} else { -					before_char = 10; -				} -			} else { -				j = i - 2; -				// walk back to the beginning of the UTF_8 sequence: -				while (j > 0 && lit.data[j] >> 6 == 2) { -					j--; -				} -				utf8proc_iterate(lit.data + j, lit.len - j, &before_char); -			} - -			if (i >= lit.len) { -				if (node->next) { -					if (node->next->type == CMARK_NODE_TEXT) { -						utf8proc_iterate(node->next->as.literal.data, -								 node->next->as.literal.len, -								 &after_char); -					} else if (node->next->type == CMARK_NODE_SOFTBREAK || -						   node->next->type == CMARK_NODE_LINEBREAK) { -						after_char = 10; -					} else { -						after_char = 65; -					} -				} else { -					after_char = 10; -				} -			} else { -				utf8proc_iterate(lit.data + i, lit.len - i, &after_char); -			} - -			left_flanking = !utf8proc_is_space(after_char) && -				!(utf8proc_is_punctuation(after_char) && -				  !utf8proc_is_space(before_char) && -				  !utf8proc_is_punctuation(before_char)); -			right_flanking = !utf8proc_is_space(before_char) && -				!(utf8proc_is_punctuation(before_char) && -				  !utf8proc_is_space(after_char) && -				  !utf8proc_is_punctuation(after_char)); -		} - -		switch (c) { -		case '"': -			if (right_flanking) { -				cmark_strbuf_puts(buf, right_double_quote); -			} else { -				cmark_strbuf_puts(buf, left_double_quote); -			} -			break; -		case '\'': -			if (left_flanking && !right_flanking) { -				cmark_strbuf_puts(buf, left_single_quote); -			} else { -				cmark_strbuf_puts(buf, right_single_quote); -			} -			break; -		case '-': -			if (i < lit.len && lit.data[i] == '-') { -				if (lit.data[i + 1] == '-') { -					cmark_strbuf_puts(buf, em_dash); -					i += 2; -				} else { -					cmark_strbuf_puts(buf, en_dash); -					i += 1; -				} -			} else { -				cmark_strbuf_putc(buf, c); -			} -			break; -		case '.': -			if (i < lit.len - 1 && lit.data[i] == '.' && -			    lit.data[i + 1] == '.') { -				cmark_strbuf_puts(buf, ellipses); -				i += 2; -			} else { -				cmark_strbuf_putc(buf, c); -			} -			break; -		default: -			cmark_strbuf_putc(buf, c); -		} -		lastout = i; -	} -	(*escape)(buf, node->as.literal.data + lastout, lit.len - lastout); - -} diff --git a/src/smart.h b/src/smart.h deleted file mode 100644 index fa614b3..0000000 --- a/src/smart.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef CMARK_SMART_H -#define CMARK_SMART_H - -#include <stddef.h> -#include <stdarg.h> -#include "config.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void escape_with_smart(cmark_strbuf *buf, -		       cmark_node *node, -		       void (*escape)(cmark_strbuf *, const unsigned char *, int), -		       const char *left_double_quote, -		       const char *right_double_quote, -		       const char *left_single_quote, -		       const char *right_single_quote, -		       const char *em_dash, -		       const char *en_dash, -		       const char *ellipses); - -#ifdef __cplusplus -} -#endif - -#endif - diff --git a/test/smart_punct.txt b/test/smart_punct.txt index c870c9d..5deccde 100644 --- a/test/smart_punct.txt +++ b/test/smart_punct.txt @@ -4,58 +4,58 @@  "Hello," said the spider.  "'Shelob' is my name."  . -<p>“Hello,” said the spider. -“‘Shelob’ is my name.”</p> +<p>“Hello,” said the spider. +“‘Shelob’ is my name.”</p>  .  .  'A', 'B', and 'C' are letters.  . -<p>‘A’, ‘B’, and ‘C’ are letters.</p> +<p>‘A’, ‘B’, and ‘C’ are letters.</p>  .  .  'Oak,' 'elm,' and 'beech' are names of trees.  So is 'pine.'  . -<p>‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. -So is ‘pine.’</p> +<p>‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. +So is ‘pine.’</p>  .  .  'He said, "I want to go."'  . -<p>‘He said, “I want to go.”’</p> +<p>‘He said, “I want to go.”’</p>  .  .  Were you alive in the 70's?  . -<p>Were you alive in the 70’s?</p> +<p>Were you alive in the 70’s?</p>  .  .  Here is some quoted '`code`' and a "[quoted link](url)".  . -<p>Here is some quoted ‘<code>code</code>’ and a “<a href="url">quoted link</a>”.</p> +<p>Here is some quoted ‘<code>code</code>’ and a “<a href="url">quoted link</a>”.</p>  .  .  Some dashes:  one---two ---  three---four --- five.  . -<p>Some dashes:  one—two — -three—four — five.</p> +<p>Some dashes:  one—two — +three—four — five.</p>  .  .  Dashes between numbers: 5--7, 255--66, 1987--1999.  . -<p>Dashes between numbers: 5–7, 255–66, 1987–1999.</p> +<p>Dashes between numbers: 5–7, 255–66, 1987–1999.</p>  .  .  Ellipses...and...and....  . -<p>Ellipses…and…and….</p> +<p>Ellipses…and…and….</p>  . | 
