Added smart.c, smart.h with function abstracting smart punct rendering.

Also fixed some bugs in earlier smart handling. Now handles UTF-8.
author: John MacFarlane <jgm@berkeley.edu> 2015-02-14 21:18:55 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2015-02-14 22:36:34 -0800
commit: 5860d60cc79332c01da59f39e90fff2bcc8a5e9e (patch)
tree: 5e3d033c11cdbf50ac8c34f93389492c793579fc
parent: 7c92577bbb670ddfbf6df5ee4b931c27548230cc (diff)
5 files changed, 182 insertions, 112 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2179c08..2150e7a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,6 +16,7 @@ set(HEADERS
   html_unescape.h
   houdini.h
   cmark_ctype.h
+  smart.h
   )
 set(LIBRARY_SOURCES
   cmark.c
@@ -36,6 +37,7 @@ set(LIBRARY_SOURCES
   houdini_html_e.c
   houdini_html_u.c
   cmark_ctype.c
+  smart.c
   ${HEADERS}
   )
 
diff --git a/src/html.c b/src/html.c
index b4d351e..1f64196 100644
--- a/src/html.c
+++ b/src/html.c
@@ -9,6 +9,7 @@
 #include "utf8.h"
 #include "buffer.h"
 #include "houdini.h"
+#include "smart.h"
 
 // Functions to convert cmark_nodes to HTML strings.
 
@@ -61,10 +62,6 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
 	char start_header[] = "<h0";
 	char end_header[] = "</h0";
 	bool tight;
-	int lastout, i;
-	cmark_chunk lit;
-	char before_char, after_char, c;
-	bool left_flanking, right_flanking;
 
 	bool entering = (ev_type == CMARK_EVENT_ENTER);
 
@@ -223,112 +220,9 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
 
 	case CMARK_NODE_TEXT:
 		if (options & CMARK_OPT_SMARTPUNCT) {
-			lastout = 0;
-			i = 0;
-			lit = node->as.literal;
-			while (i < lit.len) {
-				c = lit.data[i];
-				// replace with efficient lookup table:
-				if (c != '"' && c != '-' && c != '\'' && c != '.') {
-					i++;
-					continue;
-				}
-				escape_html(html, lit.data + lastout,
-				            i - lastout);
-				if (c == '\'' || c == '"') {
-					if (i == 0) {
-						if (node->prev) {
-							if (node->prev->type == CMARK_NODE_TEXT) {
-								before_char = node->prev->as.literal.data[node->prev->as.literal.len - 1];
-							} else if (node->prev->type == CMARK_NODE_SOFTBREAK ||
-							           node->prev->type == CMARK_NODE_LINEBREAK) {
-								before_char = '\n';
-							} else {
-								before_char = 'x';
-							}
-						} else {
-							before_char = '\n';
-						}
-					} else {
-						before_char = lit.data[i - 1];
-					}
-					if (i >= lit.len - 1) {
-						if (node->next) {
-							if (node->next->type == CMARK_NODE_TEXT) {
-								after_char = node->next->as.literal.data[0];
-							} else if (node->next->type == CMARK_NODE_SOFTBREAK ||
-							           node->next->type == CMARK_NODE_LINEBREAK) {
-								before_char = '\n';
-							} else {
-								after_char = 'x';
-							}
-						} else {
-							after_char = '\n';
-						}
-					} else {
-						after_char = lit.data[i + 1];
-					}
-					left_flanking = !utf8proc_is_space(after_char) &&
-					                !(utf8proc_is_punctuation(after_char) &&
-					                  !utf8proc_is_space(before_char) &&
-					                  !utf8proc_is_punctuation(before_char));
-					right_flanking = !utf8proc_is_space(before_char) &&
-					                 !(utf8proc_is_punctuation(before_char) &&
-					                   !utf8proc_is_space(after_char) &&
-					                   !utf8proc_is_punctuation(after_char));
-				}
-				switch (lit.data[i]) {
-				case '"':
-					if (right_flanking) {
-						cmark_strbuf_puts(html, "&rdquo;");
-					} else {
-						cmark_strbuf_puts(html, "&ldquo;");
-					}
-					i += 1;
-					break;
-				case '\'':
-					if (left_flanking && !right_flanking) {
-						cmark_strbuf_puts(html, "&lsquo;");
-					} else {
-						cmark_strbuf_puts(html, "&rsquo;");
-					}
-					i += 1;
-					break;
-				case '-':
-					if (i < lit.len - 1 && lit.data[i + 1] == '-') {
-						if (lit.data[i + 2] == '-') {
-							cmark_strbuf_puts(html,
-							                  "&mdash;");
-							i += 3;
-						} else {
-							cmark_strbuf_puts(html, "&ndash;");
-							i += 2;
-						}
-					} else {
-						cmark_strbuf_putc(html, c);
-						i += 1;
-					}
-					break;
-				case '.':
-					if (i < lit.len - 2 && lit.data[i + 1] == '.' &&
-					    lit.data[i + 2] == '.') {
-						cmark_strbuf_puts(html,
-						                  "&hellip;");
-						i += 3;
-					} else {
-						cmark_strbuf_putc(html, c);
-						i += 1;
-					}
-					break;
-				default:
-					cmark_strbuf_putc(html, c);
-					i++;
-				}
-				lastout = i;
-			}
-			escape_html(html, node->as.literal.data + lastout,
-			            i - lastout);
-
+			escape_with_smart(html, node, escape_html,
+					  "&ldquo;", "&rdquo;", "&lsquo;", "&rsquo;",
+					  "&mdash;", "&ndash;", "&hellip;");
 		} else {
 			escape_html(html, node->as.literal.data,
 			            node->as.literal.len);
diff --git a/src/smart.c b/src/smart.c
new file mode 100644
index 0000000..54c9740
--- /dev/null
+++ b/src/smart.c
@@ -0,0 +1,146 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "config.h"
+#include "cmark.h"
+#include "node.h"
+#include "utf8.h"
+#include "buffer.h"
+#include "chunk.h"
+
+void escape_with_smart(cmark_strbuf *buf,
+		       cmark_node *node,
+		       void (*escape)(cmark_strbuf *, const unsigned char *, int),
+		       const char *left_double_quote,
+		       const char *right_double_quote,
+		       const char *left_single_quote,
+		       const char *right_single_quote,
+		       const char *em_dash,
+		       const char *en_dash,
+		       const char *ellipses)
+{
+	int32_t c = 0;
+	int32_t after_char = 0;
+	int32_t before_char = 0;
+	int len;
+	bool left_flanking, right_flanking;
+	int lastout = 0;
+	int i = 0;
+	cmark_chunk lit = node->as.literal;
+
+	// set before_char based on previous text node if there is one:
+	if (node->prev) {
+		if (node->prev->type == CMARK_NODE_TEXT) {
+
+			// walk back to the beginning of the UTF_8 sequence:
+			i = node->prev->as.literal.len - 1;
+			while (i > 0 && node->prev->as.literal.data[i] >> 6 == 2) {
+				i -= 1;
+			}
+			len = utf8proc_iterate(node->prev->as.literal.data + i,
+					       node->prev->as.literal.len - i,
+					       &before_char);
+			if (len == -1) {
+				before_char = 10;
+			}
+
+		} else if (node->prev->type == CMARK_NODE_SOFTBREAK ||
+			   node->prev->type == CMARK_NODE_LINEBREAK) {
+			before_char = 10;
+
+		} else {
+			before_char = 65;
+		}
+	} else {
+		before_char = 10;
+	}
+
+	while (i < lit.len) {
+		len = utf8proc_iterate(lit.data + i, lit.len - i, &c);
+		i += len;
+
+		// replace with efficient lookup table:
+		if (!(c == 34 || c == 39 || c == 45 || c == 46)) {
+			before_char = c;
+			continue;
+		}
+		(*escape)(buf, lit.data + lastout, i - len - lastout);
+
+		if (c == 34 || c == 39) {
+
+			if (i >= lit.len) {
+				if (node->next) {
+					if (node->next->type == CMARK_NODE_TEXT) {
+						utf8proc_iterate(node->next->as.literal.data,
+								 node->next->as.literal.len,
+								 &after_char);
+					} else if (node->next->type == CMARK_NODE_SOFTBREAK ||
+						   node->next->type == CMARK_NODE_LINEBREAK) {
+						after_char = 10;
+					} else {
+						after_char = 65;
+					}
+				} else {
+					after_char = 10;
+				}
+			} else {
+				utf8proc_iterate(lit.data + i, lit.len - i, &after_char);
+			}
+
+			left_flanking = !utf8proc_is_space(after_char) &&
+				!(utf8proc_is_punctuation(after_char) &&
+				  !utf8proc_is_space(before_char) &&
+				  !utf8proc_is_punctuation(before_char));
+			right_flanking = !utf8proc_is_space(before_char) &&
+				!(utf8proc_is_punctuation(before_char) &&
+				  !utf8proc_is_space(after_char) &&
+				  !utf8proc_is_punctuation(after_char));
+		}
+
+		switch (c) {
+		case 34: // "
+			if (right_flanking) {
+				cmark_strbuf_puts(buf, right_double_quote);
+			} else {
+				cmark_strbuf_puts(buf, left_double_quote);
+			}
+			break;
+		case 39: // '
+			if (left_flanking && !right_flanking) {
+				cmark_strbuf_puts(buf, left_single_quote);
+			} else {
+				cmark_strbuf_puts(buf, right_single_quote);
+			}
+			break;
+		case 45: // -
+			if (i < lit.len && lit.data[i] == '-') {
+				if (lit.data[i + 1] == '-') {
+					cmark_strbuf_puts(buf, em_dash);
+					i += 2;
+				} else {
+					cmark_strbuf_puts(buf, en_dash);
+					i += 1;
+				}
+			} else {
+				cmark_strbuf_putc(buf, c);
+			}
+			break;
+		case 46: // .
+			if (i < lit.len - 1 && lit.data[i] == '.' &&
+			    lit.data[i + 1] == '.') {
+				cmark_strbuf_puts(buf, ellipses);
+				i += 2;
+			} else {
+				cmark_strbuf_putc(buf, c);
+			}
+			break;
+		default:
+			cmark_strbuf_putc(buf, c);
+		}
+		lastout = i;
+	}
+	(*escape)(buf, node->as.literal.data + lastout, lit.len - lastout);
+
+}
diff --git a/src/smart.h b/src/smart.h
new file mode 100644
index 0000000..fa614b3
--- /dev/null
+++ b/src/smart.h
@@ -0,0 +1,28 @@
+#ifndef CMARK_SMART_H
+#define CMARK_SMART_H
+
+#include <stddef.h>
+#include <stdarg.h>
+#include "config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void escape_with_smart(cmark_strbuf *buf,
+		       cmark_node *node,
+		       void (*escape)(cmark_strbuf *, const unsigned char *, int),
+		       const char *left_double_quote,
+		       const char *right_double_quote,
+		       const char *left_single_quote,
+		       const char *right_single_quote,
+		       const char *em_dash,
+		       const char *en_dash,
+		       const char *ellipses);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/test/smart_punct.txt b/test/smart_punct.txt
index c036a6d..c870c9d 100644
--- a/test/smart_punct.txt
+++ b/test/smart_punct.txt
@@ -35,9 +35,9 @@ Were you alive in the 70's?
 .
 
 .
-Here is some quoted '`code`' and a "[quoted link][1]".
+Here is some quoted '`code`' and a "[quoted link](url)".
 .
-<p>Here is some quoted &lsquo;<code>code</code>&rsquo; and a &ldquo;[quoted link][1]&rdquo;.</p>
+<p>Here is some quoted &lsquo;<code>code</code>&rsquo; and a &ldquo;<a href="url">quoted link</a>&rdquo;.</p>
 .
 
 .
author	John MacFarlane <jgm@berkeley.edu>	2015-02-14 21:18:55 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2015-02-14 22:36:34 -0800
commit	5860d60cc79332c01da59f39e90fff2bcc8a5e9e (patch)
tree	5e3d033c11cdbf50ac8c34f93389492c793579fc
parent	7c92577bbb670ddfbf6df5ee4b931c27548230cc (diff)