summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-02-14 21:18:55 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2015-02-14 22:36:34 -0800
commit5860d60cc79332c01da59f39e90fff2bcc8a5e9e (patch)
tree5e3d033c11cdbf50ac8c34f93389492c793579fc
parent7c92577bbb670ddfbf6df5ee4b931c27548230cc (diff)
Added smart.c, smart.h with function abstracting smart punct rendering.
Also fixed some bugs in earlier smart handling. Now handles UTF-8.
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/html.c114
-rw-r--r--src/smart.c146
-rw-r--r--src/smart.h28
-rw-r--r--test/smart_punct.txt4
5 files changed, 182 insertions, 112 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2179c08..2150e7a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,6 +16,7 @@ set(HEADERS
html_unescape.h
houdini.h
cmark_ctype.h
+ smart.h
)
set(LIBRARY_SOURCES
cmark.c
@@ -36,6 +37,7 @@ set(LIBRARY_SOURCES
houdini_html_e.c
houdini_html_u.c
cmark_ctype.c
+ smart.c
${HEADERS}
)
diff --git a/src/html.c b/src/html.c
index b4d351e..1f64196 100644
--- a/src/html.c
+++ b/src/html.c
@@ -9,6 +9,7 @@
#include "utf8.h"
#include "buffer.h"
#include "houdini.h"
+#include "smart.h"
// Functions to convert cmark_nodes to HTML strings.
@@ -61,10 +62,6 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
char start_header[] = "<h0";
char end_header[] = "</h0";
bool tight;
- int lastout, i;
- cmark_chunk lit;
- char before_char, after_char, c;
- bool left_flanking, right_flanking;
bool entering = (ev_type == CMARK_EVENT_ENTER);
@@ -223,112 +220,9 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
case CMARK_NODE_TEXT:
if (options & CMARK_OPT_SMARTPUNCT) {
- lastout = 0;
- i = 0;
- lit = node->as.literal;
- while (i < lit.len) {
- c = lit.data[i];
- // replace with efficient lookup table:
- if (c != '"' && c != '-' && c != '\'' && c != '.') {
- i++;
- continue;
- }
- escape_html(html, lit.data + lastout,
- i - lastout);
- if (c == '\'' || c == '"') {
- if (i == 0) {
- if (node->prev) {
- if (node->prev->type == CMARK_NODE_TEXT) {
- before_char = node->prev->as.literal.data[node->prev->as.literal.len - 1];
- } else if (node->prev->type == CMARK_NODE_SOFTBREAK ||
- node->prev->type == CMARK_NODE_LINEBREAK) {
- before_char = '\n';
- } else {
- before_char = 'x';
- }
- } else {
- before_char = '\n';
- }
- } else {
- before_char = lit.data[i - 1];
- }
- if (i >= lit.len - 1) {
- if (node->next) {
- if (node->next->type == CMARK_NODE_TEXT) {
- after_char = node->next->as.literal.data[0];
- } else if (node->next->type == CMARK_NODE_SOFTBREAK ||
- node->next->type == CMARK_NODE_LINEBREAK) {
- before_char = '\n';
- } else {
- after_char = 'x';
- }
- } else {
- after_char = '\n';
- }
- } else {
- after_char = lit.data[i + 1];
- }
- left_flanking = !utf8proc_is_space(after_char) &&
- !(utf8proc_is_punctuation(after_char) &&
- !utf8proc_is_space(before_char) &&
- !utf8proc_is_punctuation(before_char));
- right_flanking = !utf8proc_is_space(before_char) &&
- !(utf8proc_is_punctuation(before_char) &&
- !utf8proc_is_space(after_char) &&
- !utf8proc_is_punctuation(after_char));
- }
- switch (lit.data[i]) {
- case '"':
- if (right_flanking) {
- cmark_strbuf_puts(html, "&rdquo;");
- } else {
- cmark_strbuf_puts(html, "&ldquo;");
- }
- i += 1;
- break;
- case '\'':
- if (left_flanking && !right_flanking) {
- cmark_strbuf_puts(html, "&lsquo;");
- } else {
- cmark_strbuf_puts(html, "&rsquo;");
- }
- i += 1;
- break;
- case '-':
- if (i < lit.len - 1 && lit.data[i + 1] == '-') {
- if (lit.data[i + 2] == '-') {
- cmark_strbuf_puts(html,
- "&mdash;");
- i += 3;
- } else {
- cmark_strbuf_puts(html, "&ndash;");
- i += 2;
- }
- } else {
- cmark_strbuf_putc(html, c);
- i += 1;
- }
- break;
- case '.':
- if (i < lit.len - 2 && lit.data[i + 1] == '.' &&
- lit.data[i + 2] == '.') {
- cmark_strbuf_puts(html,
- "&hellip;");
- i += 3;
- } else {
- cmark_strbuf_putc(html, c);
- i += 1;
- }
- break;
- default:
- cmark_strbuf_putc(html, c);
- i++;
- }
- lastout = i;
- }
- escape_html(html, node->as.literal.data + lastout,
- i - lastout);
-
+ escape_with_smart(html, node, escape_html,
+ "&ldquo;", "&rdquo;", "&lsquo;", "&rsquo;",
+ "&mdash;", "&ndash;", "&hellip;");
} else {
escape_html(html, node->as.literal.data,
node->as.literal.len);
diff --git a/src/smart.c b/src/smart.c
new file mode 100644
index 0000000..54c9740
--- /dev/null
+++ b/src/smart.c
@@ -0,0 +1,146 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "config.h"
+#include "cmark.h"
+#include "node.h"
+#include "utf8.h"
+#include "buffer.h"
+#include "chunk.h"
+
+void escape_with_smart(cmark_strbuf *buf,
+ cmark_node *node,
+ void (*escape)(cmark_strbuf *, const unsigned char *, int),
+ const char *left_double_quote,
+ const char *right_double_quote,
+ const char *left_single_quote,
+ const char *right_single_quote,
+ const char *em_dash,
+ const char *en_dash,
+ const char *ellipses)
+{
+ int32_t c = 0;
+ int32_t after_char = 0;
+ int32_t before_char = 0;
+ int len;
+ bool left_flanking, right_flanking;
+ int lastout = 0;
+ int i = 0;
+ cmark_chunk lit = node->as.literal;
+
+ // set before_char based on previous text node if there is one:
+ if (node->prev) {
+ if (node->prev->type == CMARK_NODE_TEXT) {
+
+ // walk back to the beginning of the UTF_8 sequence:
+ i = node->prev->as.literal.len - 1;
+ while (i > 0 && node->prev->as.literal.data[i] >> 6 == 2) {
+ i -= 1;
+ }
+ len = utf8proc_iterate(node->prev->as.literal.data + i,
+ node->prev->as.literal.len - i,
+ &before_char);
+ if (len == -1) {
+ before_char = 10;
+ }
+
+ } else if (node->prev->type == CMARK_NODE_SOFTBREAK ||
+ node->prev->type == CMARK_NODE_LINEBREAK) {
+ before_char = 10;
+
+ } else {
+ before_char = 65;
+ }
+ } else {
+ before_char = 10;
+ }
+
+ while (i < lit.len) {
+ len = utf8proc_iterate(lit.data + i, lit.len - i, &c);
+ i += len;
+
+ // replace with efficient lookup table:
+ if (!(c == 34 || c == 39 || c == 45 || c == 46)) {
+ before_char = c;
+ continue;
+ }
+ (*escape)(buf, lit.data + lastout, i - len - lastout);
+
+ if (c == 34 || c == 39) {
+
+ if (i >= lit.len) {
+ if (node->next) {
+ if (node->next->type == CMARK_NODE_TEXT) {
+ utf8proc_iterate(node->next->as.literal.data,
+ node->next->as.literal.len,
+ &after_char);
+ } else if (node->next->type == CMARK_NODE_SOFTBREAK ||
+ node->next->type == CMARK_NODE_LINEBREAK) {
+ after_char = 10;
+ } else {
+ after_char = 65;
+ }
+ } else {
+ after_char = 10;
+ }
+ } else {
+ utf8proc_iterate(lit.data + i, lit.len - i, &after_char);
+ }
+
+ left_flanking = !utf8proc_is_space(after_char) &&
+ !(utf8proc_is_punctuation(after_char) &&
+ !utf8proc_is_space(before_char) &&
+ !utf8proc_is_punctuation(before_char));
+ right_flanking = !utf8proc_is_space(before_char) &&
+ !(utf8proc_is_punctuation(before_char) &&
+ !utf8proc_is_space(after_char) &&
+ !utf8proc_is_punctuation(after_char));
+ }
+
+ switch (c) {
+ case 34: // "
+ if (right_flanking) {
+ cmark_strbuf_puts(buf, right_double_quote);
+ } else {
+ cmark_strbuf_puts(buf, left_double_quote);
+ }
+ break;
+ case 39: // '
+ if (left_flanking && !right_flanking) {
+ cmark_strbuf_puts(buf, left_single_quote);
+ } else {
+ cmark_strbuf_puts(buf, right_single_quote);
+ }
+ break;
+ case 45: // -
+ if (i < lit.len && lit.data[i] == '-') {
+ if (lit.data[i + 1] == '-') {
+ cmark_strbuf_puts(buf, em_dash);
+ i += 2;
+ } else {
+ cmark_strbuf_puts(buf, en_dash);
+ i += 1;
+ }
+ } else {
+ cmark_strbuf_putc(buf, c);
+ }
+ break;
+ case 46: // .
+ if (i < lit.len - 1 && lit.data[i] == '.' &&
+ lit.data[i + 1] == '.') {
+ cmark_strbuf_puts(buf, ellipses);
+ i += 2;
+ } else {
+ cmark_strbuf_putc(buf, c);
+ }
+ break;
+ default:
+ cmark_strbuf_putc(buf, c);
+ }
+ lastout = i;
+ }
+ (*escape)(buf, node->as.literal.data + lastout, lit.len - lastout);
+
+}
diff --git a/src/smart.h b/src/smart.h
new file mode 100644
index 0000000..fa614b3
--- /dev/null
+++ b/src/smart.h
@@ -0,0 +1,28 @@
+#ifndef CMARK_SMART_H
+#define CMARK_SMART_H
+
+#include <stddef.h>
+#include <stdarg.h>
+#include "config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void escape_with_smart(cmark_strbuf *buf,
+ cmark_node *node,
+ void (*escape)(cmark_strbuf *, const unsigned char *, int),
+ const char *left_double_quote,
+ const char *right_double_quote,
+ const char *left_single_quote,
+ const char *right_single_quote,
+ const char *em_dash,
+ const char *en_dash,
+ const char *ellipses);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/test/smart_punct.txt b/test/smart_punct.txt
index c036a6d..c870c9d 100644
--- a/test/smart_punct.txt
+++ b/test/smart_punct.txt
@@ -35,9 +35,9 @@ Were you alive in the 70's?
.
.
-Here is some quoted '`code`' and a "[quoted link][1]".
+Here is some quoted '`code`' and a "[quoted link](url)".
.
-<p>Here is some quoted &lsquo;<code>code</code>&rsquo; and a &ldquo;[quoted link][1]&rdquo;.</p>
+<p>Here is some quoted &lsquo;<code>code</code>&rsquo; and a &ldquo;<a href="url">quoted link</a>&rdquo;.</p>
.
.