summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-02-15 18:31:07 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2015-02-15 21:45:27 -0800
commit27373892cb98a2a6a1d35fba28798d9117fff58f (patch)
treeb4ae1c9d3b1df15d4aae9ddcc6a1c5a3fa5ffccf /src
parent376f81ab8aa017ab01040e10d393d7682674562d (diff)
Moved handling of --smart from renderer to parser.
This allows backslash escapes to disable smart quote transformations in particular cases. Closes #8.
Diffstat (limited to 'src')
-rw-r--r--src/CMakeLists.txt2
-rw-r--r--src/html.c10
-rw-r--r--src/inlines.c105
-rw-r--r--src/man.c17
-rw-r--r--src/smart.c174
-rw-r--r--src/smart.h28
6 files changed, 101 insertions, 235 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2150e7a..2179c08 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,7 +16,6 @@ set(HEADERS
html_unescape.h
houdini.h
cmark_ctype.h
- smart.h
)
set(LIBRARY_SOURCES
cmark.c
@@ -37,7 +36,6 @@ set(LIBRARY_SOURCES
houdini_html_e.c
houdini_html_u.c
cmark_ctype.c
- smart.c
${HEADERS}
)
diff --git a/src/html.c b/src/html.c
index 1f64196..5a64f9c 100644
--- a/src/html.c
+++ b/src/html.c
@@ -9,7 +9,6 @@
#include "utf8.h"
#include "buffer.h"
#include "houdini.h"
-#include "smart.h"
// Functions to convert cmark_nodes to HTML strings.
@@ -219,14 +218,7 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
break;
case CMARK_NODE_TEXT:
- if (options & CMARK_OPT_SMARTPUNCT) {
- escape_with_smart(html, node, escape_html,
- "&ldquo;", "&rdquo;", "&lsquo;", "&rsquo;",
- "&mdash;", "&ndash;", "&hellip;");
- } else {
- escape_html(html, node->as.literal.data,
- node->as.literal.len);
- }
+ escape_html(html, node->as.literal.data, node->as.literal.len);
break;
case CMARK_NODE_LINEBREAK:
diff --git a/src/inlines.c b/src/inlines.c
index 014c018..a5af1a5 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -271,6 +271,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
while (peek_char(subj) == c) {
numdelims++;
advance(subj);
+ if (c == '\'' || c == '"') {
+ break; // limit to 1 delim for quotes
+ }
}
len = utf8proc_iterate(subj->input.data + subj->pos,
@@ -289,6 +292,9 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)
if (c == '_') {
*can_open = left_flanking && !right_flanking;
*can_close = right_flanking && !left_flanking;
+ } else if (c == '\'' || c == '"') {
+ *can_open = left_flanking && !right_flanking;
+ *can_close = right_flanking;
} else {
*can_open = left_flanking;
*can_close = right_flanking;
@@ -349,25 +355,68 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open,
subj->last_delim = delim;
}
-// Parse strong/emph or a fallback.
-// Assumes the subject has '_' or '*' at the current position.
-static cmark_node* handle_strong_emph(subject* subj, unsigned char c)
+// Assumes the subject has a c at the current position.
+static cmark_node* handle_delim(subject* subj, unsigned char c, bool smart)
{
int numdelims;
cmark_node * inl_text;
bool can_open, can_close;
+ cmark_chunk contents;
numdelims = scan_delims(subj, c, &can_open, &can_close);
- inl_text = make_str(cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims));
+ if (c == '\'' && smart) {
+ contents = cmark_chunk_literal("’");
+ } else if (c == '"' && smart) {
+ contents = cmark_chunk_literal("”");
+ } else {
+ contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
+ }
+
+ inl_text = make_str(contents);
- if (can_open || can_close) {
+ if ((can_open || can_close) &&
+ (!(c == '\'' || c == '"') || smart)) {
push_delimiter(subj, c, can_open, can_close, inl_text);
}
return inl_text;
}
+// Assumes we have a hyphen at the current position.
+static cmark_node* handle_hyphen(subject* subj, bool smart)
+{
+ advance(subj);
+ if (smart && peek_char(subj) == '-') {
+ advance(subj);
+ if (peek_char(subj) == '-') {
+ advance(subj);
+ return make_str(cmark_chunk_literal("—"));
+ } else {
+ return make_str(cmark_chunk_literal("–"));
+ }
+ } else {
+ return make_str(cmark_chunk_literal("-"));
+ }
+}
+
+// Assumes we have a period at the current position.
+static cmark_node* handle_period(subject* subj, bool smart)
+{
+ advance(subj);
+ if (smart && peek_char(subj) == '.') {
+ advance(subj);
+ if (peek_char(subj) == '.') {
+ advance(subj);
+ return make_str(cmark_chunk_literal("…"));
+ } else {
+ return make_str(cmark_chunk_literal(".."));
+ }
+ } else {
+ return make_str(cmark_chunk_literal("."));
+ }
+}
+
static void process_emphasis(subject *subj, delimiter *start_delim)
{
delimiter *closer = subj->last_delim;
@@ -381,7 +430,8 @@ static void process_emphasis(subject *subj, delimiter *start_delim)
// now move forward, looking for closers, and handling each
while (closer != NULL) {
if (closer->can_close &&
- (closer->delim_char == '*' || closer->delim_char == '_')) {
+ (closer->delim_char == '*' || closer->delim_char == '_' ||
+ closer->delim_char == '"' || closer->delim_char == '\'')) {
// Now look backwards for first matching opener:
opener = closer->previous;
while (opener != NULL && opener != start_delim) {
@@ -391,9 +441,31 @@ static void process_emphasis(subject *subj, delimiter *start_delim)
}
opener = opener->previous;
}
- if (opener != NULL && opener != start_delim) {
- closer = S_insert_emph(subj, opener, closer);
- } else {
+ if (closer->delim_char == '*' || closer->delim_char == '_') {
+ if (opener != NULL && opener != start_delim) {
+ closer = S_insert_emph(subj, opener, closer);
+ } else {
+ closer = closer->next;
+ }
+ } else if (closer->delim_char == '\'') {
+ cmark_chunk_free(&closer->inl_text->as.literal);
+ closer->inl_text->as.literal =
+ cmark_chunk_literal("’");
+ if (opener != NULL && opener != start_delim) {
+ cmark_chunk_free(&opener->inl_text->as.literal);
+ opener->inl_text->as.literal =
+ cmark_chunk_literal("‘");
+ }
+ closer = closer->next;
+ } else if (closer->delim_char == '"') {
+ cmark_chunk_free(&closer->inl_text->as.literal);
+ closer->inl_text->as.literal =
+ cmark_chunk_literal("”");
+ if (opener != NULL && opener != start_delim) {
+ cmark_chunk_free(&opener->inl_text->as.literal);
+ opener->inl_text->as.literal =
+ cmark_chunk_literal("“");
+ }
closer = closer->next;
}
} else {
@@ -866,7 +938,7 @@ static int subject_find_special_char(subject *subj, long options)
};
// " ' . -
- static const char SMART_PUNCT_TABLE[] = {
+ static const char SMART_PUNCT_CHARS[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
@@ -890,6 +962,9 @@ static int subject_find_special_char(subject *subj, long options)
while (n < subj->input.len) {
if (SPECIAL_CHARS[subj->input.data[n]])
return n;
+ if (options & CMARK_OPT_SMARTPUNCT &&
+ SMART_PUNCT_CHARS[subj->input.data[n]])
+ return n;
n++;
}
@@ -926,7 +1001,15 @@ static int parse_inline(subject* subj, cmark_node * parent, long options)
break;
case '*':
case '_':
- new_inl = handle_strong_emph(subj, c);
+ case '\'':
+ case '"':
+ new_inl = handle_delim(subj, c, options & CMARK_OPT_SMARTPUNCT);
+ break;
+ case '-':
+ new_inl = handle_hyphen(subj, options & CMARK_OPT_SMARTPUNCT);
+ break;
+ case '.':
+ new_inl = handle_period(subj, options & CMARK_OPT_SMARTPUNCT);
break;
case '[':
advance(subj);
diff --git a/src/man.c b/src/man.c
index 970bb6d..b7cb932 100644
--- a/src/man.c
+++ b/src/man.c
@@ -7,10 +7,11 @@
#include "cmark.h"
#include "node.h"
#include "buffer.h"
-#include "smart.h"
// Functions to convert cmark_nodes to groff man strings.
+// TODO: properly escape unicode punctuation used in smart mode:
+// "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]", "\\[em]", "\\[en]", "..."
static void escape_man(cmark_strbuf *dest, const unsigned char *source, int length)
{
int i;
@@ -47,7 +48,7 @@ struct render_state {
static int
S_render_node(cmark_node *node, cmark_event_type ev_type,
- struct render_state *state, long options)
+ struct render_state *state)
{
cmark_node *tmp;
cmark_strbuf *man = state->man;
@@ -166,14 +167,8 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,
break;
case CMARK_NODE_TEXT:
- if (options & CMARK_OPT_SMARTPUNCT) {
- escape_with_smart(man, node, escape_man,
- "\\[lq]", "\\[rq]", "\\[oq]", "\\[cq]",
- "\\[em]", "\\[en]", "...");
- } else {
- escape_man(man, node->as.literal.data,
- node->as.literal.len);
- }
+ escape_man(man, node->as.literal.data,
+ node->as.literal.len);
break;
case CMARK_NODE_LINEBREAK:
@@ -248,7 +243,7 @@ char *cmark_render_man(cmark_node *root, long options)
while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
cur = cmark_iter_get_node(iter);
- S_render_node(cur, ev_type, &state, options);
+ S_render_node(cur, ev_type, &state);
}
result = (char *)cmark_strbuf_detach(&man);
diff --git a/src/smart.c b/src/smart.c
deleted file mode 100644
index cb67448..0000000
--- a/src/smart.c
+++ /dev/null
@@ -1,174 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-#include "config.h"
-#include "cmark.h"
-#include "node.h"
-#include "utf8.h"
-#include "buffer.h"
-#include "chunk.h"
-
-static const char SMART_PUNCT_TABLE[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-void escape_with_smart(cmark_strbuf *buf,
- cmark_node *node,
- void (*escape)(cmark_strbuf *, const unsigned char *, int),
- const char *left_double_quote,
- const char *right_double_quote,
- const char *left_single_quote,
- const char *right_single_quote,
- const char *em_dash,
- const char *en_dash,
- const char *ellipses)
-{
- char c;
- int32_t after_char = 0;
- int32_t before_char = 0;
- bool left_flanking, right_flanking;
- int lastout = 0;
- int i = 0, j = 0;
- cmark_chunk lit = node->as.literal;
- int len;
-
- while (i < lit.len) {
- c = lit.data[i];
- i++;
- if (SMART_PUNCT_TABLE[(int)c] == 0) {
- continue;
- }
-
- if (i - 1 - lastout > 0) {
- (*escape)(buf, lit.data + lastout, i - 1 - lastout);
- }
-
- if (c == 34 || c == 39) {
- if (i == 1) {
- // set before_char based on previous text node if there is one:
- if (node->prev) {
- if (node->prev->type == CMARK_NODE_TEXT) {
-
- // walk to the beginning of the UTF_8 sequence:
- j = node->prev->as.literal.len - 1;
- while (j > 0 &&
- node->prev->as.literal.data[j] >> 6 == 2) {
- j--;
- }
- len = utf8proc_iterate(node->prev->as.literal.data + i,
- node->prev->as.literal.len - i,
- &before_char);
- if (len == -1) {
- before_char = 10;
- }
-
- } else if (node->prev->type == CMARK_NODE_SOFTBREAK ||
- node->prev->type == CMARK_NODE_LINEBREAK) {
- before_char = 10;
-
- } else {
- before_char = 65;
- }
- } else {
- before_char = 10;
- }
- } else {
- j = i - 2;
- // walk back to the beginning of the UTF_8 sequence:
- while (j > 0 && lit.data[j] >> 6 == 2) {
- j--;
- }
- utf8proc_iterate(lit.data + j, lit.len - j, &before_char);
- }
-
- if (i >= lit.len) {
- if (node->next) {
- if (node->next->type == CMARK_NODE_TEXT) {
- utf8proc_iterate(node->next->as.literal.data,
- node->next->as.literal.len,
- &after_char);
- } else if (node->next->type == CMARK_NODE_SOFTBREAK ||
- node->next->type == CMARK_NODE_LINEBREAK) {
- after_char = 10;
- } else {
- after_char = 65;
- }
- } else {
- after_char = 10;
- }
- } else {
- utf8proc_iterate(lit.data + i, lit.len - i, &after_char);
- }
-
- left_flanking = !utf8proc_is_space(after_char) &&
- !(utf8proc_is_punctuation(after_char) &&
- !utf8proc_is_space(before_char) &&
- !utf8proc_is_punctuation(before_char));
- right_flanking = !utf8proc_is_space(before_char) &&
- !(utf8proc_is_punctuation(before_char) &&
- !utf8proc_is_space(after_char) &&
- !utf8proc_is_punctuation(after_char));
- }
-
- switch (c) {
- case '"':
- if (right_flanking) {
- cmark_strbuf_puts(buf, right_double_quote);
- } else {
- cmark_strbuf_puts(buf, left_double_quote);
- }
- break;
- case '\'':
- if (left_flanking && !right_flanking) {
- cmark_strbuf_puts(buf, left_single_quote);
- } else {
- cmark_strbuf_puts(buf, right_single_quote);
- }
- break;
- case '-':
- if (i < lit.len && lit.data[i] == '-') {
- if (lit.data[i + 1] == '-') {
- cmark_strbuf_puts(buf, em_dash);
- i += 2;
- } else {
- cmark_strbuf_puts(buf, en_dash);
- i += 1;
- }
- } else {
- cmark_strbuf_putc(buf, c);
- }
- break;
- case '.':
- if (i < lit.len - 1 && lit.data[i] == '.' &&
- lit.data[i + 1] == '.') {
- cmark_strbuf_puts(buf, ellipses);
- i += 2;
- } else {
- cmark_strbuf_putc(buf, c);
- }
- break;
- default:
- cmark_strbuf_putc(buf, c);
- }
- lastout = i;
- }
- (*escape)(buf, node->as.literal.data + lastout, lit.len - lastout);
-
-}
diff --git a/src/smart.h b/src/smart.h
deleted file mode 100644
index fa614b3..0000000
--- a/src/smart.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef CMARK_SMART_H
-#define CMARK_SMART_H
-
-#include <stddef.h>
-#include <stdarg.h>
-#include "config.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void escape_with_smart(cmark_strbuf *buf,
- cmark_node *node,
- void (*escape)(cmark_strbuf *, const unsigned char *, int),
- const char *left_double_quote,
- const char *right_double_quote,
- const char *left_single_quote,
- const char *right_single_quote,
- const char *em_dash,
- const char *en_dash,
- const char *ellipses);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-