summaryrefslogtreecommitdiff
path: root/src/inlines.c
diff options
context:
space:
mode:
authorVicent Marti <tanoku@gmail.com>2014-09-06 20:48:05 +0200
committerVicent Marti <tanoku@gmail.com>2014-09-09 03:39:16 +0200
commit61e3e606e64221eaa5cf3d83dc598d5a42818d10 (patch)
tree1dfb6309c0e0fd7de8094c9da6497992b156350c /src/inlines.c
parent278b89d092cae8fe9cdd6346c69512886d36abbd (diff)
UTF8-aware detabbing and entity handling
Diffstat (limited to 'src/inlines.c')
-rw-r--r--src/inlines.c63
1 files changed, 26 insertions, 37 deletions
diff --git a/src/inlines.c b/src/inlines.c
index 6b17027..7b27150 100644
--- a/src/inlines.c
+++ b/src/inlines.c
@@ -5,6 +5,8 @@
#include <ctype.h>
#include "stmd.h"
+#include "html/houdini.h"
+#include "utf8.h"
#include "uthash.h"
#include "scanners.h"
@@ -176,7 +178,6 @@ inline static node_inl* make_simple(int t)
#define make_str(s) make_literal(INL_STRING, s)
#define make_code(s) make_literal(INL_CODE, s)
#define make_raw_html(s) make_literal(INL_RAW_HTML, s)
-#define make_entity(s) make_literal(INL_ENTITY, s)
#define make_linebreak() make_simple(INL_LINEBREAK)
#define make_softbreak() make_simple(INL_SOFTBREAK)
#define make_emph(contents) make_inlines(INL_EMPH, contents)
@@ -191,7 +192,6 @@ extern void free_inlines(node_inl* e)
case INL_STRING:
case INL_RAW_HTML:
case INL_CODE:
- case INL_ENTITY:
chunk_free(&e->content.literal);
break;
case INL_LINEBREAK:
@@ -540,45 +540,34 @@ static node_inl* handle_backslash(subject *subj)
// Assumes the subject has an '&' character at the current position.
static node_inl* handle_entity(subject* subj)
{
- int match;
- node_inl *result;
- match = scan_entity(&subj->input, subj->pos);
- if (match) {
- result = make_entity(chunk_dup(&subj->input, subj->pos, match));
- subj->pos += match;
- } else {
- advance(subj);
- result = make_str(chunk_literal("&"));
- }
- return result;
+ strbuf ent = GH_BUF_INIT;
+ size_t len;
+
+ advance(subj);
+
+ len = houdini_unescape_ent(&ent,
+ subj->input.data + subj->pos,
+ subj->input.len - subj->pos
+ );
+
+ if (len == 0)
+ return make_str(chunk_literal("&"));
+
+ subj->pos += len;
+ return make_str(chunk_buf_detach(&ent));
}
// Like make_str, but parses entities.
// Returns an inline sequence consisting of str and entity elements.
static node_inl *make_str_with_entities(chunk *content)
{
- node_inl *result = NULL;
- node_inl *new;
- int searchpos;
- char c;
- subject subj;
-
- subject_from_chunk(&subj, content, NULL);
+ strbuf unescaped = GH_BUF_INIT;
- while ((c = peek_char(&subj))) {
- switch (c) {
- case '&':
- new = handle_entity(&subj);
- break;
- default:
- searchpos = chunk_strchr(&subj.input, '&', subj.pos);
- new = make_str(chunk_dup(&subj.input, subj.pos, searchpos - subj.pos));
- subj.pos = searchpos;
- }
- result = append_inlines(result, new);
+ if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) {
+ return make_str(chunk_buf_detach(&unescaped));
+ } else {
+ return make_str(*content);
}
-
- return result;
}
// Destructively unescape a string: remove backslashes before punctuation chars.
@@ -611,9 +600,9 @@ static unsigned char *clean_url(chunk *url, int is_email)
strbuf_puts(&buf, "mailto:");
if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
- strbuf_put(&buf, url->data + 1, url->len - 2);
+ houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
} else {
- strbuf_put(&buf, url->data, url->len);
+ houdini_unescape_html_f(&buf, url->data, url->len);
}
unescape_buffer(&buf);
@@ -636,9 +625,9 @@ static unsigned char *clean_title(chunk *title)
if ((first == '\'' && last == '\'') ||
(first == '(' && last == ')') ||
(first == '"' && last == '"')) {
- strbuf_set(&buf, title->data + 1, title->len - 2);
+ houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
} else {
- strbuf_set(&buf, title->data, title->len);
+ houdini_unescape_html_f(&buf, title->data, title->len);
}
unescape_buffer(&buf);