summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/blocks.c11
-rw-r--r--src/buffer.c6
-rw-r--r--src/html/houdini.h44
-rw-r--r--src/html/houdini_href_e.c115
-rw-r--r--src/html/houdini_html_e.c89
-rw-r--r--src/html/html.c212
-rw-r--r--src/stmd.h1
-rw-r--r--src/utf8.c7
8 files changed, 473 insertions, 12 deletions
diff --git a/src/blocks.c b/src/blocks.c
index 71dc830..42f20db 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -8,6 +8,7 @@
#include "scanners.h"
#include "uthash.h"
+static void incorporate_line(gh_buf *ln, int line_number, block** curptr);
static void finalize(block* b, int line_number);
static block* make_block(int tag, int start_line, int start_column)
@@ -390,7 +391,7 @@ static void expand_tabs(gh_buf *ob, const unsigned char *line, size_t size)
}
}
-static block *finalize_parsing(block *document, int linenum)
+static block *finalize_document(block *document, int linenum)
{
while (document != document->top) {
finalize(document, linenum);
@@ -411,7 +412,7 @@ extern block *stmd_parse_file(FILE *f)
block *document = make_document();
while (fgets((char *)buffer, sizeof(buffer), f)) {
- expand_tabs(&line, buffer, strlen(buffer));
+ expand_tabs(&line, buffer, strlen((char *)buffer));
incorporate_line(&line, linenum, &document);
gh_buf_clear(&line);
linenum++;
@@ -429,7 +430,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
block *document = make_document();
while (buffer < end) {
- const char *eol = memchr(buffer, '\n', end - buffer);
+ const unsigned char *eol = memchr(buffer, '\n', end - buffer);
if (!eol) {
expand_tabs(&line, buffer, end - buffer);
@@ -449,9 +450,7 @@ extern block *stmd_parse_document(const unsigned char *buffer, size_t len)
}
// Process one line at a time, modifying a block.
-// Returns 0 if successful. curptr is changed to point to
-// the currently open block.
-extern void incorporate_line(gh_buf *ln, int line_number, block** curptr)
+static void incorporate_line(gh_buf *ln, int line_number, block** curptr)
{
block* last_matched_container;
int offset = 0;
diff --git a/src/buffer.c b/src/buffer.c
index 17dc864..cfc6a7e 100644
--- a/src/buffer.c
+++ b/src/buffer.c
@@ -245,11 +245,11 @@ int gh_buf_cmp(const gh_buf *a, const gh_buf *b)
int gh_buf_strchr(const gh_buf *buf, int c, int pos)
{
- const char *p = memchr(buf->ptr + pos, c, buf->size - pos);
+ const unsigned char *p = memchr(buf->ptr + pos, c, buf->size - pos);
if (!p)
return -1;
- return (int)(p - buf->ptr);
+ return (int)(p - (const unsigned char *)buf->ptr);
}
int gh_buf_strrchr(const gh_buf *buf, int c, int pos)
@@ -264,7 +264,7 @@ int gh_buf_strrchr(const gh_buf *buf, int c, int pos)
return -1;
}
-void gh_buf_truncate(gh_buf *buf, size_t len)
+void gh_buf_truncate(gh_buf *buf, int len)
{
if (len < buf->size) {
buf->size = len;
diff --git a/src/html/houdini.h b/src/html/houdini.h
new file mode 100644
index 0000000..31fe917
--- /dev/null
+++ b/src/html/houdini.h
@@ -0,0 +1,44 @@
+#ifndef __HOUDINI_H__
+#define __HOUDINI_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include "buffer.h"
+
+#define likely(x) __builtin_expect((x),1)
+#define unlikely(x) __builtin_expect((x),0)
+
+#ifdef HOUDINI_USE_LOCALE
+# define _isxdigit(c) isxdigit(c)
+# define _isdigit(c) isdigit(c)
+#else
+/*
+ * Helper _isdigit methods -- do not trust the current locale
+ * */
+# define _isxdigit(c) (strchr("0123456789ABCDEFabcdef", (c)) != NULL)
+# define _isdigit(c) ((c) >= '0' && (c) <= '9')
+#endif
+
+#define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10)
+#define HOUDINI_UNESCAPED_SIZE(x) (x)
+
+extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure);
+extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_url(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_uri(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_url(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_escape_js(gh_buf *ob, const uint8_t *src, size_t size);
+extern int houdini_unescape_js(gh_buf *ob, const uint8_t *src, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/html/houdini_href_e.c b/src/html/houdini_href_e.c
new file mode 100644
index 0000000..59fe850
--- /dev/null
+++ b/src/html/houdini_href_e.c
@@ -0,0 +1,115 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "html/houdini.h"
+
+/*
+ * The following characters will not be escaped:
+ *
+ * -_.+!*'(),%#@?=;:/,+&$ alphanum
+ *
+ * Note that this character set is the addition of:
+ *
+ * - The characters which are safe to be in an URL
+ * - The characters which are *not* safe to be in
+ * an URL because they are RESERVED characters.
+ *
+ * We asume (lazily) that any RESERVED char that
+ * appears inside an URL is actually meant to
+ * have its native function (i.e. as an URL
+ * component/separator) and hence needs no escaping.
+ *
+ * There are two exceptions: the chacters & (amp)
+ * and ' (single quote) do not appear in the table.
+ * They are meant to appear in the URL as components,
+ * yet they require special HTML-entity escaping
+ * to generate valid HTML markup.
+ *
+ * All other characters will be escaped to %XX.
+ *
+ */
+static const char HREF_SAFE[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+int
+houdini_escape_href(gh_buf *ob, const uint8_t *src, size_t size)
+{
+ static const uint8_t hex_chars[] = "0123456789ABCDEF";
+ size_t i = 0, org;
+ uint8_t hex_str[3];
+
+ hex_str[0] = '%';
+
+ while (i < size) {
+ org = i;
+ while (i < size && HREF_SAFE[src[i]] != 0)
+ i++;
+
+ if (likely(i > org)) {
+ if (unlikely(org == 0)) {
+ if (i >= size)
+ return 0;
+
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
+ }
+
+ gh_buf_put(ob, src + org, i - org);
+ }
+
+ /* escaping */
+ if (i >= size)
+ break;
+
+ switch (src[i]) {
+ /* amp appears all the time in URLs, but needs
+ * HTML-entity escaping to be inside an href */
+ case '&':
+ gh_buf_puts(ob, "&amp;");
+ break;
+
+ /* the single quote is a valid URL character
+ * according to the standard; it needs HTML
+ * entity escaping too */
+ case '\'':
+ gh_buf_puts(ob, "&#x27;");
+ break;
+
+ /* the space can be escaped to %20 or a plus
+ * sign. we're going with the generic escape
+ * for now. the plus thing is more commonly seen
+ * when building GET strings */
+#if 0
+ case ' ':
+ gh_buf_putc(ob, '+');
+ break;
+#endif
+
+ /* every other character goes with a %XX escaping */
+ default:
+ hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
+ hex_str[2] = hex_chars[src[i] & 0xF];
+ gh_buf_put(ob, hex_str, 3);
+ }
+
+ i++;
+ }
+
+ return 1;
+}
diff --git a/src/html/houdini_html_e.c b/src/html/houdini_html_e.c
new file mode 100644
index 0000000..316c5ce
--- /dev/null
+++ b/src/html/houdini_html_e.c
@@ -0,0 +1,89 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "html/houdini.h"
+
+/**
+ * According to the OWASP rules:
+ *
+ * & --> &amp;
+ * < --> &lt;
+ * > --> &gt;
+ * " --> &quot;
+ * ' --> &#x27; &apos; is not recommended
+ * / --> &#x2F; forward slash is included as it helps end an HTML entity
+ *
+ */
+static const char HTML_ESCAPE_TABLE[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const char *HTML_ESCAPES[] = {
+ "",
+ "&quot;",
+ "&amp;",
+ "&#39;",
+ "&#47;",
+ "&lt;",
+ "&gt;"
+};
+
+int
+houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure)
+{
+ size_t i = 0, org, esc = 0;
+
+ while (i < size) {
+ org = i;
+ while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0)
+ i++;
+
+ if (i > org) {
+ if (unlikely(org == 0)) {
+ if (i >= size)
+ return 0;
+
+ gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size));
+ }
+
+ gh_buf_put(ob, src + org, i - org);
+ }
+
+ /* escaping */
+ if (unlikely(i >= size))
+ break;
+
+ /* The forward slash is only escaped in secure mode */
+ if (src[i] == '/' && !secure) {
+ gh_buf_putc(ob, '/');
+ } else {
+ gh_buf_puts(ob, HTML_ESCAPES[esc]);
+ }
+
+ i++;
+ }
+
+ return 1;
+}
+
+int
+houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size)
+{
+ return houdini_escape_html0(ob, src, size, 1);
+}
diff --git a/src/html/html.c b/src/html/html.c
new file mode 100644
index 0000000..2f160ca
--- /dev/null
+++ b/src/html/html.c
@@ -0,0 +1,212 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "stmd.h"
+#include "debug.h"
+#include "scanners.h"
+#include "html/houdini.h"
+
+// Functions to convert block and inline lists to HTML strings.
+
+static void escape_html(gh_buf *dest, const unsigned char *source, int length)
+{
+ if (length < 0)
+ length = strlen((char *)source);
+
+ houdini_escape_html0(dest, source, (size_t)length, 0);
+}
+
+static void escape_href(gh_buf *dest, const unsigned char *source, int length)
+{
+ if (length < 0)
+ length = strlen((char *)source);
+
+ houdini_escape_href(dest, source, (size_t)length);
+}
+
+static inline void cr(gh_buf *html)
+{
+ if (html->size && html->ptr[html->size - 1] != '\n')
+ gh_buf_putc(html, '\n');
+}
+
+// Convert a block list to HTML. Returns 0 on success, and sets result.
+void blocks_to_html(gh_buf *html, block *b, bool tight)
+{
+ struct ListData *data;
+
+ while(b != NULL) {
+ switch(b->tag) {
+ case document:
+ blocks_to_html(html, b->children, false);
+ break;
+
+ case paragraph:
+ if (tight) {
+ inlines_to_html(html, b->inline_content);
+ } else {
+ cr(html);
+ gh_buf_puts(html, "<p>");
+ inlines_to_html(html, b->inline_content);
+ gh_buf_puts(html, "</p>");
+ cr(html);
+ }
+ break;
+
+ case block_quote:
+ cr(html);
+ gh_buf_puts(html, "<blockquote>");
+ blocks_to_html(html, b->children, false);
+ gh_buf_puts(html, "</blockquote>");
+ cr(html);
+ break;
+
+ case list_item:
+ cr(html);
+ gh_buf_puts(html, "<li>");
+ blocks_to_html(html, b->children, tight);
+ gh_buf_trim(html);
+ gh_buf_puts(html, "</li>");
+ cr(html);
+ break;
+
+ case list:
+ // make sure a list starts at the beginning of the line:
+ cr(html);
+ data = &(b->attributes.list_data);
+
+ if (data->start > 1) {
+ gh_buf_printf(html, "<%s start=\"%d\">\n",
+ data->list_type == bullet ? "ul" : "ol",
+ data->start);
+ } else {
+ gh_buf_puts(html, data->list_type == bullet ? "<ul>\n" : "<ol>\n");
+ }
+
+ blocks_to_html(html, b->children, data->tight);
+ gh_buf_puts(html, data->list_type == bullet ? "</ul>" : "</ol>");
+ cr(html);
+ break;
+
+ case atx_header:
+ case setext_header:
+ cr(html);
+ gh_buf_printf(html, "<h%d>", b->attributes.header_level);
+ inlines_to_html(html, b->inline_content);
+ gh_buf_printf(html, "</h%d>", b->attributes.header_level);
+ cr(html);
+ break;
+
+ case indented_code:
+ case fenced_code:
+ /* TODO: fenced code lang attributes */
+ cr(html);
+ gh_buf_puts(html, "<pre><code>");
+ escape_html(html, b->string_content.ptr, b->string_content.size);
+ gh_buf_puts(html, "</pre></code>");
+ cr(html);
+ break;
+
+ case html_block:
+ gh_buf_put(html, b->string_content.ptr, b->string_content.size);
+ break;
+
+ case hrule:
+ gh_buf_puts(html, "<hr />");
+ cr(html);
+ break;
+
+ case reference_def:
+ break;
+
+ default:
+ assert(false);
+ }
+
+ b = b->next;
+ }
+}
+
+// Convert an inline list to HTML. Returns 0 on success, and sets result.
+void inlines_to_html(gh_buf *html, inl* ils)
+{
+ gh_buf scrap = GH_BUF_INIT;
+
+ while(ils != NULL) {
+ switch(ils->tag) {
+ case INL_STRING:
+ escape_html(html, ils->content.literal.data, ils->content.literal.len);
+ break;
+
+ case INL_LINEBREAK:
+ gh_buf_puts(html, "<br />\n");
+ break;
+
+ case INL_SOFTBREAK:
+ gh_buf_putc(html, '\n');
+ break;
+
+ case INL_CODE:
+ gh_buf_puts(html, "<code>");
+ escape_html(html, ils->content.literal.data, ils->content.literal.len);
+ gh_buf_puts(html, "</code>");
+ break;
+
+ case INL_RAW_HTML:
+ case INL_ENTITY:
+ gh_buf_put(html,
+ ils->content.literal.data,
+ ils->content.literal.len);
+ break;
+
+ case INL_LINK:
+ gh_buf_puts(html, "<a href=\"");
+ escape_href(html, ils->content.linkable.url, -1);
+
+ if (ils->content.linkable.title) {
+ gh_buf_puts(html, "\" title=\"");
+ escape_html(html, ils->content.linkable.title, -1);
+ }
+
+ gh_buf_puts(html, "\">");
+ inlines_to_html(html, ils->content.inlines);
+ gh_buf_puts(html, "</a>");
+ break;
+
+ case INL_IMAGE:
+ gh_buf_puts(html, "<img src=\"");
+ escape_href(html, ils->content.linkable.url, -1);
+
+ inlines_to_html(&scrap, ils->content.inlines);
+ if (scrap.size) {
+ gh_buf_puts(html, "\" alt=\"");
+ escape_html(html, scrap.ptr, scrap.size);
+ }
+ gh_buf_clear(&scrap);
+
+ if (ils->content.linkable.title) {
+ gh_buf_puts(html, "\" title=\"");
+ escape_html(html, ils->content.linkable.title, -1);
+ }
+
+ gh_buf_puts(html, "\"/>");
+ break;
+
+ case INL_STRONG:
+ gh_buf_puts(html, "<strong>");
+ inlines_to_html(html, ils->content.inlines);
+ gh_buf_puts(html, "</strong>");
+ break;
+
+ case INL_EMPH:
+ gh_buf_puts(html, "<em>");
+ inlines_to_html(html, ils->content.inlines);
+ gh_buf_puts(html, "</em>");
+ break;
+ }
+ ils = ils->next;
+ }
+}
diff --git a/src/stmd.h b/src/stmd.h
index 1e490d6..3e284bd 100644
--- a/src/stmd.h
+++ b/src/stmd.h
@@ -1,4 +1,5 @@
#include <stdbool.h>
+#include <stdio.h>
#include "buffer.h"
#include "uthash.h"
diff --git a/src/utf8.c b/src/utf8.c
index e3f8dd3..32c78a4 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -1,6 +1,7 @@
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
+#include <assert.h>
#include "stmd.h"
@@ -83,9 +84,9 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst)
return length;
}
-void utf8_encode_char(int32_t uc, gh_buf *buf)
+void utf8proc_encode_char(int32_t uc, gh_buf *buf)
{
- char dst[4];
+ unsigned char dst[4];
int len = 0;
if (uc < 0x00) {
@@ -99,7 +100,7 @@ void utf8_encode_char(int32_t uc, gh_buf *buf)
len = 2;
} else if (uc == 0xFFFF) {
dst[0] = 0xFF;
- return 1;
+ len = 1;
} else if (uc == 0xFFFE) {
dst[0] = 0xFE;
len = 1;