summaryrefslogtreecommitdiff
path: root/src/houdini_href_e.c
blob: bfa970485ef0abaca2512a1c69f9179426c71024 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#include <assert.h>
#include <stdio.h>
#include <string.h>

#include "houdini.h"

/*
 * The following characters will not be escaped:
 *
 *		-_.+!*'(),%#@?=;:/,+&$ alphanum
 *
 * Note that this character set is the addition of:
 *
 *	- The characters which are safe to be in an URL
 *	- The characters which are *not* safe to be in
 *	an URL because they are RESERVED characters.
 *
 * We assume (lazily) that any RESERVED char that
 * appears inside an URL is actually meant to
 * have its native function (i.e. as an URL
 * component/separator) and hence needs no escaping.
 *
 * There are two exceptions: the chacters & (amp)
 * and ' (single quote) do not appear in the table.
 * They are meant to appear in the URL as components,
 * yet they require special HTML-entity escaping
 * to generate valid HTML markup.
 *
 * All other characters will be escaped to %XX.
 *
 */
static const char HREF_SAFE[] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

int houdini_escape_href(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) {
  static const uint8_t hex_chars[] = "0123456789ABCDEF";
  bufsize_t i = 0, org;
  uint8_t hex_str[3];

  hex_str[0] = '%';

  while (i < size) {
    org = i;
    while (i < size && HREF_SAFE[src[i]] != 0)
      i++;

    if (likely(i > org))
      cmark_strbuf_put(ob, src + org, i - org);

    /* escaping */
    if (i >= size)
      break;

    switch (src[i]) {
    /* amp appears all the time in URLs, but needs
     * HTML-entity escaping to be inside an href */
    case '&':
      cmark_strbuf_puts(ob, "&amp;");
      break;

    /* the single quote is a valid URL character
     * according to the standard; it needs HTML
     * entity escaping too */
    case '\'':
      cmark_strbuf_puts(ob, "&#x27;");
      break;

/* the space can be escaped to %20 or a plus
 * sign. we're going with the generic escape
 * for now. the plus thing is more commonly seen
 * when building GET strings */
#if 0
		case ' ':
			cmark_strbuf_putc(ob, '+');
			break;
#endif

    /* every other character goes with a %XX escaping */
    default:
      hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
      hex_str[2] = hex_chars[src[i] & 0xF];
      cmark_strbuf_put(ob, hex_str, 3);
    }

    i++;
  }

  return 1;
}