summaryrefslogtreecommitdiff
path: root/src/houdini_html_u.c
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-16 09:54:31 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 12:59:47 -0700
commit208c794def61eb819ed6eebe1d51867613addce0 (patch)
tree0d0f81dab960befc5efa7124ae900ddd64e43be3 /src/houdini_html_u.c
parentf904f701cf4390b4d5531c5626c5cf08d85a913f (diff)
Replace gperf-based entity lookup with binary tree lookup.
The primary advantage is a big reduction in the size of the compiled library and executable (> 100K). There should be no measurable performance difference in normal documents. I detected a slight performance hit (around 5%) in a file containing 1,000,000 entities. * Removed `src/html_unescape.gperf` and `src/html_unescape.h`. * Added `src/entities.h` (generated by `tools/make_entities_h.py`). * Added binary tree lookup functions to `houdini_html_u.c`, and use the data in `src/entities.h`.
Diffstat (limited to 'src/houdini_html_u.c')
-rw-r--r--src/houdini_html_u.c53
1 files changed, 43 insertions, 10 deletions
diff --git a/src/houdini_html_u.c b/src/houdini_html_u.c
index 2362b04..6f94c23 100644
--- a/src/houdini_html_u.c
+++ b/src/houdini_html_u.c
@@ -5,7 +5,44 @@
#include "buffer.h"
#include "houdini.h"
#include "utf8.h"
-#include "html_unescape.h"
+#include "entities.h"
+
+/* Binary tree lookup code for entities added by JGM */
+
+static unsigned long
+S_hash(const unsigned char *str, int len)
+{
+ unsigned long hash = 5381;
+ int i;
+
+ for (i = 0; i < len; i++) {
+ hash = (((hash << 5) + hash) + str[i]) & 0xFFFFFFFF; /* hash * 33 + c */
+ }
+
+ return hash;
+}
+
+static unsigned char *
+S_lookup(int i, unsigned long key)
+{
+ if (cmark_entities[i].value == key) {
+ return cmark_entities[i].bytes;
+ } else {
+ int next = key < cmark_entities[i].value ?
+ cmark_entities[i].less : cmark_entities[i].greater;
+ if (next == 0) {
+ return NULL;
+ } else {
+ return S_lookup(next, key);
+ }
+ }
+}
+
+static unsigned char *
+S_lookup_entity(const unsigned char *s, int len)
+{
+ return S_lookup(cmark_entities_root, S_hash(s, len));
+}
bufsize_t
houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, bufsize_t size)
@@ -57,22 +94,18 @@ houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, bufsize_t size)
}
else {
- if (size > MAX_WORD_LENGTH)
- size = MAX_WORD_LENGTH;
+ if (size > CMARK_ENTITY_MAX_LENGTH)
+ size = CMARK_ENTITY_MAX_LENGTH;
- for (i = MIN_WORD_LENGTH; i < size; ++i) {
+ for (i = CMARK_ENTITY_MIN_LENGTH; i < size; ++i) {
if (src[i] == ' ')
break;
if (src[i] == ';') {
- const struct html_ent *entity = find_entity((char *)src, i);
+ const unsigned char *entity = S_lookup_entity(src, i);
if (entity != NULL) {
- bufsize_t len = 0;
- while (len < 8 && entity->utf8[len] != '\0') {
- ++len;
- }
- cmark_strbuf_put(ob, entity->utf8, len);
+ cmark_strbuf_puts(ob, (const char *)entity);
return i + 1;
}