From 208c794def61eb819ed6eebe1d51867613addce0 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 16 Jun 2015 09:54:31 -0700 Subject: Replace gperf-based entity lookup with binary tree lookup. The primary advantage is a big reduction in the size of the compiled library and executable (> 100K). There should be no measurable performance difference in normal documents. I detected a slight performance hit (around 5%) in a file containing 1,000,000 entities. * Removed `src/html_unescape.gperf` and `src/html_unescape.h`. * Added `src/entities.h` (generated by `tools/make_entities_h.py`). * Added binary tree lookup functions to `houdini_html_u.c`, and use the data in `src/entities.h`. --- src/houdini_html_u.c | 53 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'src/houdini_html_u.c') diff --git a/src/houdini_html_u.c b/src/houdini_html_u.c index 2362b04..6f94c23 100644 --- a/src/houdini_html_u.c +++ b/src/houdini_html_u.c @@ -5,7 +5,44 @@ #include "buffer.h" #include "houdini.h" #include "utf8.h" -#include "html_unescape.h" +#include "entities.h" + +/* Binary tree lookup code for entities added by JGM */ + +static unsigned long +S_hash(const unsigned char *str, int len) +{ + unsigned long hash = 5381; + int i; + + for (i = 0; i < len; i++) { + hash = (((hash << 5) + hash) + str[i]) & 0xFFFFFFFF; /* hash * 33 + c */ + } + + return hash; +} + +static unsigned char * +S_lookup(int i, unsigned long key) +{ + if (cmark_entities[i].value == key) { + return cmark_entities[i].bytes; + } else { + int next = key < cmark_entities[i].value ? + cmark_entities[i].less : cmark_entities[i].greater; + if (next == 0) { + return NULL; + } else { + return S_lookup(next, key); + } + } +} + +static unsigned char * +S_lookup_entity(const unsigned char *s, int len) +{ + return S_lookup(cmark_entities_root, S_hash(s, len)); +} bufsize_t houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) @@ -57,22 +94,18 @@ houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) } else { - if (size > MAX_WORD_LENGTH) - size = MAX_WORD_LENGTH; + if (size > CMARK_ENTITY_MAX_LENGTH) + size = CMARK_ENTITY_MAX_LENGTH; - for (i = MIN_WORD_LENGTH; i < size; ++i) { + for (i = CMARK_ENTITY_MIN_LENGTH; i < size; ++i) { if (src[i] == ' ') break; if (src[i] == ';') { - const struct html_ent *entity = find_entity((char *)src, i); + const unsigned char *entity = S_lookup_entity(src, i); if (entity != NULL) { - bufsize_t len = 0; - while (len < 8 && entity->utf8[len] != '\0') { - ++len; - } - cmark_strbuf_put(ob, entity->utf8, len); + cmark_strbuf_puts(ob, (const char *)entity); return i + 1; } -- cgit v1.2.3