From 208c794def61eb819ed6eebe1d51867613addce0 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 16 Jun 2015 09:54:31 -0700 Subject: Replace gperf-based entity lookup with binary tree lookup. The primary advantage is a big reduction in the size of the compiled library and executable (> 100K). There should be no measurable performance difference in normal documents. I detected a slight performance hit (around 5%) in a file containing 1,000,000 entities. * Removed `src/html_unescape.gperf` and `src/html_unescape.h`. * Added `src/entities.h` (generated by `tools/make_entities_h.py`). * Added binary tree lookup functions to `houdini_html_u.c`, and use the data in `src/entities.h`. --- tools/make_entities_h.py | 107 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tools/make_entities_h.py (limited to 'tools') diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py new file mode 100644 index 0000000..5c24825 --- /dev/null +++ b/tools/make_entities_h.py @@ -0,0 +1,107 @@ +# Creates C data structures for binary lookup table of entities, +# using python's html5 entity data. +# Usage: python3 tools/make_entities_h.py > src/entities.h + +import html + +# We use this simple hashing algorithm to convert a string +# to an integer: +def djb2(s): + bs = list(s.encode('utf-8')) + hash = 5381 + for b in bs: + hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF + return hash + +entities5 = html.entities.html5 + +# Note that most entries in the entity table end with ';', but in a few +# cases we have both a version with ';' and one without, so we strip out +# the latter to avoid duplicates: +hashed_data = sorted([[int(djb2(s[:-1])), entities5[s].encode('utf-8'), s] + for s in entities5.keys() if s[-1] == ';']) + +# indices is a dictionary - given a hash it spits out the ordering +# of this entity in the list (the array index) +indices = {} +i = 0 + +for x in hashed_data: + indices[x[0]] = i + i = i + 1 + +# Formats integer as C octal escape. +def toesc(x): + return '\\' + oct(x)[2:] + +# Lines is the list of lines in the array. +# We don't fill them in order, so we initialize the whole array first. +lines = [""] * len(hashed_data) + +# Takes hashed_data or some sublist of it, and a midpoint (array index) +# in this list. Adds to lines a line for the midpoint, then calls +# itself recursively for the earlier and later elements. Each node +# contains indices for elements with a lesser hash and elements with +# a greater hash. An index of 0 means we're at a leaf node. +def to_binary_array(xs, mid): + # divide in half, and form binary array from each half + x = xs[mid] + lesses = xs[0:mid] + greaters = xs[mid+1:] + midlesses = len(lesses) // 2 + midgreaters = len(greaters) // 2 + if len(lesses) == 0: + ml = 0 + else: + ml = indices[lesses[midlesses][0]] + if len(greaters) == 0: + mg = 0 + else: + mg = indices[greaters[midgreaters][0]] + lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" + + ''.join(map(toesc, x[1])) + "\", " + str(ml) + + ", " + str(mg) + "}, /* &" + x[2] + " */") + if len(lesses) > 0: + to_binary_array(lesses, midlesses) + if len(greaters) > 0: + to_binary_array(greaters, midgreaters) + +# Now call this to fill up the array lines: +mid = len(hashed_data) // 2 +to_binary_array(hashed_data, mid) + +# Print out the header: +print("""#ifndef CMARK_ENTITIES_H +#define CMARK_ENTITIES_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct cmark_entity_node { + unsigned long value; + unsigned char *bytes; + int less; + int greater; +}; + +#define CMARK_ENTITY_MIN_LENGTH 2 +#define CMARK_ENTITY_MAX_LENGTH 31 +""") + +print("static struct cmark_entity_node cmark_entities[] = {"); + +for line in lines: + print(line); + +print("};\n"); + +print("static int cmark_entities_root = " + str(mid) + ";"); + +print(""" +#ifdef __cplusplus +} +#endif + +#endif +""") -- cgit v1.2.3