diff options
author | John MacFarlane <jgm@berkeley.edu> | 2015-06-16 17:28:53 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2015-06-16 17:33:48 -0700 |
commit | 7e7819e05ec91bc7ca7859d119f1274cf3a83913 (patch) | |
tree | e10c78ca2bbf0033b381cdc27076ada30f38b39c /tools | |
parent | 54c087d1272b4ce756e56de68e8e6dfac6d159fc (diff) |
Simpler approach for entity lookup.
We dispense with the hashes and just do string comparsions.
Since the array is in order, we can search intelligently
and should never need to do more than 8 or so comparisons.
This reduces binary size even further, at a small cost
in performance. (This shouldn't matter too much, as
it's only detectable in really entity-heavy sources.)
Diffstat (limited to 'tools')
-rw-r--r-- | tools/make_entities_h.py | 88 |
1 files changed, 9 insertions, 79 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py index 9342286..48492c7 100644 --- a/tools/make_entities_h.py +++ b/tools/make_entities_h.py @@ -4,79 +4,12 @@ import html -# We use this simple hashing algorithm to convert a string -# to an integer: -def djb2(s): - bs = list(s.encode('utf-8')) - hash = 5381 - for b in bs: - hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF - return hash - entities5 = html.entities.html5 # remove keys without semicolons. For some reason the list # has duplicates of a few things, like auml, one with and one # without a semicolon. -entities = [(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';'] - -# Note that most entries in the entity table end with ';', but in a few -# cases we have both a version with ';' and one without, so we strip out -# the latter to avoid duplicates: -hashed_data = sorted([[int(djb2(k)), v.encode('utf-8'), k] for (k,v) in entities]) - -# Confirm no hash collisions -hashes = [x for [x,_,_] in hashed_data] -assert(len(hashes) == len(set(hashes))) - -# indices is a dictionary - given a hash it spits out the ordering -# of this entity in the list (the array index) -indices = {} -i = 0 - -for x in hashed_data: - indices[x[0]] = i - i = i + 1 - -# Formats integer as C octal escape. -def toesc(x): - return '\\' + oct(x)[2:] - -# Lines is the list of lines in the array. -# We don't fill them in order, so we initialize the whole array first. -lines = [""] * len(hashed_data) - -# Takes hashed_data or some sublist of it, and a midpoint (array index) -# in this list. Adds to lines a line for the midpoint, then calls -# itself recursively for the earlier and later elements. Each node -# contains indices for elements with a lesser hash and elements with -# a greater hash. An index of -1 means we're at a leaf node. -def to_binary_array(xs, mid): - # divide in half, and form binary array from each half - x = xs[mid] - lesses = xs[0:mid] - greaters = xs[mid+1:] - midlesses = len(lesses) // 2 - midgreaters = len(greaters) // 2 - if len(lesses) == 0: - ml = -1 - else: - ml = indices[lesses[midlesses][0]] - if len(greaters) == 0: - mg = -1 - else: - mg = indices[greaters[midgreaters][0]] - lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" + - ''.join(map(toesc, x[1])) + "\", " + str(ml) + - ", " + str(mg) + "}, /* &" + x[2] + "; */") - if len(lesses) > 0: - to_binary_array(lesses, midlesses) - if len(greaters) > 0: - to_binary_array(greaters, midgreaters) - -# Now call this to fill up the array lines: -mid = len(hashed_data) // 2 -to_binary_array(hashed_data, mid) +entities = sorted([(k[:-1], entities5[k].encode('utf-8')) for k in entities5.keys() if k[-1] == ';']) # Print out the header: print("""#ifndef CMARK_ENTITIES_H @@ -87,24 +20,21 @@ extern "C" { #endif struct cmark_entity_node { - unsigned long value; - unsigned char *bytes; - int less; - int greater; + unsigned char *entity; + unsigned char bytes[8]; }; #define CMARK_ENTITY_MIN_LENGTH 2 -#define CMARK_ENTITY_MAX_LENGTH 31 -""") +#define CMARK_ENTITY_MAX_LENGTH 31""") -print("static const struct cmark_entity_node cmark_entities[] = {"); +print("#define CMARK_NUM_ENTITIES " + str(len(entities))); -for line in lines: - print(line); +print("\nstatic const struct cmark_entity_node cmark_entities[] = {"); -print("};\n"); +for (ent, bs) in entities: + print('{(unsigned char*)"' + ent + '", {' + ', '.join(map(str, bs)) + ', 0}},') -print("static const int cmark_entities_root = " + str(mid) + ";"); +print("};\n"); print(""" #ifdef __cplusplus |