summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-16 17:28:53 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 17:33:48 -0700
commit7e7819e05ec91bc7ca7859d119f1274cf3a83913 (patch)
treee10c78ca2bbf0033b381cdc27076ada30f38b39c /tools
parent54c087d1272b4ce756e56de68e8e6dfac6d159fc (diff)
Simpler approach for entity lookup.
We dispense with the hashes and just do string comparsions. Since the array is in order, we can search intelligently and should never need to do more than 8 or so comparisons. This reduces binary size even further, at a small cost in performance. (This shouldn't matter too much, as it's only detectable in really entity-heavy sources.)
Diffstat (limited to 'tools')
-rw-r--r--tools/make_entities_h.py88
1 files changed, 9 insertions, 79 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py
index 9342286..48492c7 100644
--- a/tools/make_entities_h.py
+++ b/tools/make_entities_h.py
@@ -4,79 +4,12 @@
import html
-# We use this simple hashing algorithm to convert a string
-# to an integer:
-def djb2(s):
- bs = list(s.encode('utf-8'))
- hash = 5381
- for b in bs:
- hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF
- return hash
-
entities5 = html.entities.html5
# remove keys without semicolons. For some reason the list
# has duplicates of a few things, like auml, one with and one
# without a semicolon.
-entities = [(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';']
-
-# Note that most entries in the entity table end with ';', but in a few
-# cases we have both a version with ';' and one without, so we strip out
-# the latter to avoid duplicates:
-hashed_data = sorted([[int(djb2(k)), v.encode('utf-8'), k] for (k,v) in entities])
-
-# Confirm no hash collisions
-hashes = [x for [x,_,_] in hashed_data]
-assert(len(hashes) == len(set(hashes)))
-
-# indices is a dictionary - given a hash it spits out the ordering
-# of this entity in the list (the array index)
-indices = {}
-i = 0
-
-for x in hashed_data:
- indices[x[0]] = i
- i = i + 1
-
-# Formats integer as C octal escape.
-def toesc(x):
- return '\\' + oct(x)[2:]
-
-# Lines is the list of lines in the array.
-# We don't fill them in order, so we initialize the whole array first.
-lines = [""] * len(hashed_data)
-
-# Takes hashed_data or some sublist of it, and a midpoint (array index)
-# in this list. Adds to lines a line for the midpoint, then calls
-# itself recursively for the earlier and later elements. Each node
-# contains indices for elements with a lesser hash and elements with
-# a greater hash. An index of -1 means we're at a leaf node.
-def to_binary_array(xs, mid):
- # divide in half, and form binary array from each half
- x = xs[mid]
- lesses = xs[0:mid]
- greaters = xs[mid+1:]
- midlesses = len(lesses) // 2
- midgreaters = len(greaters) // 2
- if len(lesses) == 0:
- ml = -1
- else:
- ml = indices[lesses[midlesses][0]]
- if len(greaters) == 0:
- mg = -1
- else:
- mg = indices[greaters[midgreaters][0]]
- lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
- ''.join(map(toesc, x[1])) + "\", " + str(ml) +
- ", " + str(mg) + "}, /* &" + x[2] + "; */")
- if len(lesses) > 0:
- to_binary_array(lesses, midlesses)
- if len(greaters) > 0:
- to_binary_array(greaters, midgreaters)
-
-# Now call this to fill up the array lines:
-mid = len(hashed_data) // 2
-to_binary_array(hashed_data, mid)
+entities = sorted([(k[:-1], entities5[k].encode('utf-8')) for k in entities5.keys() if k[-1] == ';'])
# Print out the header:
print("""#ifndef CMARK_ENTITIES_H
@@ -87,24 +20,21 @@ extern "C" {
#endif
struct cmark_entity_node {
- unsigned long value;
- unsigned char *bytes;
- int less;
- int greater;
+ unsigned char *entity;
+ unsigned char bytes[8];
};
#define CMARK_ENTITY_MIN_LENGTH 2
-#define CMARK_ENTITY_MAX_LENGTH 31
-""")
+#define CMARK_ENTITY_MAX_LENGTH 31""")
-print("static const struct cmark_entity_node cmark_entities[] = {");
+print("#define CMARK_NUM_ENTITIES " + str(len(entities)));
-for line in lines:
- print(line);
+print("\nstatic const struct cmark_entity_node cmark_entities[] = {");
-print("};\n");
+for (ent, bs) in entities:
+ print('{(unsigned char*)"' + ent + '", {' + ', '.join(map(str, bs)) + ', 0}},')
-print("static const int cmark_entities_root = " + str(mid) + ";");
+print("};\n");
print("""
#ifdef __cplusplus