diff options
author | John MacFarlane <jgm@berkeley.edu> | 2015-06-16 16:31:52 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2015-06-16 16:31:52 -0700 |
commit | 54c087d1272b4ce756e56de68e8e6dfac6d159fc (patch) | |
tree | 1218f0c71531fe7d2ceccc115c62da4452ac8d2a | |
parent | 8418191a26b970c32c0396bf3580c0c7a374fb35 (diff) |
make_entities_h.py: confirm there are no hash collisions.
At least with valid data.
-rw-r--r-- | tools/make_entities_h.py | 14 |
1 files changed, 11 insertions, 3 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py index a993246..9342286 100644 --- a/tools/make_entities_h.py +++ b/tools/make_entities_h.py @@ -15,11 +15,19 @@ def djb2(s): entities5 = html.entities.html5 +# remove keys without semicolons. For some reason the list +# has duplicates of a few things, like auml, one with and one +# without a semicolon. +entities = [(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';'] + # Note that most entries in the entity table end with ';', but in a few # cases we have both a version with ';' and one without, so we strip out # the latter to avoid duplicates: -hashed_data = sorted([[int(djb2(s[:-1])), entities5[s].encode('utf-8'), s] - for s in entities5.keys() if s[-1] == ';']) +hashed_data = sorted([[int(djb2(k)), v.encode('utf-8'), k] for (k,v) in entities]) + +# Confirm no hash collisions +hashes = [x for [x,_,_] in hashed_data] +assert(len(hashes) == len(set(hashes))) # indices is a dictionary - given a hash it spits out the ordering # of this entity in the list (the array index) @@ -60,7 +68,7 @@ def to_binary_array(xs, mid): mg = indices[greaters[midgreaters][0]] lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" + ''.join(map(toesc, x[1])) + "\", " + str(ml) + - ", " + str(mg) + "}, /* &" + x[2] + " */") + ", " + str(mg) + "}, /* &" + x[2] + "; */") if len(lesses) > 0: to_binary_array(lesses, midlesses) if len(greaters) > 0: |