summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-16 09:54:31 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 12:59:47 -0700
commit208c794def61eb819ed6eebe1d51867613addce0 (patch)
tree0d0f81dab960befc5efa7124ae900ddd64e43be3 /tools
parentf904f701cf4390b4d5531c5626c5cf08d85a913f (diff)
Replace gperf-based entity lookup with binary tree lookup.
The primary advantage is a big reduction in the size of the compiled library and executable (> 100K). There should be no measurable performance difference in normal documents. I detected a slight performance hit (around 5%) in a file containing 1,000,000 entities. * Removed `src/html_unescape.gperf` and `src/html_unescape.h`. * Added `src/entities.h` (generated by `tools/make_entities_h.py`). * Added binary tree lookup functions to `houdini_html_u.c`, and use the data in `src/entities.h`.
Diffstat (limited to 'tools')
-rw-r--r--tools/make_entities_h.py107
1 files changed, 107 insertions, 0 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py
new file mode 100644
index 0000000..5c24825
--- /dev/null
+++ b/tools/make_entities_h.py
@@ -0,0 +1,107 @@
+# Creates C data structures for binary lookup table of entities,
+# using python's html5 entity data.
+# Usage: python3 tools/make_entities_h.py > src/entities.h
+
+import html
+
+# We use this simple hashing algorithm to convert a string
+# to an integer:
+def djb2(s):
+ bs = list(s.encode('utf-8'))
+ hash = 5381
+ for b in bs:
+ hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF
+ return hash
+
+entities5 = html.entities.html5
+
+# Note that most entries in the entity table end with ';', but in a few
+# cases we have both a version with ';' and one without, so we strip out
+# the latter to avoid duplicates:
+hashed_data = sorted([[int(djb2(s[:-1])), entities5[s].encode('utf-8'), s]
+ for s in entities5.keys() if s[-1] == ';'])
+
+# indices is a dictionary - given a hash it spits out the ordering
+# of this entity in the list (the array index)
+indices = {}
+i = 0
+
+for x in hashed_data:
+ indices[x[0]] = i
+ i = i + 1
+
+# Formats integer as C octal escape.
+def toesc(x):
+ return '\\' + oct(x)[2:]
+
+# Lines is the list of lines in the array.
+# We don't fill them in order, so we initialize the whole array first.
+lines = [""] * len(hashed_data)
+
+# Takes hashed_data or some sublist of it, and a midpoint (array index)
+# in this list. Adds to lines a line for the midpoint, then calls
+# itself recursively for the earlier and later elements. Each node
+# contains indices for elements with a lesser hash and elements with
+# a greater hash. An index of 0 means we're at a leaf node.
+def to_binary_array(xs, mid):
+ # divide in half, and form binary array from each half
+ x = xs[mid]
+ lesses = xs[0:mid]
+ greaters = xs[mid+1:]
+ midlesses = len(lesses) // 2
+ midgreaters = len(greaters) // 2
+ if len(lesses) == 0:
+ ml = 0
+ else:
+ ml = indices[lesses[midlesses][0]]
+ if len(greaters) == 0:
+ mg = 0
+ else:
+ mg = indices[greaters[midgreaters][0]]
+ lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
+ ''.join(map(toesc, x[1])) + "\", " + str(ml) +
+ ", " + str(mg) + "}, /* &" + x[2] + " */")
+ if len(lesses) > 0:
+ to_binary_array(lesses, midlesses)
+ if len(greaters) > 0:
+ to_binary_array(greaters, midgreaters)
+
+# Now call this to fill up the array lines:
+mid = len(hashed_data) // 2
+to_binary_array(hashed_data, mid)
+
+# Print out the header:
+print("""#ifndef CMARK_ENTITIES_H
+#define CMARK_ENTITIES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cmark_entity_node {
+ unsigned long value;
+ unsigned char *bytes;
+ int less;
+ int greater;
+};
+
+#define CMARK_ENTITY_MIN_LENGTH 2
+#define CMARK_ENTITY_MAX_LENGTH 31
+""")
+
+print("static struct cmark_entity_node cmark_entities[] = {");
+
+for line in lines:
+ print(line);
+
+print("};\n");
+
+print("static int cmark_entities_root = " + str(mid) + ";");
+
+print("""
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+""")