Replace gperf-based entity lookup with binary tree lookup.

The primary advantage is a big reduction in the size of the compiled library and executable (> 100K). There should be no measurable performance difference in normal documents. I detected a slight performance hit (around 5%) in a file containing 1,000,000 entities. * Removed `src/html_unescape.gperf` and `src/html_unescape.h`. * Added `src/entities.h` (generated by `tools/make_entities_h.py`). * Added binary tree lookup functions to `houdini_html_u.c`, and use the data in `src/entities.h`.
author: John MacFarlane <jgm@berkeley.edu> 2015-06-16 09:54:31 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2015-06-16 12:59:47 -0700
commit: 208c794def61eb819ed6eebe1d51867613addce0 (patch)
tree: 0d0f81dab960befc5efa7124ae900ddd64e43be3 /tools
parent: f904f701cf4390b4d5531c5626c5cf08d85a913f (diff)
1 files changed, 107 insertions, 0 deletions
diff --git a/tools/make_entities_h.py b/tools/make_entities_h.py
new file mode 100644
index 0000000..5c24825
--- /dev/null
+++ b/tools/make_entities_h.py
@@ -0,0 +1,107 @@
+# Creates C data structures for binary lookup table of entities,
+# using python's html5 entity data.
+# Usage: python3 tools/make_entities_h.py > src/entities.h
+
+import html
+
+# We use this simple hashing algorithm to convert a string
+# to an integer:
+def djb2(s):
+  bs = list(s.encode('utf-8'))
+  hash = 5381
+  for b in bs:
+    hash = (((hash << 5) + hash) + b) & 0xFFFFFFFF
+  return hash
+
+entities5 = html.entities.html5
+
+# Note that most entries in the entity table end with ';', but in a few
+# cases we have both a version with ';' and one without, so we strip out
+# the latter to avoid duplicates:
+hashed_data = sorted([[int(djb2(s[:-1])), entities5[s].encode('utf-8'), s]
+                      for s in entities5.keys() if s[-1] == ';'])
+
+# indices is a dictionary - given a hash it spits out the ordering
+# of this entity in the list (the array index)
+indices = {}
+i = 0
+
+for x in hashed_data:
+  indices[x[0]] = i
+  i = i + 1
+
+# Formats integer as C octal escape.
+def toesc(x):
+  return '\\' + oct(x)[2:]
+
+# Lines is the list of lines in the array.
+# We don't fill them in order, so we initialize the whole array first.
+lines = [""] * len(hashed_data)
+
+# Takes hashed_data or some sublist of it, and a midpoint (array index)
+# in this list.  Adds to lines a line for the midpoint, then calls
+# itself recursively for the earlier and later elements.  Each node
+# contains indices for elements with a lesser hash and elements with
+# a greater hash.  An index of 0 means we're at a leaf node.
+def to_binary_array(xs, mid):
+  # divide in half, and form binary array from each half
+  x = xs[mid]
+  lesses = xs[0:mid]
+  greaters = xs[mid+1:]
+  midlesses = len(lesses) // 2
+  midgreaters = len(greaters) // 2
+  if len(lesses) == 0:
+    ml = 0
+  else:
+    ml = indices[lesses[midlesses][0]]
+  if len(greaters) == 0:
+    mg = 0
+  else:
+    mg = indices[greaters[midgreaters][0]]
+  lines[indices[x[0]]] = ("{" + str(x[0]) + ", (unsigned char*)\"" +
+                          ''.join(map(toesc, x[1])) + "\", " + str(ml) +
+                          ", " + str(mg) + "}, /* &" + x[2] + " */")
+  if len(lesses) > 0:
+    to_binary_array(lesses, midlesses)
+  if len(greaters) > 0:
+    to_binary_array(greaters, midgreaters)
+
+# Now call this to fill up the array lines:
+mid = len(hashed_data) // 2
+to_binary_array(hashed_data, mid)
+
+# Print out the header:
+print("""#ifndef CMARK_ENTITIES_H
+#define CMARK_ENTITIES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct cmark_entity_node {
+	unsigned long value;
+	unsigned char *bytes;
+	int less;
+	int greater;
+};
+
+#define CMARK_ENTITY_MIN_LENGTH 2
+#define CMARK_ENTITY_MAX_LENGTH 31
+""")
+
+print("static struct cmark_entity_node cmark_entities[] = {");
+
+for line in lines:
+  print(line);
+
+print("};\n");
+
+print("static int cmark_entities_root = " + str(mid) + ";");
+
+print("""
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+""")
author	John MacFarlane <jgm@berkeley.edu>	2015-06-16 09:54:31 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2015-06-16 12:59:47 -0700
commit	208c794def61eb819ed6eebe1d51867613addce0 (patch)
tree	0d0f81dab960befc5efa7124ae900ddd64e43be3 /tools
parent	f904f701cf4390b4d5531c5626c5cf08d85a913f (diff)