#!/usr/bin/env python # -*- coding: utf-8 -*- from ctypes import CDLL, c_char_p, c_long import sys import platform from difflib import unified_diff from subprocess import * import argparse from HTMLParser import HTMLParser from htmlentitydefs import name2codepoint import re if __name__ == "__main__": parser = argparse.ArgumentParser(description='Run cmark tests.') parser.add_argument('--program', dest='program', nargs='?', default=None, help='program to test') parser.add_argument('--spec', dest='spec', nargs='?', default='spec.txt', help='path to spec') parser.add_argument('--pattern', dest='pattern', nargs='?', default=None, help='limit to sections matching regex pattern') parser.add_argument('--library_dir', dest='library_dir', nargs='?', default=None, help='directory containing dynamic library') args = parser.parse_args(sys.argv[1:]) if not args.program: sysname = platform.system() libname = "libcmark" if sysname == 'Darwin': libname += ".dylib" elif sysname == 'Windows': libname += ".dll" else: libname += ".so" if args and args.library_dir: libpath = args.library_dir + "/" + libname else: libpath = "build/src/" + libname cmark = CDLL(libpath) markdown = cmark.cmark_markdown_to_html markdown.restype = c_char_p markdown.argtypes = [c_char_p, c_long] def md2html(text, prog): if prog: p1 = Popen([prog], stdout=PIPE, stdin=PIPE, stderr=PIPE) [result, err] = p1.communicate(input=text) return [p1.returncode, result, err] else: return [0, markdown(text, len(text)), ''] # Normalization code, adapted from # https://github.com/karlcow/markdown-testsuite/ significant_attrs = ["alt", "href", "src", "title"] normalize_whitespace_re = re.compile('\s+') class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.last = "starttag" self.in_pre = False self.output = u"" def handle_data(self, data): if self.in_pre: self.output += data else: data = normalize_whitespace_re.sub(' ', data) data_strip = data.strip() if (self.last == "ref") and data_strip and data[0] == " ": self.output += " " self.data_end_in_space_not_empty = (data[-1] == ' ' and data_strip) self.output += data_strip self.last = "data" def handle_endtag(self, tag): if tag == "pre": self.in_pre = False self.output += "" + tag + ">" self.last = "endtag" def handle_starttag(self, tag, attrs): if tag == "pre": self.in_pre = True self.output += "<" + tag attrs = filter(lambda attr: attr[0] in significant_attrs, attrs) if attrs: attrs.sort() for attr in attrs: self.output += " " + attr[0] + "=" + '"' + attr[1] + '"' self.output += ">" self.last = "starttag" def handle_startendtag(self, tag, attrs): """Ignore closing tag for self-closing void elements.""" self.handle_starttag(tag, attrs) def handle_entityref(self, name): self.add_space_from_last_data() try: self.output += unichr(name2codepoint[name]) except KeyError: self.output += name self.last = "ref" def handle_charref(self, name): self.add_space_from_last_data() try: if name.startswith("x"): c = unichr(int(name[1:], 16)) else: c = unichr(int(name)) self.output += c except ValueError: self.output += name self.last = "ref" # Helpers. def add_space_from_last_data(self): """Maintain the space at: `a b`""" if self.last == 'data' and self.data_end_in_space_not_empty: self.output += ' ' def normalize(html): r""" Return normalized form of HTML which igores insignificant output differences. Multiple inner whitespaces to a single space >>> normalize("
a \t\nb
") u'a b
' Surrounding whitespaces are removed: >>> normalize("a
") u'a
' >>> normalize("a
") u'a
' TODO: how to deal with the following cases without a full list of the void tags? >>> normalize("a b
") u'ab
' >>> normalize("b c
") u'bc
' >>> normalize("a
a
a \t\nb") u'
a \t\nb' Self-closing tags: >>> normalize("
<
") u'<
' >>> normalize("<
") u'<
' >>> normalize("<
") u'<
' >>> normalize("中
") u'\u4e2d
' Spaces around entities are kept: >>> normalize("a < b
") u'a < b
' >>> normalize("a<b
") u'a' Most attributes are ignored: >>> normalize('
') u'' Critical attributes are considered and sorted alphabetically: >>> normalize('') u'' >>> normalize('