diff options
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | src/blocks.c | 16 | ||||
| -rw-r--r-- | src/buffer.c | 43 | ||||
| -rw-r--r-- | src/buffer.h | 2 | ||||
| -rw-r--r-- | src/inlines.c | 176 | ||||
| -rw-r--r-- | src/references.c | 109 | ||||
| -rw-r--r-- | src/references.h | 27 | ||||
| -rw-r--r-- | src/stmd.h | 26 | ||||
| -rw-r--r-- | src/utf8.c | 10 | ||||
| -rw-r--r-- | src/utf8.h | 5 | 
10 files changed, 225 insertions, 191 deletions
@@ -42,7 +42,7 @@ benchjs:  	node js/bench.js ${BENCHINP}  HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o $(SRCDIR)/html/houdini_html_u.o -STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o +STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o $(SRCDIR)/references.c  $(PROG): $(SRCDIR)/html/html_unescape.h $(SRCDIR)/case_fold_switch.inc $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c  	$(CC) $(LDFLAGS) -o $@ $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c diff --git a/src/blocks.c b/src/blocks.c index 72b2dc2..30a8284 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -8,7 +8,6 @@  #include "utf8.h"  #include "html/houdini.h"  #include "scanners.h" -#include "uthash.h"  #define peek_at(i, n) (i)->data[n] @@ -36,12 +35,7 @@ static node_block* make_block(int tag, int start_line, int start_column)  extern node_block* make_document()  {  	node_block *e = make_block(BLOCK_DOCUMENT, 1, 1); -	reference *map = NULL; -	reference ** refmap; - -	refmap = (reference**) malloc(sizeof(reference*)); -	*refmap = map; -	e->as.document.refmap = refmap; +	e->as.document.refmap = reference_map_new();  	e->top = e;  	return e; @@ -164,7 +158,7 @@ static void finalize(node_block* b, int line_number)  		case BLOCK_PARAGRAPH:  			pos = 0;  			while (strbuf_at(&b->string_content, 0) == '[' && -					(pos = parse_reference(&b->string_content, b->top->as.document.refmap))) { +					(pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) {  				strbuf_drop(&b->string_content, pos);  			} @@ -192,7 +186,7 @@ static void finalize(node_block* b, int line_number)  			strbuf_drop(&b->string_content, firstlinelen + 1);  			strbuf_trim(&b->as.code.info); -			unescape_buffer(&b->as.code.info); +			strbuf_unescape(&b->as.code.info);  			break;  		case BLOCK_LIST: // determine tight/loose status @@ -268,7 +262,7 @@ extern void free_blocks(node_block* e)  		if (e->tag == BLOCK_FENCED_CODE) {  			strbuf_free(&e->as.code.info);  		} else if (e->tag == BLOCK_DOCUMENT) { -			free_reference_map(e->as.document.refmap); +			reference_map_free(e->as.document.refmap);  		}  		free_blocks(e->children);  		free(e); @@ -278,7 +272,7 @@ extern void free_blocks(node_block* e)  // Walk through node_block and all children, recursively, parsing  // string content into inline content where appropriate. -void process_inlines(node_block* cur, reference** refmap) +void process_inlines(node_block* cur, reference_map *refmap)  {  	switch (cur->tag) {  		case BLOCK_PARAGRAPH: diff --git a/src/buffer.c b/src/buffer.c index 90c2186..cdf8ca0 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -308,3 +308,46 @@ void strbuf_trim(strbuf *buf)  	buf->ptr[buf->size] = '\0';  } + +// Destructively modify string, collapsing consecutive +// space and newline characters into a single space. +void strbuf_normalize_whitespace(strbuf *s) +{ +	bool last_char_was_space = false; +	int r, w; + +	for (r = 0, w = 0; r < s->size; ++r) { +		switch (s->ptr[r]) { +		case ' ': +		case '\n': +			if (last_char_was_space) +				break; + +			s->ptr[w++] = ' '; +			last_char_was_space = true; +			break; + +		default: +			s->ptr[w++] = s->ptr[r]; +			last_char_was_space = false; +		} +	} + +	strbuf_truncate(s, w); +} + +// Destructively unescape a string: remove backslashes before punctuation chars. +extern void strbuf_unescape(strbuf *buf) +{ +	int r, w; + +	for (r = 0, w = 0; r < buf->size; ++r) { +		if (buf->ptr[r] == '\\' && ispunct(buf->ptr[r + 1])) +			continue; + +		buf->ptr[w++] = buf->ptr[r]; +	} + +	strbuf_truncate(buf, w); +} + diff --git a/src/buffer.h b/src/buffer.h index 6f45cbb..1bc1eee 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -108,5 +108,7 @@ int strbuf_strrchr(const strbuf *buf, int c, int pos);  void strbuf_drop(strbuf *buf, int n);  void strbuf_truncate(strbuf *buf, int len);  void strbuf_trim(strbuf *buf); +void strbuf_normalize_whitespace(strbuf *s); +void strbuf_unescape(strbuf *s);  #endif diff --git a/src/inlines.c b/src/inlines.c index aa0e13e..3040f09 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -7,110 +7,23 @@  #include "stmd.h"  #include "html/houdini.h"  #include "utf8.h" -#include "uthash.h"  #include "scanners.h"  typedef struct Subject {  	chunk input;  	int pos; -	int            label_nestlevel; -	reference**    reference_map; +	int label_nestlevel; +	reference_map *refmap;  } subject; -reference* lookup_reference(reference** refmap, chunk *label); -reference* make_reference(chunk *label, chunk *url, chunk *title); - -static unsigned char *clean_url(chunk *url); -static unsigned char *clean_title(chunk *title); -static unsigned char *clean_autolink(chunk *url, int is_email); - -inline static void chunk_free(chunk *c); -inline static void chunk_trim(chunk *c); - -inline static chunk chunk_literal(const char *data); -inline static chunk chunk_buf_detach(strbuf *buf); -inline static chunk chunk_dup(const chunk *ch, int pos, int len); - -static node_inl *parse_chunk_inlines(chunk *chunk, reference** refmap); +static node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap);  static node_inl *parse_inlines_while(subject* subj, int (*f)(subject*));  static int parse_inline(subject* subj, node_inl ** last); -static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap); -static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap); +static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap); +static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap);  static int subject_find_special_char(subject *subj); -static void normalize_whitespace(strbuf *s); - -extern void free_reference(reference *ref) { -	free(ref->label); -	free(ref->url); -	free(ref->title); -	free(ref); -} - -extern void free_reference_map(reference **refmap) { -	/* free the hash table contents */ -	reference *s; -	reference *tmp; -	if (refmap != NULL) { -		HASH_ITER(hh, *refmap, s, tmp) { -			HASH_DEL(*refmap, s); -			free_reference(s); -		} -		free(refmap); -	} -} - -// normalize reference:  collapse internal whitespace to single space, -// remove leading/trailing whitespace, case fold -static unsigned char *normalize_reference(chunk *ref) -{ -	strbuf normalized = GH_BUF_INIT; - -	utf8proc_case_fold(&normalized, ref->data, ref->len); -	strbuf_trim(&normalized); -	normalize_whitespace(&normalized); - -	return strbuf_detach(&normalized); -} - -// Returns reference if refmap contains a reference with matching -// label, otherwise NULL. -extern reference* lookup_reference(reference** refmap, chunk *label) -{ -	reference *ref = NULL; -	unsigned char *norm = normalize_reference(label); -	if (refmap != NULL) { -		HASH_FIND_STR(*refmap, (char*)norm, ref); -	} -	free(norm); -	return ref; -} - -extern reference* make_reference(chunk *label, chunk *url, chunk *title) -{ -	reference *ref; -	ref = malloc(sizeof(reference)); -	ref->label = normalize_reference(label); -	ref->url = clean_url(url); -	ref->title = clean_title(title); -	return ref; -} - -extern void add_reference(reference** refmap, reference* ref) -{ -	reference * t = NULL; -	const char *label = (const char *)ref->label; - -	HASH_FIND(hh, *refmap, label, strlen(label), t); - -	if (t == NULL) { -		HASH_ADD_KEYPTR(hh, *refmap, label, strlen(label), ref); -	} else { -		free_reference(ref);  // we free this now since it won't be in the refmap -	} -} -  static unsigned char *bufdup(const unsigned char *buf)  {  	unsigned char *new = NULL; @@ -236,26 +149,26 @@ inline static node_inl* append_inlines(node_inl* a, node_inl* b)  	return a;  } -static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap) +static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap)  {  	e->input.data = buffer->ptr;  	e->input.len = buffer->size;  	e->input.alloc = 0;  	e->pos = 0;  	e->label_nestlevel = 0; -	e->reference_map = refmap; +	e->refmap = refmap;  	chunk_rtrim(&e->input);  } -static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap) +static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap)  {  	e->input.data = chunk->data;  	e->input.len = chunk->len;  	e->input.alloc = 0;  	e->pos = 0;  	e->label_nestlevel = 0; -	e->reference_map = refmap; +	e->refmap = refmap;  	chunk_rtrim(&e->input);  } @@ -325,33 +238,6 @@ static int scan_to_closing_backticks(subject* subj, int openticklength)  	return (subj->pos);  } -// Destructively modify string, collapsing consecutive -// space and newline characters into a single space. -static void normalize_whitespace(strbuf *s) -{ -	bool last_char_was_space = false; -	int r, w; - -	for (r = 0, w = 0; r < s->size; ++r) { -		switch (s->ptr[r]) { -		case ' ': -		case '\n': -			if (last_char_was_space) -				break; - -			s->ptr[w++] = ' '; -			last_char_was_space = true; -			break; - -		default: -			s->ptr[w++] = s->ptr[r]; -			last_char_was_space = false; -		} -	} - -	strbuf_truncate(s, w); -} -  // Parse backtick code section or raw backticks, return an inline.  // Assumes that the subject has a backtick at the current position.  static node_inl* handle_backticks(subject *subj) @@ -368,7 +254,7 @@ static node_inl* handle_backticks(subject *subj)  		strbuf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len);  		strbuf_trim(&buf); -		normalize_whitespace(&buf); +		strbuf_normalize_whitespace(&buf);  		return make_code(chunk_buf_detach(&buf));  	} @@ -575,24 +461,9 @@ static node_inl *make_str_with_entities(chunk *content)  	}  } -// Destructively unescape a string: remove backslashes before punctuation chars. -extern void unescape_buffer(strbuf *buf) -{ -	int r, w; - -	for (r = 0, w = 0; r < buf->size; ++r) { -		if (buf->ptr[r] == '\\' && ispunct(buf->ptr[r + 1])) -			continue; - -		buf->ptr[w++] = buf->ptr[r]; -	} - -	strbuf_truncate(buf, w); -} -  // Clean a URL: remove surrounding whitespace and surrounding <>,  // and remove \ that escape punctuation. -static unsigned char *clean_url(chunk *url) +unsigned char *clean_url(chunk *url)  {  	strbuf buf = GH_BUF_INIT; @@ -607,11 +478,11 @@ static unsigned char *clean_url(chunk *url)  		houdini_unescape_html_f(&buf, url->data, url->len);  	} -	unescape_buffer(&buf); +	strbuf_unescape(&buf);  	return strbuf_detach(&buf);  } -static unsigned char *clean_autolink(chunk *url, int is_email) +unsigned char *clean_autolink(chunk *url, int is_email)  {  	strbuf buf = GH_BUF_INIT; @@ -628,7 +499,7 @@ static unsigned char *clean_autolink(chunk *url, int is_email)  }  // Clean a title: remove surrounding quotes and remove \ that escape punctuation. -static unsigned char *clean_title(chunk *title) +unsigned char *clean_title(chunk *title)  {  	strbuf buf = GH_BUF_INIT;  	unsigned char first, last; @@ -648,7 +519,7 @@ static unsigned char *clean_title(chunk *title)  		houdini_unescape_html_f(&buf, title->data, title->len);  	} -	unescape_buffer(&buf); +	strbuf_unescape(&buf);  	return strbuf_detach(&buf);  } @@ -810,7 +681,7 @@ static node_inl* handle_left_bracket(subject* subj)  			} else {  				// if we get here, we matched a label but didn't get further:  				subj->pos = endlabel; -				lab = parse_chunk_inlines(&rawlabel, subj->reference_map); +				lab = parse_chunk_inlines(&rawlabel, subj->refmap);  				result = append_inlines(make_str(chunk_literal("[")),  						append_inlines(lab,  							make_str(chunk_literal("]")))); @@ -834,13 +705,13 @@ static node_inl* handle_left_bracket(subject* subj)  			}  			// lookup rawlabel in subject->reference_map: -			ref = lookup_reference(subj->reference_map, &reflabel); +			ref = reference_lookup(subj->refmap, &reflabel);  			if (ref != NULL) { // found  				lab = parse_chunk_inlines(&rawlabel, NULL);  				result = make_ref_link(lab, ref);  			} else {  				subj->pos = endlabel; -				lab = parse_chunk_inlines(&rawlabel, subj->reference_map); +				lab = parse_chunk_inlines(&rawlabel, subj->refmap);  				result = append_inlines(make_str(chunk_literal("[")),  						append_inlines(lab, make_str(chunk_literal("]"))));  			} @@ -887,7 +758,7 @@ extern node_inl* parse_inlines_while(subject* subj, int (*f)(subject*))  	return result;  } -node_inl *parse_chunk_inlines(chunk *chunk, reference** refmap) +node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap)  {  	subject subj;  	subject_from_chunk(&subj, chunk, refmap); @@ -987,7 +858,7 @@ static int parse_inline(subject* subj, node_inl ** last)  	return 1;  } -extern node_inl* parse_inlines(strbuf *input, reference** refmap) +extern node_inl* parse_inlines(strbuf *input, reference_map *refmap)  {  	subject subj;  	subject_from_buf(&subj, input, refmap); @@ -1009,7 +880,7 @@ void spnl(subject* subj)  // Modify refmap if a reference is encountered.  // Return 0 if no reference found, otherwise position of subject  // after reference is parsed. -extern int parse_reference(strbuf *input, reference** refmap) +int parse_reference_inline(strbuf *input, reference_map *refmap)  {  	subject subj; @@ -1019,7 +890,6 @@ extern int parse_reference(strbuf *input, reference** refmap)  	int matchlen = 0;  	int beforetitle; -	reference *new = NULL;  	subject_from_buf(&subj, input, NULL); @@ -1065,9 +935,7 @@ extern int parse_reference(strbuf *input, reference** refmap)  		return 0;  	}  	// insert reference into refmap -	new = make_reference(&lab, &url, &title); -	add_reference(refmap, new); - +	reference_create(refmap, &lab, &url, &title);  	return subj.pos;  } diff --git a/src/references.c b/src/references.c new file mode 100644 index 0000000..ff64b00 --- /dev/null +++ b/src/references.c @@ -0,0 +1,109 @@ +#include "stmd.h" +#include "utf8.h" +#include "references.h" + +static unsigned int +refhash(const unsigned char *link_ref) +{ +	unsigned int hash = 0; + +	while (*link_ref) +		hash = (*link_ref++) + (hash << 6) + (hash << 16) - hash; + +	return hash; +} + +// normalize reference:  collapse internal whitespace to single space, +// remove leading/trailing whitespace, case fold +static unsigned char *normalize_reference(chunk *ref) +{ +	strbuf normalized = GH_BUF_INIT; + +	utf8proc_case_fold(&normalized, ref->data, ref->len); +	strbuf_trim(&normalized); +	strbuf_normalize_whitespace(&normalized); + +	return strbuf_detach(&normalized); +} + +static void add_reference(reference_map *map, reference* ref) +{ +	ref->next = map->table[ref->hash % REFMAP_SIZE]; +	map->table[ref->hash % REFMAP_SIZE] = ref; +} + +extern reference *reference_create(reference_map *map, chunk *label, chunk *url, chunk *title) +{ +	reference *ref; +	ref = malloc(sizeof(reference)); +	ref->label = normalize_reference(label); +	ref->hash = refhash(ref->label); +	ref->url = clean_url(url); +	ref->title = clean_title(title); +	ref->next = NULL; + +	add_reference(map, ref); + +	return ref; +} + +// Returns reference if refmap contains a reference with matching +// label, otherwise NULL. +reference* reference_lookup(reference_map *map, chunk *label) +{ +	reference *ref = NULL; +	unsigned char *norm; +	unsigned int hash; +	 +	if (map == NULL) +		return NULL; +	 +	norm = normalize_reference(label); +	hash = refhash(norm); +	ref = map->table[hash % REFMAP_SIZE]; + +	while (ref) { +		if (ref->label[0] == norm[0] && +			!strcmp((char *)ref->label, (char *)norm)) +			break; +		ref = ref->next; +	} + +	free(norm); +	return ref; +} + +static void reference_free(reference *ref) +{ +	free(ref->label); +	free(ref->url); +	free(ref->title); +	free(ref); +} + +void reference_map_free(reference_map *map) +{ +	unsigned int i; + +	for (i = 0; i < REFMAP_SIZE; ++i) { +		reference *ref = map->table[i]; +		reference *next; + +		while (ref) { +			next = ref->next; +			reference_free(ref); +			ref = next; +		} +	} + +	free(map->table); +	free(map); +} + +reference_map *reference_map_new(void) +{ +	reference_map *map = malloc(sizeof(reference_map)); +	memset(map, 0x0, sizeof(reference_map)); +	return map; +} + diff --git a/src/references.h b/src/references.h new file mode 100644 index 0000000..78fffe7 --- /dev/null +++ b/src/references.h @@ -0,0 +1,27 @@ +#ifndef _REFERENCES_H_ +#define _REFERENCES_H_ + +#define REFMAP_SIZE 16 + +struct reference { +	struct reference *next; +	unsigned char *label; +	unsigned char *url; +	unsigned char *title; +	unsigned int hash; +}; + +typedef struct reference reference; + +struct reference_map { +	reference *table[REFMAP_SIZE]; +}; + +typedef struct reference_map reference_map; + +reference_map *reference_map_new(void); +void reference_map_free(reference_map *map); +reference* reference_lookup(reference_map *map, chunk *label); +extern reference *reference_create(reference_map *map, chunk *label, chunk *url, chunk *title); + +#endif @@ -5,7 +5,7 @@  #include <stdio.h>  #include "buffer.h"  #include "chunk.h" -#include "uthash.h" +#include "references.h"  #define VERSION "0.1"  #define CODE_INDENT 4 @@ -36,17 +36,7 @@ struct node_inl {  typedef struct node_inl node_inl; -struct reference { -  unsigned char *label; -  unsigned char *url; -  unsigned char *title; -  UT_hash_handle  hh; // used by uthash -}; - -typedef struct reference reference; -  // Types for blocks -  struct ListData {  	enum {  		bullet, @@ -104,7 +94,7 @@ struct node_block {  			int level;  		} header;  		struct { -			reference** refmap; +			reference_map *refmap;  		} document;  	} as; @@ -114,14 +104,10 @@ struct node_block {  typedef struct node_block node_block; -node_inl* parse_inlines(strbuf *input, reference** refmap); +node_inl* parse_inlines(strbuf *input, reference_map *refmap);  void free_inlines(node_inl* e); -int parse_reference(strbuf *input, reference** refmap); -void free_reference(reference *ref); -void free_reference_map(reference **refmap); - -void add_reference(reference** refmap, reference* ref); +int parse_reference_inline(strbuf *input, reference_map *refmap);  void unescape_buffer(strbuf *buf);  extern node_block* make_document(); @@ -138,4 +124,8 @@ void print_blocks(node_block* blk, int indent);  void blocks_to_html(strbuf *html, node_block *b, bool tight);  void inlines_to_html(strbuf *html, node_inl *b); +unsigned char *clean_url(chunk *url); +unsigned char *clean_autolink(chunk *url, int is_email); +unsigned char *clean_title(chunk *title); +  #endif @@ -25,7 +25,7 @@ static const int8_t utf8proc_utf8class[256] = {  static void encode_unknown(strbuf *buf)  { -	static const unsigned char repl[] = {239, 191, 189}; +	static const uint8_t repl[] = {239, 191, 189};  	strbuf_put(buf, repl, 3);  } @@ -52,9 +52,9 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len)  	return length;  } -void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size) +void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)  { -	static const unsigned char whitespace[] = "    "; +	static const uint8_t whitespace[] = "    ";  	size_t i = 0, tab = 0; @@ -132,7 +132,7 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst)  void utf8proc_encode_char(int32_t uc, strbuf *buf)  { -	unsigned char dst[4]; +	uint8_t dst[4];  	int len = 0;  	assert(uc >= 0); @@ -169,7 +169,7 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf)  	strbuf_put(buf, dst, len);  } -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) +void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len)  {  	int32_t c; @@ -1,12 +1,13 @@  #ifndef _H_STMD_UTF8_  #define _H_STMD_UTF8_ +#include <stdint.h>  #include "buffer.h" -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len); +void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len);  void utf8proc_encode_char(int32_t uc, strbuf *buf);  ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst);  ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len); -void utf8proc_detab(strbuf *dest, const unsigned char *line, size_t size); +void utf8proc_detab(strbuf *dest, const uint8_t *line, size_t size);  #endif  | 
