From 94a79a605f3e76a43f1f87a5044f6761b99e5ca5 Mon Sep 17 00:00:00 2001 From: Vicent Marti Date: Wed, 10 Sep 2014 18:33:27 +0200 Subject: Cleanup reference implementation --- Makefile | 2 +- src/blocks.c | 16 ++--- src/buffer.c | 43 ++++++++++++++ src/buffer.h | 2 + src/inlines.c | 176 +++++++------------------------------------------------ src/references.c | 109 ++++++++++++++++++++++++++++++++++ src/references.h | 27 +++++++++ src/stmd.h | 26 +++----- src/utf8.c | 10 ++-- src/utf8.h | 5 +- 10 files changed, 225 insertions(+), 191 deletions(-) create mode 100644 src/references.c create mode 100644 src/references.h diff --git a/Makefile b/Makefile index 5d13272..11e2141 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ benchjs: node js/bench.js ${BENCHINP} HTML_OBJ=$(SRCDIR)/html/html.o $(SRCDIR)/html/houdini_href_e.o $(SRCDIR)/html/houdini_html_e.o $(SRCDIR)/html/houdini_html_u.o -STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o +STMD_OBJ=$(SRCDIR)/inlines.o $(SRCDIR)/buffer.o $(SRCDIR)/blocks.o $(SRCDIR)/scanners.c $(SRCDIR)/print.o $(SRCDIR)/utf8.o $(SRCDIR)/references.c $(PROG): $(SRCDIR)/html/html_unescape.h $(SRCDIR)/case_fold_switch.inc $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c $(CC) $(LDFLAGS) -o $@ $(HTML_OBJ) $(STMD_OBJ) $(SRCDIR)/main.c diff --git a/src/blocks.c b/src/blocks.c index 72b2dc2..30a8284 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -8,7 +8,6 @@ #include "utf8.h" #include "html/houdini.h" #include "scanners.h" -#include "uthash.h" #define peek_at(i, n) (i)->data[n] @@ -36,12 +35,7 @@ static node_block* make_block(int tag, int start_line, int start_column) extern node_block* make_document() { node_block *e = make_block(BLOCK_DOCUMENT, 1, 1); - reference *map = NULL; - reference ** refmap; - - refmap = (reference**) malloc(sizeof(reference*)); - *refmap = map; - e->as.document.refmap = refmap; + e->as.document.refmap = reference_map_new(); e->top = e; return e; @@ -164,7 +158,7 @@ static void finalize(node_block* b, int line_number) case BLOCK_PARAGRAPH: pos = 0; while (strbuf_at(&b->string_content, 0) == '[' && - (pos = parse_reference(&b->string_content, b->top->as.document.refmap))) { + (pos = parse_reference_inline(&b->string_content, b->top->as.document.refmap))) { strbuf_drop(&b->string_content, pos); } @@ -192,7 +186,7 @@ static void finalize(node_block* b, int line_number) strbuf_drop(&b->string_content, firstlinelen + 1); strbuf_trim(&b->as.code.info); - unescape_buffer(&b->as.code.info); + strbuf_unescape(&b->as.code.info); break; case BLOCK_LIST: // determine tight/loose status @@ -268,7 +262,7 @@ extern void free_blocks(node_block* e) if (e->tag == BLOCK_FENCED_CODE) { strbuf_free(&e->as.code.info); } else if (e->tag == BLOCK_DOCUMENT) { - free_reference_map(e->as.document.refmap); + reference_map_free(e->as.document.refmap); } free_blocks(e->children); free(e); @@ -278,7 +272,7 @@ extern void free_blocks(node_block* e) // Walk through node_block and all children, recursively, parsing // string content into inline content where appropriate. -void process_inlines(node_block* cur, reference** refmap) +void process_inlines(node_block* cur, reference_map *refmap) { switch (cur->tag) { case BLOCK_PARAGRAPH: diff --git a/src/buffer.c b/src/buffer.c index 90c2186..cdf8ca0 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -308,3 +308,46 @@ void strbuf_trim(strbuf *buf) buf->ptr[buf->size] = '\0'; } + +// Destructively modify string, collapsing consecutive +// space and newline characters into a single space. +void strbuf_normalize_whitespace(strbuf *s) +{ + bool last_char_was_space = false; + int r, w; + + for (r = 0, w = 0; r < s->size; ++r) { + switch (s->ptr[r]) { + case ' ': + case '\n': + if (last_char_was_space) + break; + + s->ptr[w++] = ' '; + last_char_was_space = true; + break; + + default: + s->ptr[w++] = s->ptr[r]; + last_char_was_space = false; + } + } + + strbuf_truncate(s, w); +} + +// Destructively unescape a string: remove backslashes before punctuation chars. +extern void strbuf_unescape(strbuf *buf) +{ + int r, w; + + for (r = 0, w = 0; r < buf->size; ++r) { + if (buf->ptr[r] == '\\' && ispunct(buf->ptr[r + 1])) + continue; + + buf->ptr[w++] = buf->ptr[r]; + } + + strbuf_truncate(buf, w); +} + diff --git a/src/buffer.h b/src/buffer.h index 6f45cbb..1bc1eee 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -108,5 +108,7 @@ int strbuf_strrchr(const strbuf *buf, int c, int pos); void strbuf_drop(strbuf *buf, int n); void strbuf_truncate(strbuf *buf, int len); void strbuf_trim(strbuf *buf); +void strbuf_normalize_whitespace(strbuf *s); +void strbuf_unescape(strbuf *s); #endif diff --git a/src/inlines.c b/src/inlines.c index aa0e13e..3040f09 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -7,110 +7,23 @@ #include "stmd.h" #include "html/houdini.h" #include "utf8.h" -#include "uthash.h" #include "scanners.h" typedef struct Subject { chunk input; int pos; - int label_nestlevel; - reference** reference_map; + int label_nestlevel; + reference_map *refmap; } subject; -reference* lookup_reference(reference** refmap, chunk *label); -reference* make_reference(chunk *label, chunk *url, chunk *title); - -static unsigned char *clean_url(chunk *url); -static unsigned char *clean_title(chunk *title); -static unsigned char *clean_autolink(chunk *url, int is_email); - -inline static void chunk_free(chunk *c); -inline static void chunk_trim(chunk *c); - -inline static chunk chunk_literal(const char *data); -inline static chunk chunk_buf_detach(strbuf *buf); -inline static chunk chunk_dup(const chunk *ch, int pos, int len); - -static node_inl *parse_chunk_inlines(chunk *chunk, reference** refmap); +static node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap); static node_inl *parse_inlines_while(subject* subj, int (*f)(subject*)); static int parse_inline(subject* subj, node_inl ** last); -static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap); -static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap); +static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap); +static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap); static int subject_find_special_char(subject *subj); -static void normalize_whitespace(strbuf *s); - -extern void free_reference(reference *ref) { - free(ref->label); - free(ref->url); - free(ref->title); - free(ref); -} - -extern void free_reference_map(reference **refmap) { - /* free the hash table contents */ - reference *s; - reference *tmp; - if (refmap != NULL) { - HASH_ITER(hh, *refmap, s, tmp) { - HASH_DEL(*refmap, s); - free_reference(s); - } - free(refmap); - } -} - -// normalize reference: collapse internal whitespace to single space, -// remove leading/trailing whitespace, case fold -static unsigned char *normalize_reference(chunk *ref) -{ - strbuf normalized = GH_BUF_INIT; - - utf8proc_case_fold(&normalized, ref->data, ref->len); - strbuf_trim(&normalized); - normalize_whitespace(&normalized); - - return strbuf_detach(&normalized); -} - -// Returns reference if refmap contains a reference with matching -// label, otherwise NULL. -extern reference* lookup_reference(reference** refmap, chunk *label) -{ - reference *ref = NULL; - unsigned char *norm = normalize_reference(label); - if (refmap != NULL) { - HASH_FIND_STR(*refmap, (char*)norm, ref); - } - free(norm); - return ref; -} - -extern reference* make_reference(chunk *label, chunk *url, chunk *title) -{ - reference *ref; - ref = malloc(sizeof(reference)); - ref->label = normalize_reference(label); - ref->url = clean_url(url); - ref->title = clean_title(title); - return ref; -} - -extern void add_reference(reference** refmap, reference* ref) -{ - reference * t = NULL; - const char *label = (const char *)ref->label; - - HASH_FIND(hh, *refmap, label, strlen(label), t); - - if (t == NULL) { - HASH_ADD_KEYPTR(hh, *refmap, label, strlen(label), ref); - } else { - free_reference(ref); // we free this now since it won't be in the refmap - } -} - static unsigned char *bufdup(const unsigned char *buf) { unsigned char *new = NULL; @@ -236,26 +149,26 @@ inline static node_inl* append_inlines(node_inl* a, node_inl* b) return a; } -static void subject_from_buf(subject *e, strbuf *buffer, reference** refmap) +static void subject_from_buf(subject *e, strbuf *buffer, reference_map *refmap) { e->input.data = buffer->ptr; e->input.len = buffer->size; e->input.alloc = 0; e->pos = 0; e->label_nestlevel = 0; - e->reference_map = refmap; + e->refmap = refmap; chunk_rtrim(&e->input); } -static void subject_from_chunk(subject *e, chunk *chunk, reference** refmap) +static void subject_from_chunk(subject *e, chunk *chunk, reference_map *refmap) { e->input.data = chunk->data; e->input.len = chunk->len; e->input.alloc = 0; e->pos = 0; e->label_nestlevel = 0; - e->reference_map = refmap; + e->refmap = refmap; chunk_rtrim(&e->input); } @@ -325,33 +238,6 @@ static int scan_to_closing_backticks(subject* subj, int openticklength) return (subj->pos); } -// Destructively modify string, collapsing consecutive -// space and newline characters into a single space. -static void normalize_whitespace(strbuf *s) -{ - bool last_char_was_space = false; - int r, w; - - for (r = 0, w = 0; r < s->size; ++r) { - switch (s->ptr[r]) { - case ' ': - case '\n': - if (last_char_was_space) - break; - - s->ptr[w++] = ' '; - last_char_was_space = true; - break; - - default: - s->ptr[w++] = s->ptr[r]; - last_char_was_space = false; - } - } - - strbuf_truncate(s, w); -} - // Parse backtick code section or raw backticks, return an inline. // Assumes that the subject has a backtick at the current position. static node_inl* handle_backticks(subject *subj) @@ -368,7 +254,7 @@ static node_inl* handle_backticks(subject *subj) strbuf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len); strbuf_trim(&buf); - normalize_whitespace(&buf); + strbuf_normalize_whitespace(&buf); return make_code(chunk_buf_detach(&buf)); } @@ -575,24 +461,9 @@ static node_inl *make_str_with_entities(chunk *content) } } -// Destructively unescape a string: remove backslashes before punctuation chars. -extern void unescape_buffer(strbuf *buf) -{ - int r, w; - - for (r = 0, w = 0; r < buf->size; ++r) { - if (buf->ptr[r] == '\\' && ispunct(buf->ptr[r + 1])) - continue; - - buf->ptr[w++] = buf->ptr[r]; - } - - strbuf_truncate(buf, w); -} - // Clean a URL: remove surrounding whitespace and surrounding <>, // and remove \ that escape punctuation. -static unsigned char *clean_url(chunk *url) +unsigned char *clean_url(chunk *url) { strbuf buf = GH_BUF_INIT; @@ -607,11 +478,11 @@ static unsigned char *clean_url(chunk *url) houdini_unescape_html_f(&buf, url->data, url->len); } - unescape_buffer(&buf); + strbuf_unescape(&buf); return strbuf_detach(&buf); } -static unsigned char *clean_autolink(chunk *url, int is_email) +unsigned char *clean_autolink(chunk *url, int is_email) { strbuf buf = GH_BUF_INIT; @@ -628,7 +499,7 @@ static unsigned char *clean_autolink(chunk *url, int is_email) } // Clean a title: remove surrounding quotes and remove \ that escape punctuation. -static unsigned char *clean_title(chunk *title) +unsigned char *clean_title(chunk *title) { strbuf buf = GH_BUF_INIT; unsigned char first, last; @@ -648,7 +519,7 @@ static unsigned char *clean_title(chunk *title) houdini_unescape_html_f(&buf, title->data, title->len); } - unescape_buffer(&buf); + strbuf_unescape(&buf); return strbuf_detach(&buf); } @@ -810,7 +681,7 @@ static node_inl* handle_left_bracket(subject* subj) } else { // if we get here, we matched a label but didn't get further: subj->pos = endlabel; - lab = parse_chunk_inlines(&rawlabel, subj->reference_map); + lab = parse_chunk_inlines(&rawlabel, subj->refmap); result = append_inlines(make_str(chunk_literal("[")), append_inlines(lab, make_str(chunk_literal("]")))); @@ -834,13 +705,13 @@ static node_inl* handle_left_bracket(subject* subj) } // lookup rawlabel in subject->reference_map: - ref = lookup_reference(subj->reference_map, &reflabel); + ref = reference_lookup(subj->refmap, &reflabel); if (ref != NULL) { // found lab = parse_chunk_inlines(&rawlabel, NULL); result = make_ref_link(lab, ref); } else { subj->pos = endlabel; - lab = parse_chunk_inlines(&rawlabel, subj->reference_map); + lab = parse_chunk_inlines(&rawlabel, subj->refmap); result = append_inlines(make_str(chunk_literal("[")), append_inlines(lab, make_str(chunk_literal("]")))); } @@ -887,7 +758,7 @@ extern node_inl* parse_inlines_while(subject* subj, int (*f)(subject*)) return result; } -node_inl *parse_chunk_inlines(chunk *chunk, reference** refmap) +node_inl *parse_chunk_inlines(chunk *chunk, reference_map *refmap) { subject subj; subject_from_chunk(&subj, chunk, refmap); @@ -987,7 +858,7 @@ static int parse_inline(subject* subj, node_inl ** last) return 1; } -extern node_inl* parse_inlines(strbuf *input, reference** refmap) +extern node_inl* parse_inlines(strbuf *input, reference_map *refmap) { subject subj; subject_from_buf(&subj, input, refmap); @@ -1009,7 +880,7 @@ void spnl(subject* subj) // Modify refmap if a reference is encountered. // Return 0 if no reference found, otherwise position of subject // after reference is parsed. -extern int parse_reference(strbuf *input, reference** refmap) +int parse_reference_inline(strbuf *input, reference_map *refmap) { subject subj; @@ -1019,7 +890,6 @@ extern int parse_reference(strbuf *input, reference** refmap) int matchlen = 0; int beforetitle; - reference *new = NULL; subject_from_buf(&subj, input, NULL); @@ -1065,9 +935,7 @@ extern int parse_reference(strbuf *input, reference** refmap) return 0; } // insert reference into refmap - new = make_reference(&lab, &url, &title); - add_reference(refmap, new); - + reference_create(refmap, &lab, &url, &title); return subj.pos; } diff --git a/src/references.c b/src/references.c new file mode 100644 index 0000000..ff64b00 --- /dev/null +++ b/src/references.c @@ -0,0 +1,109 @@ +#include "stmd.h" +#include "utf8.h" +#include "references.h" + +static unsigned int +refhash(const unsigned char *link_ref) +{ + unsigned int hash = 0; + + while (*link_ref) + hash = (*link_ref++) + (hash << 6) + (hash << 16) - hash; + + return hash; +} + +// normalize reference: collapse internal whitespace to single space, +// remove leading/trailing whitespace, case fold +static unsigned char *normalize_reference(chunk *ref) +{ + strbuf normalized = GH_BUF_INIT; + + utf8proc_case_fold(&normalized, ref->data, ref->len); + strbuf_trim(&normalized); + strbuf_normalize_whitespace(&normalized); + + return strbuf_detach(&normalized); +} + +static void add_reference(reference_map *map, reference* ref) +{ + ref->next = map->table[ref->hash % REFMAP_SIZE]; + map->table[ref->hash % REFMAP_SIZE] = ref; +} + +extern reference *reference_create(reference_map *map, chunk *label, chunk *url, chunk *title) +{ + reference *ref; + ref = malloc(sizeof(reference)); + ref->label = normalize_reference(label); + ref->hash = refhash(ref->label); + ref->url = clean_url(url); + ref->title = clean_title(title); + ref->next = NULL; + + add_reference(map, ref); + + return ref; +} + +// Returns reference if refmap contains a reference with matching +// label, otherwise NULL. +reference* reference_lookup(reference_map *map, chunk *label) +{ + reference *ref = NULL; + unsigned char *norm; + unsigned int hash; + + if (map == NULL) + return NULL; + + norm = normalize_reference(label); + hash = refhash(norm); + ref = map->table[hash % REFMAP_SIZE]; + + while (ref) { + if (ref->label[0] == norm[0] && + !strcmp((char *)ref->label, (char *)norm)) + break; + ref = ref->next; + } + + free(norm); + return ref; +} + +static void reference_free(reference *ref) +{ + free(ref->label); + free(ref->url); + free(ref->title); + free(ref); +} + +void reference_map_free(reference_map *map) +{ + unsigned int i; + + for (i = 0; i < REFMAP_SIZE; ++i) { + reference *ref = map->table[i]; + reference *next; + + while (ref) { + next = ref->next; + reference_free(ref); + ref = next; + } + } + + free(map->table); + free(map); +} + +reference_map *reference_map_new(void) +{ + reference_map *map = malloc(sizeof(reference_map)); + memset(map, 0x0, sizeof(reference_map)); + return map; +} + diff --git a/src/references.h b/src/references.h new file mode 100644 index 0000000..78fffe7 --- /dev/null +++ b/src/references.h @@ -0,0 +1,27 @@ +#ifndef _REFERENCES_H_ +#define _REFERENCES_H_ + +#define REFMAP_SIZE 16 + +struct reference { + struct reference *next; + unsigned char *label; + unsigned char *url; + unsigned char *title; + unsigned int hash; +}; + +typedef struct reference reference; + +struct reference_map { + reference *table[REFMAP_SIZE]; +}; + +typedef struct reference_map reference_map; + +reference_map *reference_map_new(void); +void reference_map_free(reference_map *map); +reference* reference_lookup(reference_map *map, chunk *label); +extern reference *reference_create(reference_map *map, chunk *label, chunk *url, chunk *title); + +#endif diff --git a/src/stmd.h b/src/stmd.h index 21a86b0..4e21e6c 100644 --- a/src/stmd.h +++ b/src/stmd.h @@ -5,7 +5,7 @@ #include #include "buffer.h" #include "chunk.h" -#include "uthash.h" +#include "references.h" #define VERSION "0.1" #define CODE_INDENT 4 @@ -36,17 +36,7 @@ struct node_inl { typedef struct node_inl node_inl; -struct reference { - unsigned char *label; - unsigned char *url; - unsigned char *title; - UT_hash_handle hh; // used by uthash -}; - -typedef struct reference reference; - // Types for blocks - struct ListData { enum { bullet, @@ -104,7 +94,7 @@ struct node_block { int level; } header; struct { - reference** refmap; + reference_map *refmap; } document; } as; @@ -114,14 +104,10 @@ struct node_block { typedef struct node_block node_block; -node_inl* parse_inlines(strbuf *input, reference** refmap); +node_inl* parse_inlines(strbuf *input, reference_map *refmap); void free_inlines(node_inl* e); -int parse_reference(strbuf *input, reference** refmap); -void free_reference(reference *ref); -void free_reference_map(reference **refmap); - -void add_reference(reference** refmap, reference* ref); +int parse_reference_inline(strbuf *input, reference_map *refmap); void unescape_buffer(strbuf *buf); extern node_block* make_document(); @@ -138,4 +124,8 @@ void print_blocks(node_block* blk, int indent); void blocks_to_html(strbuf *html, node_block *b, bool tight); void inlines_to_html(strbuf *html, node_inl *b); +unsigned char *clean_url(chunk *url); +unsigned char *clean_autolink(chunk *url, int is_email); +unsigned char *clean_title(chunk *title); + #endif diff --git a/src/utf8.c b/src/utf8.c index 12d7ba5..c65aec6 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -25,7 +25,7 @@ static const int8_t utf8proc_utf8class[256] = { static void encode_unknown(strbuf *buf) { - static const unsigned char repl[] = {239, 191, 189}; + static const uint8_t repl[] = {239, 191, 189}; strbuf_put(buf, repl, 3); } @@ -52,9 +52,9 @@ ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len) return length; } -void utf8proc_detab(strbuf *ob, const unsigned char *line, size_t size) +void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) { - static const unsigned char whitespace[] = " "; + static const uint8_t whitespace[] = " "; size_t i = 0, tab = 0; @@ -132,7 +132,7 @@ ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst) void utf8proc_encode_char(int32_t uc, strbuf *buf) { - unsigned char dst[4]; + uint8_t dst[4]; int len = 0; assert(uc >= 0); @@ -169,7 +169,7 @@ void utf8proc_encode_char(int32_t uc, strbuf *buf) strbuf_put(buf, dst, len); } -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len) +void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len) { int32_t c; diff --git a/src/utf8.h b/src/utf8.h index 1e4e556..9506b75 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -1,12 +1,13 @@ #ifndef _H_STMD_UTF8_ #define _H_STMD_UTF8_ +#include #include "buffer.h" -void utf8proc_case_fold(strbuf *dest, const unsigned char *str, int len); +void utf8proc_case_fold(strbuf *dest, const uint8_t *str, int len); void utf8proc_encode_char(int32_t uc, strbuf *buf); ssize_t utf8proc_iterate(const uint8_t *str, ssize_t str_len, int32_t *dst); ssize_t utf8proc_charlen(const uint8_t *str, ssize_t str_len); -void utf8proc_detab(strbuf *dest, const unsigned char *line, size_t size); +void utf8proc_detab(strbuf *dest, const uint8_t *line, size_t size); #endif -- cgit v1.2.3