From 60c249ec24ab865c4a55759c7ffde2da99530b1d Mon Sep 17 00:00:00 2001 From: Hiltjo Posthuma Date: Wed, 30 Sep 2020 11:42:07 +0100 Subject: several improvements and more efficient xml parser --- xml.c | 36 +++---------------------- xml.h | 12 ++++----- xml2tsv.c | 93 +++++++++++++++++++++------------------------------------------ 3 files changed, 38 insertions(+), 103 deletions(-) diff --git a/xml.c b/xml.c index e6c27d7..67ad5a8 100644 --- a/xml.c +++ b/xml.c @@ -116,49 +116,19 @@ startvalue: static void xml_parsecomment(XMLParser *x) { - size_t datalen = 0, i = 0; + size_t i = 0; int c; - if (x->xmlcommentstart) - x->xmlcommentstart(x); while ((c = GETNEXT()) != EOF) { - if (c == '-' || c == '>') { - if (x->xmlcomment && datalen) { - x->data[datalen] = '\0'; - x->xmlcomment(x, x->data, datalen); - datalen = 0; - } - } - if (c == '-') { - if (++i > 2) { - if (x->xmlcomment) - for (; i > 2; i--) - x->xmlcomment(x, "-", 1); + if (++i > 2) i = 2; - } continue; } else if (c == '>' && i == 2) { - if (x->xmlcommentend) - x->xmlcommentend(x); return; } else if (i) { - if (x->xmlcomment) { - for (; i > 0; i--) - x->xmlcomment(x, "-", 1); - } i = 0; } - - if (datalen < sizeof(x->data) - 1) { - x->data[datalen++] = c; - } else { - x->data[datalen] = '\0'; - if (x->xmlcomment) - x->xmlcomment(x, x->data, datalen); - x->data[0] = c; - datalen = 1; - } } } @@ -286,7 +256,7 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz) l = strtol(++e, &end, 16); else l = strtol(e, &end, 10); - /* invalid value or not a well-formed entity or invalid codepoint */ + /* invalid value or not a well-formed entity or invalid code point */ if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff) return -1; len = codepointtoutf8(l, buf); diff --git a/xml.h b/xml.h index 6ee18b4..a2742db 100644 --- a/xml.h +++ b/xml.h @@ -1,5 +1,5 @@ -#ifndef _XML_H -#define _XML_H +#ifndef _XML_H_ +#define _XML_H_ #include @@ -16,9 +16,6 @@ typedef struct xmlparser { void (*xmlcdatastart)(struct xmlparser *); void (*xmlcdata)(struct xmlparser *, const char *, size_t); void (*xmlcdataend)(struct xmlparser *); - void (*xmlcommentstart)(struct xmlparser *); - void (*xmlcomment)(struct xmlparser *, const char *, size_t); - void (*xmlcommentend)(struct xmlparser *); void (*xmldata)(struct xmlparser *, const char *, size_t); void (*xmldataend)(struct xmlparser *); void (*xmldataentity)(struct xmlparser *, const char *, size_t); @@ -29,8 +26,9 @@ typedef struct xmlparser { size_t, int); #ifndef GETNEXT - #define GETNEXT (x)->getnext - int (*getnext)(void); + /* GETNEXT overridden to reduce function call overhead and + further context optimizations. */ + #define GETNEXT getchar #endif /* current tag */ diff --git a/xml2tsv.c b/xml2tsv.c index 36aef3a..342d900 100644 --- a/xml2tsv.c +++ b/xml2tsv.c @@ -64,7 +64,7 @@ void stack_init(tstack_t *t){ /* utility functions */ /* quote_print: quote \\, \n, \t, and strip other ctrl chars */ -void quote_print(FILE *f, const char *s){ +void quote_print(const char *s){ const char *tmp = s; size_t len; int i; @@ -72,36 +72,45 @@ void quote_print(FILE *f, const char *s){ len = strcspn(tmp, "\\\n\t"); for(i=0; i 0){ - fprintf(f, "\\n"); + fputs("\\n", stdout); } tmp ++; break; case '\t': - fprintf(f, "\\t"); + fputs("\\t", stdout); tmp ++; break; case '\r': - fprintf(f, "\\r"); + fputs("\\r", stdout); tmp ++; break; case '\\': - fprintf(f, "\\\\"); + fputs("\\\\", stdout); tmp ++; break; } } } -void print_cur_str(FILE *f, tstack_t *t){ +void print_cur_str(tstack_t *t){ int i; for (i=0; i<=t->top; i++){ - fprintf(f, "/%s", t->st[i]); + putchar('/'); + fputs(t->st[i], stdout); + } +} + +void print_cur_str_fp(FILE *f, tstack_t *t){ + int i; + for (i=0; i<=t->top; i++){ + fputc('/', f); + fputs(t->st[i], f); } } @@ -110,13 +119,13 @@ void print_cur_str(FILE *f, tstack_t *t){ tstack_t st; char emitsep; -/* xml callbacks */ +/* XML callbacks */ void xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, const char *v, size_t vl) { - printf("%s", v); + fputs(v, stdout); } void @@ -132,57 +141,34 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, xmlattr(x, t, tl, a, al, v, vl); } -void -xmlattrend(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) -{ -} - void xmlattrstart(XMLParser *x, const char *t, size_t tl, const char *a, size_t al) { - printf("%c%s%c", SEP, a, SATTR); + putchar(SEP); + fputs(a, stdout); + putchar(SATTR); } void xmlcdatastart(XMLParser *x) { - printf("%c", SEP); + putchar(SEP); } void xmlcdata(XMLParser *x, const char *d, size_t dl) { - quote_print(stdout, d); -} - -void -xmlcdataend(XMLParser *x) -{ -} - -void -xmlcommentstart(XMLParser *x) -{ -} - -void -xmlcomment(XMLParser *x, const char *c, size_t cl) -{ -} - -void -xmlcommentend(XMLParser *x) -{ + quote_print(d); } void xmldata(XMLParser *x, const char *d, size_t dl) { if (strcspn(d, " \t\n") && emitsep){ - printf("%c", SEP); + putchar(SEP); emitsep = FALSE; } - quote_print(stdout, d); + quote_print(d); } void @@ -220,12 +206,6 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort) if (strcmp(t, tag)){ fprintf(stderr, "Error: tag-end '%s' closes tag '%s'\n", t, tag); } - -/* if (isshort) { - printf("\n"); - print_cur_str(stdout, &st); - } -*/ } void @@ -235,13 +215,8 @@ xmltagstart(XMLParser *x, const char *t, size_t tl) fprintf(stderr, "Error: stack full. Ignoring tag '%s' (parent tag: '%s')\n", t, stack_peek(&st)); return; } - printf("\n"); - print_cur_str(stdout, &st); -} - -void -xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) -{ + putchar('\n'); + print_cur_str(&st); } int @@ -252,30 +227,22 @@ main(void) XMLParser x = { 0 }; x.xmlattr = xmlattr; - x.xmlattrend = xmlattrend; x.xmlattrstart = xmlattrstart; x.xmlattrentity = xmlattrentity; x.xmlcdatastart = xmlcdatastart; x.xmlcdata = xmlcdata; - x.xmlcdataend = xmlcdataend; - x.xmlcommentstart = xmlcommentstart; - x.xmlcomment = xmlcomment; - x.xmlcommentend = xmlcommentend; x.xmldata = xmldata; x.xmldataend = xmldataend; x.xmldataentity = xmldataentity; x.xmldatastart = xmldatastart; x.xmltagend = xmltagend; x.xmltagstart = xmltagstart; - x.xmltagstartparsed = xmltagstartparsed; - - x.getnext = getchar; xml_parse(&x); - printf("\n"); + putchar('\n'); if (! stack_empty(&st)) { fprintf(stderr, "Error: tags still open at EOF: "); - print_cur_str(stderr, &st); + print_cur_str_fp(stderr, &st); fprintf(stderr, "\n"); } return 0; -- cgit v1.2.3