From 86fda06897ccd4d610410f920923c6e1f3e2bf3d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 29 Dec 2014 22:15:09 -0800 Subject: Added cmark_ctype.h with locale-independent isspace, ispunct, etc. Otherwise cmark's behavior varies unpredictably with the locale. `is_punctuation` in utf8.h has also been adjusted so that everything that counts all ASCII symbol characters count as punctuation, even though some are not in P* character classes. --- src/CMakeLists.txt | 2 ++ src/blocks.c | 2 +- src/buffer.c | 2 +- src/chunk.h | 2 +- src/cmark_ctype.c | 33 +++++++++++++++++++++++++++++++++ src/cmark_ctype.h | 11 +++++++++++ src/inlines.c | 2 +- src/utf8.c | 13 ++----------- 8 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 src/cmark_ctype.c create mode 100644 src/cmark_ctype.h (limited to 'src') diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4272234..87651bc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,7 @@ set(HEADERS inlines.h html_unescape.h houdini.h + cmark_ctype.h ) set(LIBRARY_SOURCES cmark.c @@ -34,6 +35,7 @@ set(LIBRARY_SOURCES houdini_href_e.c houdini_html_e.c houdini_html_u.c + cmark_ctype.c ${HEADERS} ) diff --git a/src/blocks.c b/src/blocks.c index 319706d..b3ea362 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -1,8 +1,8 @@ #include #include #include -#include +#include "cmark_ctype.h" #include "config.h" #include "parser.h" #include "cmark.h" diff --git a/src/buffer.c b/src/buffer.c index 73a9728..40e8674 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -1,11 +1,11 @@ #include -#include #include #include #include #include #include +#include "cmark_ctype.h" #include "buffer.h" /* Used as default value for cmark_strbuf->ptr so that people can always diff --git a/src/chunk.h b/src/chunk.h index 0f48791..ba6c89e 100644 --- a/src/chunk.h +++ b/src/chunk.h @@ -2,9 +2,9 @@ #define CMARK_CHUNK_H #include -#include #include #include +#include "cmark_ctype.h" #include "buffer.h" typedef struct { diff --git a/src/cmark_ctype.c b/src/cmark_ctype.c new file mode 100644 index 0000000..9ed4b5c --- /dev/null +++ b/src/cmark_ctype.c @@ -0,0 +1,33 @@ +/** + * Returns 1 if c is a "whitespace" character as defined by the spec. + */ +int isspace(char c) +{ + return (c == 0x09 || + c == 0x20 || + c == 0x0a || + c == 0x0d); +} + +/** + * Returns 1 if c is an ascii punctuation character. + */ +int ispunct(char c) +{ + return ((c >= 33 && c <= 47) || + (c >= 58 && c <= 64) || + (c >= 91 && c <= 96) || + (c >= 123 && c <= 126)); +} + +int isalnum(char c) +{ + return ((c >= 48 && c <= 57) || + (c >= 65 && c <= 90) || + (c >= 97 && c <= 122)); +} + +int isdigit(char c) +{ + return (c >= 48 && c <= 57); +} diff --git a/src/cmark_ctype.h b/src/cmark_ctype.h new file mode 100644 index 0000000..afc605e --- /dev/null +++ b/src/cmark_ctype.h @@ -0,0 +1,11 @@ +/** Locale-independent versions of functions from ctype.h. + * We want cmark to behave the same no matter what the system locale. + */ + +int isspace(char c); + +int ispunct(char c); + +int isalnum(char c); + +int isdigit(char c); diff --git a/src/inlines.c b/src/inlines.c index 8235f59..9d2d7f8 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -1,8 +1,8 @@ #include #include #include -#include +#include "cmark_ctype.h" #include "config.h" #include "node.h" #include "parser.h" diff --git a/src/utf8.c b/src/utf8.c index 8e3c4bb..50d8834 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -2,6 +2,7 @@ #include #include +#include "cmark_ctype.h" #include "utf8.h" static const int8_t utf8proc_utf8class[256] = { @@ -268,17 +269,7 @@ int utf8proc_is_space(int32_t uc) // matches anything in the P[cdefios] classes. int utf8proc_is_punctuation(int32_t uc) { - return ((uc >= 33 && uc <= 35) || - (uc >= 37 && uc <= 42) || - (uc >= 44 && uc <= 47) || - uc == 58 || - uc == 59 || - uc == 63 || - uc == 64 || - (uc >= 91 && uc <= 93) || - uc == 95 || - uc == 123 || - uc == 125 || + return ((uc < 128 && ispunct((char)uc)) || uc == 161 || uc == 167 || uc == 171 || -- cgit v1.2.3