summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2015-06-11 14:32:22 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2015-06-16 17:47:19 -0700
commit7f491b0bdf8e206458d284938efa8a0890c9d352 (patch)
treed102cc7abad2628771a26dc35a26a577f0f524cd
parentef77d908553bfdd37b83ae4832d7e6ff36874f24 (diff)
Preliminary changes for new tab handling.
We no longer preprocess tabs to spaces before parsing. Instead, we keep track of both the byte offset and the (virtual) column as we parse block starts. This allows us to handle tabs without converting to spaces first. Tabs are left as tabs in the output. Added `column` and `first_nonspace_column` fields to `parser`. Added utility function to advance the offset, computing the virtual column too. Note that we don't need to deal with UTF-8 here at all. Only ASCII occurs in block starts. Significant performance improvement due to the fact that we're not doing UTF-8 validation -- though we might want to add that back in.
-rw-r--r--src/blocks.c92
-rw-r--r--src/parser.h2
2 files changed, 71 insertions, 23 deletions
diff --git a/src/blocks.c b/src/blocks.c
index 8c3e18c..06f6dcb 100644
--- a/src/blocks.c
+++ b/src/blocks.c
@@ -16,6 +16,8 @@
#include "debug.h"
#define CODE_INDENT 4
+#define TAB_STOP 4
+
#define peek_at(i, n) (i)->data[n]
static inline bool
@@ -70,7 +72,9 @@ cmark_parser *cmark_parser_new(int options)
parser->current = document;
parser->line_number = 0;
parser->offset = 0;
+ parser->column = 0;
parser->first_nonspace = 0;
+ parser->first_nonspace_column = 0;
parser->indent = 0;
parser->blank = false;
parser->curline = line;
@@ -555,16 +559,53 @@ static void chop_trailing_hashtags(cmark_chunk *ch)
static void
S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input)
{
+ char c;
+ int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
+
parser->first_nonspace = parser->offset;
- while (peek_at(input, parser->first_nonspace) == ' ') {
- parser->first_nonspace++;
+ parser->first_nonspace_column = parser->column;
+ while ((c = peek_at(input, parser->first_nonspace))) {
+ if (c == ' ') {
+ parser->first_nonspace += 1;
+ parser->first_nonspace_column += 1;
+ chars_to_tab = chars_to_tab - 1;
+ if (chars_to_tab == 0) {
+ chars_to_tab = TAB_STOP;
+ }
+ } else if (c == '\t') {
+ parser->first_nonspace += 1;
+ parser->first_nonspace_column += chars_to_tab;
+ chars_to_tab = TAB_STOP;
+ } else {
+ break;
+ }
}
- parser->indent = parser->first_nonspace - parser->offset;
+ parser->indent = parser->first_nonspace_column - parser->column;
parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
}
static void
+S_advance_offset(cmark_parser *parser, cmark_chunk *input, bufsize_t count, bool columns)
+{
+ char c;
+ int chars_to_tab;
+ while (count > 0 && (c = peek_at(input, parser->offset))) {
+ if (c == '\t') {
+ chars_to_tab = 4 - (parser->column % TAB_STOP);
+ parser->column += chars_to_tab;
+ parser->offset += 1;
+ count -= (columns ? chars_to_tab : 1);
+ } else {
+ parser->offset += 1;
+ parser->column += 1; // assume ascii; block starts are ascii
+ count -= 1;
+ }
+ }
+}
+
+
+static void
S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t bytes)
{
cmark_node* last_matched_container;
@@ -578,8 +619,9 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
cmark_chunk input;
bool maybe_lazy;
- utf8proc_detab(parser->curline, buffer, bytes);
+ cmark_strbuf_put(parser->curline, buffer, bytes);
parser->offset = 0;
+ parser->column = 0;
parser->blank = false;
input.data = parser->curline->ptr;
@@ -601,7 +643,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
if (container->type == NODE_BLOCK_QUOTE) {
matched = parser->indent <= 3 && peek_at(&input, parser->first_nonspace) == '>';
if (matched) {
- parser->offset = parser->first_nonspace + 1;
+ S_advance_offset(parser, &input, parser->indent + 1, true);
if (peek_at(&input, parser->offset) == ' ')
parser->offset++;
} else {
@@ -609,13 +651,14 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
}
} else if (container->type == NODE_ITEM) {
-
if (parser->indent >= container->as.list.marker_offset +
container->as.list.padding) {
- parser->offset += container->as.list.marker_offset +
- container->as.list.padding;
+ S_advance_offset(parser, &input,
+ container->as.list.marker_offset +
+ container->as.list.padding, true);
} else if (parser->blank) {
- parser->offset = parser->first_nonspace;
+ S_advance_offset(parser, &input,
+ parser->first_nonspace - parser->offset, false);
} else {
all_matched = false;
}
@@ -624,9 +667,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
if (!container->as.code.fenced) { // indented
if (parser->indent >= CODE_INDENT) {
- parser->offset += CODE_INDENT;
+ S_advance_offset(parser, &input, CODE_INDENT, true);
} else if (parser->blank) {
- parser->offset = parser->first_nonspace;
+ S_advance_offset(parser, &input,
+ parser->first_nonspace - parser->offset,
+ false);
} else {
all_matched = false;
}
@@ -642,7 +687,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
// closing fence - and since we're at
// the end of a line, we can return:
all_matched = false;
- parser->offset += matched;
+ S_advance_offset(parser, &input, matched, false);
parser->current = finalize(parser, container);
goto finished;
} else {
@@ -650,7 +695,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
i = container->as.code.fence_offset;
while (i > 0 &&
peek_at(&input, parser->offset) == ' ') {
- parser->offset++;
+ S_advance_offset(parser, &input, 1, false);
i--;
}
}
@@ -697,15 +742,16 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
if (!indented && peek_at(&input, parser->first_nonspace) == '>') {
- parser->offset = parser->first_nonspace + 1;
+ S_advance_offset(parser, &input, parser->first_nonspace + 1 - parser->offset, false);
// optional following character
if (peek_at(&input, parser->offset) == ' ')
- parser->offset++;
+ S_advance_offset(parser, &input, 1, false);
container = add_child(parser, container, NODE_BLOCK_QUOTE, parser->offset + 1);
} else if (!indented && (matched = scan_atx_header_start(&input, parser->first_nonspace))) {
- parser->offset = parser->first_nonspace + matched;
+ S_advance_offset(parser, &input,
+ parser->first_nonspace + matched - parser->offset, false);
container = add_child(parser, container, NODE_HEADER, parser->offset + 1);
bufsize_t hashpos = cmark_chunk_strchr(&input, '#', parser->first_nonspace);
@@ -726,7 +772,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
container->as.code.fence_length = matched;
container->as.code.fence_offset = parser->first_nonspace - parser->offset;
container->as.code.info = cmark_chunk_literal("");
- parser->offset = parser->first_nonspace + matched;
+ S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false);
} else if (!indented && (matched = scan_html_block_tag(&input, parser->first_nonspace))) {
@@ -743,7 +789,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
container->type = NODE_HEADER;
container->as.header.level = lev;
container->as.header.setext = true;
- parser->offset = input.len - 1;
+ S_advance_offset(parser, &input, input.len - 1 - parser->offset, false);
} else if (!indented &&
!(container->type == NODE_PARAGRAPH &&
@@ -753,7 +799,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
// it's only now that we know the line is not part of a setext header:
container = add_child(parser, container, NODE_HRULE, parser->first_nonspace + 1);
container = finalize(parser, container);
- parser->offset = input.len - 1;
+ S_advance_offset(parser, &input, input.len - 1 - parser->offset, false);
} else if ((matched = parse_list_marker(&input, parser->first_nonspace, &data)) &&
(!indented || container->type == NODE_LIST)) {
@@ -761,7 +807,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
// spaces indent, as long as the list container is still open.
// compute padding:
- parser->offset = parser->first_nonspace + matched;
+ S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false);
i = 0;
while (i <= 5 && peek_at(&input, parser->offset + i) == ' ') {
i++;
@@ -771,11 +817,11 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
S_is_line_end_char(peek_at(&input, parser->offset))) {
data->padding = matched + 1;
if (i > 0) {
- parser->offset += 1;
+ S_advance_offset(parser, &input, 1, false);
}
} else {
data->padding = matched + i;
- parser->offset += i;
+ S_advance_offset(parser, &input, i, true);
}
// check container; if it's a list, see if this list item
@@ -799,7 +845,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t byte
free(data);
} else if (indented && !maybe_lazy && !parser->blank) {
- parser->offset += CODE_INDENT;
+ S_advance_offset(parser, &input, CODE_INDENT, true);
container = add_child(parser, container, NODE_CODE_BLOCK, parser->offset + 1);
container->as.code.fenced = false;
container->as.code.fence_char = 0;
diff --git a/src/parser.h b/src/parser.h
index 6e18c67..01a7aeb 100644
--- a/src/parser.h
+++ b/src/parser.h
@@ -17,7 +17,9 @@ struct cmark_parser {
struct cmark_node* current;
int line_number;
bufsize_t offset;
+ bufsize_t column;
bufsize_t first_nonspace;
+ bufsize_t first_nonspace_column;
int indent;
bool blank;
cmark_strbuf *curline;