# HG changeset patch # User Matti Hamalainen # Date 1579921900 -7200 # Node ID b897995101b7d510e03854bf1120269f1ec93e48 # Parent 562de49f8b4b93ac10d8492c1c72348cc91e3e66 More fiddling and twiddling. Add parsing to string nodes instead of separate character nodes. diff -r 562de49f8b4b -r b897995101b7 tests.c --- a/tests.c Fri Jan 24 15:10:19 2020 +0200 +++ b/tests.c Sat Jan 25 05:11:40 2020 +0200 @@ -571,26 +571,44 @@ } test_regex_def; +void test_regex_print_matches(const th_regex_char_t *str, const th_regex_match_t *matches) +{ + for (const th_regex_match_t *mt = matches; + mt != NULL; mt = (th_regex_match_t *) mt->node.next) + { + char *tmp = th_strndup(str + mt->start, mt->len); + + printf(" match [%3" PRIu_SIZE_T " ++ %3" PRIu_SIZE_T "]: '%s'\n", + mt->start, mt->len, tmp); + + th_free(tmp); + } +} + + void test_regex_list(const test_regex_def *list, const th_regex_char_t *pattern) { - th_regex_t *reg = NULL; + th_regex_t *expr = NULL; int res; printf("========================================\n"); - if ((res = th_regex_compile(®, pattern)) != THERR_OK) + printf("Compiling pattern \"%s\"\n", pattern); + if ((res = th_regex_compile(&expr, pattern)) != THERR_OK) { THERR("Regex compilation failed: %s\n", th_error_str(res)); goto out; } + th_regex_dump(stdout, 1, expr); + for (const test_regex_def *def = list; def->str != NULL; def++) { th_regex_match_t *matches = NULL; size_t nmatches; printf("----------------------------------------\n"); - if ((res = th_regex_match(reg, def->str, + if ((res = th_regex_match(expr, def->str, &nmatches, &matches, -1, def->flags)) != THERR_OK) { THERR("Regex match returned error: %s\n", @@ -598,26 +616,17 @@ goto out; } - printf("\npattern '%s'\n", pattern); - printf(" '%s': matched %" PRIu_SIZE_T " time(s), testresult=%s\n", + printf("'%s': matched %" PRIu_SIZE_T " time(s), testresult=%s\n", def->str, nmatches, def->nmatches == nmatches ? "YES" : "NO"); - for (th_regex_match_t *m = matches; - m != NULL; m = (th_regex_match_t *) m->node.next) - { - char *tmp = th_strndup(def->str + m->start, m->len); - printf(" match [%" PRIu_SIZE_T " ++ %" PRIu_SIZE_T "]: '%s'\n", - m->start, m->len, tmp); - th_free(tmp); - } - + test_regex_print_matches(def->str, matches); th_regex_free_matches(matches); } out: - th_regex_free(reg); + th_regex_free(expr); } #endif @@ -880,18 +889,23 @@ if (test_set_start("Regular expressions")) { #ifdef TH_EXPERIMENTAL_REGEX_DEBUG - th_dbg_re_flags = th_verbosity > 0 ? TH_DBG_RE_MATCH : 0; + th_dbg_re_flags = th_verbosity > 0; #endif + { + const char *str = "z*k+abba fabboa? [a-zA-Z_-] \\{\\} k{4} ([0-9]+ yay){1,2} foo(bar|zoo)?"; + th_regex_t *expr = NULL; + int res = th_regex_compile(&expr, str); + printf("REGEX: \"%s\"\n", str); + if (res == THERR_OK) + th_regex_dump(stdout, 1, expr); + else + printf("ERROR: %s\n", th_error_str(res)); + th_regex_free(expr); + } + #if 0 { - th_regex_t *reg = NULL; - int res = th_regex_compile(®, "z*k+abba fabboa? k{4} [gz]{1,2} foo(bar|zoo)?"); - if (res != THERR_OK) - printf("result: %s\n", th_error_str(res)); - th_regex_free(reg); - } - { static const test_regex_def tlist[] = { { "abcfoabccg" , 1, 0 }, diff -r 562de49f8b4b -r b897995101b7 th_regex.c --- a/th_regex.c Fri Jan 24 15:10:19 2020 +0200 +++ b/th_regex.c Sat Jan 25 05:11:40 2020 +0200 @@ -10,11 +10,34 @@ #ifdef TH_EXPERIMENTAL_REGEX_DEBUG -# define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stdout, __VA_ARGS__); } while (0) -# define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stdout, __VA_ARGS__); } while (0) -# define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stdout, __VA_ARGS__); } while (0) +# define DBG_RE_MATCH(...) do { if (th_dbg_re_flags) fprintf(stdout, __VA_ARGS__); } while (0) +BOOL th_dbg_re_flags = FALSE; +#else +# define DBG_RE_MATCH(...) +#endif + + +#ifdef TH_EXPERIMENTAL_REGEX -int th_dbg_re_flags = 0; +enum +{ + TH_RE_MATCH_ONCE, + TH_RE_MATCH_COUNT, + TH_RE_MATCH_ANCHOR_START, + TH_RE_MATCH_ANCHOR_END, +}; + + +enum +{ + TH_RE_TYPE_CHAR, + TH_RE_TYPE_STR, + TH_RE_TYPE_ANY_CHAR, + TH_RE_TYPE_LIST, + TH_RE_TYPE_LIST_REVERSE, + TH_RE_TYPE_SUBEXPR, +}; + static const char *re_match_modes[] = { @@ -24,23 +47,17 @@ "ANCHOR END", }; + static const char *re_match_types[] = { "CHAR", + "STR", "ANY", "LIST", "LIST REVERSE", "SUBEXPR", }; -#else -# define DBG_RE_COMPILE(...) -# define DBG_RE_FREE(...) -# define DBG_RE_MATCH(...) -#endif - - -#ifdef TH_EXPERIMENTAL_REGEX typedef struct { @@ -66,8 +83,10 @@ struct { th_regex_char_t chr; + th_regex_char_t *str; th_regex_list_t list; - struct th_regex_t *expr; + + th_regex_t *expr; } match; } th_regex_node_t; @@ -79,26 +98,6 @@ }; -enum -{ - TH_RE_MATCH_ONCE, - TH_RE_MATCH_COUNT, - - TH_RE_MATCH_ANCHOR_START, - TH_RE_MATCH_ANCHOR_END, -}; - - -enum -{ - TH_RE_TYPE_CHAR, - TH_RE_TYPE_ANY_CHAR, - TH_RE_TYPE_LIST, - TH_RE_TYPE_LIST_REVERSE, - TH_RE_TYPE_SUBEXPR, -}; - - typedef struct { const th_regex_char_t *pattern; @@ -108,6 +107,10 @@ size_t nstack, stacksize; th_regex_t **stack; + + th_regex_char_t *buf; + size_t bufSize, bufPos; + } th_regex_parse_ctx_t; @@ -205,13 +208,6 @@ memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t)); data->nnodes++; - DBG_RE_COMPILE( - "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " - "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", - data->nnodes, data->nodessize, - node->mode, node->type, - node->repeatMin, node->repeatMax); - return THERR_OK; } @@ -327,7 +323,7 @@ item.start = *prev; item.end = *next; - if (item.start <= item.end) + if (item.start >= item.end) { res = THERR_INVALID_DATA; goto out; @@ -390,6 +386,53 @@ } +static int th_regex_parse_ctx_node_commit_strchr_do(th_regex_parse_ctx_t *ctx, + const th_regex_char_t *buf, const size_t bufLen) +{ + th_regex_node_t node; + th_regex_node_init(&node); + + if (bufLen > 1) + { + int res; + node.type = TH_RE_TYPE_STR; + if ((res = th_regex_strndup(&node.match.str, buf, bufLen)) != THERR_OK) + return res; + } + else + { + node.type = TH_RE_TYPE_CHAR; + node.match.chr = buf[0]; + } + + return th_regex_parse_ctx_node_commit(ctx, &node); +} + + +static int th_regex_parse_ctx_node_commit_strchr(th_regex_parse_ctx_t *ctx, const BOOL split) +{ + int res = THERR_OK;; + + if (ctx->bufPos > 0) + { + if (ctx->bufPos > 1 && split) + { + if ((res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos - 1)) != THERR_OK) + return res; + + res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf + ctx->bufPos - 1, 1); + } + else + { + res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos); + } + ctx->bufPos = 0; + } + + return res; +} + + int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern) { int res = THERR_OK; @@ -398,26 +441,39 @@ th_regex_char_t *tmp = NULL; size_t start; + // Check pointers if (pexpr == NULL || pattern == NULL) { res = THERR_NULLPTR; - goto exit; + goto out; } + // Initialize parsing context memset(&ctx, 0, sizeof(ctx)); ctx.pattern = pattern; + ctx.bufSize = 256; + if ((ctx.buf = th_malloc(ctx.bufSize * sizeof(th_regex_char_t))) == NULL) + { + res = THERR_MALLOC; + goto out; + } + + // Start parsing the pattern for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) { th_regex_char_t cch = ctx.pattern[ctx.offs]; - DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); + switch (cch) { case '?': case '*': case '+': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, TRUE)) != THERR_OK) + goto out; + if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) - goto exit; + goto out; if (cch == '?') { @@ -432,7 +488,7 @@ if (pnode->mode == TH_RE_MATCH_COUNT) { res = THERR_INVALID_DATA; - goto exit; + goto out; } pnode->mode = TH_RE_MATCH_COUNT; @@ -453,13 +509,16 @@ break; case '{': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, TRUE)) != THERR_OK) + goto out; + // {n} | {min,max} start = ctx.offs + 1; if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}')) { // End not found res = THERR_INVALID_DATA; - goto exit; + goto out; } th_free(tmp); @@ -467,7 +526,7 @@ if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || (res = th_regex_strndup(&tmp, ctx.pattern + start, ctx.offs - start)) != THERR_OK) - goto exit; + goto out; pnode->mode = TH_RE_MATCH_COUNT; @@ -478,7 +537,7 @@ !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax)) { res = THERR_INVALID_DATA; - goto exit; + goto out; } } else @@ -486,7 +545,7 @@ if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin)) { res = THERR_INVALID_DATA; - goto exit; + goto out; } pnode->repeatMax = pnode->repeatMin; } @@ -496,52 +555,67 @@ { // Invalid repeat counts res = THERR_INVALID_DATA; - goto exit; + goto out; } break; case '(': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; + // Start of subpattern if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK) - goto exit; + goto out; break; case ')': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; + // End of subpattern th_regex_node_init(&node); node.type = TH_RE_TYPE_SUBEXPR; if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) - goto exit; + goto out; break; case '^': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; + // Start of line anchor th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_START; if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) - goto exit; + goto out; break; case '$': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; + // End of line anchor th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_END; if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) - goto exit; + goto out; break; case '[': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; + // Start of char list start = ctx.offs + 1; if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') || ctx.offs == start) { res = THERR_INVALID_DATA; - goto exit; + goto out; } th_regex_node_init(&node); @@ -554,18 +628,21 @@ node.type = TH_RE_TYPE_LIST; if ((res = th_regex_parse_list(ctx.pattern + start, - ctx.offs - start, &node.match.list) != THERR_OK) || + ctx.offs - start, &node.match.list)) != THERR_OK || (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) - goto exit; + goto out; break; case '.': + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; + // Any single character matches th_regex_node_init(&node); node.type = TH_RE_TYPE_ANY_CHAR; if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) - goto exit; + goto out; break; case '\\': @@ -575,25 +652,40 @@ { // End of pattern, error res = THERR_INVALID_DATA; - goto exit; + goto out; } // fall-through default: // Given character must match - th_regex_node_init(&node); - node.type = TH_RE_TYPE_CHAR; - node.match.chr = ctx.pattern[ctx.offs]; - - if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) - goto exit; + if (ctx.bufPos < ctx.bufSize) + ctx.buf[ctx.bufPos++] = ctx.pattern[ctx.offs]; + else + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; break; } } -exit: + // Commit last string/char if any + if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) + goto out; + + // Create root node + th_regex_node_init(&node); + node.type = TH_RE_TYPE_SUBEXPR; + node.match.expr = ctx.data; + ctx.data = NULL; + + if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) + goto out; + +out: *pexpr = ctx.data; + + // Free temporary buffers th_free(tmp); + th_free(ctx.buf); return res; } @@ -606,14 +698,6 @@ { th_regex_node_t *node = &expr->nodes[nnode]; - DBG_RE_FREE( - "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " - "mode=%s, type=%s, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", - nnode, expr->nnodes, - re_match_modes[node->mode], - re_match_types[node->type], - node->repeatMin, node->repeatMax); - th_regex_free(node->match.expr); th_regex_list_free(&node->match.list); } @@ -623,6 +707,71 @@ } +void th_regex_dump(FILE *fh, const int level, const th_regex_t *expr) +{ + if (expr != NULL) + { + for (size_t nnode = 0; nnode < expr->nnodes; nnode++) + { + th_regex_node_t *node = &expr->nodes[nnode]; + + for (int indent = 0; indent < level; indent++) + fprintf(fh, " "); + + fprintf(fh, "[%" PRIu_SIZE_T " / %" PRIu_SIZE_T "] %s %s ", + nnode + 1, expr->nnodes, + re_match_modes[node->mode], + re_match_types[node->type]); + + if (node->mode == TH_RE_MATCH_COUNT) + { + fprintf(fh, "min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T " : ", + node->repeatMin, node->repeatMax); + } + + switch (node->type) + { + case TH_RE_TYPE_CHAR: + fprintf(fh, "'%c'\n", node->match.chr); + break; + + case TH_RE_TYPE_STR: + fprintf(fh, "\"%s\"\n", node->match.str); + break; + + case TH_RE_TYPE_ANY_CHAR: + fprintf(fh, ".\n"); + break; + + case TH_RE_TYPE_LIST: + case TH_RE_TYPE_LIST_REVERSE: + fprintf(fh, "[ "); + for (size_t n = 0; n < node->match.list.nitems; n++) + { + const th_regex_list_item_t *li = &node->match.list.items[n]; + if (li->type) + { + fprintf(fh, "'%c-%c' ", li->start, li->end); + } + else + { + for (size_t i = 0; i < li->nchars; i++) + fprintf(fh, "'%c' ", li->chars[i]); + } + } + fprintf(fh, "]\n"); + break; + + case TH_RE_TYPE_SUBEXPR: + fprintf(fh, "\n"); + th_regex_dump(fh, level + 1, node->match.expr); + break; + } + } + } +} + + static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch) { // Could be optimized, perhaps .. sort match.chars, binary search etc? @@ -669,8 +818,6 @@ switch (node->type) { case TH_RE_TYPE_SUBEXPR: - // push current context in stack - // new context for subexpr res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags); break; @@ -708,6 +855,20 @@ (*poffs)++; } break; + + case TH_RE_TYPE_STR: + res = TRUE; + for (th_regex_char_t *str = node->match.str; *str != 0; ) + { + if (haystack[*poffs] != *str) + { + res = FALSE; + break; + } + str++; + (*poffs)++; + } + break; } return res; @@ -744,7 +905,7 @@ { ssize_t count = 0; - DBG_RE_MATCH(" COUNT min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", + DBG_RE_MATCH(" min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); do @@ -752,21 +913,29 @@ BOOL match; size_t toffs = soffs, tnode; - DBG_RE_MATCH(" START '%s'\n", haystack + toffs); - do { + DBG_RE_MATCH(" ROUND #%" PRIu_SIZE_T ": '%s'\n", + count, haystack + toffs); + + do + { match = TRUE; + size_t noffs = toffs; for (tnode = nnode; match && tnode < expr->nnodes && haystack[toffs] != 0; ) { match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags); if (match) tnode++; } - DBG_RE_MATCH(" '%s': %d (tnode=%" PRIu_SIZE_T ")\n", haystack + toffs, match, tnode); + + DBG_RE_MATCH(" '%s': %d (tnode=%" PRIu_SIZE_T ")\n", + haystack + noffs, match, tnode); + if (node->repeatMin >= 0 && match) break; } while (!match && haystack[toffs] != 0); - DBG_RE_MATCH(" END '%s': %d (tnode=%" PRIu_SIZE_T ")\n", haystack + toffs, match, tnode); + DBG_RE_MATCH(" ROUND #%" PRIu_SIZE_T " END: match=%s, tnode=%" PRIu_SIZE_T "\n", + count, match ? "YES" : "NO", tnode); if (match) { diff -r 562de49f8b4b -r b897995101b7 th_regex.h --- a/th_regex.h Fri Jan 24 15:10:19 2020 +0200 +++ b/th_regex.h Sat Jan 25 05:11:40 2020 +0200 @@ -50,14 +50,7 @@ #ifdef TH_EXPERIMENTAL_REGEX_DEBUG -enum -{ - TH_DBG_RE_COMPILE = 0x0001, - TH_DBG_RE_FREE = 0x0002, - TH_DBG_RE_MATCH = 0x0004, -}; - -extern int th_dbg_re_flags; +extern BOOL th_dbg_re_flags; #endif @@ -66,6 +59,7 @@ // int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern); void th_regex_free(th_regex_t *expr); +void th_regex_dump(FILE *fh, const int level, const th_regex_t *expr); int th_regex_match(const th_regex_t *expr, const th_regex_char_t *haystack, size_t *pnmatches, th_regex_match_t **pmatches, const size_t maxmatches,