# HG changeset patch # User Matti Hamalainen # Date 1579850004 -7200 # Node ID 9e1f9e1d14875dc6dac4f3fbabb6d2803563ca02 # Parent 8c957ad9d4c3668753b07c3127876ee82abd5d61 Aaand some more work. Still just a broken concept. diff -r 8c957ad9d4c3 -r 9e1f9e1d1487 tests.c --- a/tests.c Thu Jan 23 11:38:28 2020 +0200 +++ b/tests.c Fri Jan 24 09:13:24 2020 +0200 @@ -565,15 +565,15 @@ typedef struct { - th_regex_char *str; + th_regex_char_t *str; size_t nmatches; int flags; } test_regex_def; -void test_regex_list(const test_regex_def *list, const th_regex_char *pattern) +void test_regex_list(const test_regex_def *list, const th_regex_char_t *pattern) { - th_regex_ctx *reg = NULL; + th_regex_t *reg = NULL; int res; printf("========================================\n"); @@ -587,7 +587,7 @@ for (const test_regex_def *def = list; def->str != NULL; def++) { - th_regex_match_node *matches = NULL; + th_regex_match_t *matches = NULL; size_t nmatches; if ((res = th_regex_match(reg, def->str, @@ -603,8 +603,8 @@ nmatches, def->nmatches == nmatches ? "YES" : "NO"); - for (th_regex_match_node *m = matches; - m != NULL; m = (th_regex_match_node *) m->node.next) + for (th_regex_match_t *m = matches; + m != NULL; m = (th_regex_match_t *) m->node.next) { char *tmp = th_strndup(def->str + m->start, m->len); printf(" match [%" PRIu_SIZE_T " ++ %" PRIu_SIZE_T "]: '%s'\n", @@ -878,18 +878,18 @@ #ifdef TH_EXPERIMENTAL_REGEX if (test_set_start("Regular expressions")) { - th_regex_ctx *reg = NULL; - int res; - #ifdef TH_EXPERIMENTAL_REGEX_DEBUG th_dbg_re_flags = TH_DBG_RE_MATCH; #endif #if 0 - res = th_regex_compile(®, "z*k+abba fabboa? k{4} [gz]{1,2} foo(bar|zoo)?"); - if (res != THERR_OK) - printf("result: %s\n", th_error_str(res)); - th_regex_free(reg); + { + th_regex_t *reg = NULL; + int res = th_regex_compile(®, "z*k+abba fabboa? k{4} [gz]{1,2} foo(bar|zoo)?"); + if (res != THERR_OK) + printf("result: %s\n", th_error_str(res)); + th_regex_free(reg); + } // { diff -r 8c957ad9d4c3 -r 9e1f9e1d1487 th_regex.c --- a/th_regex.c Thu Jan 23 11:38:28 2020 +0200 +++ b/th_regex.c Fri Jan 24 09:13:24 2020 +0200 @@ -10,10 +10,30 @@ #ifdef TH_EXPERIMENTAL_REGEX_DEBUG +# define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stdout, __VA_ARGS__); } while (0) +# define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stdout, __VA_ARGS__); } while (0) +# define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stdout, __VA_ARGS__); } while (0) + int th_dbg_re_flags = 0; -# define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stderr, __VA_ARGS__); } while (0) -# define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stderr, __VA_ARGS__); } while (0) -# define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stderr, __VA_ARGS__); } while (0) + +static const char *re_match_modes[] = +{ + "ONCE", + "COUNT GREEDY", + "COUNT NONGREEDY", + "ANCHOR START", + "ANCHOR END", +}; + +static const char *re_match_types[] = +{ + "CHAR", + "ANY", + "LIST", + "LIST REVERSE", + "SUBEXPR", +}; + #else # define DBG_RE_COMPILE(...) # define DBG_RE_FREE(...) @@ -23,11 +43,49 @@ #ifdef TH_EXPERIMENTAL_REGEX +typedef struct +{ + int type; + th_regex_char_t start, end; + + size_t nchars; + th_regex_char_t *chars; +} th_regex_list_item_t; + + +typedef struct +{ + size_t nitems, itemssize; + th_regex_list_item_t *items; +} th_regex_list_t; + + +typedef struct +{ + int mode, type; + ssize_t repeatMin, repeatMax; + + struct { + th_regex_char_t chr; + th_regex_list_t list; + struct th_regex_t *expr; + } match; +} th_regex_node_t; + + +struct th_regex_t +{ + size_t nnodes, nodessize; + th_regex_node_t *nodes; +}; + + enum { TH_RE_MATCH_ONCE, TH_RE_MATCH_COUNT_GREEDY, TH_RE_MATCH_COUNT_NONGREEDY, + TH_RE_MATCH_ANCHOR_START, TH_RE_MATCH_ANCHOR_END, }; @@ -43,68 +101,47 @@ }; -#ifdef TH_EXPERIMENTAL_REGEX_DEBUG -static const char *re_match_modes[] = +typedef struct { - "ONCE", - "COUNT GREEDY", - "COUNT NONGREEDY", - "ANCHOR_START", - "ANCHOR_END", -}; + const th_regex_char_t *pattern; + size_t offs; -static const char *re_match_types[] = -{ - "CHAR", - "ANY", - "LIST", - "LIST_REVERSE", - "SUBEXPR", -}; -#endif + th_regex_t *data; + + size_t nstack, stacksize; + th_regex_t **stack; +} th_regex_parse_ctx_t; -typedef struct +static void th_regex_node_init(th_regex_node_t *node) { - const th_regex_char *pattern; - size_t offs; - - th_regex_ctx *data; - - size_t nstack, stacksize; - th_regex_ctx **stack; -} th_regex_parse_ctx; - - -static void th_regex_node_init(th_regex_node *node) -{ - memset(node, 0, sizeof(th_regex_node)); + memset(node, 0, sizeof(th_regex_node_t)); node->mode = TH_RE_MATCH_ONCE; } -static int th_regex_strndup(th_regex_char **pdst, - const th_regex_char *src, const size_t len) +static int th_regex_strndup(th_regex_char_t **pdst, + const th_regex_char_t *src, const size_t len) { if (pdst == NULL) return THERR_NULLPTR; - if (UINTPTR_MAX / sizeof(th_regex_char) < len + 1) + if (UINTPTR_MAX / sizeof(th_regex_char_t) < len + 1) return THERR_BOUNDS; - if ((*pdst = (th_regex_char *) - th_malloc((len + 1) * sizeof(th_regex_char))) == NULL) + if ((*pdst = (th_regex_char_t *) + th_malloc((len + 1) * sizeof(th_regex_char_t))) == NULL) return THERR_MALLOC; - memcpy(*pdst, src, len * sizeof(th_regex_char)); + memcpy(*pdst, src, len * sizeof(th_regex_char_t)); (*pdst)[len] = 0; return THERR_OK; } -static int th_regex_ctx_get_prev_node( - th_regex_parse_ctx *ctx, th_regex_node **pnode) +static int th_regex_parse_ctx_get_prev_node( + th_regex_parse_ctx_t *ctx, th_regex_node_t **pnode) { if (ctx->data != NULL && ctx->data->nnodes > 0) { @@ -116,14 +153,14 @@ } -static int th_regex_ctx_push(th_regex_parse_ctx *ctx) +static int th_regex_parse_ctx_push(th_regex_parse_ctx_t *ctx) { if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize) { ctx->stacksize += 16; if ((ctx->stack = th_realloc(ctx->stack, - ctx->stacksize * sizeof(th_regex_node *))) == NULL) + ctx->stacksize * sizeof(th_regex_node_t *))) == NULL) return THERR_MALLOC; } @@ -135,7 +172,7 @@ } -static int th_regex_ctx_pop(th_regex_parse_ctx *ctx, th_regex_ctx **data) +static int th_regex_parse_ctx_pop(th_regex_parse_ctx_t *ctx, th_regex_t **data) { if (ctx->nstack > 0) { @@ -149,13 +186,13 @@ } -static int th_regex_ctx_node_commit(th_regex_parse_ctx *ctx, th_regex_node *node) +static int th_regex_parse_ctx_node_commit(th_regex_parse_ctx_t *ctx, th_regex_node_t *node) { - th_regex_ctx *data = ctx->data; + th_regex_t *data = ctx->data; if (data == NULL) { - if ((data = ctx->data = th_malloc0(sizeof(th_regex_ctx))) == NULL) + if ((data = ctx->data = th_malloc0(sizeof(th_regex_t))) == NULL) return THERR_MALLOC; } @@ -163,11 +200,11 @@ { data->nodessize += 16; if ((data->nodes = th_realloc(data->nodes, - data->nodessize * sizeof(th_regex_node))) == NULL) + data->nodessize * sizeof(th_regex_node_t))) == NULL) return THERR_MALLOC; } - memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); + memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t)); data->nnodes++; DBG_RE_COMPILE( @@ -181,9 +218,9 @@ } -static BOOL th_regex_find_next(const th_regex_char *str, +static BOOL th_regex_find_next(const th_regex_char_t *str, const size_t start, size_t *offs, - const th_regex_char delim) + const th_regex_char_t delim) { for (*offs = start; str[*offs] != 0; (*offs)++) { @@ -194,10 +231,10 @@ } -static BOOL th_regex_parse_ssize_t(const th_regex_char *str, +static BOOL th_regex_parse_ssize_t(const th_regex_char_t *str, ssize_t *value) { - th_regex_char ch; + th_regex_char_t ch; BOOL neg; if (*str == '-') @@ -227,31 +264,31 @@ } -static void th_regex_list_item_init(th_regex_list_item *item) +static void th_regex_list_item_init(th_regex_list_item_t *item) { - memset(item, 0, sizeof(th_regex_list_item)); + memset(item, 0, sizeof(th_regex_list_item_t)); } -static int th_regex_list_add_item(th_regex_list *list, th_regex_list_item *item) +static int th_regex_list_add_item(th_regex_list_t *list, th_regex_list_item_t *item) { if (list->items == NULL || list->nitems + 1 >= list->itemssize) { list->itemssize += 16; if ((list->items = th_realloc(list->items, - list->itemssize * sizeof(th_regex_list_item))) == NULL) + list->itemssize * sizeof(th_regex_list_item_t))) == NULL) return THERR_MALLOC; } - memcpy(list->items + list->nitems, item, sizeof(th_regex_list_item)); + memcpy(&list->items[list->nitems], item, sizeof(th_regex_list_item_t)); list->nitems++; return THERR_OK; } -static void th_regex_list_free(th_regex_list *list) +static void th_regex_list_free(th_regex_list_t *list) { if (list != NULL) { @@ -264,11 +301,11 @@ } -static int th_regex_parse_list(const th_regex_char *str, - const size_t slen, th_regex_list *list) +static int th_regex_parse_list(const th_regex_char_t *str, + const size_t slen, th_regex_list_t *list) { - th_regex_char *tmp = NULL; - th_regex_list_item item; + th_regex_char_t *tmp = NULL; + th_regex_list_item_t item; int res; if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK) @@ -277,7 +314,7 @@ // Handle ranges like [A-Z] for (size_t offs = 0; offs < slen; offs++) { - th_regex_char + th_regex_char_t *prev = (offs > 0) ? tmp + offs - 1 : NULL, *curr = tmp + offs, *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL; @@ -319,14 +356,14 @@ for (size_t offs = 0; offs < slen; offs++) { - th_regex_char curr = tmp[offs]; + th_regex_char_t curr = tmp[offs]; if (curr != 0) item.nchars++; } if (item.nchars > 0) { - if ((item.chars = th_malloc(sizeof(th_regex_char) * item.nchars)) == NULL) + if ((item.chars = th_malloc(sizeof(th_regex_char_t) * item.nchars)) == NULL) { res = THERR_MALLOC; goto out; @@ -334,7 +371,7 @@ for (size_t offs = 0, n = 0; offs < slen; offs++) { - th_regex_char curr = tmp[offs]; + th_regex_char_t curr = tmp[offs]; if (curr != 0) { item.chars[n] = curr; @@ -355,12 +392,12 @@ } -int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern) +int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern) { int res = THERR_OK; - th_regex_parse_ctx ctx; - th_regex_node node, *pnode; - th_regex_char *tmp = NULL; + th_regex_parse_ctx_t ctx; + th_regex_node_t node, *pnode; + th_regex_char_t *tmp = NULL; size_t start; if (pexpr == NULL || pattern == NULL) @@ -374,14 +411,14 @@ for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) { - th_regex_char cch = ctx.pattern[ctx.offs]; + th_regex_char_t cch = ctx.pattern[ctx.offs]; DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); switch (cch) { case '?': case '*': case '+': - if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) + if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) goto exit; if (cch == '?') @@ -433,7 +470,7 @@ th_free(tmp); - if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || + if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || (res = th_regex_strndup(&tmp, ctx.pattern + start, ctx.offs - start)) != THERR_OK) goto exit; @@ -471,7 +508,7 @@ case '(': // Start of subpattern - if ((res = th_regex_ctx_push(&ctx)) != THERR_OK) + if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK) goto exit; break; @@ -480,8 +517,8 @@ th_regex_node_init(&node); node.type = TH_RE_TYPE_SUBEXPR; - if ((res = th_regex_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || - (res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) + if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || + (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; @@ -490,7 +527,7 @@ th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_START; - if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) + if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; @@ -499,7 +536,7 @@ th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_END; - if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) + if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; @@ -513,17 +550,18 @@ goto exit; } - // XXX TODO Parse/support ranges [0-9a-zA-Z_-] + th_regex_node_init(&node); + if (ctx.pattern[start] == '^') + { + node.type = TH_RE_TYPE_LIST_REVERSE; + start++; + } + else + node.type = TH_RE_TYPE_LIST; - th_regex_node_init(&node); - node.type = (ctx.pattern[start] == '^') ? - TH_RE_TYPE_LIST_REVERSE : TH_RE_TYPE_LIST; - node.match.list.nchars = ctx.offs - start; - if ((res = th_regex_strndup(&node.match.list.chars, - ctx.pattern + start, node.match.list.nchars)) != THERR_OK) - goto exit; - - if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) + if ((res = th_regex_parse_list(ctx.pattern + start, + ctx.offs - start, &node.match.list) != THERR_OK) || + (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; @@ -532,7 +570,7 @@ th_regex_node_init(&node); node.type = TH_RE_TYPE_ANY_CHAR; - if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) + if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; @@ -545,14 +583,7 @@ res = THERR_INVALID_DATA; goto exit; } - - th_regex_node_init(&node); - node.type = TH_RE_TYPE_CHAR; - node.match.chr = ctx.pattern[ctx.offs]; - - if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) - goto exit; - break; + // fall-through default: // Given character must match @@ -560,7 +591,7 @@ node.type = TH_RE_TYPE_CHAR; node.match.chr = ctx.pattern[ctx.offs]; - if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) + if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; } @@ -573,45 +604,24 @@ } -void th_regex_free(th_regex_ctx *expr) +void th_regex_free(th_regex_t *expr) { if (expr != NULL) { - for (size_t n = 0; n < expr->nnodes; n++) + for (size_t nnode = 0; nnode < expr->nnodes; nnode++) { - th_regex_node *node = &expr->nodes[n]; + th_regex_node_t *node = &expr->nodes[nnode]; DBG_RE_FREE( "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " - "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", - n, expr->nnodes, - node->mode, node->type, + "mode=%s, type=%s, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", + nnode, expr->nnodes, + re_match_modes[node->mode], + re_match_types[node->type], node->repeatMin, node->repeatMax); - switch (node->type) - { - case TH_RE_TYPE_SUBEXPR: - DBG_RE_FREE(" SUBEXPR: %p vs %p\n", - (void *) expr, (void *) node->match.expr); - th_regex_free(node->match.expr); - break; - - case TH_RE_TYPE_LIST: - case TH_RE_TYPE_LIST_REVERSE: - DBG_RE_FREE(" list='%s'\n", node->match.list.chars); - th_free(node->match.list.chars); - break; - -#ifdef TH_EXPERIMENTAL_REGEX_DEBUG - case TH_RE_TYPE_ANY_CHAR: - DBG_RE_FREE(" any char\n"); - break; - - case TH_RE_TYPE_CHAR: - DBG_RE_FREE(" char='%c'\n", node->match.chr); - break; -#endif - } + th_regex_free(node->match.expr); + th_regex_list_free(&node->match.list); } th_free(expr->nodes); @@ -619,12 +629,13 @@ } -static BOOL th_regex_do_match_list(const th_regex_list *list, const th_regex_char cch) +static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch) { // Could be optimized, perhaps .. sort match.chars, binary search etc? for (size_t nitem = 0; nitem < list->nitems; nitem++) { - const th_regex_list_item *item = &list->items[nitem]; + const th_regex_list_item_t *item = &list->items[nitem]; + if (item->type == 0) { for (size_t n = 0; n < item->nchars; n++) @@ -642,167 +653,170 @@ } -static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, - const th_regex_char *haystack, size_t *offs, const int flags); +static BOOL th_regex_match_expr( + const th_regex_char_t *haystack, + size_t *poffs, + const th_regex_t *expr, + const size_t startnode, + const int flags + ); -static BOOL th_regex_do_match_node(const th_regex_char *haystack, - size_t *offs, const th_regex_node *node, const int flags) +static BOOL th_regex_match_one( + const th_regex_char_t *haystack, + size_t *poffs, + const th_regex_node_t *node, + const int flags + ) { - th_regex_char cch; + th_regex_char_t cch; BOOL res = FALSE; - DBG_RE_MATCH(" node_START [%s]: '%s': ", - re_match_types[node->type], haystack + *offs); - switch (node->type) { case TH_RE_TYPE_SUBEXPR: - DBG_RE_MATCH("subexpr ..\n"); - res = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); - return res; + // push current context in stack + // new context for subexpr + res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags); break; case TH_RE_TYPE_LIST: case TH_RE_TYPE_LIST_REVERSE: - DBG_RE_MATCH("[%s]", node->match.list.chars); - if ((cch = haystack[*offs]) == 0) - goto out; - - // Could be optimized, perhaps .. sort match.chars, binary search etc? - // XXX TODO Ranges are not supported yet - for (size_t n = 0; n < node->match.list.nchars; n++) - if (node->match.list.chars[n] == cch) + if ((cch = haystack[*poffs]) == 0) + res = FALSE; + else { - res = TRUE; - break; - } + res = th_regex_match_list(&node->match.list, cch); - if (node->type == TH_RE_TYPE_LIST_REVERSE) - res = !res; + if (node->type == TH_RE_TYPE_LIST_REVERSE) + res = !res; - (*offs)++; + (*poffs)++; + } break; case TH_RE_TYPE_ANY_CHAR: - if ((cch = haystack[*offs]) == 0) - goto out; - - res = TRUE; - (*offs)++; + if ((cch = haystack[*poffs]) == 0) + res = FALSE; + else + { + res = TRUE; + (*poffs)++; + } break; case TH_RE_TYPE_CHAR: - DBG_RE_MATCH("'%c'", node->match.chr); - if ((cch = haystack[*offs]) == 0) - goto out; - - res = (cch == node->match.chr); - (*offs)++; + if ((cch = haystack[*poffs]) == 0) + res = FALSE; + else + { + res = (cch == node->match.chr); + (*poffs)++; + } break; } -out: - DBG_RE_MATCH(", match=%s\n", - res ? "YES" : "NO"); - return res; } -static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, - const th_regex_char *haystack, size_t *offs, const int flags) +static BOOL th_regex_match_expr( + const th_regex_char_t *haystack, + size_t *poffs, + const th_regex_t *expr, + const size_t startnode, + const int flags + ) { - for (size_t n = 0; n < expr->nnodes; n++) + BOOL res = TRUE; + size_t soffs = *poffs; + + for (size_t nnode = startnode; res && nnode < expr->nnodes; nnode++) { - const th_regex_node *node = &expr->nodes[n]; - size_t soffs; + const th_regex_node_t *node = &expr->nodes[nnode]; - DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", - n, expr->nnodes, re_match_modes[node->mode]); + DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s %s '%s'\n", + nnode, expr->nnodes, + re_match_modes[node->mode], re_match_types[node->type], + haystack + *poffs); switch (node->mode) { case TH_RE_MATCH_ONCE: - { - DBG_RE_MATCH("\n"); - soffs = *offs; - if (!th_regex_do_match_node(haystack, &soffs, node, flags)) - return FALSE; - *offs = soffs; - } + res = th_regex_match_one(haystack, poffs, node, flags); break; case TH_RE_MATCH_COUNT_GREEDY: case TH_RE_MATCH_COUNT_NONGREEDY: { - BOOL done = FALSE; ssize_t count = 0; - DBG_RE_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); + DBG_RE_MATCH(" COUNT min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", + node->repeatMin, node->repeatMax); do { BOOL match; - soffs = *offs; + size_t toffs = *poffs, tnode = nnode; - match = th_regex_do_match_node(haystack, &soffs, node, flags); - for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++) - { - const th_regex_node *next = &expr->nodes[qn]; - do { - match = th_regex_do_match_node(haystack, &soffs, next, flags); - } while (haystack[soffs] != 0 && !match); - } + do { + match = th_regex_match_one(haystack, &toffs, node, flags); + if (match && haystack[toffs] != 0) + { + for (tnode = nnode + 1; match && tnode < expr->nnodes && haystack[toffs] != 0; tnode++) + { + size_t noffs = toffs; + match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags); + DBG_RE_MATCH(" '%s': %d\n", haystack + noffs, match); + } + } + } while (!match && haystack[toffs] != 0); if (match) { // Node matched count++; - done = (node->repeatMax > 0 && count >= node->repeatMax); - *offs = soffs; + *poffs = soffs; + nnode = tnode; + res = (node->repeatMax > 0 && count >= node->repeatMax); } else { // Node did not match, check if we got the minimum if set - done = (node->repeatMin >= 0 && count >= node->repeatMin); - soffs = *offs; - break; + res = (node->repeatMin >= 0 && count >= node->repeatMin); } - } while (!done); - DBG_RE_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); + } while (!res); - if (!done) - return FALSE; - - *offs = soffs; + DBG_RE_MATCH(" RESULT: count=%" PRId_SSIZE_T ", done=%s\n", + count, res ? "YES" : "NO"); } break; + case TH_RE_MATCH_ANCHOR_START: - DBG_RE_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); - if (*offs != 0) - return FALSE; + res = (*poffs == 0); break; case TH_RE_MATCH_ANCHOR_END: - DBG_RE_MATCH("is end=%d\n", haystack[*offs]); - if (haystack[*offs] != 0) - return FALSE; + res = (haystack[*poffs] == 0); break; } } - return TRUE; + if (!res) + *poffs = soffs; + + return res; } -int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack, - size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches, +int th_regex_match(const th_regex_t *expr, const th_regex_char_t *haystack, + size_t *pnmatches, th_regex_match_t **pmatches, const size_t maxmatches, const int flags) { size_t nmatches = 0; + (void) flags; if (pnmatches != NULL) *pnmatches = 0; @@ -818,13 +832,12 @@ // .. for (size_t soffs = 0; haystack[soffs] != 0; ) { - BOOL matched; size_t coffs = soffs; DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", soffs, haystack + soffs); - if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) + if (th_regex_match_expr(haystack, &coffs, expr, 0, flags)) { // A match was found, increase count nmatches++; @@ -835,7 +848,7 @@ if (pmatches != NULL) { - th_regex_match_node *match = th_malloc0(sizeof(th_regex_match_node)); + th_regex_match_t *match = th_malloc0(sizeof(th_regex_match_t)); if (match == NULL) return THERR_MALLOC; @@ -866,14 +879,14 @@ } -static void th_regex_free_match(th_regex_match_node *node) +static void th_regex_free_match(th_regex_match_t *node) { (void) node; // Nothing to do here at the moment } -void th_regex_free_matches(th_regex_match_node *matches) +void th_regex_free_matches(th_regex_match_t *matches) { th_llist_free_func_node((th_llist_t *) matches, (void (*)(th_llist_t *)) th_regex_free_match); diff -r 8c957ad9d4c3 -r 9e1f9e1d1487 th_regex.h --- a/th_regex.h Thu Jan 23 11:38:28 2020 +0200 +++ b/th_regex.h Fri Jan 24 09:13:24 2020 +0200 @@ -22,10 +22,26 @@ // // Definitions // + // Character type -typedef char th_regex_char; +typedef char th_regex_char_t; + + +// Structure containing the compiled regular expression +struct th_regex_t; +typedef struct th_regex_t th_regex_t; +// Linked list structure for returned match sequences +typedef struct +{ + th_llist_t node; + size_t start, len; +} th_regex_match_t; + + +// Flags for th_regex_match() +// (not actually used or supported yet :D) enum { TH_REF_CASEFOLD = 0x0001, @@ -33,59 +49,6 @@ }; -typedef struct -{ - int type; - th_regex_char start, end; - - size_t nchars; - th_regex_char *chars; -} th_regex_list_item; - - -typedef struct -{ - size_t nitems, itemssize; - th_regex_list_item *items; -} th_regex_list; - - -struct th_regex_ctx; - -typedef struct -{ - int mode, type; - ssize_t repeatMin, repeatMax; - - union - { - th_regex_char chr; - - struct - { - size_t nchars; - th_regex_char *chars; - } list; - - struct th_regex_ctx *expr; - } match; -} th_regex_node; - - -typedef struct th_regex_ctx -{ - size_t nnodes, nodessize; - th_regex_node *nodes; -} th_regex_ctx; - - -typedef struct -{ - th_llist_t node; - size_t start, len; -} th_regex_match_node; - - #ifdef TH_EXPERIMENTAL_REGEX_DEBUG enum { @@ -101,13 +64,14 @@ // // Functions // -int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern); -void th_regex_free(th_regex_ctx *expr); +int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern); +void th_regex_free(th_regex_t *expr); -int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack, - size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches, +int th_regex_match(const th_regex_t *expr, const th_regex_char_t *haystack, + size_t *pnmatches, th_regex_match_t **pmatches, const size_t maxmatches, const int flags); -void th_regex_free_matches(th_regex_match_node *matches); + +void th_regex_free_matches(th_regex_match_t *matches); #ifdef __cplusplus