Mercurial > hg > th-libs
diff th_regex.c @ 613:2e3b81ae8c8a
More work on regexes.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Thu, 16 Jan 2020 16:01:27 +0200 |
parents | cc9ec51b4875 |
children | afcaf5e38f56 |
line wrap: on
line diff
--- a/th_regex.c Thu Jan 16 12:50:28 2020 +0200 +++ b/th_regex.c Thu Jan 16 16:01:27 2020 +0200 @@ -13,7 +13,7 @@ //#define DBG_RE_COMPILE 1 //#define DBG_RE_FREE 1 -//#define DBG_RE_MATCH 1 +#define DBG_RE_MATCH 1 #if defined(DBG_RE_COMPILE) # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) @@ -35,7 +35,8 @@ enum { TH_RE_MATCH_ONCE, - TH_RE_MATCH_COUNT, + TH_RE_MATCH_COUNT_GREEDY, + TH_RE_MATCH_COUNT_NONGREEDY, TH_RE_MATCH_ANCHOR_START, TH_RE_MATCH_ANCHOR_END, }; @@ -55,7 +56,8 @@ static const char *re_match_modes[] = { "ONCE", - "COUNT", + "COUNT GREEDY", + "COUNT NONGREEDY", "ANCHOR_START", "ANCHOR_END", }; @@ -253,37 +255,51 @@ for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) { - DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, ctx.pattern[ctx.offs]); - switch (ctx.pattern[ctx.offs]) + th_regex_char cch = ctx.pattern[ctx.offs]; + DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); + switch (cch) { case '?': - // Previous token is optional (repeat 0-1 times) + case '*': + case '+': if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) goto exit; - pnode->mode = TH_RE_MATCH_COUNT; - pnode->repeatMin = 0; - pnode->repeatMax = 1; - break; - - case '*': - // Previous token can repeat 0 or more times - if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) - goto exit; + if (cch == '?') + { + // Check if previous was a count + pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ? + TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY; - pnode->mode = TH_RE_MATCH_COUNT; - pnode->repeatMin = 0; - pnode->repeatMax = -1; - break; + // Previous token is optional (repeat 0-1 times) + pnode->repeatMin = 0; + pnode->repeatMax = 1; + } + else + { + // Check if previous was a count + if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY || + pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY) + { + res = THERR_INVALID_DATA; + goto exit; + } - case '+': - // Previous token must repeat 1 or more times - if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) - goto exit; + pnode->mode = TH_RE_MATCH_COUNT_GREEDY; - pnode->mode = TH_RE_MATCH_COUNT; - pnode->repeatMin = 1; - pnode->repeatMax = 1; + if (cch == '*') + { + // Previous token can repeat 0 or more times + pnode->repeatMin = 0; + pnode->repeatMax = -1; + } + else + { + // Previous token must repeat 1 or more times + pnode->repeatMin = 1; + pnode->repeatMax = -1; + } + } break; case '{': @@ -303,7 +319,7 @@ ctx.offs - start)) != THERR_OK) goto exit; - pnode->mode = TH_RE_MATCH_COUNT; + pnode->mode = TH_RE_MATCH_COUNT_GREEDY; if (th_regex_find_next(tmp, 0, &start, ',')) { @@ -493,7 +509,7 @@ th_regex_char cch; BOOL ret = FALSE; - DBG_RE_PRINT_MATCH("node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); + DBG_RE_PRINT_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); switch (node->type) { @@ -550,7 +566,7 @@ } out: - DBG_RE_PRINT_MATCH("node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); + DBG_RE_PRINT_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); return ret; } @@ -563,8 +579,8 @@ const th_regex_node *node = &expr->nodes[n]; size_t soffs; - DBG_RE_PRINT_MATCH(" expr %p [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", - (void *) expr, n, expr->nnodes, re_match_modes[node->mode]); + DBG_RE_PRINT_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", + n, expr->nnodes, re_match_modes[node->mode]); switch (node->mode) { @@ -578,17 +594,29 @@ } break; - case TH_RE_MATCH_COUNT: + case TH_RE_MATCH_COUNT_GREEDY: + case TH_RE_MATCH_COUNT_NONGREEDY: { - BOOL done = FALSE, match = FALSE; + BOOL done = FALSE; ssize_t count = 0; DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); do { + BOOL match; soffs = *offs; - if ((match = th_regex_do_match_node(haystack, &soffs, node, flags))) + + match = th_regex_do_match_node(haystack, &soffs, node, flags); + for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++) + { + const th_regex_node *next = &expr->nodes[qn]; + do { + match = th_regex_do_match_node(haystack, &soffs, next, flags); + } while (haystack[soffs] != 0 && !match); + } + + if (match) { // Node matched count++; @@ -654,7 +682,7 @@ BOOL matched; size_t coffs = soffs; - DBG_RE_PRINT_MATCH("\nDO_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", + DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", soffs, haystack + soffs); if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))