# HG changeset patch # User Matti Hamalainen # Date 1579183287 -7200 # Node ID 2e3b81ae8c8a9a68a2a73e0ce4c79f3fabb75813 # Parent cc9ec51b487502d4561e72a9b983b73996ed8dd0 More work on regexes. diff -r cc9ec51b4875 -r 2e3b81ae8c8a tests.c --- a/tests.c Thu Jan 16 12:50:28 2020 +0200 +++ b/tests.c Thu Jan 16 16:01:27 2020 +0200 @@ -524,7 +524,9 @@ // Test ptr test_start(&ctx, "Test configuration string list ptr"); - test_result(&ctx, (item = th_cfg_find(cfg, NULL, "string_list", -1)) != NULL && item->v.list == &v_str_list); + test_result(&ctx, + (item = th_cfg_find(cfg, NULL, "string_list", -1)) != NULL && + item->v.list == &v_str_list); test_end(&ctx); // Test value finding @@ -567,12 +569,12 @@ } test_regex_def; -void test_regex_list(const th_regex_char *pattern, const test_regex_def *list) +void test_regex_list(const test_regex_def *list, const th_regex_char *pattern) { th_regex_ctx *reg = NULL; int res; - + printf("========================================\n"); printf("pattern '%s'\n", pattern); if ((res = th_regex_compile(®, pattern)) != THERR_OK) { @@ -586,7 +588,8 @@ th_regex_match_node *matches = NULL; size_t nmatches; - if ((res = th_regex_match(reg, def->str, &nmatches, &matches, -1, def->flags)) != THERR_OK) + if ((res = th_regex_match(reg, def->str, + &nmatches, &matches, -1, def->flags)) != THERR_OK) { THERR("Regex match returned error: %s\n", th_error_str(res)); @@ -602,7 +605,7 @@ m != NULL; m = (th_regex_match_node *) m->node.next) { char *tmp = th_strndup(def->str + m->start, m->len); - printf(" match [%" PRIu_SIZE_T " ++ %" PRIu_SIZE_T "]: '%s'\n", + printf(" match [%" PRIu_SIZE_T " ++ %" PRIu_SIZE_T "]: '%s'\n", m->start, m->len, tmp); th_free(tmp); } @@ -827,7 +830,7 @@ #elif TH_ARCH == 64 0xaabbccdd11223344; #else -#error Unsupported TH_ARCH value. +# error Unsupported TH_ARCH value. #endif snprintf(tmp, sizeof(tmp), "%16" PRIx_SIZE_T "h", usiz); @@ -873,20 +876,64 @@ th_regex_ctx *reg = NULL; int res; +#if 0 res = th_regex_compile(®, "z*k+abba fabboa? k{4} [gz]{1,2} foo(bar|zoo)?"); if (res != THERR_OK) printf("result: %s\n", th_error_str(res)); th_regex_free(reg); - test_regex_def tst1[] = + // + { + static const test_regex_def tlist[] = + { + { "abcfoabccg" , 1, 0 }, + { "abcbcfoabccg" , 1, 0 }, + { "abcbcfoabccgabcbcfoabccg" , 2, 0 }, + { "ffdsafS abcbcfoabccg zasdf" , 1, 0 }, + { NULL , 0, 0 } + }; + + test_regex_list(tlist, "a(bc){1,2}fo[oab]*cc?g"); + } + { - { "abcfoabccg" , 1, 0 }, - { "abcbcfoabccg" , 1, 0 }, - { "abcbcfoabccgabcbcfoabccg" , 2, 0 }, - { NULL, 0, 0 } - }; + static const test_regex_def tlist[] = + { + { "abcfoabccg" , 1, 0 }, + { "abcbcfoabccg" , 1, 0 }, + { "abcbcfoabccgabcbcfoabccg" , 2, 0 }, + { "ffdsafS abcbcfoabccg zasdf" , 0, 0 }, + { NULL , 0, 0 } + }; + + test_regex_list(tlist, "^a(bc){1,2}fo[oab]*cc?g"); + } - test_regex_list("a(bc){1,2}fo[oab]*cc?g", tst1); + { + static const test_regex_def tlist[] = + { + { "cg" , 1, 0 }, + { "g" , 1, 0 }, + { "" , 0, 0 }, + { "c" , 0, 0 }, + { NULL , 0, 0 } + }; + + test_regex_list(tlist, "g$"); + } +#endif + + { + static const test_regex_def tlist[] = + { +// { "zoobar" , 1, 0 }, + { "zoo lol bar" , 1, 0 }, +// { "hoho zoo lol lol bar bar" , 1, 0 }, + { NULL , 0, 0 } + }; + + test_regex_list(tlist, "zoo.*?bar"); + } } // diff -r cc9ec51b4875 -r 2e3b81ae8c8a th_regex.c --- a/th_regex.c Thu Jan 16 12:50:28 2020 +0200 +++ b/th_regex.c Thu Jan 16 16:01:27 2020 +0200 @@ -13,7 +13,7 @@ //#define DBG_RE_COMPILE 1 //#define DBG_RE_FREE 1 -//#define DBG_RE_MATCH 1 +#define DBG_RE_MATCH 1 #if defined(DBG_RE_COMPILE) # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) @@ -35,7 +35,8 @@ enum { TH_RE_MATCH_ONCE, - TH_RE_MATCH_COUNT, + TH_RE_MATCH_COUNT_GREEDY, + TH_RE_MATCH_COUNT_NONGREEDY, TH_RE_MATCH_ANCHOR_START, TH_RE_MATCH_ANCHOR_END, }; @@ -55,7 +56,8 @@ static const char *re_match_modes[] = { "ONCE", - "COUNT", + "COUNT GREEDY", + "COUNT NONGREEDY", "ANCHOR_START", "ANCHOR_END", }; @@ -253,37 +255,51 @@ for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) { - DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, ctx.pattern[ctx.offs]); - switch (ctx.pattern[ctx.offs]) + th_regex_char cch = ctx.pattern[ctx.offs]; + DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); + switch (cch) { case '?': - // Previous token is optional (repeat 0-1 times) + case '*': + case '+': if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) goto exit; - pnode->mode = TH_RE_MATCH_COUNT; - pnode->repeatMin = 0; - pnode->repeatMax = 1; - break; - - case '*': - // Previous token can repeat 0 or more times - if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) - goto exit; + if (cch == '?') + { + // Check if previous was a count + pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ? + TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY; - pnode->mode = TH_RE_MATCH_COUNT; - pnode->repeatMin = 0; - pnode->repeatMax = -1; - break; + // Previous token is optional (repeat 0-1 times) + pnode->repeatMin = 0; + pnode->repeatMax = 1; + } + else + { + // Check if previous was a count + if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY || + pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY) + { + res = THERR_INVALID_DATA; + goto exit; + } - case '+': - // Previous token must repeat 1 or more times - if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) - goto exit; + pnode->mode = TH_RE_MATCH_COUNT_GREEDY; - pnode->mode = TH_RE_MATCH_COUNT; - pnode->repeatMin = 1; - pnode->repeatMax = 1; + if (cch == '*') + { + // Previous token can repeat 0 or more times + pnode->repeatMin = 0; + pnode->repeatMax = -1; + } + else + { + // Previous token must repeat 1 or more times + pnode->repeatMin = 1; + pnode->repeatMax = -1; + } + } break; case '{': @@ -303,7 +319,7 @@ ctx.offs - start)) != THERR_OK) goto exit; - pnode->mode = TH_RE_MATCH_COUNT; + pnode->mode = TH_RE_MATCH_COUNT_GREEDY; if (th_regex_find_next(tmp, 0, &start, ',')) { @@ -493,7 +509,7 @@ th_regex_char cch; BOOL ret = FALSE; - DBG_RE_PRINT_MATCH("node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); + DBG_RE_PRINT_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); switch (node->type) { @@ -550,7 +566,7 @@ } out: - DBG_RE_PRINT_MATCH("node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); + DBG_RE_PRINT_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); return ret; } @@ -563,8 +579,8 @@ const th_regex_node *node = &expr->nodes[n]; size_t soffs; - DBG_RE_PRINT_MATCH(" expr %p [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", - (void *) expr, n, expr->nnodes, re_match_modes[node->mode]); + DBG_RE_PRINT_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", + n, expr->nnodes, re_match_modes[node->mode]); switch (node->mode) { @@ -578,17 +594,29 @@ } break; - case TH_RE_MATCH_COUNT: + case TH_RE_MATCH_COUNT_GREEDY: + case TH_RE_MATCH_COUNT_NONGREEDY: { - BOOL done = FALSE, match = FALSE; + BOOL done = FALSE; ssize_t count = 0; DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); do { + BOOL match; soffs = *offs; - if ((match = th_regex_do_match_node(haystack, &soffs, node, flags))) + + match = th_regex_do_match_node(haystack, &soffs, node, flags); + for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++) + { + const th_regex_node *next = &expr->nodes[qn]; + do { + match = th_regex_do_match_node(haystack, &soffs, next, flags); + } while (haystack[soffs] != 0 && !match); + } + + if (match) { // Node matched count++; @@ -654,7 +682,7 @@ BOOL matched; size_t coffs = soffs; - DBG_RE_PRINT_MATCH("\nDO_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", + DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", soffs, haystack + soffs); if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))