# HG changeset patch # User Matti Hamalainen # Date 1580121819 -7200 # Node ID e1d27caf0dbd4fc585f16a7ba762dd0fd761b61b # Parent 4932188c91019a540d8c3da7d352f2f756a45198 More work on regex stuff. diff -r 4932188c9101 -r e1d27caf0dbd tests.c --- a/tests.c Mon Jan 27 07:51:07 2020 +0200 +++ b/tests.c Mon Jan 27 12:43:39 2020 +0200 @@ -603,7 +603,7 @@ th_regex_t *expr = NULL; int res; - printf("========================================\n"); + printf("\n========================================\n\n"); printf("Compiling pattern \"%s\"\n", pattern); if ((res = th_regex_compile(&expr, pattern)) != THERR_OK) { @@ -620,7 +620,7 @@ th_regex_match_t *matches = NULL; size_t nmatches; - printf("----------------------------------------\n"); + printf("\n----------------------------------------\n"); if ((res = th_regex_match(expr, def->str, &nmatches, &matches, -1, def->flags)) != THERR_OK) { @@ -645,7 +645,7 @@ void test_regex_list2(const test_regex_def2 *list) { - printf("========================================\n"); + printf("\n========================================\n\n"); for (const test_regex_def2 *def = list; def->str != NULL; def++) { @@ -959,21 +959,23 @@ const char *str = "z*k+abba fabboa? [a-zA-Z_-] \\{\\} k{4} ([0-9]+ yay){1,2} foo(bar|zoo)?"; th_regex_t *expr = NULL; int res = th_regex_compile(&expr, str); + printf("REGEX: \"%s\"\n", str); + if (res == THERR_OK) th_regex_dump(&testio, 1, expr); else printf("ERROR: %s\n", th_error_str(res)); + th_regex_free(expr); } -#if 0 { static const test_regex_def1 tlist[] = { - { "abcfoabccg" , 1, 0 }, - { "abcbcfoabccg" , 1, 0 }, - { "abcbcfoabccgabcbcfoabccg" , 2, 0 }, + { "abcfoabcccg" , 1, 0 }, + { "abcbcfoabcccg" , 1, 0 }, + { "abcbcfoabccg abcbcfoabccccg" , 2, 0 }, { "ffdsafS abcbcfoabccg zasdf" , 1, 0 }, { NULL , 0, 0 } }; @@ -986,7 +988,7 @@ { { "abcfoabccg" , 1, 0 }, { "abcbcfoabccg" , 1, 0 }, - { "abcbcfoabccgabcbcfoabccg" , 2, 0 }, + { "abcbcfoabccgabcbcfoabccg" , 1, 0 }, { "ffdsafS abcbcfoabccg zasdf" , 0, 0 }, { NULL , 0, 0 } }; @@ -1006,7 +1008,6 @@ test_regex_list1(tlist, "g$"); } -#endif { static const test_regex_def1 tlist[] = @@ -1019,7 +1020,7 @@ }; test_regex_list1(tlist, "zoo.*?bar"); -// test_regex_list(tlist, "zoo.*?bar"); + test_regex_list1(tlist, "zoo.*bar"); } } #endif diff -r 4932188c9101 -r e1d27caf0dbd th_regex.c --- a/th_regex.c Mon Jan 27 07:51:07 2020 +0200 +++ b/th_regex.c Mon Jan 27 12:43:39 2020 +0200 @@ -942,55 +942,87 @@ const int level ) { - size_t toffs = *offs, noffs; - BOOL res, match = FALSE; + size_t toffs = *offs; + BOOL res = FALSE, rest = FALSE; ssize_t count = 0; - if (node->repeatMin > 0) do { - noffs = toffs; - match = th_regex_match_one(haystack, &toffs, node, flags, level); - if (match) + // Attempt to match the repeated node once + size_t poffs; + BOOL matched; + + poffs = toffs; + if ((matched = th_regex_match_one(haystack, &poffs, node, flags, level))) { + // Matched, increase count count++; +// DBG_RE_PRINT("#%" PRId_SSIZE_T "\n", count); + } else - toffs = noffs; + { + // No match, backtrack + poffs = toffs; +// DBG_RE_PRINT("nope\n"); + if (rest) + break; + } - if (node->repeatMin >= 0 && - count >= node->repeatMin && - node->repeatMax > 0 && - count >= node->repeatMax) - break; + // Attempt to match rest of the expression if matched + // or if required repeats are 0 + if (matched || node->repeatMin == 0) + { + size_t qoffs = poffs; + DBG_RE_PRINT("try rest '%s'\n", haystack + qoffs); + if (th_regex_match_expr(haystack, &qoffs, expr, *nnode + 1, flags, level + 1)) + { + // Matched - } while (match && toffs > noffs); + // Check min repeats and if we are "not greedy". + if (count >= node->repeatMin && node->repeatMax == 1) + res = TRUE; + + // Check max repeats + if (node->repeatMax > 0 && count >= node->repeatMax) + res = TRUE; - if (count > 0 || node->repeatMin == 0) - { - DBG_RE_PRINT("count=%" PRId_SSIZE_T " \"%s\"\n", - count, haystack + toffs); + DBG_RE_PRINT("yes: res=%s count=%" PRId_SSIZE_T " [%" PRId_SSIZE_T " .. %" PRId_SSIZE_T "]\n", res ? "yes" : "no", count, node->repeatMin, node->repeatMax); + toffs = qoffs; + } + else + { + // Rest of expression did not match + DBG_RE_PRINT("no\n"); + toffs = poffs; + } - match = th_regex_match_expr(haystack, &toffs, expr, *nnode + 1, flags, level + 1); + rest = TRUE; + } + else + { + DBG_RE_PRINT("no match and repeatmin>0\n"); + break; + } - DBG_RE_PRINT("rest expr match=%s \"%s\"\n", - match ? "YES" : "NO", haystack + toffs); - } +// DBG_RE_PRINT("res=%d [%" PRIu_SIZE_T "='%c']\n", res, toffs, haystack[toffs]); + + } while (!res && haystack[toffs] != 0); - if (match) + // Check min repeats and if we are "not greedy". + if (count >= node->repeatMin || + (node->repeatMax > 0 && count >= node->repeatMax)) + res = TRUE; + + if (res) { *offs = toffs; *nnode = expr->nnodes; } - res = match && - ( - (node->repeatMax > 0 && count >= node->repeatMax) || - (node->repeatMin >= 0 && count >= node->repeatMin) - ); - - DBG_RE_PRINT("RESULT: match=%s, res=%s\n", - match ? "YES" : "NO", res ? "YES" : "NO"); + DBG_RE_PRINT("RESULT: %s : offs=%" PRIu_SIZE_T "='%s'\n", + res ? "YES" : "NO", + *offs, haystack + *offs); return res; }