changeset 613:2e3b81ae8c8a

More work on regexes.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 16 Jan 2020 16:01:27 +0200
parents cc9ec51b4875
children afcaf5e38f56
files tests.c th_regex.c
diffstat 2 files changed, 123 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/tests.c	Thu Jan 16 12:50:28 2020 +0200
+++ b/tests.c	Thu Jan 16 16:01:27 2020 +0200
@@ -524,7 +524,9 @@
 
     // Test ptr
     test_start(&ctx, "Test configuration string list ptr");
-    test_result(&ctx, (item = th_cfg_find(cfg, NULL, "string_list", -1)) != NULL && item->v.list == &v_str_list);
+    test_result(&ctx,
+        (item = th_cfg_find(cfg, NULL, "string_list", -1)) != NULL &&
+        item->v.list == &v_str_list);
     test_end(&ctx);
 
     // Test value finding
@@ -567,12 +569,12 @@
 } test_regex_def;
 
 
-void test_regex_list(const th_regex_char *pattern, const test_regex_def *list)
+void test_regex_list(const test_regex_def *list, const th_regex_char *pattern)
 {
     th_regex_ctx *reg = NULL;
     int res;
 
-
+    printf("========================================\n");
     printf("pattern '%s'\n", pattern);
     if ((res = th_regex_compile(&reg, pattern)) != THERR_OK)
     {
@@ -586,7 +588,8 @@
         th_regex_match_node *matches = NULL;
         size_t nmatches;
 
-        if ((res = th_regex_match(reg, def->str, &nmatches, &matches, -1, def->flags)) != THERR_OK)
+        if ((res = th_regex_match(reg, def->str,
+            &nmatches, &matches, -1, def->flags)) != THERR_OK)
         {
             THERR("Regex match returned error: %s\n",
                 th_error_str(res));
@@ -602,7 +605,7 @@
             m != NULL; m = (th_regex_match_node *) m->node.next)
         {
             char *tmp = th_strndup(def->str + m->start, m->len);
-            printf("  match [%" PRIu_SIZE_T " ++ %" PRIu_SIZE_T "]: '%s'\n",
+            printf("      match [%" PRIu_SIZE_T " ++ %" PRIu_SIZE_T "]: '%s'\n",
                 m->start, m->len, tmp);
             th_free(tmp);
         }
@@ -827,7 +830,7 @@
 #elif TH_ARCH == 64
             0xaabbccdd11223344;
 #else
-#error Unsupported TH_ARCH value.
+#    error Unsupported TH_ARCH value.
 #endif
 
         snprintf(tmp, sizeof(tmp), "%16" PRIx_SIZE_T "h", usiz);
@@ -873,20 +876,64 @@
         th_regex_ctx *reg = NULL;
         int res;
 
+#if 0
         res = th_regex_compile(&reg, "z*k+abba fabboa? k{4} [gz]{1,2} foo(bar|zoo)?");
         if (res != THERR_OK)
             printf("result: %s\n", th_error_str(res));
         th_regex_free(reg);
 
-        test_regex_def tst1[] =
+        //
+        {
+            static const test_regex_def tlist[] =
+            {
+                { "abcfoabccg"                   , 1, 0 },
+                { "abcbcfoabccg"                 , 1, 0 },
+                { "abcbcfoabccgabcbcfoabccg"     , 2, 0 },
+                { "ffdsafS abcbcfoabccg zasdf"   , 1, 0 },
+                { NULL                           , 0, 0 }
+            };
+
+            test_regex_list(tlist, "a(bc){1,2}fo[oab]*cc?g");
+        }
+
         {
-            { "abcfoabccg"        , 1, 0 },
-            { "abcbcfoabccg"      , 1, 0 },
-            { "abcbcfoabccgabcbcfoabccg"   , 2, 0 },
-            { NULL, 0, 0 }
-        };
+            static const test_regex_def tlist[] =
+            {
+                { "abcfoabccg"                   , 1, 0 },
+                { "abcbcfoabccg"                 , 1, 0 },
+                { "abcbcfoabccgabcbcfoabccg"     , 2, 0 },
+                { "ffdsafS abcbcfoabccg zasdf"   , 0, 0 },
+                { NULL                           , 0, 0 }
+            };
+
+            test_regex_list(tlist, "^a(bc){1,2}fo[oab]*cc?g");
+        }
 
-        test_regex_list("a(bc){1,2}fo[oab]*cc?g", tst1);
+        {
+            static const test_regex_def tlist[] =
+            {
+                { "cg"                           , 1, 0 },
+                { "g"                            , 1, 0 },
+                { ""                             , 0, 0 },
+                { "c"                            , 0, 0 },
+                { NULL                           , 0, 0 }
+            };
+
+            test_regex_list(tlist, "g$");
+        }
+#endif
+
+        {
+            static const test_regex_def tlist[] =
+            {
+//                { "zoobar"                       , 1, 0 },
+                { "zoo lol bar"                  , 1, 0 },
+//                { "hoho zoo lol lol bar bar"     , 1, 0 },
+                { NULL                           , 0, 0 }
+            };
+
+            test_regex_list(tlist, "zoo.*?bar");
+        }
     }
 
     //
--- a/th_regex.c	Thu Jan 16 12:50:28 2020 +0200
+++ b/th_regex.c	Thu Jan 16 16:01:27 2020 +0200
@@ -13,7 +13,7 @@
 
 //#define DBG_RE_COMPILE 1
 //#define DBG_RE_FREE 1
-//#define DBG_RE_MATCH 1
+#define DBG_RE_MATCH 1
 
 #if defined(DBG_RE_COMPILE)
 #    define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
@@ -35,7 +35,8 @@
 enum
 {
     TH_RE_MATCH_ONCE,
-    TH_RE_MATCH_COUNT,
+    TH_RE_MATCH_COUNT_GREEDY,
+    TH_RE_MATCH_COUNT_NONGREEDY,
     TH_RE_MATCH_ANCHOR_START,
     TH_RE_MATCH_ANCHOR_END,
 };
@@ -55,7 +56,8 @@
 static const char *re_match_modes[] =
 {
     "ONCE",
-    "COUNT",
+    "COUNT GREEDY",
+    "COUNT NONGREEDY",
     "ANCHOR_START",
     "ANCHOR_END",
 };
@@ -253,37 +255,51 @@
 
     for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
     {
-        DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, ctx.pattern[ctx.offs]);
-        switch (ctx.pattern[ctx.offs])
+        th_regex_char cch = ctx.pattern[ctx.offs];
+        DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch);
+        switch (cch)
         {
             case '?':
-                // Previous token is optional (repeat 0-1 times)
+            case '*':
+            case '+':
                 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
                     goto exit;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
-                pnode->repeatMin = 0;
-                pnode->repeatMax = 1;
-                break;
-
-            case '*':
-                // Previous token can repeat 0 or more times
-                if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
-                    goto exit;
+                if (cch == '?')
+                {
+                    // Check if previous was a count
+                    pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ?
+                        TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
-                pnode->repeatMin = 0;
-                pnode->repeatMax = -1;
-                break;
+                    // Previous token is optional (repeat 0-1 times)
+                    pnode->repeatMin = 0;
+                    pnode->repeatMax = 1;
+                }
+                else
+                {
+                    // Check if previous was a count
+                    if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY ||
+                        pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY)
+                    {
+                        res = THERR_INVALID_DATA;
+                        goto exit;
+                    }
 
-            case '+':
-                // Previous token must repeat 1 or more times
-                if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
-                    goto exit;
+                    pnode->mode = TH_RE_MATCH_COUNT_GREEDY;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
-                pnode->repeatMin = 1;
-                pnode->repeatMax = 1;
+                    if (cch == '*')
+                    {
+                        // Previous token can repeat 0 or more times
+                        pnode->repeatMin = 0;
+                        pnode->repeatMax = -1;
+                    }
+                    else
+                    {
+                        // Previous token must repeat 1 or more times
+                        pnode->repeatMin = 1;
+                        pnode->repeatMax = -1;
+                    }
+                }
                 break;
 
             case '{':
@@ -303,7 +319,7 @@
                     ctx.offs - start)) != THERR_OK)
                     goto exit;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
+                pnode->mode = TH_RE_MATCH_COUNT_GREEDY;
 
                 if (th_regex_find_next(tmp, 0, &start, ','))
                 {
@@ -493,7 +509,7 @@
     th_regex_char cch;
     BOOL ret = FALSE;
 
-    DBG_RE_PRINT_MATCH("node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs);
+    DBG_RE_PRINT_MATCH("    node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs);
 
     switch (node->type)
     {
@@ -550,7 +566,7 @@
     }
 
 out:
-    DBG_RE_PRINT_MATCH("node_DONE  [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO");
+    DBG_RE_PRINT_MATCH("    node_DONE  [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO");
     return ret;
 }
 
@@ -563,8 +579,8 @@
         const th_regex_node *node = &expr->nodes[n];
         size_t soffs;
 
-        DBG_RE_PRINT_MATCH("  expr %p [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ",
-            (void *) expr, n, expr->nnodes, re_match_modes[node->mode]);
+        DBG_RE_PRINT_MATCH("  expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ",
+            n, expr->nnodes, re_match_modes[node->mode]);
 
         switch (node->mode)
         {
@@ -578,17 +594,29 @@
                 }
                 break;
 
-            case TH_RE_MATCH_COUNT:
+            case TH_RE_MATCH_COUNT_GREEDY:
+            case TH_RE_MATCH_COUNT_NONGREEDY:
                 {
-                    BOOL done = FALSE, match = FALSE;
+                    BOOL done = FALSE;
                     ssize_t count = 0;
 
                     DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax);
 
                     do
                     {
+                        BOOL match;
                         soffs = *offs;
-                        if ((match = th_regex_do_match_node(haystack, &soffs, node, flags)))
+
+                        match = th_regex_do_match_node(haystack, &soffs, node, flags);
+                        for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++)
+                        {
+                            const th_regex_node *next = &expr->nodes[qn];
+                            do {
+                              match = th_regex_do_match_node(haystack, &soffs, next, flags);
+                            } while (haystack[soffs] != 0 && !match);
+                        }
+
+                        if (match)
                         {
                             // Node matched
                             count++;
@@ -654,7 +682,7 @@
         BOOL matched;
         size_t coffs = soffs;
 
-        DBG_RE_PRINT_MATCH("\nDO_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
+        DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
             soffs, haystack + soffs);
 
         if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))