diff th_regex.c @ 613:2e3b81ae8c8a

More work on regexes.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 16 Jan 2020 16:01:27 +0200
parents cc9ec51b4875
children afcaf5e38f56
line wrap: on
line diff
--- a/th_regex.c	Thu Jan 16 12:50:28 2020 +0200
+++ b/th_regex.c	Thu Jan 16 16:01:27 2020 +0200
@@ -13,7 +13,7 @@
 
 //#define DBG_RE_COMPILE 1
 //#define DBG_RE_FREE 1
-//#define DBG_RE_MATCH 1
+#define DBG_RE_MATCH 1
 
 #if defined(DBG_RE_COMPILE)
 #    define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
@@ -35,7 +35,8 @@
 enum
 {
     TH_RE_MATCH_ONCE,
-    TH_RE_MATCH_COUNT,
+    TH_RE_MATCH_COUNT_GREEDY,
+    TH_RE_MATCH_COUNT_NONGREEDY,
     TH_RE_MATCH_ANCHOR_START,
     TH_RE_MATCH_ANCHOR_END,
 };
@@ -55,7 +56,8 @@
 static const char *re_match_modes[] =
 {
     "ONCE",
-    "COUNT",
+    "COUNT GREEDY",
+    "COUNT NONGREEDY",
     "ANCHOR_START",
     "ANCHOR_END",
 };
@@ -253,37 +255,51 @@
 
     for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
     {
-        DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, ctx.pattern[ctx.offs]);
-        switch (ctx.pattern[ctx.offs])
+        th_regex_char cch = ctx.pattern[ctx.offs];
+        DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch);
+        switch (cch)
         {
             case '?':
-                // Previous token is optional (repeat 0-1 times)
+            case '*':
+            case '+':
                 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
                     goto exit;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
-                pnode->repeatMin = 0;
-                pnode->repeatMax = 1;
-                break;
-
-            case '*':
-                // Previous token can repeat 0 or more times
-                if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
-                    goto exit;
+                if (cch == '?')
+                {
+                    // Check if previous was a count
+                    pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ?
+                        TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
-                pnode->repeatMin = 0;
-                pnode->repeatMax = -1;
-                break;
+                    // Previous token is optional (repeat 0-1 times)
+                    pnode->repeatMin = 0;
+                    pnode->repeatMax = 1;
+                }
+                else
+                {
+                    // Check if previous was a count
+                    if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY ||
+                        pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY)
+                    {
+                        res = THERR_INVALID_DATA;
+                        goto exit;
+                    }
 
-            case '+':
-                // Previous token must repeat 1 or more times
-                if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
-                    goto exit;
+                    pnode->mode = TH_RE_MATCH_COUNT_GREEDY;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
-                pnode->repeatMin = 1;
-                pnode->repeatMax = 1;
+                    if (cch == '*')
+                    {
+                        // Previous token can repeat 0 or more times
+                        pnode->repeatMin = 0;
+                        pnode->repeatMax = -1;
+                    }
+                    else
+                    {
+                        // Previous token must repeat 1 or more times
+                        pnode->repeatMin = 1;
+                        pnode->repeatMax = -1;
+                    }
+                }
                 break;
 
             case '{':
@@ -303,7 +319,7 @@
                     ctx.offs - start)) != THERR_OK)
                     goto exit;
 
-                pnode->mode = TH_RE_MATCH_COUNT;
+                pnode->mode = TH_RE_MATCH_COUNT_GREEDY;
 
                 if (th_regex_find_next(tmp, 0, &start, ','))
                 {
@@ -493,7 +509,7 @@
     th_regex_char cch;
     BOOL ret = FALSE;
 
-    DBG_RE_PRINT_MATCH("node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs);
+    DBG_RE_PRINT_MATCH("    node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs);
 
     switch (node->type)
     {
@@ -550,7 +566,7 @@
     }
 
 out:
-    DBG_RE_PRINT_MATCH("node_DONE  [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO");
+    DBG_RE_PRINT_MATCH("    node_DONE  [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO");
     return ret;
 }
 
@@ -563,8 +579,8 @@
         const th_regex_node *node = &expr->nodes[n];
         size_t soffs;
 
-        DBG_RE_PRINT_MATCH("  expr %p [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ",
-            (void *) expr, n, expr->nnodes, re_match_modes[node->mode]);
+        DBG_RE_PRINT_MATCH("  expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ",
+            n, expr->nnodes, re_match_modes[node->mode]);
 
         switch (node->mode)
         {
@@ -578,17 +594,29 @@
                 }
                 break;
 
-            case TH_RE_MATCH_COUNT:
+            case TH_RE_MATCH_COUNT_GREEDY:
+            case TH_RE_MATCH_COUNT_NONGREEDY:
                 {
-                    BOOL done = FALSE, match = FALSE;
+                    BOOL done = FALSE;
                     ssize_t count = 0;
 
                     DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax);
 
                     do
                     {
+                        BOOL match;
                         soffs = *offs;
-                        if ((match = th_regex_do_match_node(haystack, &soffs, node, flags)))
+
+                        match = th_regex_do_match_node(haystack, &soffs, node, flags);
+                        for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++)
+                        {
+                            const th_regex_node *next = &expr->nodes[qn];
+                            do {
+                              match = th_regex_do_match_node(haystack, &soffs, next, flags);
+                            } while (haystack[soffs] != 0 && !match);
+                        }
+
+                        if (match)
                         {
                             // Node matched
                             count++;
@@ -654,7 +682,7 @@
         BOOL matched;
         size_t coffs = soffs;
 
-        DBG_RE_PRINT_MATCH("\nDO_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
+        DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
             soffs, haystack + soffs);
 
         if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))