changeset 666:e1d27caf0dbd

More work on regex stuff.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 27 Jan 2020 12:43:39 +0200
parents 4932188c9101
children 039aa00cbfbf
files tests.c th_regex.c
diffstat 2 files changed, 73 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
--- a/tests.c	Mon Jan 27 07:51:07 2020 +0200
+++ b/tests.c	Mon Jan 27 12:43:39 2020 +0200
@@ -603,7 +603,7 @@
     th_regex_t *expr = NULL;
     int res;
 
-    printf("========================================\n");
+    printf("\n========================================\n\n");
     printf("Compiling pattern \"%s\"\n", pattern);
     if ((res = th_regex_compile(&expr, pattern)) != THERR_OK)
     {
@@ -620,7 +620,7 @@
         th_regex_match_t *matches = NULL;
         size_t nmatches;
 
-        printf("----------------------------------------\n");
+        printf("\n----------------------------------------\n");
         if ((res = th_regex_match(expr, def->str,
             &nmatches, &matches, -1, def->flags)) != THERR_OK)
         {
@@ -645,7 +645,7 @@
 
 void test_regex_list2(const test_regex_def2 *list)
 {
-    printf("========================================\n");
+    printf("\n========================================\n\n");
 
     for (const test_regex_def2 *def = list; def->str != NULL; def++)
     {
@@ -959,21 +959,23 @@
             const char *str = "z*k+abba fabboa? [a-zA-Z_-] \\{\\} k{4} ([0-9]+ yay){1,2} foo(bar|zoo)?";
             th_regex_t *expr = NULL;
             int res = th_regex_compile(&expr, str);
+
             printf("REGEX: \"%s\"\n", str);
+
             if (res == THERR_OK)
                 th_regex_dump(&testio, 1, expr);
             else
                 printf("ERROR: %s\n", th_error_str(res));
+
             th_regex_free(expr);
         }
 
-#if 0
         {
             static const test_regex_def1 tlist[] =
             {
-                { "abcfoabccg"                   , 1, 0 },
-                { "abcbcfoabccg"                 , 1, 0 },
-                { "abcbcfoabccgabcbcfoabccg"     , 2, 0 },
+                { "abcfoabcccg"                  , 1, 0 },
+                { "abcbcfoabcccg"                , 1, 0 },
+                { "abcbcfoabccg abcbcfoabccccg"  , 2, 0 },
                 { "ffdsafS abcbcfoabccg zasdf"   , 1, 0 },
                 { NULL                           , 0, 0 }
             };
@@ -986,7 +988,7 @@
             {
                 { "abcfoabccg"                   , 1, 0 },
                 { "abcbcfoabccg"                 , 1, 0 },
-                { "abcbcfoabccgabcbcfoabccg"     , 2, 0 },
+                { "abcbcfoabccgabcbcfoabccg"     , 1, 0 },
                 { "ffdsafS abcbcfoabccg zasdf"   , 0, 0 },
                 { NULL                           , 0, 0 }
             };
@@ -1006,7 +1008,6 @@
 
             test_regex_list1(tlist, "g$");
         }
-#endif
 
         {
             static const test_regex_def1 tlist[] =
@@ -1019,7 +1020,7 @@
             };
 
             test_regex_list1(tlist, "zoo.*?bar");
-//            test_regex_list(tlist, "zoo.*?bar");
+            test_regex_list1(tlist, "zoo.*bar");
         }
     }
 #endif
--- a/th_regex.c	Mon Jan 27 07:51:07 2020 +0200
+++ b/th_regex.c	Mon Jan 27 12:43:39 2020 +0200
@@ -942,55 +942,87 @@
     const int level
     )
 {
-    size_t toffs = *offs, noffs;
-    BOOL res, match = FALSE;
+    size_t toffs = *offs;
+    BOOL res = FALSE, rest = FALSE;
     ssize_t count = 0;
 
-    if (node->repeatMin > 0)
     do
     {
-        noffs = toffs;
-        match = th_regex_match_one(haystack, &toffs, node, flags, level);
-        if (match)
+        // Attempt to match the repeated node once
+        size_t poffs;
+        BOOL matched;
+
+        poffs = toffs;
+        if ((matched = th_regex_match_one(haystack, &poffs, node, flags, level)))
         {
+            // Matched, increase count
             count++;
+//            DBG_RE_PRINT("#%" PRId_SSIZE_T "\n", count);
+
         }
         else
-            toffs = noffs;
+        {
+            // No match, backtrack
+            poffs = toffs;
+//            DBG_RE_PRINT("nope\n");
+            if (rest)
+                break;
+        }
 
-        if (node->repeatMin >= 0 &&
-            count >= node->repeatMin &&
-            node->repeatMax > 0 &&
-            count >= node->repeatMax)
-            break;
+        // Attempt to match rest of the expression if matched
+        // or if required repeats are 0
+        if (matched || node->repeatMin == 0)
+        {
+            size_t qoffs = poffs;
+            DBG_RE_PRINT("try rest '%s'\n", haystack + qoffs);
+            if (th_regex_match_expr(haystack, &qoffs, expr, *nnode + 1, flags, level + 1))
+            {
+                // Matched
 
-    } while (match && toffs > noffs);
+                // Check min repeats and if we are "not greedy".
+                if (count >= node->repeatMin && node->repeatMax == 1)
+                    res = TRUE;
+
+                // Check max repeats
+                if (node->repeatMax > 0 && count >= node->repeatMax)
+                    res = TRUE;
 
-    if (count > 0 || node->repeatMin == 0)
-    {
-        DBG_RE_PRINT("count=%" PRId_SSIZE_T " \"%s\"\n",
-            count, haystack + toffs);
+                DBG_RE_PRINT("yes: res=%s count=%" PRId_SSIZE_T " [%" PRId_SSIZE_T " .. %" PRId_SSIZE_T "]\n", res ? "yes" : "no", count, node->repeatMin, node->repeatMax);
+                toffs = qoffs;
+            }
+            else
+            {
+                // Rest of expression did not match
+                DBG_RE_PRINT("no\n");
+                toffs = poffs;
+            }
 
-        match = th_regex_match_expr(haystack, &toffs, expr, *nnode + 1, flags, level + 1);
+            rest = TRUE;
+        }
+        else
+        {
+            DBG_RE_PRINT("no match and repeatmin>0\n");
+            break;
+        }
 
-        DBG_RE_PRINT("rest expr match=%s \"%s\"\n",
-            match ? "YES" : "NO", haystack + toffs);
-    }
+//        DBG_RE_PRINT("res=%d [%" PRIu_SIZE_T "='%c']\n", res, toffs, haystack[toffs]);
+
+    } while (!res && haystack[toffs] != 0);
 
-    if (match)
+    // Check min repeats and if we are "not greedy".
+    if (count >= node->repeatMin ||
+        (node->repeatMax > 0 && count >= node->repeatMax))
+        res = TRUE;
+
+    if (res)
     {
         *offs = toffs;
         *nnode = expr->nnodes;
     }
 
-    res = match &&
-        (
-        (node->repeatMax > 0 && count >= node->repeatMax) ||
-        (node->repeatMin >= 0 && count >= node->repeatMin)
-        );
-
-    DBG_RE_PRINT("RESULT: match=%s, res=%s\n",
-        match ? "YES" : "NO", res ? "YES" : "NO");
+    DBG_RE_PRINT("RESULT: %s : offs=%" PRIu_SIZE_T "='%s'\n",
+        res ? "YES" : "NO",
+        *offs, haystack + *offs);
 
     return res;
 }