changeset 610:a0e8d9c6300b

A bit more work on the regex stuff.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 16 Jan 2020 03:33:11 +0200
parents 69f1cb7f9b38
children d895b0fd6ad6
files tests.c th_regex.c th_regex.h
diffstat 3 files changed, 93 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/tests.c	Thu Jan 16 01:46:19 2020 +0200
+++ b/tests.c	Thu Jan 16 03:33:11 2020 +0200
@@ -562,8 +562,8 @@
 typedef struct
 {
     th_regex_char *str;
+    size_t nmatches;
     int flags;
-    BOOL result;
 } test_regex_def;
 
 
@@ -583,16 +583,31 @@
 
     for (const test_regex_def *def = list; def->str != NULL; def++)
     {
-        BOOL matched = FALSE;
+        th_regex_match_node *matches = NULL;
+        size_t nmatches;
 
-        if ((res = th_regex_match(reg, def->str, &matched, NULL, -1, def->flags)) != THERR_OK)
+        if ((res = th_regex_match(reg, def->str, &nmatches, &matches, -1, def->flags)) != THERR_OK)
         {
             THERR("Regex match returned error: %s\n",
                 th_error_str(res));
             goto out;
         }
 
-        printf("  '%s': %s\n", def->str, matched ? "YES" : "NO");
+        printf("  '%s': matched %" PRIu_SIZE_T " time(s), testresult=%s\n",
+            def->str,
+            nmatches,
+            def->nmatches == nmatches ? "YES" : "NO");
+
+        for (th_regex_match_node *m = matches;
+            m != NULL; m = (th_regex_match_node *) m->node.next)
+        {
+            char *tmp = th_strndup(def->str + m->start, m->len);
+            printf("  match [%" PRIu_SIZE_T " ++ %" PRIu_SIZE_T "]: '%s'\n",
+                m->start, m->len, tmp);
+            th_free(tmp);
+        }
+
+        th_regex_free_matches(matches);
     }
 
 out:
@@ -865,12 +880,13 @@
 
         test_regex_def tst1[] =
         {
-            { "abcfoabccg"        , 0, TRUE },
-            { "abcbcfoabccg"      , 0, TRUE },
-            { NULL, 0, FALSE }
+            { "abcfoabccg"        , 1, 0 },
+            { "abcbcfoabccg"      , 1, 0 },
+            { "abcbcfoabccgabcbcfoabccg"   , 2, 0 },
+            { NULL, 0, 0 }
         };
 
-        test_regex_list("^a(bc){1,2}fo[oab]*cc?g", tst1);
+        test_regex_list("a(bc){1,2}fo[oab]*cc?g", tst1);
     }
 
     //
--- a/th_regex.c	Thu Jan 16 01:46:19 2020 +0200
+++ b/th_regex.c	Thu Jan 16 03:33:11 2020 +0200
@@ -636,40 +636,76 @@
 
 
 int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack,
-    BOOL *pmatched, th_regex_match_node **pmatches, const ssize_t max,
+    size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches,
     const int flags)
 {
-//    th_regex_match_node *matches = NULL;
-    BOOL matched;
-    (void) pmatches;
-    (void) max;
+    size_t nmatches = 0;
+
+    if (pnmatches != NULL)
+        *pnmatches = 0;
 
     // Check given pattern and string
     if (expr == NULL || haystack == NULL)
         return THERR_NULLPTR;
 
     // Start matching
-#if 0
-    size_t soffs, coffs;
-    soffs = coffs = 0;
-    while (haystack[soffs] != 0)
+    // XXX NOTE .. lots to think about and to take into account:
+    // - anchored and unanchored expressions
+    // - how to check if the expression has consumed all possibilities?
+    // ..
+    for (size_t soffs = 0; haystack[soffs] != 0; )
     {
+        BOOL matched;
+        size_t coffs = soffs;
+
         if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))
         {
+            nmatches++;
+
+            if (pnmatches != NULL)
+                *pnmatches = nmatches;
+
+            if (pmatches != NULL)
+            {
+                th_regex_match_node *match = th_malloc0(sizeof(th_regex_match_node));
+                if (match == NULL)
+                    return THERR_MALLOC;
+
+                match->start = soffs;
+                match->len   = coffs - soffs;
+
+                th_llist_append_node((th_llist_t **) pmatches, (th_llist_t *) match);
+            }
+
+            if (maxmatches > 0 && nmatches >= maxmatches)
+                break;
+
+            if (soffs == coffs)
+                soffs++;
+            else
+                soffs = coffs;
         }
         else
         {
+            soffs++;
         }
     }
-#else
-    size_t offs = 0;
-    matched = th_regex_do_match_expr(expr, haystack, &offs, flags);
-#endif
-
-    if (pmatched != NULL)
-        *pmatched = matched;
 
     return THERR_OK;
 }
 
+
+static void th_regex_free_match(th_regex_match_node *node)
+{
+    (void) node;
+    // Nothing to do here at the moment
+}
+
+
+void th_regex_free_matches(th_regex_match_node *matches)
+{
+    th_llist_free_func_node((th_llist_t *) matches,
+        (void (*)(th_llist_t *)) th_regex_free_match);
+}
+
 #endif // TH_EXPERIMENTAL_REGEX
--- a/th_regex.h	Thu Jan 16 01:46:19 2020 +0200
+++ b/th_regex.h	Thu Jan 16 03:33:11 2020 +0200
@@ -10,7 +10,9 @@
 #ifndef TH_REGEX_H
 #define TH_REGEX_H
 
-#include "th_util.h"
+#include "th_types.h"
+#include "th_datastruct.h"
+
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,6 +25,13 @@
 typedef char th_regex_char;
 
 
+enum
+{
+    TH_REF_CASEFOLD          = 0x0001,
+    TH_REF_ANCHORED          = 0x0002,
+};
+
+
 struct th_regex_ctx;
 
 typedef struct
@@ -54,7 +63,7 @@
 
 typedef struct
 {
-//    th_llist_t node;
+    th_llist_t node;
     size_t start, len;
 } th_regex_match_node;
 
@@ -62,12 +71,13 @@
 //
 // Functions
 //
-int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern);
-void th_regex_free(th_regex_ctx *expr);
+int      th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern);
+void     th_regex_free(th_regex_ctx *expr);
 
-int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack,
-    BOOL *pmatched, th_regex_match_node **pmatches, const ssize_t max,
-    const int flags);
+int      th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack,
+         size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches,
+         const int flags);
+void     th_regex_free_matches(th_regex_match_node *matches);
 
 
 #ifdef __cplusplus