# HG changeset patch # User Matti Hamalainen # Date 1579129740 -7200 # Node ID 566e6ef41f9d654f3fddc3fccbf34a51858f858a # Parent e789accfca8764603d45cb52ff8fdf6a0ef15f80 Initial commit of the highly experimental and unfinished regular expression parsing and matching code + unfinished test harness stuff. Not to be used. diff -r e789accfca87 -r 566e6ef41f9d Makefile.gen --- a/Makefile.gen Thu Jan 16 00:49:42 2020 +0200 +++ b/Makefile.gen Thu Jan 16 01:09:00 2020 +0200 @@ -8,6 +8,7 @@ CFLAGS += -DTH_USE_INTERNAL_SPRINTF=1 CFLAGS += -DTH_PRINTF_DEBUG=1 CFLAGS += -DTH_USE_OPT_ARG=1 +CFLAGS += -DTH_EXPERIMENTAL_REGEX=1 THLIBS = ./ @@ -17,7 +18,7 @@ THLIBS_A=$(OBJPATH)thlibs.a THLIBS_OBJ=th_util.o th_config.o th_string.o th_ioctx.o \ th_args.o th_crypto.o th_datastruct.o th_network.o \ - th_file.o + th_file.o th_regex.o TESTS_OBJ=tests.o diff -r e789accfca87 -r 566e6ef41f9d tests.c --- a/tests.c Thu Jan 16 00:49:42 2020 +0200 +++ b/tests.c Thu Jan 16 01:09:00 2020 +0200 @@ -4,6 +4,7 @@ #include "th_string.h" #include "th_crypto.h" #include "th_config.h" +#include "th_regex.h" #define SET_BUF_SIZE 128 @@ -558,6 +559,47 @@ } +typedef struct +{ + th_regex_char *str; + int flags; + BOOL result; +} test_regex_def; + + +void test_regex_list(const th_regex_char *pattern, const test_regex_def *list) +{ + th_regex_ctx *reg = NULL; + int res; + + + printf("pattern '%s'\n", pattern); + if ((res = th_regex_compile(®, pattern)) != THERR_OK) + { + THERR("Regex compilation failed: %s\n", + th_error_str(res)); + goto out; + } + + for (const test_regex_def *def = list; def->str != NULL; def++) + { + BOOL matched = FALSE; + + if ((res = th_regex_match(reg, def->str, &matched, NULL, -1, def->flags)) != THERR_OK) + { + THERR("Regex match returned error: %s\n", + th_error_str(res)); + goto out; + } + + printf(" '%s': %s\n", def->str, matched ? "YES" : "NO"); + } + +out: + th_regex_free(reg); +} + + int main(int argc, char *argv[]) { size_t i1, i2, i3, i4; @@ -809,6 +851,29 @@ } // + // Regular expressions + // + if (test_set_start("Regular expressions")) + { + th_regex_ctx *reg = NULL; + int res; + + res = th_regex_compile(®, "z*k+abba fabboa? k{4} [gz]{1,2} foo(bar|zoo)?"); + if (res != THERR_OK) + printf("result: %s\n", th_error_str(res)); + th_regex_free(reg); + + test_regex_def tst1[] = + { + { "abcfoabccg" , 0, TRUE }, + { "abcbcfoabccg" , 0, TRUE }, + { NULL, 0, FALSE } + }; + + test_regex_list("^a(bc){1,2}fo[oab]*cc?g", tst1); + } + + // // Print summary and exit // tprint(1, diff -r e789accfca87 -r 566e6ef41f9d th_regex.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/th_regex.c Thu Jan 16 01:09:00 2020 +0200 @@ -0,0 +1,673 @@ +/* + * Simple regular expression matching functionality + * Programmed and designed by Matti 'ccr' Hamalainen + * (C) Copyright 2020 Tecnic Software productions (TNSP) + * + * Please read file 'COPYING' for information on license and distribution. + */ +#ifdef TH_EXPERIMENTAL_REGEX +#include "th_util.h" +#include "th_regex.h" +//#include "th_datastruct.h" + + +//#define DBG_RE_COMPILE 1 +//#define DBG_RE_FREE 1 +//#define DBG_RE_MATCH 1 + +#if defined(DBG_RE_COMPILE) +# define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#else +# define DBG_RE_PRINT_COMPILE(...) do { } while (0) +#endif +#if defined(DBG_RE_FREE) +# define DBG_RE_PRINT_FREE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#else +# define DBG_RE_PRINT_FREE(...) do { } while (0) +#endif +#if defined(DBG_RE_MATCH) +# define DBG_RE_PRINT_MATCH(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#else +# define DBG_RE_PRINT_MATCH(...) do { } while (0) +#endif + + +enum +{ + TH_RE_MATCH_ONCE, + TH_RE_MATCH_COUNT, + TH_RE_MATCH_ANCHOR_START, + TH_RE_MATCH_ANCHOR_END, +}; + + +enum +{ + TH_RE_TYPE_CHAR, + TH_RE_TYPE_ANY_CHAR, + TH_RE_TYPE_LIST, + TH_RE_TYPE_LIST_REVERSE, + TH_RE_TYPE_SUBEXPR, +}; + + +#if defined(DBG_RE_COMPILE) || defined(DBG_RE_FREE) || defined(DBG_RE_MATCH) +static const char *re_match_modes[] = +{ + "ONCE", + "COUNT", + "ANCHOR_START", + "ANCHOR_END", +}; + +static const char *re_match_types[] = +{ + "CHAR", + "ANY", + "LIST", + "LIST_REVERSE", + "SUBEXPR", +}; +#endif + + +typedef struct +{ + const th_regex_char *pattern; + size_t offs; + + th_regex_ctx *data; + + size_t nstack, stacksize; + th_regex_ctx **stack; +} th_regex_parse_ctx; + + +static void th_regex_node_init(th_regex_node *node) +{ + memset(node, 0, sizeof(th_regex_node)); + node->mode = TH_RE_MATCH_ONCE; +} + + +static int th_regex_strndup(th_regex_char **pdst, + const th_regex_char *src, const size_t len) +{ + if (pdst == NULL) + return THERR_NULLPTR; + + if (UINTPTR_MAX / sizeof(th_regex_char) < len + 1) + return THERR_BOUNDS; + + if ((*pdst = (th_regex_char *) + th_malloc((len + 1) * sizeof(th_regex_char))) == NULL) + return THERR_MALLOC; + + memcpy(*pdst, src, len * sizeof(th_regex_char)); + (*pdst)[len] = 0; + + return THERR_OK; +} + + +static int th_regex_ctx_get_prev_node( + th_regex_parse_ctx *ctx, th_regex_node **pnode) +{ + if (ctx->data != NULL && ctx->data->nnodes > 0) + { + *pnode = &ctx->data->nodes[ctx->data->nnodes - 1]; + return THERR_OK; + } + else + return THERR_INVALID_DATA; +} + + +static int th_regex_ctx_push(th_regex_parse_ctx *ctx) +{ + if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize) + { + ctx->stacksize += 16; + + if ((ctx->stack = th_realloc(ctx->stack, + ctx->stacksize * sizeof(th_regex_node *))) == NULL) + return THERR_MALLOC; + } + + ctx->stack[ctx->nstack] = ctx->data; + ctx->nstack++; + ctx->data = NULL; + + return THERR_OK; +} + + +static int th_regex_ctx_pop(th_regex_parse_ctx *ctx, th_regex_ctx **data) +{ + if (ctx->nstack > 0) + { + *data = ctx->data; + ctx->nstack--; + ctx->data = ctx->stack[ctx->nstack]; + return THERR_OK; + } + else + return THERR_INVALID_DATA; +} + + +static int th_regex_ctx_node_commit(th_regex_parse_ctx *ctx, th_regex_node *node) +{ + th_regex_ctx *data = ctx->data; + + if (data == NULL) + { + if ((data = ctx->data = th_malloc0(sizeof(th_regex_ctx))) == NULL) + return THERR_MALLOC; + } + + if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize) + { + data->nodessize += 16; + if ((data->nodes = th_realloc(data->nodes, + data->nodessize * sizeof(th_regex_node))) == NULL) + return THERR_MALLOC; + } + + memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); + data->nnodes++; + + DBG_RE_PRINT_COMPILE( + "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " + "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", + data->nnodes, data->nodessize, + node->mode, node->type, + node->repeatMin, node->repeatMax); + + return THERR_OK; +} + + +static BOOL th_regex_find_next(const th_regex_char *str, + const size_t start, size_t *offs, + const th_regex_char delim) +{ + for (*offs = start; str[*offs] != 0; (*offs)++) + { + if (str[*offs] == delim) + return TRUE; + } + return FALSE; +} + + +static BOOL th_regex_parse_ssize_t(const th_regex_char *str, + ssize_t *value) +{ + th_regex_char ch; + BOOL neg; + + if (*str == '-') + { + str++; + neg = TRUE; + } + else + neg = FALSE; + + // Is the value negative? + while ((ch = *str++)) + { + if (ch >= '0' && ch <= '9') + { + *value *= 10; + *value += ch - '0'; + } + else + return FALSE; + } + + if (neg) + *value = -(*value); + + return TRUE; +} + + +static int th_regex_compile_do(th_regex_parse_ctx *ctx) +{ + int res = THERR_OK; + th_regex_node node, *pnode; + th_regex_char *tmp = NULL; + size_t start; + + for (; ctx->pattern[ctx->offs] != 0; ctx->offs++) + { + DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx->offs, ctx->pattern[ctx->offs]); + switch (ctx->pattern[ctx->offs]) + { + case '?': + // Previous token is optional (repeat 0-1 times) + if ((res = th_regex_ctx_get_prev_node(ctx, &pnode)) != THERR_OK) + goto exit; + + pnode->mode = TH_RE_MATCH_COUNT; + pnode->repeatMin = 0; + pnode->repeatMax = 1; + break; + + case '*': + // Previous token can repeat 0 or more times + if ((res = th_regex_ctx_get_prev_node(ctx, &pnode)) != THERR_OK) + goto exit; + + pnode->mode = TH_RE_MATCH_COUNT; + pnode->repeatMin = 0; + pnode->repeatMax = -1; + break; + + case '+': + // Previous token must repeat 1 or more times + if ((res = th_regex_ctx_get_prev_node(ctx, &pnode)) != THERR_OK) + goto exit; + + pnode->mode = TH_RE_MATCH_COUNT; + pnode->repeatMin = 1; + pnode->repeatMax = 1; + break; + + case '{': + // {n} | {min,max} + start = ctx->offs + 1; + if (!th_regex_find_next(ctx->pattern, start, &ctx->offs, '}')) + { + // End not found + res = THERR_INVALID_DATA; + goto exit; + } + + th_free(tmp); + + if ((res = th_regex_ctx_get_prev_node(ctx, &pnode)) != THERR_OK || + (res = th_regex_strndup(&tmp, ctx->pattern + start, + ctx->offs - start)) != THERR_OK) + goto exit; + + pnode->mode = TH_RE_MATCH_COUNT; + + if (th_regex_find_next(tmp, 0, &start, ',')) + { + tmp[start] = 0; + if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || + !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax)) + { + res = THERR_INVALID_DATA; + goto exit; + } + } + else + { + if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin)) + { + res = THERR_INVALID_DATA; + goto exit; + } + pnode->repeatMax = pnode->repeatMin; + } + + if (pnode->repeatMin < 0 || pnode->repeatMax < 1 || + pnode->repeatMax < pnode->repeatMin) + { + // Invalid repeat counts + res = THERR_INVALID_DATA; + goto exit; + } + break; + + case '(': + // Start of subpattern + if ((res = th_regex_ctx_push(ctx)) != THERR_OK) + goto exit; + break; + + case ')': + // End of subpattern + th_regex_node_init(&node); + node.type = TH_RE_TYPE_SUBEXPR; + + if ((res = th_regex_ctx_pop(ctx, &node.match.expr)) != THERR_OK || + (res = th_regex_ctx_node_commit(ctx, &node)) != THERR_OK) + goto exit; + break; + + case '^': + // Start of line anchor + th_regex_node_init(&node); + node.mode = TH_RE_MATCH_ANCHOR_START; + + if ((res = th_regex_ctx_node_commit(ctx, &node)) != THERR_OK) + goto exit; + break; + + case '$': + // End of line anchor + th_regex_node_init(&node); + node.mode = TH_RE_MATCH_ANCHOR_END; + + if ((res = th_regex_ctx_node_commit(ctx, &node)) != THERR_OK) + goto exit; + break; + + case '[': + // Start of char list + start = ctx->offs + 1; + if (!th_regex_find_next(ctx->pattern, start, &ctx->offs, ']') || + ctx->offs == start) + { + res = THERR_INVALID_DATA; + goto exit; + } + + th_regex_node_init(&node); + node.type = (ctx->pattern[start] == '^') ? + TH_RE_TYPE_LIST_REVERSE : TH_RE_TYPE_LIST; + node.match.list.nchars = ctx->offs - start; + if ((res = th_regex_strndup(&node.match.list.chars, + ctx->pattern + start, node.match.list.nchars)) != THERR_OK) + goto exit; + + if ((res = th_regex_ctx_node_commit(ctx, &node)) != THERR_OK) + goto exit; + break; + + case '.': + // Any single character matches + th_regex_node_init(&node); + node.type = TH_RE_TYPE_ANY_CHAR; + + if ((res = th_regex_ctx_node_commit(ctx, &node)) != THERR_OK) + goto exit; + break; + + case '\\': + // Literal escape + ctx->offs++; + if (ctx->pattern[ctx->offs] == 0) + { + // End of pattern, error + res = THERR_INVALID_DATA; + goto exit; + } + + th_regex_node_init(&node); + node.type = TH_RE_TYPE_CHAR; + node.match.chr = ctx->pattern[ctx->offs]; + + if ((res = th_regex_ctx_node_commit(ctx, &node)) != THERR_OK) + goto exit; + break; + + default: + // Given character must match + th_regex_node_init(&node); + node.type = TH_RE_TYPE_CHAR; + node.match.chr = ctx->pattern[ctx->offs]; + + if ((res = th_regex_ctx_node_commit(ctx, &node)) != THERR_OK) + goto exit; + break; + } + } + +exit: + th_free(tmp); + return res; +} + + +int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern) +{ + th_regex_parse_ctx ctx; + int res = THERR_OK; + + if (pexpr == NULL || pattern == NULL) + return THERR_NULLPTR; + + memset(&ctx, 0, sizeof(ctx)); + ctx.pattern = pattern; + res = th_regex_compile_do(&ctx); + *pexpr = ctx.data; + + return res; +} + + +void th_regex_free(th_regex_ctx *expr) +{ + if (expr != NULL) + { + for (size_t n = 0; n < expr->nnodes; n++) + { + th_regex_node *node = &expr->nodes[n]; + + DBG_RE_PRINT_FREE( + "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " + "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", + n, expr->nnodes, + node->mode, node->type, + node->repeatMin, node->repeatMax); + + switch (node->type) + { + case TH_RE_TYPE_SUBEXPR: + DBG_RE_PRINT_FREE(" SUBEXPR: %p vs %p\n", expr, node->match.expr); + th_regex_free(node->match.expr); + break; + + case TH_RE_TYPE_LIST: + case TH_RE_TYPE_LIST_REVERSE: + DBG_RE_PRINT_FREE(" list='%s'\n", node->match.list.chars); + th_free(node->match.list.chars); + break; + +#ifdef DBG_RE_FREE + case TH_RE_TYPE_ANY_CHAR: + DBG_RE_PRINT_FREE(" any char\n"); + break; + + case TH_RE_TYPE_CHAR: + DBG_RE_PRINT_FREE(" char='%c'\n", node->match.chr); + break; +#endif + } + } + + th_free(expr->nodes); + } +} + + +static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, + const th_regex_char *haystack, size_t *offs, const int flags); + + +static BOOL th_regex_do_match_node(const th_regex_char *haystack, + size_t *offs, const th_regex_node *node, const int flags) +{ + th_regex_char cch; + BOOL ret = FALSE; + + DBG_RE_PRINT_MATCH("node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); + + switch (node->type) + { + case TH_RE_TYPE_SUBEXPR: + DBG_RE_PRINT_MATCH("subexpr ..\n"); + ret = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); + break; + + case TH_RE_TYPE_LIST: + case TH_RE_TYPE_LIST_REVERSE: + DBG_RE_PRINT_MATCH("[%s]\n", node->match.list.chars); + ret = FALSE; + if ((cch = haystack[*offs]) == 0) + goto out; + + // Could be optimized, perhaps .. sort match.chars, binary search etc? + // XXX TODO Ranges and escapes are not supported yet + for (size_t n = 0; n < node->match.list.nchars; n++) + if (node->match.list.chars[n] == cch) + { + ret = TRUE; + break; + } + + if (node->type == TH_RE_TYPE_LIST_REVERSE) + ret = !ret; + + (*offs)++; + break; + + case TH_RE_TYPE_ANY_CHAR: + DBG_RE_PRINT_MATCH("\n"); + if ((cch = haystack[*offs]) == 0) + { + ret = FALSE; + goto out; + } + + ret = TRUE; + (*offs)++; + break; + + case TH_RE_TYPE_CHAR: + DBG_RE_PRINT_MATCH("'%c'\n", node->match.chr); + if ((cch = haystack[*offs]) == 0) + { + ret = FALSE; + goto out; + } + + ret = (cch == node->match.chr); + (*offs)++; + break; + } + +out: + DBG_RE_PRINT_MATCH("node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); + return ret; +} + + +static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, + const th_regex_char *haystack, size_t *offs, const int flags) +{ + for (size_t n = 0; n < expr->nnodes; n++) + { + const th_regex_node *node = &expr->nodes[n]; + size_t soffs; + + DBG_RE_PRINT_MATCH("expr %p [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", + (void *) expr, n, expr->nnodes, re_match_modes[node->mode]); + + switch (node->mode) + { + case TH_RE_MATCH_ONCE: + { + DBG_RE_PRINT_MATCH("\n"); + soffs = *offs; + if (!th_regex_do_match_node(haystack, &soffs, node, flags)) + return FALSE; + *offs = soffs; + } + break; + + case TH_RE_MATCH_COUNT: + { + BOOL done = FALSE, match = FALSE; + ssize_t count = 0; + + DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); + + do + { + soffs = *offs; + if ((match = th_regex_do_match_node(haystack, &soffs, node, flags))) + { + // Node matched + count++; + done = (node->repeatMax > 0 && count >= node->repeatMax); + *offs = soffs; + } + else + { + // Node did not match, check if we got the minimum if set + done = (node->repeatMin >= 0 && count >= node->repeatMin); + soffs = *offs; + break; + } + } while (!done); + + DBG_RE_PRINT_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); + + if (!done) + return FALSE; + + *offs = soffs; + } + break; + + case TH_RE_MATCH_ANCHOR_START: + DBG_RE_PRINT_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); + if (*offs != 0) + return FALSE; + break; + + case TH_RE_MATCH_ANCHOR_END: + DBG_RE_PRINT_MATCH("is end=%d\n", haystack[*offs]); + if (haystack[*offs] != 0) + return FALSE; + break; + } + } + + return TRUE; +} + + +int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack, + BOOL *pmatched, th_regex_match_node **pmatches, const ssize_t max, + const int flags) +{ + th_regex_match_node *matches = NULL; + size_t soffs, coffs; + BOOL matched; + + // Check given pattern and string + if (expr == NULL || haystack == NULL) + return THERR_NULLPTR; + + // Start matching + soffs = coffs = 0; + +#if 0 + while (haystack[soffs] != 0) + { + if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) + { + } + else + { + } + } +#else + matched = th_regex_do_match_expr(expr, haystack, &coffs, flags); +#endif + + if (pmatched != NULL) + *pmatched = matched; + + return THERR_OK; +} + +#endif // TH_EXPERIMENTAL_REGEX diff -r e789accfca87 -r 566e6ef41f9d th_regex.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/th_regex.h Thu Jan 16 01:09:00 2020 +0200 @@ -0,0 +1,78 @@ +/* + * Simple regular expression matching functionality + * Programmed and designed by Matti 'ccr' Hamalainen + * (C) Copyright 2020 Tecnic Software productions (TNSP) + * + * Please read file 'COPYING' for information on license and distribution. + */ +#ifdef TH_EXPERIMENTAL_REGEX + +#ifndef TH_REGEX_H +#define TH_REGEX_H + +#include "th_util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// +// Definitions +// +// Character type +typedef char th_regex_char; + + +struct th_regex_ctx; + +typedef struct +{ + int mode, type; + ssize_t repeatMin, repeatMax; + + union + { + th_regex_char chr; + + struct + { + size_t nchars; + th_regex_char *chars; + } list; + + struct th_regex_ctx *expr; + } match; +} th_regex_node; + + +typedef struct th_regex_ctx +{ + size_t nnodes, nodessize; + th_regex_node *nodes; +} th_regex_ctx; + + +typedef struct +{ +// th_llist_t node; + size_t start, len; +} th_regex_match_node; + + +// +// Functions +// +int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern); +void th_regex_free(th_regex_ctx *expr); + +int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack, + BOOL *pmatched, th_regex_match_node **pmatches, const ssize_t max, + const int flags); + + +#ifdef __cplusplus +} +#endif +#endif // TH_REGEX_H + +#endif // TH_EXPERIMENTAL_REGEX