Mercurial > hg > th-libs
view th_regex.c @ 635:d191ded8a790
Improve the experimental regex matching debugging macros.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 21 Jan 2020 07:16:18 +0200 |
parents | afcaf5e38f56 |
children | c4bca120bfb0 |
line wrap: on
line source
/* * Simple regular expression matching functionality * Programmed and designed by Matti 'ccr' Hamalainen * (C) Copyright 2020 Tecnic Software productions (TNSP) * * Please read file 'COPYING' for information on license and distribution. */ #include "th_util.h" #include "th_regex.h" #ifdef TH_EXPERIMENTAL_REGEX_DEBUG int th_dbg_re_flags = 0; # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stderr, __VA_ARGS__); } while (0) # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stderr, __VA_ARGS__); } while (0) # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stderr, __VA_ARGS__); } while (0) #else # define DBG_RE_COMPILE(...) # define DBG_RE_FREE(...) # define DBG_RE_MATCH(...) #endif #ifdef TH_EXPERIMENTAL_REGEX enum { TH_RE_MATCH_ONCE, TH_RE_MATCH_COUNT_GREEDY, TH_RE_MATCH_COUNT_NONGREEDY, TH_RE_MATCH_ANCHOR_START, TH_RE_MATCH_ANCHOR_END, }; enum { TH_RE_TYPE_CHAR, TH_RE_TYPE_ANY_CHAR, TH_RE_TYPE_LIST, TH_RE_TYPE_LIST_REVERSE, TH_RE_TYPE_SUBEXPR, }; #ifdef TH_EXPERIMENTAL_REGEX_DEBUG static const char *re_match_modes[] = { "ONCE", "COUNT GREEDY", "COUNT NONGREEDY", "ANCHOR_START", "ANCHOR_END", }; static const char *re_match_types[] = { "CHAR", "ANY", "LIST", "LIST_REVERSE", "SUBEXPR", }; #endif typedef struct { const th_regex_char *pattern; size_t offs; th_regex_ctx *data; size_t nstack, stacksize; th_regex_ctx **stack; } th_regex_parse_ctx; static void th_regex_node_init(th_regex_node *node) { memset(node, 0, sizeof(th_regex_node)); node->mode = TH_RE_MATCH_ONCE; } static int th_regex_strndup(th_regex_char **pdst, const th_regex_char *src, const size_t len) { if (pdst == NULL) return THERR_NULLPTR; if (UINTPTR_MAX / sizeof(th_regex_char) < len + 1) return THERR_BOUNDS; if ((*pdst = (th_regex_char *) th_malloc((len + 1) * sizeof(th_regex_char))) == NULL) return THERR_MALLOC; memcpy(*pdst, src, len * sizeof(th_regex_char)); (*pdst)[len] = 0; return THERR_OK; } static int th_regex_ctx_get_prev_node( th_regex_parse_ctx *ctx, th_regex_node **pnode) { if (ctx->data != NULL && ctx->data->nnodes > 0) { *pnode = &ctx->data->nodes[ctx->data->nnodes - 1]; return THERR_OK; } else return THERR_INVALID_DATA; } static int th_regex_ctx_push(th_regex_parse_ctx *ctx) { if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize) { ctx->stacksize += 16; if ((ctx->stack = th_realloc(ctx->stack, ctx->stacksize * sizeof(th_regex_node *))) == NULL) return THERR_MALLOC; } ctx->stack[ctx->nstack] = ctx->data; ctx->nstack++; ctx->data = NULL; return THERR_OK; } static int th_regex_ctx_pop(th_regex_parse_ctx *ctx, th_regex_ctx **data) { if (ctx->nstack > 0) { *data = ctx->data; ctx->nstack--; ctx->data = ctx->stack[ctx->nstack]; return THERR_OK; } else return THERR_INVALID_DATA; } static int th_regex_ctx_node_commit(th_regex_parse_ctx *ctx, th_regex_node *node) { th_regex_ctx *data = ctx->data; if (data == NULL) { if ((data = ctx->data = th_malloc0(sizeof(th_regex_ctx))) == NULL) return THERR_MALLOC; } if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize) { data->nodessize += 16; if ((data->nodes = th_realloc(data->nodes, data->nodessize * sizeof(th_regex_node))) == NULL) return THERR_MALLOC; } memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); data->nnodes++; DBG_RE_COMPILE( "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", data->nnodes, data->nodessize, node->mode, node->type, node->repeatMin, node->repeatMax); return THERR_OK; } static BOOL th_regex_find_next(const th_regex_char *str, const size_t start, size_t *offs, const th_regex_char delim) { for (*offs = start; str[*offs] != 0; (*offs)++) { if (str[*offs] == delim) return TRUE; } return FALSE; } static BOOL th_regex_parse_ssize_t(const th_regex_char *str, ssize_t *value) { th_regex_char ch; BOOL neg; if (*str == '-') { str++; neg = TRUE; } else neg = FALSE; // Is the value negative? while ((ch = *str++)) { if (ch >= '0' && ch <= '9') { *value *= 10; *value += ch - '0'; } else return FALSE; } if (neg) *value = -(*value); return TRUE; } int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern) { int res = THERR_OK; th_regex_parse_ctx ctx; th_regex_node node, *pnode; th_regex_char *tmp = NULL; size_t start; if (pexpr == NULL || pattern == NULL) { res = THERR_NULLPTR; goto exit; } memset(&ctx, 0, sizeof(ctx)); ctx.pattern = pattern; for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) { th_regex_char cch = ctx.pattern[ctx.offs]; DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); switch (cch) { case '?': case '*': case '+': if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) goto exit; if (cch == '?') { // Check if previous was a count pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ? TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY; // Previous token is optional (repeat 0-1 times) pnode->repeatMin = 0; pnode->repeatMax = 1; } else { // Check if previous was a count if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY || pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY) { res = THERR_INVALID_DATA; goto exit; } pnode->mode = TH_RE_MATCH_COUNT_GREEDY; if (cch == '*') { // Previous token can repeat 0 or more times pnode->repeatMin = 0; pnode->repeatMax = -1; } else { // Previous token must repeat 1 or more times pnode->repeatMin = 1; pnode->repeatMax = -1; } } break; case '{': // {n} | {min,max} start = ctx.offs + 1; if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}')) { // End not found res = THERR_INVALID_DATA; goto exit; } th_free(tmp); if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || (res = th_regex_strndup(&tmp, ctx.pattern + start, ctx.offs - start)) != THERR_OK) goto exit; pnode->mode = TH_RE_MATCH_COUNT_GREEDY; if (th_regex_find_next(tmp, 0, &start, ',')) { tmp[start] = 0; if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax)) { res = THERR_INVALID_DATA; goto exit; } } else { if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin)) { res = THERR_INVALID_DATA; goto exit; } pnode->repeatMax = pnode->repeatMin; } if (pnode->repeatMin < 0 || pnode->repeatMax < 1 || pnode->repeatMax < pnode->repeatMin) { // Invalid repeat counts res = THERR_INVALID_DATA; goto exit; } break; case '(': // Start of subpattern if ((res = th_regex_ctx_push(&ctx)) != THERR_OK) goto exit; break; case ')': // End of subpattern th_regex_node_init(&node); node.type = TH_RE_TYPE_SUBEXPR; if ((res = th_regex_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || (res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; case '^': // Start of line anchor th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_START; if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; case '$': // End of line anchor th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_END; if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; case '[': // Start of char list start = ctx.offs + 1; if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') || ctx.offs == start) { res = THERR_INVALID_DATA; goto exit; } th_regex_node_init(&node); node.type = (ctx.pattern[start] == '^') ? TH_RE_TYPE_LIST_REVERSE : TH_RE_TYPE_LIST; node.match.list.nchars = ctx.offs - start; if ((res = th_regex_strndup(&node.match.list.chars, ctx.pattern + start, node.match.list.nchars)) != THERR_OK) goto exit; if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; case '.': // Any single character matches th_regex_node_init(&node); node.type = TH_RE_TYPE_ANY_CHAR; if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; case '\\': // Literal escape ctx.offs++; if (ctx.pattern[ctx.offs] == 0) { // End of pattern, error res = THERR_INVALID_DATA; goto exit; } th_regex_node_init(&node); node.type = TH_RE_TYPE_CHAR; node.match.chr = ctx.pattern[ctx.offs]; if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; default: // Given character must match th_regex_node_init(&node); node.type = TH_RE_TYPE_CHAR; node.match.chr = ctx.pattern[ctx.offs]; if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) goto exit; break; } } exit: *pexpr = ctx.data; th_free(tmp); return res; } void th_regex_free(th_regex_ctx *expr) { if (expr != NULL) { for (size_t n = 0; n < expr->nnodes; n++) { th_regex_node *node = &expr->nodes[n]; DBG_RE_FREE( "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", n, expr->nnodes, node->mode, node->type, node->repeatMin, node->repeatMax); switch (node->type) { case TH_RE_TYPE_SUBEXPR: DBG_RE_FREE(" SUBEXPR: %p vs %p\n", (void *) expr, (void *) node->match.expr); th_regex_free(node->match.expr); break; case TH_RE_TYPE_LIST: case TH_RE_TYPE_LIST_REVERSE: DBG_RE_FREE(" list='%s'\n", node->match.list.chars); th_free(node->match.list.chars); break; #ifdef TH_EXPERIMENTAL_REGEX_DEBUG case TH_RE_TYPE_ANY_CHAR: DBG_RE_FREE(" any char\n"); break; case TH_RE_TYPE_CHAR: DBG_RE_FREE(" char='%c'\n", node->match.chr); break; #endif } } th_free(expr->nodes); } } static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, const th_regex_char *haystack, size_t *offs, const int flags); static BOOL th_regex_do_match_node(const th_regex_char *haystack, size_t *offs, const th_regex_node *node, const int flags) { th_regex_char cch; BOOL ret = FALSE; DBG_RE_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); switch (node->type) { case TH_RE_TYPE_SUBEXPR: DBG_RE_MATCH("subexpr ..\n"); ret = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); break; case TH_RE_TYPE_LIST: case TH_RE_TYPE_LIST_REVERSE: DBG_RE_MATCH("[%s]\n", node->match.list.chars); ret = FALSE; if ((cch = haystack[*offs]) == 0) goto out; // Could be optimized, perhaps .. sort match.chars, binary search etc? // XXX TODO Ranges and escapes are not supported yet for (size_t n = 0; n < node->match.list.nchars; n++) if (node->match.list.chars[n] == cch) { ret = TRUE; break; } if (node->type == TH_RE_TYPE_LIST_REVERSE) ret = !ret; (*offs)++; break; case TH_RE_TYPE_ANY_CHAR: DBG_RE_MATCH("\n"); if ((cch = haystack[*offs]) == 0) { ret = FALSE; goto out; } ret = TRUE; (*offs)++; break; case TH_RE_TYPE_CHAR: DBG_RE_MATCH("'%c'\n", node->match.chr); if ((cch = haystack[*offs]) == 0) { ret = FALSE; goto out; } ret = (cch == node->match.chr); (*offs)++; break; } out: DBG_RE_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); return ret; } static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, const th_regex_char *haystack, size_t *offs, const int flags) { for (size_t n = 0; n < expr->nnodes; n++) { const th_regex_node *node = &expr->nodes[n]; size_t soffs; DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", n, expr->nnodes, re_match_modes[node->mode]); switch (node->mode) { case TH_RE_MATCH_ONCE: { DBG_RE_MATCH("\n"); soffs = *offs; if (!th_regex_do_match_node(haystack, &soffs, node, flags)) return FALSE; *offs = soffs; } break; case TH_RE_MATCH_COUNT_GREEDY: case TH_RE_MATCH_COUNT_NONGREEDY: { BOOL done = FALSE; ssize_t count = 0; DBG_RE_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); do { BOOL match; soffs = *offs; match = th_regex_do_match_node(haystack, &soffs, node, flags); for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++) { const th_regex_node *next = &expr->nodes[qn]; do { match = th_regex_do_match_node(haystack, &soffs, next, flags); } while (haystack[soffs] != 0 && !match); } if (match) { // Node matched count++; done = (node->repeatMax > 0 && count >= node->repeatMax); *offs = soffs; } else { // Node did not match, check if we got the minimum if set done = (node->repeatMin >= 0 && count >= node->repeatMin); soffs = *offs; break; } } while (!done); DBG_RE_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); if (!done) return FALSE; *offs = soffs; } break; case TH_RE_MATCH_ANCHOR_START: DBG_RE_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); if (*offs != 0) return FALSE; break; case TH_RE_MATCH_ANCHOR_END: DBG_RE_MATCH("is end=%d\n", haystack[*offs]); if (haystack[*offs] != 0) return FALSE; break; } } return TRUE; } int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack, size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches, const int flags) { size_t nmatches = 0; if (pnmatches != NULL) *pnmatches = 0; // Check given pattern and string if (expr == NULL || haystack == NULL) return THERR_NULLPTR; // Start matching // XXX NOTE .. lots to think about and to take into account: // - anchored and unanchored expressions // - how to check if the expression has consumed all possibilities? // .. for (size_t soffs = 0; haystack[soffs] != 0; ) { BOOL matched; size_t coffs = soffs; DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", soffs, haystack + soffs); if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) { // A match was found, increase count nmatches++; // Deliver to caller if required if (pnmatches != NULL) *pnmatches = nmatches; if (pmatches != NULL) { th_regex_match_node *match = th_malloc0(sizeof(th_regex_match_node)); if (match == NULL) return THERR_MALLOC; match->start = soffs; match->len = coffs - soffs; th_llist_append_node((th_llist_t **) pmatches, (th_llist_t *) match); } // Check match count limit, if set if (maxmatches > 0 && nmatches >= maxmatches) break; // If offset was not advanced, increase by one // otherwise use end of match offset as new start if (soffs == coffs) soffs++; else soffs = coffs; } else { soffs++; } } return THERR_OK; } static void th_regex_free_match(th_regex_match_node *node) { (void) node; // Nothing to do here at the moment } void th_regex_free_matches(th_regex_match_node *matches) { th_llist_free_func_node((th_llist_t *) matches, (void (*)(th_llist_t *)) th_regex_free_match); } #endif // TH_EXPERIMENTAL_REGEX