view th_regex.c @ 639:8c957ad9d4c3

Some more work on regex stuff.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 23 Jan 2020 11:38:28 +0200
parents c4bca120bfb0
children 9e1f9e1d1487
line wrap: on
line source

/*
 * Simple regular expression matching functionality
 * Programmed and designed by Matti 'ccr' Hamalainen
 * (C) Copyright 2020 Tecnic Software productions (TNSP)
 *
 * Please read file 'COPYING' for information on license and distribution.
 */
#include "th_util.h"
#include "th_regex.h"


#ifdef TH_EXPERIMENTAL_REGEX_DEBUG
int th_dbg_re_flags = 0;
#    define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stderr, __VA_ARGS__); } while (0)
#    define DBG_RE_FREE(...)    do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stderr, __VA_ARGS__); } while (0)
#    define DBG_RE_MATCH(...)   do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stderr, __VA_ARGS__); } while (0)
#else
#    define DBG_RE_COMPILE(...)
#    define DBG_RE_FREE(...)
#    define DBG_RE_MATCH(...)
#endif


#ifdef TH_EXPERIMENTAL_REGEX

enum
{
    TH_RE_MATCH_ONCE,
    TH_RE_MATCH_COUNT_GREEDY,
    TH_RE_MATCH_COUNT_NONGREEDY,
    TH_RE_MATCH_ANCHOR_START,
    TH_RE_MATCH_ANCHOR_END,
};


enum
{
    TH_RE_TYPE_CHAR,
    TH_RE_TYPE_ANY_CHAR,
    TH_RE_TYPE_LIST,
    TH_RE_TYPE_LIST_REVERSE,
    TH_RE_TYPE_SUBEXPR,
};


#ifdef TH_EXPERIMENTAL_REGEX_DEBUG
static const char *re_match_modes[] =
{
    "ONCE",
    "COUNT GREEDY",
    "COUNT NONGREEDY",
    "ANCHOR_START",
    "ANCHOR_END",
};

static const char *re_match_types[] =
{
    "CHAR",
    "ANY",
    "LIST",
    "LIST_REVERSE",
    "SUBEXPR",
};
#endif


typedef struct
{
    const th_regex_char *pattern;
    size_t offs;

    th_regex_ctx *data;

    size_t nstack, stacksize;
    th_regex_ctx **stack;
} th_regex_parse_ctx;


static void th_regex_node_init(th_regex_node *node)
{
    memset(node, 0, sizeof(th_regex_node));
    node->mode = TH_RE_MATCH_ONCE;
}


static int th_regex_strndup(th_regex_char **pdst,
    const th_regex_char *src, const size_t len)
{
    if (pdst == NULL)
        return THERR_NULLPTR;

    if (UINTPTR_MAX / sizeof(th_regex_char) < len + 1)
        return THERR_BOUNDS;

    if ((*pdst = (th_regex_char *)
        th_malloc((len + 1) * sizeof(th_regex_char))) == NULL)
        return THERR_MALLOC;

    memcpy(*pdst, src, len * sizeof(th_regex_char));
    (*pdst)[len] = 0;

    return THERR_OK;
}


static int th_regex_ctx_get_prev_node(
    th_regex_parse_ctx *ctx, th_regex_node **pnode)
{
    if (ctx->data != NULL && ctx->data->nnodes > 0)
    {
        *pnode = &ctx->data->nodes[ctx->data->nnodes - 1];
        return THERR_OK;
    }
    else
        return THERR_INVALID_DATA;
}


static int th_regex_ctx_push(th_regex_parse_ctx *ctx)
{
    if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize)
    {
        ctx->stacksize += 16;

        if ((ctx->stack = th_realloc(ctx->stack,
            ctx->stacksize * sizeof(th_regex_node *))) == NULL)
            return THERR_MALLOC;
    }

    ctx->stack[ctx->nstack] = ctx->data;
    ctx->nstack++;
    ctx->data = NULL;

    return THERR_OK;
}


static int th_regex_ctx_pop(th_regex_parse_ctx *ctx, th_regex_ctx **data)
{
    if (ctx->nstack > 0)
    {
        *data = ctx->data;
        ctx->nstack--;
        ctx->data = ctx->stack[ctx->nstack];
        return THERR_OK;
    }
    else
        return THERR_INVALID_DATA;
}


static int th_regex_ctx_node_commit(th_regex_parse_ctx *ctx, th_regex_node *node)
{
    th_regex_ctx *data = ctx->data;

    if (data == NULL)
    {
        if ((data = ctx->data = th_malloc0(sizeof(th_regex_ctx))) == NULL)
            return THERR_MALLOC;
    }

    if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize)
    {
        data->nodessize += 16;
        if ((data->nodes = th_realloc(data->nodes,
            data->nodessize * sizeof(th_regex_node))) == NULL)
            return THERR_MALLOC;
    }

    memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node));
    data->nnodes++;

    DBG_RE_COMPILE(
        "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
        "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
        data->nnodes, data->nodessize,
        node->mode, node->type,
        node->repeatMin, node->repeatMax);

    return THERR_OK;
}


static BOOL th_regex_find_next(const th_regex_char *str,
    const size_t start, size_t *offs,
    const th_regex_char delim)
{
    for (*offs = start; str[*offs] != 0; (*offs)++)
    {
        if (str[*offs] == delim)
            return TRUE;
    }
    return FALSE;
}


static BOOL th_regex_parse_ssize_t(const th_regex_char *str,
    ssize_t *value)
{
    th_regex_char ch;
    BOOL neg;

    if (*str == '-')
    {
        str++;
        neg = TRUE;
    }
    else
        neg = FALSE;

    // Is the value negative?
    while ((ch = *str++))
    {
        if (ch >= '0' && ch <= '9')
        {
            *value *= 10;
            *value += ch - '0';
        }
        else
            return FALSE;
    }

    if (neg)
        *value = -(*value);

    return TRUE;
}


static void th_regex_list_item_init(th_regex_list_item *item)
{
    memset(item, 0, sizeof(th_regex_list_item));
}


static int th_regex_list_add_item(th_regex_list *list, th_regex_list_item *item)
{
    if (list->items == NULL || list->nitems + 1 >= list->itemssize)
    {
        list->itemssize += 16;

        if ((list->items = th_realloc(list->items,
            list->itemssize * sizeof(th_regex_list_item))) == NULL)
            return THERR_MALLOC;
    }

    memcpy(list->items + list->nitems, item, sizeof(th_regex_list_item));
    list->nitems++;

    return THERR_OK;
}


static void th_regex_list_free(th_regex_list *list)
{
    if (list != NULL)
    {
        for (size_t n = 0; n < list->nitems; n++)
        {
            th_free(list->items[n].chars);
        }
        th_free(list->items);
    }
}


static int th_regex_parse_list(const th_regex_char *str,
    const size_t slen, th_regex_list *list)
{
    th_regex_char *tmp = NULL;
    th_regex_list_item item;
    int res;

    if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK)
        goto out;

    // Handle ranges like [A-Z]
    for (size_t offs = 0; offs < slen; offs++)
    {
        th_regex_char
            *prev = (offs > 0) ? tmp + offs - 1 : NULL,
            *curr = tmp + offs,
            *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL;

        if (*curr == '-')
        {
            if (prev != NULL && next != NULL)
            {
                // Range
                th_regex_list_item_init(&item);
                item.type = 1;
                item.start = *prev;
                item.end = *next;

                if (item.start <= item.end)
                {
                    res = THERR_INVALID_DATA;
                    goto out;
                }

                *curr = *prev = *next = 0;

                if ((res = th_regex_list_add_item(list, &item)) != THERR_OK)
                    goto out;
            }
            else
            if (next != NULL)
            {
                res = THERR_INVALID_DATA;
                goto out;
            }
        }
    }

    // Count number of remaining characters
    th_regex_list_item_init(&item);
    item.type = 0;
    item.nchars = 0;

    for (size_t offs = 0; offs < slen; offs++)
    {
        th_regex_char curr = tmp[offs];
        if (curr != 0)
            item.nchars++;
    }

    if (item.nchars > 0)
    {
        if ((item.chars = th_malloc(sizeof(th_regex_char) * item.nchars)) == NULL)
        {
            res = THERR_MALLOC;
            goto out;
        }

        for (size_t offs = 0, n = 0; offs < slen; offs++)
        {
            th_regex_char curr = tmp[offs];
            if (curr != 0)
            {
                item.chars[n] = curr;
                n++;
            }
        }

        if ((res = th_regex_list_add_item(list, &item)) != THERR_OK)
        {
            th_free(item.chars);
            goto out;
        }
    }

out:
    th_free(tmp);
    return res;
}


int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern)
{
    int res = THERR_OK;
    th_regex_parse_ctx ctx;
    th_regex_node node, *pnode;
    th_regex_char *tmp = NULL;
    size_t start;

    if (pexpr == NULL || pattern == NULL)
    {
        res = THERR_NULLPTR;
        goto exit;
    }

    memset(&ctx, 0, sizeof(ctx));
    ctx.pattern = pattern;

    for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
    {
        th_regex_char cch = ctx.pattern[ctx.offs];
        DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch);
        switch (cch)
        {
            case '?':
            case '*':
            case '+':
                if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
                    goto exit;

                if (cch == '?')
                {
                    // Check if previous was a count
                    pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ?
                        TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY;

                    // Previous token is optional (repeat 0-1 times)
                    pnode->repeatMin = 0;
                    pnode->repeatMax = 1;
                }
                else
                {
                    // Check if previous was a count
                    if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY ||
                        pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY)
                    {
                        res = THERR_INVALID_DATA;
                        goto exit;
                    }

                    pnode->mode = TH_RE_MATCH_COUNT_GREEDY;

                    if (cch == '*')
                    {
                        // Previous token can repeat 0 or more times
                        pnode->repeatMin = 0;
                        pnode->repeatMax = -1;
                    }
                    else
                    {
                        // Previous token must repeat 1 or more times
                        pnode->repeatMin = 1;
                        pnode->repeatMax = -1;
                    }
                }
                break;

            case '{':
                // {n} | {min,max}
                start = ctx.offs + 1;
                if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}'))
                {
                    // End not found
                    res = THERR_INVALID_DATA;
                    goto exit;
                }

                th_free(tmp);

                if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK ||
                    (res = th_regex_strndup(&tmp, ctx.pattern + start,
                    ctx.offs - start)) != THERR_OK)
                    goto exit;

                pnode->mode = TH_RE_MATCH_COUNT_GREEDY;

                if (th_regex_find_next(tmp, 0, &start, ','))
                {
                    tmp[start] = 0;
                    if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) ||
                        !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax))
                    {
                        res = THERR_INVALID_DATA;
                        goto exit;
                    }
                }
                else
                {
                    if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin))
                    {
                        res = THERR_INVALID_DATA;
                        goto exit;
                    }
                    pnode->repeatMax = pnode->repeatMin;
                }

                if (pnode->repeatMin < 0 || pnode->repeatMax < 1 ||
                    pnode->repeatMax < pnode->repeatMin)
                {
                    // Invalid repeat counts
                    res = THERR_INVALID_DATA;
                    goto exit;
                }
                break;

            case '(':
                // Start of subpattern
                if ((res = th_regex_ctx_push(&ctx)) != THERR_OK)
                    goto exit;
                break;

            case ')':
                // End of subpattern
                th_regex_node_init(&node);
                node.type = TH_RE_TYPE_SUBEXPR;

                if ((res = th_regex_ctx_pop(&ctx, &node.match.expr)) != THERR_OK ||
                    (res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto exit;
                break;

            case '^':
                // Start of line anchor
                th_regex_node_init(&node);
                node.mode = TH_RE_MATCH_ANCHOR_START;

                if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto exit;
                break;

            case '$':
                // End of line anchor
                th_regex_node_init(&node);
                node.mode = TH_RE_MATCH_ANCHOR_END;

                if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto exit;
                break;

            case '[':
                // Start of char list
                start = ctx.offs + 1;
                if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') ||
                    ctx.offs == start)
                {
                    res = THERR_INVALID_DATA;
                    goto exit;
                }

                // XXX TODO Parse/support ranges [0-9a-zA-Z_-]

                th_regex_node_init(&node);
                node.type = (ctx.pattern[start] == '^') ?
                    TH_RE_TYPE_LIST_REVERSE : TH_RE_TYPE_LIST;
                node.match.list.nchars = ctx.offs - start;
                if ((res = th_regex_strndup(&node.match.list.chars,
                    ctx.pattern + start, node.match.list.nchars)) != THERR_OK)
                    goto exit;

                if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto exit;
                break;

            case '.':
                // Any single character matches
                th_regex_node_init(&node);
                node.type = TH_RE_TYPE_ANY_CHAR;

                if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto exit;
                break;

            case '\\':
                // Literal escape
                ctx.offs++;
                if (ctx.pattern[ctx.offs] == 0)
                {
                    // End of pattern, error
                    res = THERR_INVALID_DATA;
                    goto exit;
                }

                th_regex_node_init(&node);
                node.type = TH_RE_TYPE_CHAR;
                node.match.chr = ctx.pattern[ctx.offs];

                if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto exit;
                break;

            default:
                // Given character must match
                th_regex_node_init(&node);
                node.type = TH_RE_TYPE_CHAR;
                node.match.chr = ctx.pattern[ctx.offs];

                if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto exit;
                break;
        }
    }

exit:
    *pexpr = ctx.data;
    th_free(tmp);
    return res;
}


void th_regex_free(th_regex_ctx *expr)
{
    if (expr != NULL)
    {
        for (size_t n = 0; n < expr->nnodes; n++)
        {
            th_regex_node *node = &expr->nodes[n];

            DBG_RE_FREE(
                "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
                "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
                n, expr->nnodes,
                node->mode, node->type,
                node->repeatMin, node->repeatMax);

            switch (node->type)
            {
                case TH_RE_TYPE_SUBEXPR:
                    DBG_RE_FREE("  SUBEXPR: %p vs %p\n",
                        (void *) expr, (void *) node->match.expr);
                    th_regex_free(node->match.expr);
                    break;

                case TH_RE_TYPE_LIST:
                case TH_RE_TYPE_LIST_REVERSE:
                    DBG_RE_FREE("  list='%s'\n", node->match.list.chars);
                    th_free(node->match.list.chars);
                    break;

#ifdef TH_EXPERIMENTAL_REGEX_DEBUG
                case TH_RE_TYPE_ANY_CHAR:
                    DBG_RE_FREE("  any char\n");
                    break;

                case TH_RE_TYPE_CHAR:
                    DBG_RE_FREE("  char='%c'\n", node->match.chr);
                    break;
#endif
            }
        }

        th_free(expr->nodes);
    }
}


static BOOL th_regex_do_match_list(const th_regex_list *list, const th_regex_char cch)
{
    // Could be optimized, perhaps .. sort match.chars, binary search etc?
    for (size_t nitem = 0; nitem < list->nitems; nitem++)
    {
        const th_regex_list_item *item = &list->items[nitem];
        if (item->type == 0)
        {
            for (size_t n = 0; n < item->nchars; n++)
            if (item->chars[n] == cch)
                return TRUE;
        }
        else
        {
            if (cch >= item->start && cch <= item->end)
                return TRUE;
        }
    }

    return FALSE;
}


static BOOL th_regex_do_match_expr(const th_regex_ctx *expr,
    const th_regex_char *haystack, size_t *offs, const int flags);


static BOOL th_regex_do_match_node(const th_regex_char *haystack,
    size_t *offs, const th_regex_node *node, const int flags)
{
    th_regex_char cch;
    BOOL res = FALSE;

    DBG_RE_MATCH("    node_START [%s]: '%s': ",
        re_match_types[node->type], haystack + *offs);

    switch (node->type)
    {
        case TH_RE_TYPE_SUBEXPR:
            DBG_RE_MATCH("subexpr ..\n");
            res = th_regex_do_match_expr(node->match.expr, haystack, offs, flags);
            return res;
            break;

        case TH_RE_TYPE_LIST:
        case TH_RE_TYPE_LIST_REVERSE:
            DBG_RE_MATCH("[%s]", node->match.list.chars);
            if ((cch = haystack[*offs]) == 0)
                goto out;

            // Could be optimized, perhaps .. sort match.chars, binary search etc?
            // XXX TODO Ranges are not supported yet
            for (size_t n = 0; n < node->match.list.nchars; n++)
            if (node->match.list.chars[n] == cch)
            {
                res = TRUE;
                break;
            }

            if (node->type == TH_RE_TYPE_LIST_REVERSE)
                res = !res;

            (*offs)++;
            break;

        case TH_RE_TYPE_ANY_CHAR:
            if ((cch = haystack[*offs]) == 0)
                goto out;

            res = TRUE;
            (*offs)++;
            break;

        case TH_RE_TYPE_CHAR:
            DBG_RE_MATCH("'%c'", node->match.chr);
            if ((cch = haystack[*offs]) == 0)
                goto out;

            res = (cch == node->match.chr);
            (*offs)++;
            break;
    }

out:
    DBG_RE_MATCH(", match=%s\n",
        res ? "YES" : "NO");

    return res;
}


static BOOL th_regex_do_match_expr(const th_regex_ctx *expr,
    const th_regex_char *haystack, size_t *offs, const int flags)
{
    for (size_t n = 0; n < expr->nnodes; n++)
    {
        const th_regex_node *node = &expr->nodes[n];
        size_t soffs;

        DBG_RE_MATCH("  expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ",
            n, expr->nnodes, re_match_modes[node->mode]);

        switch (node->mode)
        {
            case TH_RE_MATCH_ONCE:
                {
                    DBG_RE_MATCH("\n");
                    soffs = *offs;
                    if (!th_regex_do_match_node(haystack, &soffs, node, flags))
                        return FALSE;
                    *offs = soffs;
                }
                break;

            case TH_RE_MATCH_COUNT_GREEDY:
            case TH_RE_MATCH_COUNT_NONGREEDY:
                {
                    BOOL done = FALSE;
                    ssize_t count = 0;

                    DBG_RE_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax);

                    do
                    {
                        BOOL match;
                        soffs = *offs;

                        match = th_regex_do_match_node(haystack, &soffs, node, flags);
                        for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++)
                        {
                            const th_regex_node *next = &expr->nodes[qn];
                            do {
                              match = th_regex_do_match_node(haystack, &soffs, next, flags);
                            } while (haystack[soffs] != 0 && !match);
                        }

                        if (match)
                        {
                            // Node matched
                            count++;
                            done = (node->repeatMax > 0 && count >= node->repeatMax);
                            *offs = soffs;
                        }
                        else
                        {
                            // Node did not match, check if we got the minimum if set
                            done = (node->repeatMin >= 0 && count >= node->repeatMin);
                            soffs = *offs;
                            break;
                        }
                    } while (!done);

                    DBG_RE_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO");

                    if (!done)
                        return FALSE;

                    *offs = soffs;
                }
                break;

            case TH_RE_MATCH_ANCHOR_START:
                DBG_RE_MATCH("offs=%" PRIu_SIZE_T "\n", *offs);
                if (*offs != 0)
                    return FALSE;
                break;

            case TH_RE_MATCH_ANCHOR_END:
                DBG_RE_MATCH("is end=%d\n", haystack[*offs]);
                if (haystack[*offs] != 0)
                    return FALSE;
                break;
        }
    }

    return TRUE;
}


int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack,
    size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches,
    const int flags)
{
    size_t nmatches = 0;

    if (pnmatches != NULL)
        *pnmatches = 0;

    // Check given pattern and string
    if (expr == NULL || haystack == NULL)
        return THERR_NULLPTR;

    // Start matching
    // XXX NOTE .. lots to think about and to take into account:
    // - anchored and unanchored expressions
    // - how to check if the expression has consumed all possibilities?
    // ..
    for (size_t soffs = 0; haystack[soffs] != 0; )
    {
        BOOL matched;
        size_t coffs = soffs;

        DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
            soffs, haystack + soffs);

        if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))
        {
            // A match was found, increase count
            nmatches++;

            // Deliver to caller if required
            if (pnmatches != NULL)
                *pnmatches = nmatches;

            if (pmatches != NULL)
            {
                th_regex_match_node *match = th_malloc0(sizeof(th_regex_match_node));
                if (match == NULL)
                    return THERR_MALLOC;

                match->start = soffs;
                match->len   = coffs - soffs;

                th_llist_append_node((th_llist_t **) pmatches, (th_llist_t *) match);
            }

            // Check match count limit, if set
            if (maxmatches > 0 && nmatches >= maxmatches)
                break;

            // If offset was not advanced, increase by one
            // otherwise use end of match offset as new start
            if (soffs == coffs)
                soffs++;
            else
                soffs = coffs;
        }
        else
        {
            soffs++;
        }
    }

    return THERR_OK;
}


static void th_regex_free_match(th_regex_match_node *node)
{
    (void) node;
    // Nothing to do here at the moment
}


void th_regex_free_matches(th_regex_match_node *matches)
{
    th_llist_free_func_node((th_llist_t *) matches,
        (void (*)(th_llist_t *)) th_regex_free_match);
}

#endif // TH_EXPERIMENTAL_REGEX