Mercurial > hg > th-libs

/*
 * Simple regular expression matching functionality
 * Programmed and designed by Matti 'ccr' Hamalainen
 * (C) Copyright 2020-2022 Tecnic Software productions (TNSP)
 *
 * Please read file 'COPYING' for information on license and distribution.
 */
#include "th_regex.h"


#ifdef TH_EXPERIMENTAL_REGEX_DEBUG
th_ioctx_t *th_dbg_fh = NULL;

#    define DBG_RE_PRINT(...) do { \
        if (th_dbg_fh != NULL) \
        { \
            th_regex_dump_indent(th_dbg_fh, level); \
            thfprintf(th_dbg_fh, __VA_ARGS__); \
        } \
    } while (0)
#else
#    define DBG_RE_PRINT(...)
#endif


/// @cond
enum
{
    TH_RE_MATCH_ONCE,
    TH_RE_MATCH_COUNT,
    TH_RE_MATCH_ANCHOR_START,
    TH_RE_MATCH_ANCHOR_END,
};


enum
{
    TH_RE_TYPE_CHAR,
    TH_RE_TYPE_STR,
    TH_RE_TYPE_ANY_CHAR,
    TH_RE_TYPE_LIST,
    TH_RE_TYPE_LIST_REVERSE,
    TH_RE_TYPE_SUBEXPR,
};


static const char *re_match_modes[] =
{
    "ONCE",
    "COUNT",
    "ANCHOR START",
    "ANCHOR END",
};


static const char *re_match_types[] =
{
    "CHAR",
    "STR",
    "ANY",
    "LIST",
    "LIST REVERSE",
    "SUBEXPR",
};


typedef struct
{
    int type;
    th_char_t start, end;

    size_t nchars;
    th_char_t *chars;
} th_regex_list_item_t;


typedef struct
{
    size_t nitems, itemssize;
    th_regex_list_item_t *items;
} th_regex_list_t;


typedef struct
{
    int mode, type;
    ssize_t repeatMin, repeatMax;

    struct {
        th_char_t chr;
        th_char_t *str;
        th_regex_list_t list;

        th_regex_t *expr;
    } match;
} th_regex_node_t;


typedef struct
{
    const th_char_t *pattern;
    size_t offs;

    th_regex_t *data;

    size_t nstack, stacksize;
    th_regex_t **stack;

    th_char_t *buf;
    size_t bufSize, bufPos;

} th_regex_parse_ctx_t;


struct th_regex_t
{
    size_t nnodes, nodessize;
    th_regex_node_t *nodes;
};

/// @endcond


static void th_regex_node_init(th_regex_node_t *node)
{
    memset(node, 0, sizeof(th_regex_node_t));
    node->mode = TH_RE_MATCH_ONCE;
}


static int th_regex_strndup(th_char_t **pdst,
    const th_char_t *src, const size_t len)
{
    if (pdst == NULL)
        return THERR_NULLPTR;

    if (UINTPTR_MAX / sizeof(th_char_t) < len + 1)
        return THERR_BOUNDS;

    if ((*pdst = (th_char_t *)
        th_malloc((len + 1) * sizeof(th_char_t))) == NULL)
        return THERR_MALLOC;

    memcpy(*pdst, src, len * sizeof(th_char_t));
    (*pdst)[len] = 0;

    return THERR_OK;
}


static int th_regex_parse_ctx_get_prev_node(
    th_regex_parse_ctx_t *ctx, th_regex_node_t **pnode)
{
    if (ctx->data != NULL && ctx->data->nnodes > 0)
    {
        *pnode = &ctx->data->nodes[ctx->data->nnodes - 1];
        return THERR_OK;
    }
    else
        return THERR_INVALID_DATA;
}


static int th_regex_parse_ctx_push(th_regex_parse_ctx_t *ctx)
{
    if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize)
    {
        ctx->stacksize += 16;

        if ((ctx->stack = th_realloc(ctx->stack,
            ctx->stacksize * sizeof(th_regex_node_t *))) == NULL)
            return THERR_MALLOC;
    }

    ctx->stack[ctx->nstack] = ctx->data;
    ctx->nstack++;
    ctx->data = NULL;

    return THERR_OK;
}


static int th_regex_parse_ctx_pop(th_regex_parse_ctx_t *ctx, th_regex_t **data)
{
    if (ctx->nstack > 0)
    {
        *data = ctx->data;
        ctx->nstack--;
        ctx->data = ctx->stack[ctx->nstack];
        return THERR_OK;
    }
    else
        return THERR_INVALID_DATA;
}


static int th_regex_parse_ctx_node_commit(th_regex_parse_ctx_t *ctx, th_regex_node_t *node)
{
    th_regex_t *data;

    if (ctx->data == NULL &&
        (data = ctx->data = th_malloc0(sizeof(th_regex_t))) == NULL)
        return THERR_MALLOC;
    else
        data = ctx->data;

    if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize)
    {
        data->nodessize += 16;
        if ((data->nodes = th_realloc(data->nodes,
            data->nodessize * sizeof(th_regex_node_t))) == NULL)
            return THERR_MALLOC;
    }

    memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t));
    data->nnodes++;

    return THERR_OK;
}


static bool th_regex_find_next(const th_char_t *str,
    const size_t start, size_t *offs,
    const th_char_t delim)
{
    for (*offs = start; str[*offs] != 0; (*offs)++)
    {
        if (str[*offs] == delim)
            return true;
    }
    return false;
}


static bool th_regex_parse_ssize_t(const th_char_t *str,
    ssize_t *value)
{
    th_char_t ch;
    bool neg;

    if (*str == '-')
    {
        str++;
        neg = true;
    }
    else
        neg = false;

    // Is the value negative?
    while ((ch = *str++))
    {
        if (ch >= '0' && ch <= '9')
        {
            *value *= 10;
            *value += ch - '0';
        }
        else
            return false;
    }

    if (neg)
        *value = -(*value);

    return true;
}


static void th_regex_list_item_init(th_regex_list_item_t *item)
{
    memset(item, 0, sizeof(th_regex_list_item_t));
}


static int th_regex_list_add_item(th_regex_list_t *list, th_regex_list_item_t *item)
{
    if (list->items == NULL || list->nitems + 1 >= list->itemssize)
    {
        list->itemssize += 16;

        if ((list->items = th_realloc(list->items,
            list->itemssize * sizeof(th_regex_list_item_t))) == NULL)
            return THERR_MALLOC;
    }

    memcpy(&list->items[list->nitems], item, sizeof(th_regex_list_item_t));
    list->nitems++;

    return THERR_OK;
}


static void th_regex_list_free(th_regex_list_t *list)
{
    if (list != NULL)
    {
        for (size_t n = 0; n < list->nitems; n++)
        {
            th_free(list->items[n].chars);
        }
        th_free(list->items);
    }
}


static int th_regex_parse_list(const th_char_t *str,
    const size_t slen, th_regex_list_t *list)
{
    th_char_t *tmp = NULL;
    th_regex_list_item_t item;
    int res;

    if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK)
        goto out;

    // Handle ranges like [A-Z]
    for (size_t offs = 0; offs < slen; offs++)
    {
        th_char_t
            *prev = (offs > 0) ? tmp + offs - 1 : NULL,
            *curr = tmp + offs,
            *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL;

        if (*curr == '-')
        {
            if (prev != NULL && next != NULL)
            {
                // Range
                th_regex_list_item_init(&item);
                item.type = 1;
                item.start = *prev;
                item.end = *next;

                if (item.start >= item.end)
                {
                    res = THERR_INVALID_DATA;
                    goto out;
                }

                *curr = *prev = *next = 0;

                if ((res = th_regex_list_add_item(list, &item)) != THERR_OK)
                    goto out;
            }
            else
            if (next != NULL)
            {
                res = THERR_INVALID_DATA;
                goto out;
            }
        }
    }

    // Count number of remaining characters
    th_regex_list_item_init(&item);
    item.type = 0;
    item.nchars = 0;

    for (size_t offs = 0; offs < slen; offs++)
    {
        th_char_t curr = tmp[offs];
        if (curr != 0)
            item.nchars++;
    }

    if (item.nchars > 0)
    {
        if ((item.chars = th_malloc(sizeof(th_char_t) * item.nchars)) == NULL)
        {
            res = THERR_MALLOC;
            goto out;
        }

        for (size_t offs = 0, n = 0; offs < slen; offs++)
        {
            th_char_t curr = tmp[offs];
            if (curr != 0)
            {
                item.chars[n] = curr;
                n++;
            }
        }

        if ((res = th_regex_list_add_item(list, &item)) != THERR_OK)
        {
            th_free(item.chars);
            goto out;
        }
    }

out:
    th_free(tmp);
    return res;
}


static int th_regex_parse_ctx_node_commit_strchr_do(th_regex_parse_ctx_t *ctx,
    const th_char_t *buf, const size_t bufLen)
{
    th_regex_node_t node;
    th_regex_node_init(&node);

    if (bufLen > 1)
    {
        int res;
        node.type = TH_RE_TYPE_STR;
        if ((res = th_regex_strndup(&node.match.str, buf, bufLen)) != THERR_OK)
            return res;
    }
    else
    {
        node.type = TH_RE_TYPE_CHAR;
        node.match.chr = buf[0];
    }

    return th_regex_parse_ctx_node_commit(ctx, &node);
}


static int th_regex_parse_ctx_node_commit_strchr(th_regex_parse_ctx_t *ctx, const bool split)
{
    int res = THERR_OK;;

    if (ctx->bufPos > 0)
    {
        if (ctx->bufPos > 1 && split)
        {
            if ((res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos - 1)) != THERR_OK)
                return res;

            res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf + ctx->bufPos - 1, 1);
        }
        else
        {
            res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos);
        }
        ctx->bufPos = 0;
    }

    return res;
}


/**
 * Parse given regular expression @p pattern string into compiled/tokenized
 * form as @c th_regex_t structures. Returns @c THERR_OK if successful,
 * or other @c THERR_* return value if not. In either case, the @p pexpr
 * may have been allocated and must be freed via th_regex_free().
 * @param[in,out] pexpr pointer to a pointer of @c th_regex_t structures to be
 * @param[in] pattern regular expression pattern string
 * @returns @c THERR_* return value indicating success or failure
 */
int th_regex_compile(th_regex_t **pexpr, const th_char_t *pattern)
{
    int res = THERR_OK;
    th_regex_parse_ctx_t ctx;
    th_regex_node_t node, *pnode;
    th_char_t *tmp = NULL;
    size_t start;

    memset(&ctx, 0, sizeof(ctx));

    // Check pointers
    if (pexpr == NULL || pattern == NULL)
    {
        res = THERR_NULLPTR;
        goto out;
    }

    // Initialize parsing context
    ctx.pattern = pattern;
    ctx.bufSize = 256;

    if ((ctx.buf = th_malloc(ctx.bufSize * sizeof(th_char_t))) == NULL)
    {
        res = THERR_MALLOC;
        goto out;
    }

    // Start parsing the pattern
    for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
    {
        th_char_t cch = ctx.pattern[ctx.offs];

        switch (cch)
        {
            case '?':
            case '*':
            case '+':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, true)) != THERR_OK)
                    goto out;

                if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
                    goto out;

                if (cch == '?')
                {
                    // Previous token is optional (repeat 0-1 times) (non-greedy matching)
                    pnode->mode = TH_RE_MATCH_COUNT;
                    pnode->repeatMin = 0;
                    pnode->repeatMax = 1;
                }
                else
                {
                    // Check if previous was a count ("**", "*+", etc.)
                    if (pnode->mode == TH_RE_MATCH_COUNT)
                    {
                        res = THERR_INVALID_DATA;
                        goto out;
                    }

                    pnode->mode = TH_RE_MATCH_COUNT;

                    if (cch == '*')
                    {
                        // Previous token can repeat 0 or more times
                        pnode->repeatMin = 0;
                        pnode->repeatMax = -1;
                    }
                    else
                    {
                        // Previous token must repeat 1 or more times
                        pnode->repeatMin = 1;
                        pnode->repeatMax = -1;
                    }
                }
                break;

            case '{':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, true)) != THERR_OK)
                    goto out;

                // {n} | {min,max}
                start = ctx.offs + 1;
                if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}'))
                {
                    // End not found
                    res = THERR_INVALID_DATA;
                    goto out;
                }

                th_free(tmp);

                if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK ||
                    (res = th_regex_strndup(&tmp, ctx.pattern + start,
                    ctx.offs - start)) != THERR_OK)
                    goto out;

                pnode->mode = TH_RE_MATCH_COUNT;

                if (th_regex_find_next(tmp, 0, &start, ','))
                {
                    tmp[start] = 0;
                    if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) ||
                        !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax))
                    {
                        res = THERR_INVALID_DATA;
                        goto out;
                    }
                }
                else
                {
                    if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin))
                    {
                        res = THERR_INVALID_DATA;
                        goto out;
                    }
                    pnode->repeatMax = pnode->repeatMin;
                }

                if (pnode->repeatMin < 0 || pnode->repeatMax < 1 ||
                    pnode->repeatMax < pnode->repeatMin)
                {
                    // Invalid repeat counts
                    res = THERR_INVALID_DATA;
                    goto out;
                }
                break;

/*
            case '|':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;

                // Alt pattern .. how to handle these?
                break;
*/

            case '(':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;

                // Start of subpattern
                if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK)
                    goto out;
                break;

            case ')':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;

                // End of subpattern
                th_regex_node_init(&node);
                node.type = TH_RE_TYPE_SUBEXPR;

                if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK ||
                    (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto out;
                break;

            case '^':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;

                // Start of line anchor
                th_regex_node_init(&node);
                node.mode = TH_RE_MATCH_ANCHOR_START;

                if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto out;
                break;

            case '$':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;

                // End of line anchor
                th_regex_node_init(&node);
                node.mode = TH_RE_MATCH_ANCHOR_END;

                if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto out;
                break;

            case '[':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;

                // Start of char list
                start = ctx.offs + 1;
                if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') ||
                    ctx.offs == start)
                {
                    res = THERR_INVALID_DATA;
                    goto out;
                }

                th_regex_node_init(&node);
                if (ctx.pattern[start] == '^')
                {
                    node.type = TH_RE_TYPE_LIST_REVERSE;
                    start++;
                }
                else
                    node.type = TH_RE_TYPE_LIST;

                if ((res = th_regex_parse_list(ctx.pattern + start,
                    ctx.offs - start, &node.match.list)) != THERR_OK ||
                    (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto out;
                break;

            case '.':
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;

                // Any single character matches
                th_regex_node_init(&node);
                node.type = TH_RE_TYPE_ANY_CHAR;

                if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
                    goto out;
                break;

            case '\\':
                // Literal escape
                ctx.offs++;
                if (ctx.pattern[ctx.offs] == 0)
                {
                    // End of pattern, error
                    res = THERR_INVALID_DATA;
                    goto out;
                }
                // fall-through

            default:
                // Given character must match
                if (ctx.bufPos < ctx.bufSize)
                    ctx.buf[ctx.bufPos++] = ctx.pattern[ctx.offs];
                else
                if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
                    goto out;
                break;
        }
    }

    // Commit last string/char if any
    if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK)
        goto out;

    // Create root node
    th_regex_node_init(&node);
    node.type = TH_RE_TYPE_SUBEXPR;
    node.match.expr = ctx.data;
    ctx.data = NULL;

    if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
        goto out;

out:
    *pexpr = ctx.data;

    // Free parse context
    for (size_t n = 0; n < ctx.nstack; n++)
    {
        if (ctx.stack[n] != ctx.data)
        {
            th_regex_free(ctx.stack[n]);
            th_free(ctx.stack[n]);
        }
    }
    th_free(ctx.stack);

    th_free(tmp);
    th_free(ctx.buf);
    return res;
}


/**
 * Deallocate the given regular expression structure @p expr.
 * All associated data will be freed, though pointers may not
 * be NULLed.
 *
 * @param[in] expr structure to be deallocated
 */
void th_regex_free(th_regex_t *expr)
{
    if (expr != NULL)
    {
        for (size_t nnode = 0; nnode < expr->nnodes; nnode++)
        {
            th_regex_node_t *node = &expr->nodes[nnode];

            th_free(node->match.str);
            th_regex_free(node->match.expr);
            th_regex_list_free(&node->match.list);
        }

        th_free(expr->nodes);
        th_free(expr);
    }
}


static void th_regex_dump_indent(th_ioctx_t *fh, const int level)
{
    for (int indent = 0; indent < level; indent++)
        thfputs("    ", fh);
}


static void th_regex_dump_node(th_ioctx_t *fh, const th_regex_node_t *node)
{
    thfprintf(fh,
        "%s %s ",
        re_match_modes[node->mode],
        re_match_types[node->type]);

    if (node->mode == TH_RE_MATCH_COUNT)
    {
        thfprintf(fh, "min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T " : ",
            node->repeatMin, node->repeatMax);
    }

    switch (node->type)
    {
        case TH_RE_TYPE_CHAR:
            thfprintf(fh, "'%c'", node->match.chr);
            break;

        case TH_RE_TYPE_STR:
            thfprintf(fh, "\"%s\"", node->match.str);
            break;

        case TH_RE_TYPE_ANY_CHAR:
            thfprintf(fh, ".");
            break;

        case TH_RE_TYPE_LIST:
        case TH_RE_TYPE_LIST_REVERSE:
            thfputs("[ ", fh);
            for (size_t n = 0; n < node->match.list.nitems; n++)
            {
                const th_regex_list_item_t *li = &node->match.list.items[n];
                if (li->type)
                {
                    thfprintf(fh, "'%c-%c' ", li->start, li->end);
                }
                else
                {
                    for (size_t i = 0; i < li->nchars; i++)
                        thfprintf(fh, "'%c' ", li->chars[i]);
                }
            }
            thfputs("]", fh);
            break;
    }
}


/**
 * Print out the contents of given regular expression structure @p expr
 * in "human-readable" format to specified @c th_ioctx_t context. Typically
 * useful for debugging purposes only.
 *
 * @param[in,out] fh th_ioctx.handle to be used for output, must be writable.
 * @param[in] level starting whitespace indentation level
 * @param[in] expr regular expression structure to be "dumped"
 */
void th_regex_dump(th_ioctx_t *fh, const int level, const th_regex_t *expr)
{
    if (expr != NULL)
    {
        for (size_t nnode = 0; nnode < expr->nnodes; nnode++)
        {
            th_regex_node_t *node = &expr->nodes[nnode];

            th_regex_dump_indent(fh, level);

            thfprintf(fh,
                "[%" PRIu_SIZE_T "/%" PRIu_SIZE_T "] ",
                nnode + 1, expr->nnodes);

            th_regex_dump_node(fh, node);
            thfputs("\n", fh);

            if (node->type == TH_RE_TYPE_SUBEXPR)
                th_regex_dump(fh, level + 1, node->match.expr);
        }
    }
}


static bool th_regex_match_list(const th_regex_list_t *list, const th_char_t cch)
{
    // Could be optimized, perhaps .. sort match.chars, binary search etc?
    for (size_t nitem = 0; nitem < list->nitems; nitem++)
    {
        const th_regex_list_item_t *item = &list->items[nitem];

        if (item->type == 0)
        {
            for (size_t n = 0; n < item->nchars; n++)
            {
                if (item->chars[n] == cch)
                    return true;
            }
        }
        else
        {
            if (cch >= item->start && cch <= item->end)
                return true;
        }
    }

    return false;
}


static bool th_regex_match_expr(
    const th_char_t *haystack,
    size_t *offs,
    const th_regex_t *expr,
    const size_t startnode,
    const int flags,
    const int level
    );


static bool th_regex_match_one(
    const th_char_t *haystack,
    size_t *offs,
    const th_regex_node_t *node,
    const int flags,
    const int level
    )
{
    th_char_t cch;
    bool res = false;

    switch (node->type)
    {
        case TH_RE_TYPE_SUBEXPR:
            res = th_regex_match_expr(haystack, offs, node->match.expr, 0, flags, level + 1);
            break;

        case TH_RE_TYPE_LIST:
        case TH_RE_TYPE_LIST_REVERSE:
            if ((cch = haystack[*offs]) == 0)
                res = false;
            else
            {
                res = th_regex_match_list(&node->match.list, cch);

                if (node->type == TH_RE_TYPE_LIST_REVERSE)
                    res = !res;

                (*offs)++;
            }
            break;

        case TH_RE_TYPE_ANY_CHAR:
            if ((cch = haystack[*offs]) == 0)
                res = false;
            else
            {
                res = true;
                (*offs)++;
            }
            break;

        case TH_RE_TYPE_CHAR:
            if ((cch = haystack[*offs]) == 0)
                res = false;
            else
            {
                res = (cch == node->match.chr);
                (*offs)++;
            }
            break;

        case TH_RE_TYPE_STR:
            res = true;
            for (th_char_t *str = node->match.str;
                res && *str != 0;
                str++, (*offs)++)
            {
                if (haystack[*offs] != *str)
                    res = false;
            }
            break;
    }

    return res;
}


static bool th_regex_match_count(
    const th_char_t *haystack,
    size_t *offs,
    const th_regex_t *expr,
    const th_regex_node_t *node,
    size_t *nnode,
    const int flags,
    const int level
    )
{
    size_t toffs = *offs, last_offs = *offs;
    ssize_t count = 0;

    do
    {
        // Attempt to match the repeated node once
        size_t poffs = toffs;

        if (th_regex_match_one(haystack, &poffs, node, flags, level))
        {
            // Matched, increase count of repeats
            count++;
            //DBG_RE_PRINT("#%" PRId_SSIZE_T "\n", count);

            // poffs should now be at position + 1 from match
        }
        else
        {
            // Did not match, get out if repeatMin > 0
            if (node->repeatMin > 0)
                break;
        }

        // Attempt to match rest of the expression
        size_t qoffs1 = poffs, qoffs2 = toffs;
        DBG_RE_PRINT("try rest '%s' :: '%s'\n", haystack + qoffs1, haystack + qoffs2);
        if (th_regex_match_expr(haystack, &qoffs1, expr, *nnode + 1, flags, level + 1))
        {
            // Matched
            toffs = last_offs = qoffs1;

            DBG_RE_PRINT("  yes1: count=%" PRId_SSIZE_T " [%" PRId_SSIZE_T " .. %" PRId_SSIZE_T "]\n", count, node->repeatMin, node->repeatMax);

            // Check min repeats and if we are "not greedy".
            if (count >= node->repeatMin && node->repeatMax == 1)
                break;

            // Check max repeats
            if (node->repeatMax > 0 && count >= node->repeatMax)
                break;
        }
        else
        if (node->repeatMin == 0 &&
            th_regex_match_expr(haystack, &qoffs2, expr, *nnode + 1, flags, level + 1))
        {
            // Matched
            toffs = last_offs = qoffs2;

            DBG_RE_PRINT("  yes2: count=%" PRId_SSIZE_T " [%" PRId_SSIZE_T " .. %" PRId_SSIZE_T "]\n", count, node->repeatMin, node->repeatMax);

            // Check min repeats and if we are "not greedy".
            if (count >= node->repeatMin && node->repeatMax == 1)
                break;

            // Check max repeats
            if (node->repeatMax > 0 && count >= node->repeatMax)
                break;

        }
        else
        {
            // Rest of expression did not match, try again
            DBG_RE_PRINT("  no\n");
            toffs = poffs;
        }


    } while (haystack[toffs] != 0);

    // Check results
    bool res = count >= node->repeatMin ||
        (node->repeatMax > 0 && count >= node->repeatMax);

    if (res)
    {
        *offs = last_offs;
        *nnode = expr->nnodes;
    }

    DBG_RE_PRINT("RESULT: %s : offs=%" PRIu_SIZE_T "='%s'\n",
        res ? "YES" : "NO",
        *offs, haystack + *offs);

    return res;
}


static bool th_regex_match_expr(
    const th_char_t *haystack,
    size_t *offs,
    const th_regex_t *expr,
    const size_t startnode,
    const int flags,
    const int level
    )
{
    bool res = true;
    size_t soffs = *offs;

    for (size_t nnode = startnode; res && nnode < expr->nnodes; nnode++)
    {
        const th_regex_node_t *node = &expr->nodes[nnode];

#ifdef TH_EXPERIMENTAL_REGEX_DEBUG
        if (th_dbg_fh != NULL)
        {
            th_regex_dump_indent(th_dbg_fh, level);

            thfprintf(th_dbg_fh,
                "[%" PRIu_SIZE_T "/%" PRIu_SIZE_T "] ",
                nnode + 1, expr->nnodes);

            th_regex_dump_node(th_dbg_fh, node);

            thfprintf(th_dbg_fh, " <-> \"%s\"\n",
                haystack + soffs);
        }
#endif

        switch (node->mode)
        {
            case TH_RE_MATCH_ONCE:
                res = th_regex_match_one(haystack, &soffs, node, flags, level);
                break;

            case TH_RE_MATCH_COUNT:
                res = th_regex_match_count(haystack, &soffs, expr, node, &nnode, flags, level);
                break;

            case TH_RE_MATCH_ANCHOR_START:
                res = (soffs == 0);
                break;

            case TH_RE_MATCH_ANCHOR_END:
                res = (haystack[soffs] == 0);
                break;
        }
    }

    if (res)
        *offs = soffs;

    return res;
}


/**
 * Match the specified string @p haystack against specified compiled
 * regular expression @p expr and return results in optional variables
 * @p pnmatches for number of matches and/or @p pmatches @c th_regex_match_t
 * structures for matching sequences information. If @p pmatches is used,
 * the resulting linked list should be eventually freed via th_regex_free_matches().
 *
 * @param[in] expr regular expression structure to be matched
 * @param[in] haystack string to be matched against
 * @param[out] pnmatches pointer to variable to be set to number of found matches, or @c NULL if the information is not desired
 * @param[out] pmatches pointer to a pointer of @c th_regex_match_t structures, or @c NULL if the information is not desired
 * @param[in] maxmatches maximum number of matches until bailing out, or @c 0 if no limit
 * @param[in] flags additional flags, see @c TH_REF_*
 */
int th_regex_match(const th_regex_t *expr, const th_char_t *haystack,
    size_t *pnmatches, th_regex_match_t **pmatches, const size_t maxmatches,
    const int flags)
{
    size_t nmatches = 0;
    int level = 0;
    (void) flags;

    if (pnmatches != NULL)
        *pnmatches = 0;

    // Check given pattern and string
    if (expr == NULL || haystack == NULL)
        return THERR_NULLPTR;

    // Start matching
    // XXX NOTE .. lots to think about and to take into account:
    // - anchored and unanchored expressions
    // - how to check if the expression has consumed all possibilities?
    // ..
    for (size_t soffs = 0; haystack[soffs] != 0; )
    {
        size_t coffs = soffs;

        if (th_regex_match_expr(haystack, &coffs, expr, 0, flags, level))
        {
            // A match was found, increase count
            nmatches++;

            // Deliver to caller if required
            if (pnmatches != NULL)
                *pnmatches = nmatches;

            if (pmatches != NULL)
            {
                // Add the match region to the list
                th_regex_match_t *match = th_malloc0(sizeof(th_regex_match_t));
                if (match == NULL)
                    return THERR_MALLOC;

                match->type  = TH_RE_MATCH_EXPR;
                match->start = soffs;
                match->len   = coffs - soffs;

                th_llist_append_node((th_llist_t **) pmatches, (th_llist_t *) match);
            }

            // Check match count limit, if set
            if (maxmatches > 0 && nmatches >= maxmatches)
                break;

            // If offset was not advanced, increase by one
            // otherwise use end of match offset as new start
            if (soffs == coffs)
                soffs++;
            else
                soffs = coffs;
        }
        else
        {
            soffs++;
        }
    }

    return THERR_OK;
}


static void th_regex_free_match(th_regex_match_t *node)
{
    (void) node;
    // Nothing to do here at the moment
}


/**
 * Deallocate the given set of @c th_regex_match_t
 * linked list structures pointed by @p matches.
 * All associated data will be freed.
 *
 * @param[in] matches structure to be deallocated
 */
void th_regex_free_matches(th_regex_match_t *matches)
{
    th_llist_free_func_data((th_llist_t *) matches,
        (void (*)(void *)) th_regex_free_match);
}
author	Matti Hamalainen <ccr@tnsp.org>
date	Fri, 08 Mar 2024 15:26:24 +0200
parents	c17eadc60c3d
children