Mercurial > hg > th-libs
view th_regex.c @ 789:d61d3eb29053 default tip
Bump copyright.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Fri, 08 Mar 2024 15:26:24 +0200 |
parents | c17eadc60c3d |
children |
line wrap: on
line source
/* * Simple regular expression matching functionality * Programmed and designed by Matti 'ccr' Hamalainen * (C) Copyright 2020-2022 Tecnic Software productions (TNSP) * * Please read file 'COPYING' for information on license and distribution. */ #include "th_regex.h" #ifdef TH_EXPERIMENTAL_REGEX_DEBUG th_ioctx_t *th_dbg_fh = NULL; # define DBG_RE_PRINT(...) do { \ if (th_dbg_fh != NULL) \ { \ th_regex_dump_indent(th_dbg_fh, level); \ thfprintf(th_dbg_fh, __VA_ARGS__); \ } \ } while (0) #else # define DBG_RE_PRINT(...) #endif /// @cond enum { TH_RE_MATCH_ONCE, TH_RE_MATCH_COUNT, TH_RE_MATCH_ANCHOR_START, TH_RE_MATCH_ANCHOR_END, }; enum { TH_RE_TYPE_CHAR, TH_RE_TYPE_STR, TH_RE_TYPE_ANY_CHAR, TH_RE_TYPE_LIST, TH_RE_TYPE_LIST_REVERSE, TH_RE_TYPE_SUBEXPR, }; static const char *re_match_modes[] = { "ONCE", "COUNT", "ANCHOR START", "ANCHOR END", }; static const char *re_match_types[] = { "CHAR", "STR", "ANY", "LIST", "LIST REVERSE", "SUBEXPR", }; typedef struct { int type; th_char_t start, end; size_t nchars; th_char_t *chars; } th_regex_list_item_t; typedef struct { size_t nitems, itemssize; th_regex_list_item_t *items; } th_regex_list_t; typedef struct { int mode, type; ssize_t repeatMin, repeatMax; struct { th_char_t chr; th_char_t *str; th_regex_list_t list; th_regex_t *expr; } match; } th_regex_node_t; typedef struct { const th_char_t *pattern; size_t offs; th_regex_t *data; size_t nstack, stacksize; th_regex_t **stack; th_char_t *buf; size_t bufSize, bufPos; } th_regex_parse_ctx_t; struct th_regex_t { size_t nnodes, nodessize; th_regex_node_t *nodes; }; /// @endcond static void th_regex_node_init(th_regex_node_t *node) { memset(node, 0, sizeof(th_regex_node_t)); node->mode = TH_RE_MATCH_ONCE; } static int th_regex_strndup(th_char_t **pdst, const th_char_t *src, const size_t len) { if (pdst == NULL) return THERR_NULLPTR; if (UINTPTR_MAX / sizeof(th_char_t) < len + 1) return THERR_BOUNDS; if ((*pdst = (th_char_t *) th_malloc((len + 1) * sizeof(th_char_t))) == NULL) return THERR_MALLOC; memcpy(*pdst, src, len * sizeof(th_char_t)); (*pdst)[len] = 0; return THERR_OK; } static int th_regex_parse_ctx_get_prev_node( th_regex_parse_ctx_t *ctx, th_regex_node_t **pnode) { if (ctx->data != NULL && ctx->data->nnodes > 0) { *pnode = &ctx->data->nodes[ctx->data->nnodes - 1]; return THERR_OK; } else return THERR_INVALID_DATA; } static int th_regex_parse_ctx_push(th_regex_parse_ctx_t *ctx) { if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize) { ctx->stacksize += 16; if ((ctx->stack = th_realloc(ctx->stack, ctx->stacksize * sizeof(th_regex_node_t *))) == NULL) return THERR_MALLOC; } ctx->stack[ctx->nstack] = ctx->data; ctx->nstack++; ctx->data = NULL; return THERR_OK; } static int th_regex_parse_ctx_pop(th_regex_parse_ctx_t *ctx, th_regex_t **data) { if (ctx->nstack > 0) { *data = ctx->data; ctx->nstack--; ctx->data = ctx->stack[ctx->nstack]; return THERR_OK; } else return THERR_INVALID_DATA; } static int th_regex_parse_ctx_node_commit(th_regex_parse_ctx_t *ctx, th_regex_node_t *node) { th_regex_t *data; if (ctx->data == NULL && (data = ctx->data = th_malloc0(sizeof(th_regex_t))) == NULL) return THERR_MALLOC; else data = ctx->data; if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize) { data->nodessize += 16; if ((data->nodes = th_realloc(data->nodes, data->nodessize * sizeof(th_regex_node_t))) == NULL) return THERR_MALLOC; } memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t)); data->nnodes++; return THERR_OK; } static bool th_regex_find_next(const th_char_t *str, const size_t start, size_t *offs, const th_char_t delim) { for (*offs = start; str[*offs] != 0; (*offs)++) { if (str[*offs] == delim) return true; } return false; } static bool th_regex_parse_ssize_t(const th_char_t *str, ssize_t *value) { th_char_t ch; bool neg; if (*str == '-') { str++; neg = true; } else neg = false; // Is the value negative? while ((ch = *str++)) { if (ch >= '0' && ch <= '9') { *value *= 10; *value += ch - '0'; } else return false; } if (neg) *value = -(*value); return true; } static void th_regex_list_item_init(th_regex_list_item_t *item) { memset(item, 0, sizeof(th_regex_list_item_t)); } static int th_regex_list_add_item(th_regex_list_t *list, th_regex_list_item_t *item) { if (list->items == NULL || list->nitems + 1 >= list->itemssize) { list->itemssize += 16; if ((list->items = th_realloc(list->items, list->itemssize * sizeof(th_regex_list_item_t))) == NULL) return THERR_MALLOC; } memcpy(&list->items[list->nitems], item, sizeof(th_regex_list_item_t)); list->nitems++; return THERR_OK; } static void th_regex_list_free(th_regex_list_t *list) { if (list != NULL) { for (size_t n = 0; n < list->nitems; n++) { th_free(list->items[n].chars); } th_free(list->items); } } static int th_regex_parse_list(const th_char_t *str, const size_t slen, th_regex_list_t *list) { th_char_t *tmp = NULL; th_regex_list_item_t item; int res; if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK) goto out; // Handle ranges like [A-Z] for (size_t offs = 0; offs < slen; offs++) { th_char_t *prev = (offs > 0) ? tmp + offs - 1 : NULL, *curr = tmp + offs, *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL; if (*curr == '-') { if (prev != NULL && next != NULL) { // Range th_regex_list_item_init(&item); item.type = 1; item.start = *prev; item.end = *next; if (item.start >= item.end) { res = THERR_INVALID_DATA; goto out; } *curr = *prev = *next = 0; if ((res = th_regex_list_add_item(list, &item)) != THERR_OK) goto out; } else if (next != NULL) { res = THERR_INVALID_DATA; goto out; } } } // Count number of remaining characters th_regex_list_item_init(&item); item.type = 0; item.nchars = 0; for (size_t offs = 0; offs < slen; offs++) { th_char_t curr = tmp[offs]; if (curr != 0) item.nchars++; } if (item.nchars > 0) { if ((item.chars = th_malloc(sizeof(th_char_t) * item.nchars)) == NULL) { res = THERR_MALLOC; goto out; } for (size_t offs = 0, n = 0; offs < slen; offs++) { th_char_t curr = tmp[offs]; if (curr != 0) { item.chars[n] = curr; n++; } } if ((res = th_regex_list_add_item(list, &item)) != THERR_OK) { th_free(item.chars); goto out; } } out: th_free(tmp); return res; } static int th_regex_parse_ctx_node_commit_strchr_do(th_regex_parse_ctx_t *ctx, const th_char_t *buf, const size_t bufLen) { th_regex_node_t node; th_regex_node_init(&node); if (bufLen > 1) { int res; node.type = TH_RE_TYPE_STR; if ((res = th_regex_strndup(&node.match.str, buf, bufLen)) != THERR_OK) return res; } else { node.type = TH_RE_TYPE_CHAR; node.match.chr = buf[0]; } return th_regex_parse_ctx_node_commit(ctx, &node); } static int th_regex_parse_ctx_node_commit_strchr(th_regex_parse_ctx_t *ctx, const bool split) { int res = THERR_OK;; if (ctx->bufPos > 0) { if (ctx->bufPos > 1 && split) { if ((res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos - 1)) != THERR_OK) return res; res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf + ctx->bufPos - 1, 1); } else { res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos); } ctx->bufPos = 0; } return res; } /** * Parse given regular expression @p pattern string into compiled/tokenized * form as @c th_regex_t structures. Returns @c THERR_OK if successful, * or other @c THERR_* return value if not. In either case, the @p pexpr * may have been allocated and must be freed via th_regex_free(). * @param[in,out] pexpr pointer to a pointer of @c th_regex_t structures to be * @param[in] pattern regular expression pattern string * @returns @c THERR_* return value indicating success or failure */ int th_regex_compile(th_regex_t **pexpr, const th_char_t *pattern) { int res = THERR_OK; th_regex_parse_ctx_t ctx; th_regex_node_t node, *pnode; th_char_t *tmp = NULL; size_t start; memset(&ctx, 0, sizeof(ctx)); // Check pointers if (pexpr == NULL || pattern == NULL) { res = THERR_NULLPTR; goto out; } // Initialize parsing context ctx.pattern = pattern; ctx.bufSize = 256; if ((ctx.buf = th_malloc(ctx.bufSize * sizeof(th_char_t))) == NULL) { res = THERR_MALLOC; goto out; } // Start parsing the pattern for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) { th_char_t cch = ctx.pattern[ctx.offs]; switch (cch) { case '?': case '*': case '+': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, true)) != THERR_OK) goto out; if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) goto out; if (cch == '?') { // Previous token is optional (repeat 0-1 times) (non-greedy matching) pnode->mode = TH_RE_MATCH_COUNT; pnode->repeatMin = 0; pnode->repeatMax = 1; } else { // Check if previous was a count ("**", "*+", etc.) if (pnode->mode == TH_RE_MATCH_COUNT) { res = THERR_INVALID_DATA; goto out; } pnode->mode = TH_RE_MATCH_COUNT; if (cch == '*') { // Previous token can repeat 0 or more times pnode->repeatMin = 0; pnode->repeatMax = -1; } else { // Previous token must repeat 1 or more times pnode->repeatMin = 1; pnode->repeatMax = -1; } } break; case '{': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, true)) != THERR_OK) goto out; // {n} | {min,max} start = ctx.offs + 1; if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}')) { // End not found res = THERR_INVALID_DATA; goto out; } th_free(tmp); if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || (res = th_regex_strndup(&tmp, ctx.pattern + start, ctx.offs - start)) != THERR_OK) goto out; pnode->mode = TH_RE_MATCH_COUNT; if (th_regex_find_next(tmp, 0, &start, ',')) { tmp[start] = 0; if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax)) { res = THERR_INVALID_DATA; goto out; } } else { if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin)) { res = THERR_INVALID_DATA; goto out; } pnode->repeatMax = pnode->repeatMin; } if (pnode->repeatMin < 0 || pnode->repeatMax < 1 || pnode->repeatMax < pnode->repeatMin) { // Invalid repeat counts res = THERR_INVALID_DATA; goto out; } break; /* case '|': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // Alt pattern .. how to handle these? break; */ case '(': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // Start of subpattern if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK) goto out; break; case ')': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // End of subpattern th_regex_node_init(&node); node.type = TH_RE_TYPE_SUBEXPR; if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto out; break; case '^': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // Start of line anchor th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_START; if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto out; break; case '$': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // End of line anchor th_regex_node_init(&node); node.mode = TH_RE_MATCH_ANCHOR_END; if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto out; break; case '[': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // Start of char list start = ctx.offs + 1; if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') || ctx.offs == start) { res = THERR_INVALID_DATA; goto out; } th_regex_node_init(&node); if (ctx.pattern[start] == '^') { node.type = TH_RE_TYPE_LIST_REVERSE; start++; } else node.type = TH_RE_TYPE_LIST; if ((res = th_regex_parse_list(ctx.pattern + start, ctx.offs - start, &node.match.list)) != THERR_OK || (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto out; break; case '.': if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // Any single character matches th_regex_node_init(&node); node.type = TH_RE_TYPE_ANY_CHAR; if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto out; break; case '\\': // Literal escape ctx.offs++; if (ctx.pattern[ctx.offs] == 0) { // End of pattern, error res = THERR_INVALID_DATA; goto out; } // fall-through default: // Given character must match if (ctx.bufPos < ctx.bufSize) ctx.buf[ctx.bufPos++] = ctx.pattern[ctx.offs]; else if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; break; } } // Commit last string/char if any if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, false)) != THERR_OK) goto out; // Create root node th_regex_node_init(&node); node.type = TH_RE_TYPE_SUBEXPR; node.match.expr = ctx.data; ctx.data = NULL; if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) goto out; out: *pexpr = ctx.data; // Free parse context for (size_t n = 0; n < ctx.nstack; n++) { if (ctx.stack[n] != ctx.data) { th_regex_free(ctx.stack[n]); th_free(ctx.stack[n]); } } th_free(ctx.stack); th_free(tmp); th_free(ctx.buf); return res; } /** * Deallocate the given regular expression structure @p expr. * All associated data will be freed, though pointers may not * be NULLed. * * @param[in] expr structure to be deallocated */ void th_regex_free(th_regex_t *expr) { if (expr != NULL) { for (size_t nnode = 0; nnode < expr->nnodes; nnode++) { th_regex_node_t *node = &expr->nodes[nnode]; th_free(node->match.str); th_regex_free(node->match.expr); th_regex_list_free(&node->match.list); } th_free(expr->nodes); th_free(expr); } } static void th_regex_dump_indent(th_ioctx_t *fh, const int level) { for (int indent = 0; indent < level; indent++) thfputs(" ", fh); } static void th_regex_dump_node(th_ioctx_t *fh, const th_regex_node_t *node) { thfprintf(fh, "%s %s ", re_match_modes[node->mode], re_match_types[node->type]); if (node->mode == TH_RE_MATCH_COUNT) { thfprintf(fh, "min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T " : ", node->repeatMin, node->repeatMax); } switch (node->type) { case TH_RE_TYPE_CHAR: thfprintf(fh, "'%c'", node->match.chr); break; case TH_RE_TYPE_STR: thfprintf(fh, "\"%s\"", node->match.str); break; case TH_RE_TYPE_ANY_CHAR: thfprintf(fh, "."); break; case TH_RE_TYPE_LIST: case TH_RE_TYPE_LIST_REVERSE: thfputs("[ ", fh); for (size_t n = 0; n < node->match.list.nitems; n++) { const th_regex_list_item_t *li = &node->match.list.items[n]; if (li->type) { thfprintf(fh, "'%c-%c' ", li->start, li->end); } else { for (size_t i = 0; i < li->nchars; i++) thfprintf(fh, "'%c' ", li->chars[i]); } } thfputs("]", fh); break; } } /** * Print out the contents of given regular expression structure @p expr * in "human-readable" format to specified @c th_ioctx_t context. Typically * useful for debugging purposes only. * * @param[in,out] fh th_ioctx.handle to be used for output, must be writable. * @param[in] level starting whitespace indentation level * @param[in] expr regular expression structure to be "dumped" */ void th_regex_dump(th_ioctx_t *fh, const int level, const th_regex_t *expr) { if (expr != NULL) { for (size_t nnode = 0; nnode < expr->nnodes; nnode++) { th_regex_node_t *node = &expr->nodes[nnode]; th_regex_dump_indent(fh, level); thfprintf(fh, "[%" PRIu_SIZE_T "/%" PRIu_SIZE_T "] ", nnode + 1, expr->nnodes); th_regex_dump_node(fh, node); thfputs("\n", fh); if (node->type == TH_RE_TYPE_SUBEXPR) th_regex_dump(fh, level + 1, node->match.expr); } } } static bool th_regex_match_list(const th_regex_list_t *list, const th_char_t cch) { // Could be optimized, perhaps .. sort match.chars, binary search etc? for (size_t nitem = 0; nitem < list->nitems; nitem++) { const th_regex_list_item_t *item = &list->items[nitem]; if (item->type == 0) { for (size_t n = 0; n < item->nchars; n++) { if (item->chars[n] == cch) return true; } } else { if (cch >= item->start && cch <= item->end) return true; } } return false; } static bool th_regex_match_expr( const th_char_t *haystack, size_t *offs, const th_regex_t *expr, const size_t startnode, const int flags, const int level ); static bool th_regex_match_one( const th_char_t *haystack, size_t *offs, const th_regex_node_t *node, const int flags, const int level ) { th_char_t cch; bool res = false; switch (node->type) { case TH_RE_TYPE_SUBEXPR: res = th_regex_match_expr(haystack, offs, node->match.expr, 0, flags, level + 1); break; case TH_RE_TYPE_LIST: case TH_RE_TYPE_LIST_REVERSE: if ((cch = haystack[*offs]) == 0) res = false; else { res = th_regex_match_list(&node->match.list, cch); if (node->type == TH_RE_TYPE_LIST_REVERSE) res = !res; (*offs)++; } break; case TH_RE_TYPE_ANY_CHAR: if ((cch = haystack[*offs]) == 0) res = false; else { res = true; (*offs)++; } break; case TH_RE_TYPE_CHAR: if ((cch = haystack[*offs]) == 0) res = false; else { res = (cch == node->match.chr); (*offs)++; } break; case TH_RE_TYPE_STR: res = true; for (th_char_t *str = node->match.str; res && *str != 0; str++, (*offs)++) { if (haystack[*offs] != *str) res = false; } break; } return res; } static bool th_regex_match_count( const th_char_t *haystack, size_t *offs, const th_regex_t *expr, const th_regex_node_t *node, size_t *nnode, const int flags, const int level ) { size_t toffs = *offs, last_offs = *offs; ssize_t count = 0; do { // Attempt to match the repeated node once size_t poffs = toffs; if (th_regex_match_one(haystack, &poffs, node, flags, level)) { // Matched, increase count of repeats count++; //DBG_RE_PRINT("#%" PRId_SSIZE_T "\n", count); // poffs should now be at position + 1 from match } else { // Did not match, get out if repeatMin > 0 if (node->repeatMin > 0) break; } // Attempt to match rest of the expression size_t qoffs1 = poffs, qoffs2 = toffs; DBG_RE_PRINT("try rest '%s' :: '%s'\n", haystack + qoffs1, haystack + qoffs2); if (th_regex_match_expr(haystack, &qoffs1, expr, *nnode + 1, flags, level + 1)) { // Matched toffs = last_offs = qoffs1; DBG_RE_PRINT(" yes1: count=%" PRId_SSIZE_T " [%" PRId_SSIZE_T " .. %" PRId_SSIZE_T "]\n", count, node->repeatMin, node->repeatMax); // Check min repeats and if we are "not greedy". if (count >= node->repeatMin && node->repeatMax == 1) break; // Check max repeats if (node->repeatMax > 0 && count >= node->repeatMax) break; } else if (node->repeatMin == 0 && th_regex_match_expr(haystack, &qoffs2, expr, *nnode + 1, flags, level + 1)) { // Matched toffs = last_offs = qoffs2; DBG_RE_PRINT(" yes2: count=%" PRId_SSIZE_T " [%" PRId_SSIZE_T " .. %" PRId_SSIZE_T "]\n", count, node->repeatMin, node->repeatMax); // Check min repeats and if we are "not greedy". if (count >= node->repeatMin && node->repeatMax == 1) break; // Check max repeats if (node->repeatMax > 0 && count >= node->repeatMax) break; } else { // Rest of expression did not match, try again DBG_RE_PRINT(" no\n"); toffs = poffs; } } while (haystack[toffs] != 0); // Check results bool res = count >= node->repeatMin || (node->repeatMax > 0 && count >= node->repeatMax); if (res) { *offs = last_offs; *nnode = expr->nnodes; } DBG_RE_PRINT("RESULT: %s : offs=%" PRIu_SIZE_T "='%s'\n", res ? "YES" : "NO", *offs, haystack + *offs); return res; } static bool th_regex_match_expr( const th_char_t *haystack, size_t *offs, const th_regex_t *expr, const size_t startnode, const int flags, const int level ) { bool res = true; size_t soffs = *offs; for (size_t nnode = startnode; res && nnode < expr->nnodes; nnode++) { const th_regex_node_t *node = &expr->nodes[nnode]; #ifdef TH_EXPERIMENTAL_REGEX_DEBUG if (th_dbg_fh != NULL) { th_regex_dump_indent(th_dbg_fh, level); thfprintf(th_dbg_fh, "[%" PRIu_SIZE_T "/%" PRIu_SIZE_T "] ", nnode + 1, expr->nnodes); th_regex_dump_node(th_dbg_fh, node); thfprintf(th_dbg_fh, " <-> \"%s\"\n", haystack + soffs); } #endif switch (node->mode) { case TH_RE_MATCH_ONCE: res = th_regex_match_one(haystack, &soffs, node, flags, level); break; case TH_RE_MATCH_COUNT: res = th_regex_match_count(haystack, &soffs, expr, node, &nnode, flags, level); break; case TH_RE_MATCH_ANCHOR_START: res = (soffs == 0); break; case TH_RE_MATCH_ANCHOR_END: res = (haystack[soffs] == 0); break; } } if (res) *offs = soffs; return res; } /** * Match the specified string @p haystack against specified compiled * regular expression @p expr and return results in optional variables * @p pnmatches for number of matches and/or @p pmatches @c th_regex_match_t * structures for matching sequences information. If @p pmatches is used, * the resulting linked list should be eventually freed via th_regex_free_matches(). * * @param[in] expr regular expression structure to be matched * @param[in] haystack string to be matched against * @param[out] pnmatches pointer to variable to be set to number of found matches, or @c NULL if the information is not desired * @param[out] pmatches pointer to a pointer of @c th_regex_match_t structures, or @c NULL if the information is not desired * @param[in] maxmatches maximum number of matches until bailing out, or @c 0 if no limit * @param[in] flags additional flags, see @c TH_REF_* */ int th_regex_match(const th_regex_t *expr, const th_char_t *haystack, size_t *pnmatches, th_regex_match_t **pmatches, const size_t maxmatches, const int flags) { size_t nmatches = 0; int level = 0; (void) flags; if (pnmatches != NULL) *pnmatches = 0; // Check given pattern and string if (expr == NULL || haystack == NULL) return THERR_NULLPTR; // Start matching // XXX NOTE .. lots to think about and to take into account: // - anchored and unanchored expressions // - how to check if the expression has consumed all possibilities? // .. for (size_t soffs = 0; haystack[soffs] != 0; ) { size_t coffs = soffs; if (th_regex_match_expr(haystack, &coffs, expr, 0, flags, level)) { // A match was found, increase count nmatches++; // Deliver to caller if required if (pnmatches != NULL) *pnmatches = nmatches; if (pmatches != NULL) { // Add the match region to the list th_regex_match_t *match = th_malloc0(sizeof(th_regex_match_t)); if (match == NULL) return THERR_MALLOC; match->type = TH_RE_MATCH_EXPR; match->start = soffs; match->len = coffs - soffs; th_llist_append_node((th_llist_t **) pmatches, (th_llist_t *) match); } // Check match count limit, if set if (maxmatches > 0 && nmatches >= maxmatches) break; // If offset was not advanced, increase by one // otherwise use end of match offset as new start if (soffs == coffs) soffs++; else soffs = coffs; } else { soffs++; } } return THERR_OK; } static void th_regex_free_match(th_regex_match_t *node) { (void) node; // Nothing to do here at the moment } /** * Deallocate the given set of @c th_regex_match_t * linked list structures pointed by @p matches. * All associated data will be freed. * * @param[in] matches structure to be deallocated */ void th_regex_free_matches(th_regex_match_t *matches) { th_llist_free_func_data((th_llist_t *) matches, (void (*)(void *)) th_regex_free_match); }