comparison th_regex.c @ 640:9e1f9e1d1487

Aaand some more work. Still just a broken concept.
author Matti Hamalainen <ccr@tnsp.org>
date Fri, 24 Jan 2020 09:13:24 +0200
parents 8c957ad9d4c3
children 9a1ed82abefd
comparison
equal deleted inserted replaced
639:8c957ad9d4c3 640:9e1f9e1d1487
8 #include "th_util.h" 8 #include "th_util.h"
9 #include "th_regex.h" 9 #include "th_regex.h"
10 10
11 11
12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG 12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG
13 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stdout, __VA_ARGS__); } while (0)
14 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stdout, __VA_ARGS__); } while (0)
15 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stdout, __VA_ARGS__); } while (0)
16
13 int th_dbg_re_flags = 0; 17 int th_dbg_re_flags = 0;
14 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stderr, __VA_ARGS__); } while (0) 18
15 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stderr, __VA_ARGS__); } while (0) 19 static const char *re_match_modes[] =
16 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stderr, __VA_ARGS__); } while (0) 20 {
21 "ONCE",
22 "COUNT GREEDY",
23 "COUNT NONGREEDY",
24 "ANCHOR START",
25 "ANCHOR END",
26 };
27
28 static const char *re_match_types[] =
29 {
30 "CHAR",
31 "ANY",
32 "LIST",
33 "LIST REVERSE",
34 "SUBEXPR",
35 };
36
17 #else 37 #else
18 # define DBG_RE_COMPILE(...) 38 # define DBG_RE_COMPILE(...)
19 # define DBG_RE_FREE(...) 39 # define DBG_RE_FREE(...)
20 # define DBG_RE_MATCH(...) 40 # define DBG_RE_MATCH(...)
21 #endif 41 #endif
22 42
23 43
24 #ifdef TH_EXPERIMENTAL_REGEX 44 #ifdef TH_EXPERIMENTAL_REGEX
25 45
46 typedef struct
47 {
48 int type;
49 th_regex_char_t start, end;
50
51 size_t nchars;
52 th_regex_char_t *chars;
53 } th_regex_list_item_t;
54
55
56 typedef struct
57 {
58 size_t nitems, itemssize;
59 th_regex_list_item_t *items;
60 } th_regex_list_t;
61
62
63 typedef struct
64 {
65 int mode, type;
66 ssize_t repeatMin, repeatMax;
67
68 struct {
69 th_regex_char_t chr;
70 th_regex_list_t list;
71 struct th_regex_t *expr;
72 } match;
73 } th_regex_node_t;
74
75
76 struct th_regex_t
77 {
78 size_t nnodes, nodessize;
79 th_regex_node_t *nodes;
80 };
81
82
26 enum 83 enum
27 { 84 {
28 TH_RE_MATCH_ONCE, 85 TH_RE_MATCH_ONCE,
29 TH_RE_MATCH_COUNT_GREEDY, 86 TH_RE_MATCH_COUNT_GREEDY,
30 TH_RE_MATCH_COUNT_NONGREEDY, 87 TH_RE_MATCH_COUNT_NONGREEDY,
88
31 TH_RE_MATCH_ANCHOR_START, 89 TH_RE_MATCH_ANCHOR_START,
32 TH_RE_MATCH_ANCHOR_END, 90 TH_RE_MATCH_ANCHOR_END,
33 }; 91 };
34 92
35 93
41 TH_RE_TYPE_LIST_REVERSE, 99 TH_RE_TYPE_LIST_REVERSE,
42 TH_RE_TYPE_SUBEXPR, 100 TH_RE_TYPE_SUBEXPR,
43 }; 101 };
44 102
45 103
46 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG
47 static const char *re_match_modes[] =
48 {
49 "ONCE",
50 "COUNT GREEDY",
51 "COUNT NONGREEDY",
52 "ANCHOR_START",
53 "ANCHOR_END",
54 };
55
56 static const char *re_match_types[] =
57 {
58 "CHAR",
59 "ANY",
60 "LIST",
61 "LIST_REVERSE",
62 "SUBEXPR",
63 };
64 #endif
65
66
67 typedef struct 104 typedef struct
68 { 105 {
69 const th_regex_char *pattern; 106 const th_regex_char_t *pattern;
70 size_t offs; 107 size_t offs;
71 108
72 th_regex_ctx *data; 109 th_regex_t *data;
73 110
74 size_t nstack, stacksize; 111 size_t nstack, stacksize;
75 th_regex_ctx **stack; 112 th_regex_t **stack;
76 } th_regex_parse_ctx; 113 } th_regex_parse_ctx_t;
77 114
78 115
79 static void th_regex_node_init(th_regex_node *node) 116 static void th_regex_node_init(th_regex_node_t *node)
80 { 117 {
81 memset(node, 0, sizeof(th_regex_node)); 118 memset(node, 0, sizeof(th_regex_node_t));
82 node->mode = TH_RE_MATCH_ONCE; 119 node->mode = TH_RE_MATCH_ONCE;
83 } 120 }
84 121
85 122
86 static int th_regex_strndup(th_regex_char **pdst, 123 static int th_regex_strndup(th_regex_char_t **pdst,
87 const th_regex_char *src, const size_t len) 124 const th_regex_char_t *src, const size_t len)
88 { 125 {
89 if (pdst == NULL) 126 if (pdst == NULL)
90 return THERR_NULLPTR; 127 return THERR_NULLPTR;
91 128
92 if (UINTPTR_MAX / sizeof(th_regex_char) < len + 1) 129 if (UINTPTR_MAX / sizeof(th_regex_char_t) < len + 1)
93 return THERR_BOUNDS; 130 return THERR_BOUNDS;
94 131
95 if ((*pdst = (th_regex_char *) 132 if ((*pdst = (th_regex_char_t *)
96 th_malloc((len + 1) * sizeof(th_regex_char))) == NULL) 133 th_malloc((len + 1) * sizeof(th_regex_char_t))) == NULL)
97 return THERR_MALLOC; 134 return THERR_MALLOC;
98 135
99 memcpy(*pdst, src, len * sizeof(th_regex_char)); 136 memcpy(*pdst, src, len * sizeof(th_regex_char_t));
100 (*pdst)[len] = 0; 137 (*pdst)[len] = 0;
101 138
102 return THERR_OK; 139 return THERR_OK;
103 } 140 }
104 141
105 142
106 static int th_regex_ctx_get_prev_node( 143 static int th_regex_parse_ctx_get_prev_node(
107 th_regex_parse_ctx *ctx, th_regex_node **pnode) 144 th_regex_parse_ctx_t *ctx, th_regex_node_t **pnode)
108 { 145 {
109 if (ctx->data != NULL && ctx->data->nnodes > 0) 146 if (ctx->data != NULL && ctx->data->nnodes > 0)
110 { 147 {
111 *pnode = &ctx->data->nodes[ctx->data->nnodes - 1]; 148 *pnode = &ctx->data->nodes[ctx->data->nnodes - 1];
112 return THERR_OK; 149 return THERR_OK;
114 else 151 else
115 return THERR_INVALID_DATA; 152 return THERR_INVALID_DATA;
116 } 153 }
117 154
118 155
119 static int th_regex_ctx_push(th_regex_parse_ctx *ctx) 156 static int th_regex_parse_ctx_push(th_regex_parse_ctx_t *ctx)
120 { 157 {
121 if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize) 158 if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize)
122 { 159 {
123 ctx->stacksize += 16; 160 ctx->stacksize += 16;
124 161
125 if ((ctx->stack = th_realloc(ctx->stack, 162 if ((ctx->stack = th_realloc(ctx->stack,
126 ctx->stacksize * sizeof(th_regex_node *))) == NULL) 163 ctx->stacksize * sizeof(th_regex_node_t *))) == NULL)
127 return THERR_MALLOC; 164 return THERR_MALLOC;
128 } 165 }
129 166
130 ctx->stack[ctx->nstack] = ctx->data; 167 ctx->stack[ctx->nstack] = ctx->data;
131 ctx->nstack++; 168 ctx->nstack++;
133 170
134 return THERR_OK; 171 return THERR_OK;
135 } 172 }
136 173
137 174
138 static int th_regex_ctx_pop(th_regex_parse_ctx *ctx, th_regex_ctx **data) 175 static int th_regex_parse_ctx_pop(th_regex_parse_ctx_t *ctx, th_regex_t **data)
139 { 176 {
140 if (ctx->nstack > 0) 177 if (ctx->nstack > 0)
141 { 178 {
142 *data = ctx->data; 179 *data = ctx->data;
143 ctx->nstack--; 180 ctx->nstack--;
147 else 184 else
148 return THERR_INVALID_DATA; 185 return THERR_INVALID_DATA;
149 } 186 }
150 187
151 188
152 static int th_regex_ctx_node_commit(th_regex_parse_ctx *ctx, th_regex_node *node) 189 static int th_regex_parse_ctx_node_commit(th_regex_parse_ctx_t *ctx, th_regex_node_t *node)
153 { 190 {
154 th_regex_ctx *data = ctx->data; 191 th_regex_t *data = ctx->data;
155 192
156 if (data == NULL) 193 if (data == NULL)
157 { 194 {
158 if ((data = ctx->data = th_malloc0(sizeof(th_regex_ctx))) == NULL) 195 if ((data = ctx->data = th_malloc0(sizeof(th_regex_t))) == NULL)
159 return THERR_MALLOC; 196 return THERR_MALLOC;
160 } 197 }
161 198
162 if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize) 199 if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize)
163 { 200 {
164 data->nodessize += 16; 201 data->nodessize += 16;
165 if ((data->nodes = th_realloc(data->nodes, 202 if ((data->nodes = th_realloc(data->nodes,
166 data->nodessize * sizeof(th_regex_node))) == NULL) 203 data->nodessize * sizeof(th_regex_node_t))) == NULL)
167 return THERR_MALLOC; 204 return THERR_MALLOC;
168 } 205 }
169 206
170 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); 207 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t));
171 data->nnodes++; 208 data->nnodes++;
172 209
173 DBG_RE_COMPILE( 210 DBG_RE_COMPILE(
174 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " 211 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
175 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", 212 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
179 216
180 return THERR_OK; 217 return THERR_OK;
181 } 218 }
182 219
183 220
184 static BOOL th_regex_find_next(const th_regex_char *str, 221 static BOOL th_regex_find_next(const th_regex_char_t *str,
185 const size_t start, size_t *offs, 222 const size_t start, size_t *offs,
186 const th_regex_char delim) 223 const th_regex_char_t delim)
187 { 224 {
188 for (*offs = start; str[*offs] != 0; (*offs)++) 225 for (*offs = start; str[*offs] != 0; (*offs)++)
189 { 226 {
190 if (str[*offs] == delim) 227 if (str[*offs] == delim)
191 return TRUE; 228 return TRUE;
192 } 229 }
193 return FALSE; 230 return FALSE;
194 } 231 }
195 232
196 233
197 static BOOL th_regex_parse_ssize_t(const th_regex_char *str, 234 static BOOL th_regex_parse_ssize_t(const th_regex_char_t *str,
198 ssize_t *value) 235 ssize_t *value)
199 { 236 {
200 th_regex_char ch; 237 th_regex_char_t ch;
201 BOOL neg; 238 BOOL neg;
202 239
203 if (*str == '-') 240 if (*str == '-')
204 { 241 {
205 str++; 242 str++;
225 262
226 return TRUE; 263 return TRUE;
227 } 264 }
228 265
229 266
230 static void th_regex_list_item_init(th_regex_list_item *item) 267 static void th_regex_list_item_init(th_regex_list_item_t *item)
231 { 268 {
232 memset(item, 0, sizeof(th_regex_list_item)); 269 memset(item, 0, sizeof(th_regex_list_item_t));
233 } 270 }
234 271
235 272
236 static int th_regex_list_add_item(th_regex_list *list, th_regex_list_item *item) 273 static int th_regex_list_add_item(th_regex_list_t *list, th_regex_list_item_t *item)
237 { 274 {
238 if (list->items == NULL || list->nitems + 1 >= list->itemssize) 275 if (list->items == NULL || list->nitems + 1 >= list->itemssize)
239 { 276 {
240 list->itemssize += 16; 277 list->itemssize += 16;
241 278
242 if ((list->items = th_realloc(list->items, 279 if ((list->items = th_realloc(list->items,
243 list->itemssize * sizeof(th_regex_list_item))) == NULL) 280 list->itemssize * sizeof(th_regex_list_item_t))) == NULL)
244 return THERR_MALLOC; 281 return THERR_MALLOC;
245 } 282 }
246 283
247 memcpy(list->items + list->nitems, item, sizeof(th_regex_list_item)); 284 memcpy(&list->items[list->nitems], item, sizeof(th_regex_list_item_t));
248 list->nitems++; 285 list->nitems++;
249 286
250 return THERR_OK; 287 return THERR_OK;
251 } 288 }
252 289
253 290
254 static void th_regex_list_free(th_regex_list *list) 291 static void th_regex_list_free(th_regex_list_t *list)
255 { 292 {
256 if (list != NULL) 293 if (list != NULL)
257 { 294 {
258 for (size_t n = 0; n < list->nitems; n++) 295 for (size_t n = 0; n < list->nitems; n++)
259 { 296 {
262 th_free(list->items); 299 th_free(list->items);
263 } 300 }
264 } 301 }
265 302
266 303
267 static int th_regex_parse_list(const th_regex_char *str, 304 static int th_regex_parse_list(const th_regex_char_t *str,
268 const size_t slen, th_regex_list *list) 305 const size_t slen, th_regex_list_t *list)
269 { 306 {
270 th_regex_char *tmp = NULL; 307 th_regex_char_t *tmp = NULL;
271 th_regex_list_item item; 308 th_regex_list_item_t item;
272 int res; 309 int res;
273 310
274 if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK) 311 if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK)
275 goto out; 312 goto out;
276 313
277 // Handle ranges like [A-Z] 314 // Handle ranges like [A-Z]
278 for (size_t offs = 0; offs < slen; offs++) 315 for (size_t offs = 0; offs < slen; offs++)
279 { 316 {
280 th_regex_char 317 th_regex_char_t
281 *prev = (offs > 0) ? tmp + offs - 1 : NULL, 318 *prev = (offs > 0) ? tmp + offs - 1 : NULL,
282 *curr = tmp + offs, 319 *curr = tmp + offs,
283 *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL; 320 *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL;
284 321
285 if (*curr == '-') 322 if (*curr == '-')
317 item.type = 0; 354 item.type = 0;
318 item.nchars = 0; 355 item.nchars = 0;
319 356
320 for (size_t offs = 0; offs < slen; offs++) 357 for (size_t offs = 0; offs < slen; offs++)
321 { 358 {
322 th_regex_char curr = tmp[offs]; 359 th_regex_char_t curr = tmp[offs];
323 if (curr != 0) 360 if (curr != 0)
324 item.nchars++; 361 item.nchars++;
325 } 362 }
326 363
327 if (item.nchars > 0) 364 if (item.nchars > 0)
328 { 365 {
329 if ((item.chars = th_malloc(sizeof(th_regex_char) * item.nchars)) == NULL) 366 if ((item.chars = th_malloc(sizeof(th_regex_char_t) * item.nchars)) == NULL)
330 { 367 {
331 res = THERR_MALLOC; 368 res = THERR_MALLOC;
332 goto out; 369 goto out;
333 } 370 }
334 371
335 for (size_t offs = 0, n = 0; offs < slen; offs++) 372 for (size_t offs = 0, n = 0; offs < slen; offs++)
336 { 373 {
337 th_regex_char curr = tmp[offs]; 374 th_regex_char_t curr = tmp[offs];
338 if (curr != 0) 375 if (curr != 0)
339 { 376 {
340 item.chars[n] = curr; 377 item.chars[n] = curr;
341 n++; 378 n++;
342 } 379 }
353 th_free(tmp); 390 th_free(tmp);
354 return res; 391 return res;
355 } 392 }
356 393
357 394
358 int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern) 395 int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern)
359 { 396 {
360 int res = THERR_OK; 397 int res = THERR_OK;
361 th_regex_parse_ctx ctx; 398 th_regex_parse_ctx_t ctx;
362 th_regex_node node, *pnode; 399 th_regex_node_t node, *pnode;
363 th_regex_char *tmp = NULL; 400 th_regex_char_t *tmp = NULL;
364 size_t start; 401 size_t start;
365 402
366 if (pexpr == NULL || pattern == NULL) 403 if (pexpr == NULL || pattern == NULL)
367 { 404 {
368 res = THERR_NULLPTR; 405 res = THERR_NULLPTR;
372 memset(&ctx, 0, sizeof(ctx)); 409 memset(&ctx, 0, sizeof(ctx));
373 ctx.pattern = pattern; 410 ctx.pattern = pattern;
374 411
375 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) 412 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
376 { 413 {
377 th_regex_char cch = ctx.pattern[ctx.offs]; 414 th_regex_char_t cch = ctx.pattern[ctx.offs];
378 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); 415 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch);
379 switch (cch) 416 switch (cch)
380 { 417 {
381 case '?': 418 case '?':
382 case '*': 419 case '*':
383 case '+': 420 case '+':
384 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) 421 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
385 goto exit; 422 goto exit;
386 423
387 if (cch == '?') 424 if (cch == '?')
388 { 425 {
389 // Check if previous was a count 426 // Check if previous was a count
431 goto exit; 468 goto exit;
432 } 469 }
433 470
434 th_free(tmp); 471 th_free(tmp);
435 472
436 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || 473 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK ||
437 (res = th_regex_strndup(&tmp, ctx.pattern + start, 474 (res = th_regex_strndup(&tmp, ctx.pattern + start,
438 ctx.offs - start)) != THERR_OK) 475 ctx.offs - start)) != THERR_OK)
439 goto exit; 476 goto exit;
440 477
441 pnode->mode = TH_RE_MATCH_COUNT_GREEDY; 478 pnode->mode = TH_RE_MATCH_COUNT_GREEDY;
469 } 506 }
470 break; 507 break;
471 508
472 case '(': 509 case '(':
473 // Start of subpattern 510 // Start of subpattern
474 if ((res = th_regex_ctx_push(&ctx)) != THERR_OK) 511 if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK)
475 goto exit; 512 goto exit;
476 break; 513 break;
477 514
478 case ')': 515 case ')':
479 // End of subpattern 516 // End of subpattern
480 th_regex_node_init(&node); 517 th_regex_node_init(&node);
481 node.type = TH_RE_TYPE_SUBEXPR; 518 node.type = TH_RE_TYPE_SUBEXPR;
482 519
483 if ((res = th_regex_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || 520 if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK ||
484 (res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) 521 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
485 goto exit; 522 goto exit;
486 break; 523 break;
487 524
488 case '^': 525 case '^':
489 // Start of line anchor 526 // Start of line anchor
490 th_regex_node_init(&node); 527 th_regex_node_init(&node);
491 node.mode = TH_RE_MATCH_ANCHOR_START; 528 node.mode = TH_RE_MATCH_ANCHOR_START;
492 529
493 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) 530 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
494 goto exit; 531 goto exit;
495 break; 532 break;
496 533
497 case '$': 534 case '$':
498 // End of line anchor 535 // End of line anchor
499 th_regex_node_init(&node); 536 th_regex_node_init(&node);
500 node.mode = TH_RE_MATCH_ANCHOR_END; 537 node.mode = TH_RE_MATCH_ANCHOR_END;
501 538
502 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) 539 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
503 goto exit; 540 goto exit;
504 break; 541 break;
505 542
506 case '[': 543 case '[':
507 // Start of char list 544 // Start of char list
511 { 548 {
512 res = THERR_INVALID_DATA; 549 res = THERR_INVALID_DATA;
513 goto exit; 550 goto exit;
514 } 551 }
515 552
516 // XXX TODO Parse/support ranges [0-9a-zA-Z_-]
517
518 th_regex_node_init(&node); 553 th_regex_node_init(&node);
519 node.type = (ctx.pattern[start] == '^') ? 554 if (ctx.pattern[start] == '^')
520 TH_RE_TYPE_LIST_REVERSE : TH_RE_TYPE_LIST; 555 {
521 node.match.list.nchars = ctx.offs - start; 556 node.type = TH_RE_TYPE_LIST_REVERSE;
522 if ((res = th_regex_strndup(&node.match.list.chars, 557 start++;
523 ctx.pattern + start, node.match.list.nchars)) != THERR_OK) 558 }
524 goto exit; 559 else
525 560 node.type = TH_RE_TYPE_LIST;
526 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) 561
562 if ((res = th_regex_parse_list(ctx.pattern + start,
563 ctx.offs - start, &node.match.list) != THERR_OK) ||
564 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
527 goto exit; 565 goto exit;
528 break; 566 break;
529 567
530 case '.': 568 case '.':
531 // Any single character matches 569 // Any single character matches
532 th_regex_node_init(&node); 570 th_regex_node_init(&node);
533 node.type = TH_RE_TYPE_ANY_CHAR; 571 node.type = TH_RE_TYPE_ANY_CHAR;
534 572
535 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) 573 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
536 goto exit; 574 goto exit;
537 break; 575 break;
538 576
539 case '\\': 577 case '\\':
540 // Literal escape 578 // Literal escape
543 { 581 {
544 // End of pattern, error 582 // End of pattern, error
545 res = THERR_INVALID_DATA; 583 res = THERR_INVALID_DATA;
546 goto exit; 584 goto exit;
547 } 585 }
548 586 // fall-through
549 th_regex_node_init(&node);
550 node.type = TH_RE_TYPE_CHAR;
551 node.match.chr = ctx.pattern[ctx.offs];
552
553 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK)
554 goto exit;
555 break;
556 587
557 default: 588 default:
558 // Given character must match 589 // Given character must match
559 th_regex_node_init(&node); 590 th_regex_node_init(&node);
560 node.type = TH_RE_TYPE_CHAR; 591 node.type = TH_RE_TYPE_CHAR;
561 node.match.chr = ctx.pattern[ctx.offs]; 592 node.match.chr = ctx.pattern[ctx.offs];
562 593
563 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) 594 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
564 goto exit; 595 goto exit;
565 break; 596 break;
566 } 597 }
567 } 598 }
568 599
571 th_free(tmp); 602 th_free(tmp);
572 return res; 603 return res;
573 } 604 }
574 605
575 606
576 void th_regex_free(th_regex_ctx *expr) 607 void th_regex_free(th_regex_t *expr)
577 { 608 {
578 if (expr != NULL) 609 if (expr != NULL)
579 { 610 {
580 for (size_t n = 0; n < expr->nnodes; n++) 611 for (size_t nnode = 0; nnode < expr->nnodes; nnode++)
581 { 612 {
582 th_regex_node *node = &expr->nodes[n]; 613 th_regex_node_t *node = &expr->nodes[nnode];
583 614
584 DBG_RE_FREE( 615 DBG_RE_FREE(
585 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " 616 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
586 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", 617 "mode=%s, type=%s, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
587 n, expr->nnodes, 618 nnode, expr->nnodes,
588 node->mode, node->type, 619 re_match_modes[node->mode],
620 re_match_types[node->type],
589 node->repeatMin, node->repeatMax); 621 node->repeatMin, node->repeatMax);
590 622
591 switch (node->type) 623 th_regex_free(node->match.expr);
592 { 624 th_regex_list_free(&node->match.list);
593 case TH_RE_TYPE_SUBEXPR:
594 DBG_RE_FREE(" SUBEXPR: %p vs %p\n",
595 (void *) expr, (void *) node->match.expr);
596 th_regex_free(node->match.expr);
597 break;
598
599 case TH_RE_TYPE_LIST:
600 case TH_RE_TYPE_LIST_REVERSE:
601 DBG_RE_FREE(" list='%s'\n", node->match.list.chars);
602 th_free(node->match.list.chars);
603 break;
604
605 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG
606 case TH_RE_TYPE_ANY_CHAR:
607 DBG_RE_FREE(" any char\n");
608 break;
609
610 case TH_RE_TYPE_CHAR:
611 DBG_RE_FREE(" char='%c'\n", node->match.chr);
612 break;
613 #endif
614 }
615 } 625 }
616 626
617 th_free(expr->nodes); 627 th_free(expr->nodes);
618 } 628 }
619 } 629 }
620 630
621 631
622 static BOOL th_regex_do_match_list(const th_regex_list *list, const th_regex_char cch) 632 static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch)
623 { 633 {
624 // Could be optimized, perhaps .. sort match.chars, binary search etc? 634 // Could be optimized, perhaps .. sort match.chars, binary search etc?
625 for (size_t nitem = 0; nitem < list->nitems; nitem++) 635 for (size_t nitem = 0; nitem < list->nitems; nitem++)
626 { 636 {
627 const th_regex_list_item *item = &list->items[nitem]; 637 const th_regex_list_item_t *item = &list->items[nitem];
638
628 if (item->type == 0) 639 if (item->type == 0)
629 { 640 {
630 for (size_t n = 0; n < item->nchars; n++) 641 for (size_t n = 0; n < item->nchars; n++)
631 if (item->chars[n] == cch) 642 if (item->chars[n] == cch)
632 return TRUE; 643 return TRUE;
640 651
641 return FALSE; 652 return FALSE;
642 } 653 }
643 654
644 655
645 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, 656 static BOOL th_regex_match_expr(
646 const th_regex_char *haystack, size_t *offs, const int flags); 657 const th_regex_char_t *haystack,
647 658 size_t *poffs,
648 659 const th_regex_t *expr,
649 static BOOL th_regex_do_match_node(const th_regex_char *haystack, 660 const size_t startnode,
650 size_t *offs, const th_regex_node *node, const int flags) 661 const int flags
651 { 662 );
652 th_regex_char cch; 663
664
665 static BOOL th_regex_match_one(
666 const th_regex_char_t *haystack,
667 size_t *poffs,
668 const th_regex_node_t *node,
669 const int flags
670 )
671 {
672 th_regex_char_t cch;
653 BOOL res = FALSE; 673 BOOL res = FALSE;
654 674
655 DBG_RE_MATCH(" node_START [%s]: '%s': ",
656 re_match_types[node->type], haystack + *offs);
657
658 switch (node->type) 675 switch (node->type)
659 { 676 {
660 case TH_RE_TYPE_SUBEXPR: 677 case TH_RE_TYPE_SUBEXPR:
661 DBG_RE_MATCH("subexpr ..\n"); 678 // push current context in stack
662 res = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); 679 // new context for subexpr
663 return res; 680 res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags);
664 break; 681 break;
665 682
666 case TH_RE_TYPE_LIST: 683 case TH_RE_TYPE_LIST:
667 case TH_RE_TYPE_LIST_REVERSE: 684 case TH_RE_TYPE_LIST_REVERSE:
668 DBG_RE_MATCH("[%s]", node->match.list.chars); 685 if ((cch = haystack[*poffs]) == 0)
669 if ((cch = haystack[*offs]) == 0) 686 res = FALSE;
670 goto out; 687 else
671 688 {
672 // Could be optimized, perhaps .. sort match.chars, binary search etc? 689 res = th_regex_match_list(&node->match.list, cch);
673 // XXX TODO Ranges are not supported yet 690
674 for (size_t n = 0; n < node->match.list.nchars; n++) 691 if (node->type == TH_RE_TYPE_LIST_REVERSE)
675 if (node->match.list.chars[n] == cch) 692 res = !res;
693
694 (*poffs)++;
695 }
696 break;
697
698 case TH_RE_TYPE_ANY_CHAR:
699 if ((cch = haystack[*poffs]) == 0)
700 res = FALSE;
701 else
676 { 702 {
677 res = TRUE; 703 res = TRUE;
678 break; 704 (*poffs)++;
679 } 705 }
680
681 if (node->type == TH_RE_TYPE_LIST_REVERSE)
682 res = !res;
683
684 (*offs)++;
685 break; 706 break;
686 707
687 case TH_RE_TYPE_ANY_CHAR: 708 case TH_RE_TYPE_CHAR:
688 if ((cch = haystack[*offs]) == 0) 709 if ((cch = haystack[*poffs]) == 0)
689 goto out; 710 res = FALSE;
690 711 else
691 res = TRUE; 712 {
692 (*offs)++; 713 res = (cch == node->match.chr);
714 (*poffs)++;
715 }
693 break; 716 break;
694 717 }
695 case TH_RE_TYPE_CHAR:
696 DBG_RE_MATCH("'%c'", node->match.chr);
697 if ((cch = haystack[*offs]) == 0)
698 goto out;
699
700 res = (cch == node->match.chr);
701 (*offs)++;
702 break;
703 }
704
705 out:
706 DBG_RE_MATCH(", match=%s\n",
707 res ? "YES" : "NO");
708 718
709 return res; 719 return res;
710 } 720 }
711 721
712 722
713 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, 723 static BOOL th_regex_match_expr(
714 const th_regex_char *haystack, size_t *offs, const int flags) 724 const th_regex_char_t *haystack,
715 { 725 size_t *poffs,
716 for (size_t n = 0; n < expr->nnodes; n++) 726 const th_regex_t *expr,
717 { 727 const size_t startnode,
718 const th_regex_node *node = &expr->nodes[n]; 728 const int flags
719 size_t soffs; 729 )
720 730 {
721 DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", 731 BOOL res = TRUE;
722 n, expr->nnodes, re_match_modes[node->mode]); 732 size_t soffs = *poffs;
733
734 for (size_t nnode = startnode; res && nnode < expr->nnodes; nnode++)
735 {
736 const th_regex_node_t *node = &expr->nodes[nnode];
737
738 DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s %s '%s'\n",
739 nnode, expr->nnodes,
740 re_match_modes[node->mode], re_match_types[node->type],
741 haystack + *poffs);
723 742
724 switch (node->mode) 743 switch (node->mode)
725 { 744 {
726 case TH_RE_MATCH_ONCE: 745 case TH_RE_MATCH_ONCE:
727 { 746 res = th_regex_match_one(haystack, poffs, node, flags);
728 DBG_RE_MATCH("\n");
729 soffs = *offs;
730 if (!th_regex_do_match_node(haystack, &soffs, node, flags))
731 return FALSE;
732 *offs = soffs;
733 }
734 break; 747 break;
735 748
736 case TH_RE_MATCH_COUNT_GREEDY: 749 case TH_RE_MATCH_COUNT_GREEDY:
737 case TH_RE_MATCH_COUNT_NONGREEDY: 750 case TH_RE_MATCH_COUNT_NONGREEDY:
738 { 751 {
739 BOOL done = FALSE;
740 ssize_t count = 0; 752 ssize_t count = 0;
741 753
742 DBG_RE_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); 754 DBG_RE_MATCH(" COUNT min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n",
755 node->repeatMin, node->repeatMax);
743 756
744 do 757 do
745 { 758 {
746 BOOL match; 759 BOOL match;
747 soffs = *offs; 760 size_t toffs = *poffs, tnode = nnode;
748 761
749 match = th_regex_do_match_node(haystack, &soffs, node, flags); 762 do {
750 for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++) 763 match = th_regex_match_one(haystack, &toffs, node, flags);
751 { 764 if (match && haystack[toffs] != 0)
752 const th_regex_node *next = &expr->nodes[qn]; 765 {
753 do { 766 for (tnode = nnode + 1; match && tnode < expr->nnodes && haystack[toffs] != 0; tnode++)
754 match = th_regex_do_match_node(haystack, &soffs, next, flags); 767 {
755 } while (haystack[soffs] != 0 && !match); 768 size_t noffs = toffs;
756 } 769 match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags);
770 DBG_RE_MATCH(" '%s': %d\n", haystack + noffs, match);
771 }
772 }
773 } while (!match && haystack[toffs] != 0);
757 774
758 if (match) 775 if (match)
759 { 776 {
760 // Node matched 777 // Node matched
761 count++; 778 count++;
762 done = (node->repeatMax > 0 && count >= node->repeatMax); 779 *poffs = soffs;
763 *offs = soffs; 780 nnode = tnode;
781 res = (node->repeatMax > 0 && count >= node->repeatMax);
764 } 782 }
765 else 783 else
766 { 784 {
767 // Node did not match, check if we got the minimum if set 785 // Node did not match, check if we got the minimum if set
768 done = (node->repeatMin >= 0 && count >= node->repeatMin); 786 res = (node->repeatMin >= 0 && count >= node->repeatMin);
769 soffs = *offs;
770 break;
771 } 787 }
772 } while (!done); 788
773 789 } while (!res);
774 DBG_RE_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); 790
775 791 DBG_RE_MATCH(" RESULT: count=%" PRId_SSIZE_T ", done=%s\n",
776 if (!done) 792 count, res ? "YES" : "NO");
777 return FALSE; 793 }
778 794 break;
779 *offs = soffs; 795
780 }
781 break;
782 796
783 case TH_RE_MATCH_ANCHOR_START: 797 case TH_RE_MATCH_ANCHOR_START:
784 DBG_RE_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); 798 res = (*poffs == 0);
785 if (*offs != 0)
786 return FALSE;
787 break; 799 break;
788 800
789 case TH_RE_MATCH_ANCHOR_END: 801 case TH_RE_MATCH_ANCHOR_END:
790 DBG_RE_MATCH("is end=%d\n", haystack[*offs]); 802 res = (haystack[*poffs] == 0);
791 if (haystack[*offs] != 0) 803 break;
792 return FALSE; 804 }
793 break; 805 }
794 } 806
795 } 807 if (!res)
796 808 *poffs = soffs;
797 return TRUE; 809
798 } 810 return res;
799 811 }
800 812
801 int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack, 813
802 size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches, 814 int th_regex_match(const th_regex_t *expr, const th_regex_char_t *haystack,
815 size_t *pnmatches, th_regex_match_t **pmatches, const size_t maxmatches,
803 const int flags) 816 const int flags)
804 { 817 {
805 size_t nmatches = 0; 818 size_t nmatches = 0;
819 (void) flags;
806 820
807 if (pnmatches != NULL) 821 if (pnmatches != NULL)
808 *pnmatches = 0; 822 *pnmatches = 0;
809 823
810 // Check given pattern and string 824 // Check given pattern and string
816 // - anchored and unanchored expressions 830 // - anchored and unanchored expressions
817 // - how to check if the expression has consumed all possibilities? 831 // - how to check if the expression has consumed all possibilities?
818 // .. 832 // ..
819 for (size_t soffs = 0; haystack[soffs] != 0; ) 833 for (size_t soffs = 0; haystack[soffs] != 0; )
820 { 834 {
821 BOOL matched;
822 size_t coffs = soffs; 835 size_t coffs = soffs;
823 836
824 DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", 837 DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
825 soffs, haystack + soffs); 838 soffs, haystack + soffs);
826 839
827 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) 840 if (th_regex_match_expr(haystack, &coffs, expr, 0, flags))
828 { 841 {
829 // A match was found, increase count 842 // A match was found, increase count
830 nmatches++; 843 nmatches++;
831 844
832 // Deliver to caller if required 845 // Deliver to caller if required
833 if (pnmatches != NULL) 846 if (pnmatches != NULL)
834 *pnmatches = nmatches; 847 *pnmatches = nmatches;
835 848
836 if (pmatches != NULL) 849 if (pmatches != NULL)
837 { 850 {
838 th_regex_match_node *match = th_malloc0(sizeof(th_regex_match_node)); 851 th_regex_match_t *match = th_malloc0(sizeof(th_regex_match_t));
839 if (match == NULL) 852 if (match == NULL)
840 return THERR_MALLOC; 853 return THERR_MALLOC;
841 854
842 match->start = soffs; 855 match->start = soffs;
843 match->len = coffs - soffs; 856 match->len = coffs - soffs;
864 877
865 return THERR_OK; 878 return THERR_OK;
866 } 879 }
867 880
868 881
869 static void th_regex_free_match(th_regex_match_node *node) 882 static void th_regex_free_match(th_regex_match_t *node)
870 { 883 {
871 (void) node; 884 (void) node;
872 // Nothing to do here at the moment 885 // Nothing to do here at the moment
873 } 886 }
874 887
875 888
876 void th_regex_free_matches(th_regex_match_node *matches) 889 void th_regex_free_matches(th_regex_match_t *matches)
877 { 890 {
878 th_llist_free_func_node((th_llist_t *) matches, 891 th_llist_free_func_node((th_llist_t *) matches,
879 (void (*)(th_llist_t *)) th_regex_free_match); 892 (void (*)(th_llist_t *)) th_regex_free_match);
880 } 893 }
881 894