comparison th_regex.c @ 613:2e3b81ae8c8a

More work on regexes.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 16 Jan 2020 16:01:27 +0200
parents cc9ec51b4875
children afcaf5e38f56
comparison
equal deleted inserted replaced
612:cc9ec51b4875 613:2e3b81ae8c8a
11 //#include "th_datastruct.h" 11 //#include "th_datastruct.h"
12 12
13 13
14 //#define DBG_RE_COMPILE 1 14 //#define DBG_RE_COMPILE 1
15 //#define DBG_RE_FREE 1 15 //#define DBG_RE_FREE 1
16 //#define DBG_RE_MATCH 1 16 #define DBG_RE_MATCH 1
17 17
18 #if defined(DBG_RE_COMPILE) 18 #if defined(DBG_RE_COMPILE)
19 # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) 19 # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
20 #else 20 #else
21 # define DBG_RE_PRINT_COMPILE(...) do { } while (0) 21 # define DBG_RE_PRINT_COMPILE(...) do { } while (0)
33 33
34 34
35 enum 35 enum
36 { 36 {
37 TH_RE_MATCH_ONCE, 37 TH_RE_MATCH_ONCE,
38 TH_RE_MATCH_COUNT, 38 TH_RE_MATCH_COUNT_GREEDY,
39 TH_RE_MATCH_COUNT_NONGREEDY,
39 TH_RE_MATCH_ANCHOR_START, 40 TH_RE_MATCH_ANCHOR_START,
40 TH_RE_MATCH_ANCHOR_END, 41 TH_RE_MATCH_ANCHOR_END,
41 }; 42 };
42 43
43 44
53 54
54 #if defined(DBG_RE_COMPILE) || defined(DBG_RE_FREE) || defined(DBG_RE_MATCH) 55 #if defined(DBG_RE_COMPILE) || defined(DBG_RE_FREE) || defined(DBG_RE_MATCH)
55 static const char *re_match_modes[] = 56 static const char *re_match_modes[] =
56 { 57 {
57 "ONCE", 58 "ONCE",
58 "COUNT", 59 "COUNT GREEDY",
60 "COUNT NONGREEDY",
59 "ANCHOR_START", 61 "ANCHOR_START",
60 "ANCHOR_END", 62 "ANCHOR_END",
61 }; 63 };
62 64
63 static const char *re_match_types[] = 65 static const char *re_match_types[] =
251 memset(&ctx, 0, sizeof(ctx)); 253 memset(&ctx, 0, sizeof(ctx));
252 ctx.pattern = pattern; 254 ctx.pattern = pattern;
253 255
254 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) 256 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
255 { 257 {
256 DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, ctx.pattern[ctx.offs]); 258 th_regex_char cch = ctx.pattern[ctx.offs];
257 switch (ctx.pattern[ctx.offs]) 259 DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch);
260 switch (cch)
258 { 261 {
259 case '?': 262 case '?':
260 // Previous token is optional (repeat 0-1 times) 263 case '*':
264 case '+':
261 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) 265 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
262 goto exit; 266 goto exit;
263 267
264 pnode->mode = TH_RE_MATCH_COUNT; 268 if (cch == '?')
265 pnode->repeatMin = 0; 269 {
266 pnode->repeatMax = 1; 270 // Check if previous was a count
267 break; 271 pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ?
268 272 TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY;
269 case '*': 273
270 // Previous token can repeat 0 or more times 274 // Previous token is optional (repeat 0-1 times)
271 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) 275 pnode->repeatMin = 0;
272 goto exit; 276 pnode->repeatMax = 1;
273 277 }
274 pnode->mode = TH_RE_MATCH_COUNT; 278 else
275 pnode->repeatMin = 0; 279 {
276 pnode->repeatMax = -1; 280 // Check if previous was a count
277 break; 281 if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY ||
278 282 pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY)
279 case '+': 283 {
280 // Previous token must repeat 1 or more times 284 res = THERR_INVALID_DATA;
281 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) 285 goto exit;
282 goto exit; 286 }
283 287
284 pnode->mode = TH_RE_MATCH_COUNT; 288 pnode->mode = TH_RE_MATCH_COUNT_GREEDY;
285 pnode->repeatMin = 1; 289
286 pnode->repeatMax = 1; 290 if (cch == '*')
291 {
292 // Previous token can repeat 0 or more times
293 pnode->repeatMin = 0;
294 pnode->repeatMax = -1;
295 }
296 else
297 {
298 // Previous token must repeat 1 or more times
299 pnode->repeatMin = 1;
300 pnode->repeatMax = -1;
301 }
302 }
287 break; 303 break;
288 304
289 case '{': 305 case '{':
290 // {n} | {min,max} 306 // {n} | {min,max}
291 start = ctx.offs + 1; 307 start = ctx.offs + 1;
301 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || 317 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK ||
302 (res = th_regex_strndup(&tmp, ctx.pattern + start, 318 (res = th_regex_strndup(&tmp, ctx.pattern + start,
303 ctx.offs - start)) != THERR_OK) 319 ctx.offs - start)) != THERR_OK)
304 goto exit; 320 goto exit;
305 321
306 pnode->mode = TH_RE_MATCH_COUNT; 322 pnode->mode = TH_RE_MATCH_COUNT_GREEDY;
307 323
308 if (th_regex_find_next(tmp, 0, &start, ',')) 324 if (th_regex_find_next(tmp, 0, &start, ','))
309 { 325 {
310 tmp[start] = 0; 326 tmp[start] = 0;
311 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || 327 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) ||
491 size_t *offs, const th_regex_node *node, const int flags) 507 size_t *offs, const th_regex_node *node, const int flags)
492 { 508 {
493 th_regex_char cch; 509 th_regex_char cch;
494 BOOL ret = FALSE; 510 BOOL ret = FALSE;
495 511
496 DBG_RE_PRINT_MATCH("node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); 512 DBG_RE_PRINT_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs);
497 513
498 switch (node->type) 514 switch (node->type)
499 { 515 {
500 case TH_RE_TYPE_SUBEXPR: 516 case TH_RE_TYPE_SUBEXPR:
501 DBG_RE_PRINT_MATCH("subexpr ..\n"); 517 DBG_RE_PRINT_MATCH("subexpr ..\n");
548 (*offs)++; 564 (*offs)++;
549 break; 565 break;
550 } 566 }
551 567
552 out: 568 out:
553 DBG_RE_PRINT_MATCH("node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); 569 DBG_RE_PRINT_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO");
554 return ret; 570 return ret;
555 } 571 }
556 572
557 573
558 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, 574 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr,
561 for (size_t n = 0; n < expr->nnodes; n++) 577 for (size_t n = 0; n < expr->nnodes; n++)
562 { 578 {
563 const th_regex_node *node = &expr->nodes[n]; 579 const th_regex_node *node = &expr->nodes[n];
564 size_t soffs; 580 size_t soffs;
565 581
566 DBG_RE_PRINT_MATCH(" expr %p [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", 582 DBG_RE_PRINT_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ",
567 (void *) expr, n, expr->nnodes, re_match_modes[node->mode]); 583 n, expr->nnodes, re_match_modes[node->mode]);
568 584
569 switch (node->mode) 585 switch (node->mode)
570 { 586 {
571 case TH_RE_MATCH_ONCE: 587 case TH_RE_MATCH_ONCE:
572 { 588 {
576 return FALSE; 592 return FALSE;
577 *offs = soffs; 593 *offs = soffs;
578 } 594 }
579 break; 595 break;
580 596
581 case TH_RE_MATCH_COUNT: 597 case TH_RE_MATCH_COUNT_GREEDY:
582 { 598 case TH_RE_MATCH_COUNT_NONGREEDY:
583 BOOL done = FALSE, match = FALSE; 599 {
600 BOOL done = FALSE;
584 ssize_t count = 0; 601 ssize_t count = 0;
585 602
586 DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); 603 DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax);
587 604
588 do 605 do
589 { 606 {
607 BOOL match;
590 soffs = *offs; 608 soffs = *offs;
591 if ((match = th_regex_do_match_node(haystack, &soffs, node, flags))) 609
610 match = th_regex_do_match_node(haystack, &soffs, node, flags);
611 for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++)
612 {
613 const th_regex_node *next = &expr->nodes[qn];
614 do {
615 match = th_regex_do_match_node(haystack, &soffs, next, flags);
616 } while (haystack[soffs] != 0 && !match);
617 }
618
619 if (match)
592 { 620 {
593 // Node matched 621 // Node matched
594 count++; 622 count++;
595 done = (node->repeatMax > 0 && count >= node->repeatMax); 623 done = (node->repeatMax > 0 && count >= node->repeatMax);
596 *offs = soffs; 624 *offs = soffs;
652 for (size_t soffs = 0; haystack[soffs] != 0; ) 680 for (size_t soffs = 0; haystack[soffs] != 0; )
653 { 681 {
654 BOOL matched; 682 BOOL matched;
655 size_t coffs = soffs; 683 size_t coffs = soffs;
656 684
657 DBG_RE_PRINT_MATCH("\nDO_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", 685 DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
658 soffs, haystack + soffs); 686 soffs, haystack + soffs);
659 687
660 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) 688 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))
661 { 689 {
662 // A match was found, increase count 690 // A match was found, increase count