Mercurial > hg > th-libs
comparison th_regex.c @ 635:d191ded8a790
Improve the experimental regex matching debugging macros.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 21 Jan 2020 07:16:18 +0200 |
parents | afcaf5e38f56 |
children | c4bca120bfb0 |
comparison
equal
deleted
inserted
replaced
634:c09b068ecb32 | 635:d191ded8a790 |
---|---|
7 */ | 7 */ |
8 #include "th_util.h" | 8 #include "th_util.h" |
9 #include "th_regex.h" | 9 #include "th_regex.h" |
10 | 10 |
11 | 11 |
12 //#define DBG_RE_COMPILE 1 | 12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG |
13 //#define DBG_RE_FREE 1 | 13 int th_dbg_re_flags = 0; |
14 #define DBG_RE_MATCH 1 | 14 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stderr, __VA_ARGS__); } while (0) |
15 | 15 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stderr, __VA_ARGS__); } while (0) |
16 #if defined(DBG_RE_COMPILE) | 16 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stderr, __VA_ARGS__); } while (0) |
17 # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) | |
18 #else | 17 #else |
19 # define DBG_RE_PRINT_COMPILE(...) do { } while (0) | 18 # define DBG_RE_COMPILE(...) |
19 # define DBG_RE_FREE(...) | |
20 # define DBG_RE_MATCH(...) | |
20 #endif | 21 #endif |
21 #if defined(DBG_RE_FREE) | 22 |
22 # define DBG_RE_PRINT_FREE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) | |
23 #else | |
24 # define DBG_RE_PRINT_FREE(...) do { } while (0) | |
25 #endif | |
26 #if defined(DBG_RE_MATCH) | |
27 # define DBG_RE_PRINT_MATCH(...) do { fprintf(stderr, __VA_ARGS__); } while (0) | |
28 #else | |
29 # define DBG_RE_PRINT_MATCH(...) do { } while (0) | |
30 #endif | |
31 | 23 |
32 #ifdef TH_EXPERIMENTAL_REGEX | 24 #ifdef TH_EXPERIMENTAL_REGEX |
33 | 25 |
34 enum | 26 enum |
35 { | 27 { |
49 TH_RE_TYPE_LIST_REVERSE, | 41 TH_RE_TYPE_LIST_REVERSE, |
50 TH_RE_TYPE_SUBEXPR, | 42 TH_RE_TYPE_SUBEXPR, |
51 }; | 43 }; |
52 | 44 |
53 | 45 |
54 #if defined(DBG_RE_COMPILE) || defined(DBG_RE_FREE) || defined(DBG_RE_MATCH) | 46 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG |
55 static const char *re_match_modes[] = | 47 static const char *re_match_modes[] = |
56 { | 48 { |
57 "ONCE", | 49 "ONCE", |
58 "COUNT GREEDY", | 50 "COUNT GREEDY", |
59 "COUNT NONGREEDY", | 51 "COUNT NONGREEDY", |
176 } | 168 } |
177 | 169 |
178 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); | 170 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); |
179 data->nnodes++; | 171 data->nnodes++; |
180 | 172 |
181 DBG_RE_PRINT_COMPILE( | 173 DBG_RE_COMPILE( |
182 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " | 174 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " |
183 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", | 175 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", |
184 data->nnodes, data->nodessize, | 176 data->nnodes, data->nodessize, |
185 node->mode, node->type, | 177 node->mode, node->type, |
186 node->repeatMin, node->repeatMax); | 178 node->repeatMin, node->repeatMax); |
253 ctx.pattern = pattern; | 245 ctx.pattern = pattern; |
254 | 246 |
255 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) | 247 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) |
256 { | 248 { |
257 th_regex_char cch = ctx.pattern[ctx.offs]; | 249 th_regex_char cch = ctx.pattern[ctx.offs]; |
258 DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); | 250 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); |
259 switch (cch) | 251 switch (cch) |
260 { | 252 { |
261 case '?': | 253 case '?': |
262 case '*': | 254 case '*': |
263 case '+': | 255 case '+': |
459 { | 451 { |
460 for (size_t n = 0; n < expr->nnodes; n++) | 452 for (size_t n = 0; n < expr->nnodes; n++) |
461 { | 453 { |
462 th_regex_node *node = &expr->nodes[n]; | 454 th_regex_node *node = &expr->nodes[n]; |
463 | 455 |
464 DBG_RE_PRINT_FREE( | 456 DBG_RE_FREE( |
465 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " | 457 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " |
466 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", | 458 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", |
467 n, expr->nnodes, | 459 n, expr->nnodes, |
468 node->mode, node->type, | 460 node->mode, node->type, |
469 node->repeatMin, node->repeatMax); | 461 node->repeatMin, node->repeatMax); |
470 | 462 |
471 switch (node->type) | 463 switch (node->type) |
472 { | 464 { |
473 case TH_RE_TYPE_SUBEXPR: | 465 case TH_RE_TYPE_SUBEXPR: |
474 DBG_RE_PRINT_FREE(" SUBEXPR: %p vs %p\n", expr, node->match.expr); | 466 DBG_RE_FREE(" SUBEXPR: %p vs %p\n", (void *) expr, (void *) node->match.expr); |
475 th_regex_free(node->match.expr); | 467 th_regex_free(node->match.expr); |
476 break; | 468 break; |
477 | 469 |
478 case TH_RE_TYPE_LIST: | 470 case TH_RE_TYPE_LIST: |
479 case TH_RE_TYPE_LIST_REVERSE: | 471 case TH_RE_TYPE_LIST_REVERSE: |
480 DBG_RE_PRINT_FREE(" list='%s'\n", node->match.list.chars); | 472 DBG_RE_FREE(" list='%s'\n", node->match.list.chars); |
481 th_free(node->match.list.chars); | 473 th_free(node->match.list.chars); |
482 break; | 474 break; |
483 | 475 |
484 #ifdef DBG_RE_FREE | 476 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG |
485 case TH_RE_TYPE_ANY_CHAR: | 477 case TH_RE_TYPE_ANY_CHAR: |
486 DBG_RE_PRINT_FREE(" any char\n"); | 478 DBG_RE_FREE(" any char\n"); |
487 break; | 479 break; |
488 | 480 |
489 case TH_RE_TYPE_CHAR: | 481 case TH_RE_TYPE_CHAR: |
490 DBG_RE_PRINT_FREE(" char='%c'\n", node->match.chr); | 482 DBG_RE_FREE(" char='%c'\n", node->match.chr); |
491 break; | 483 break; |
492 #endif | 484 #endif |
493 } | 485 } |
494 } | 486 } |
495 | 487 |
506 size_t *offs, const th_regex_node *node, const int flags) | 498 size_t *offs, const th_regex_node *node, const int flags) |
507 { | 499 { |
508 th_regex_char cch; | 500 th_regex_char cch; |
509 BOOL ret = FALSE; | 501 BOOL ret = FALSE; |
510 | 502 |
511 DBG_RE_PRINT_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); | 503 DBG_RE_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); |
512 | 504 |
513 switch (node->type) | 505 switch (node->type) |
514 { | 506 { |
515 case TH_RE_TYPE_SUBEXPR: | 507 case TH_RE_TYPE_SUBEXPR: |
516 DBG_RE_PRINT_MATCH("subexpr ..\n"); | 508 DBG_RE_MATCH("subexpr ..\n"); |
517 ret = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); | 509 ret = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); |
518 break; | 510 break; |
519 | 511 |
520 case TH_RE_TYPE_LIST: | 512 case TH_RE_TYPE_LIST: |
521 case TH_RE_TYPE_LIST_REVERSE: | 513 case TH_RE_TYPE_LIST_REVERSE: |
522 DBG_RE_PRINT_MATCH("[%s]\n", node->match.list.chars); | 514 DBG_RE_MATCH("[%s]\n", node->match.list.chars); |
523 ret = FALSE; | 515 ret = FALSE; |
524 if ((cch = haystack[*offs]) == 0) | 516 if ((cch = haystack[*offs]) == 0) |
525 goto out; | 517 goto out; |
526 | 518 |
527 // Could be optimized, perhaps .. sort match.chars, binary search etc? | 519 // Could be optimized, perhaps .. sort match.chars, binary search etc? |
538 | 530 |
539 (*offs)++; | 531 (*offs)++; |
540 break; | 532 break; |
541 | 533 |
542 case TH_RE_TYPE_ANY_CHAR: | 534 case TH_RE_TYPE_ANY_CHAR: |
543 DBG_RE_PRINT_MATCH("\n"); | 535 DBG_RE_MATCH("\n"); |
544 if ((cch = haystack[*offs]) == 0) | 536 if ((cch = haystack[*offs]) == 0) |
545 { | 537 { |
546 ret = FALSE; | 538 ret = FALSE; |
547 goto out; | 539 goto out; |
548 } | 540 } |
550 ret = TRUE; | 542 ret = TRUE; |
551 (*offs)++; | 543 (*offs)++; |
552 break; | 544 break; |
553 | 545 |
554 case TH_RE_TYPE_CHAR: | 546 case TH_RE_TYPE_CHAR: |
555 DBG_RE_PRINT_MATCH("'%c'\n", node->match.chr); | 547 DBG_RE_MATCH("'%c'\n", node->match.chr); |
556 if ((cch = haystack[*offs]) == 0) | 548 if ((cch = haystack[*offs]) == 0) |
557 { | 549 { |
558 ret = FALSE; | 550 ret = FALSE; |
559 goto out; | 551 goto out; |
560 } | 552 } |
563 (*offs)++; | 555 (*offs)++; |
564 break; | 556 break; |
565 } | 557 } |
566 | 558 |
567 out: | 559 out: |
568 DBG_RE_PRINT_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); | 560 DBG_RE_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); |
569 return ret; | 561 return ret; |
570 } | 562 } |
571 | 563 |
572 | 564 |
573 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, | 565 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, |
576 for (size_t n = 0; n < expr->nnodes; n++) | 568 for (size_t n = 0; n < expr->nnodes; n++) |
577 { | 569 { |
578 const th_regex_node *node = &expr->nodes[n]; | 570 const th_regex_node *node = &expr->nodes[n]; |
579 size_t soffs; | 571 size_t soffs; |
580 | 572 |
581 DBG_RE_PRINT_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", | 573 DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", |
582 n, expr->nnodes, re_match_modes[node->mode]); | 574 n, expr->nnodes, re_match_modes[node->mode]); |
583 | 575 |
584 switch (node->mode) | 576 switch (node->mode) |
585 { | 577 { |
586 case TH_RE_MATCH_ONCE: | 578 case TH_RE_MATCH_ONCE: |
587 { | 579 { |
588 DBG_RE_PRINT_MATCH("\n"); | 580 DBG_RE_MATCH("\n"); |
589 soffs = *offs; | 581 soffs = *offs; |
590 if (!th_regex_do_match_node(haystack, &soffs, node, flags)) | 582 if (!th_regex_do_match_node(haystack, &soffs, node, flags)) |
591 return FALSE; | 583 return FALSE; |
592 *offs = soffs; | 584 *offs = soffs; |
593 } | 585 } |
597 case TH_RE_MATCH_COUNT_NONGREEDY: | 589 case TH_RE_MATCH_COUNT_NONGREEDY: |
598 { | 590 { |
599 BOOL done = FALSE; | 591 BOOL done = FALSE; |
600 ssize_t count = 0; | 592 ssize_t count = 0; |
601 | 593 |
602 DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); | 594 DBG_RE_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); |
603 | 595 |
604 do | 596 do |
605 { | 597 { |
606 BOOL match; | 598 BOOL match; |
607 soffs = *offs; | 599 soffs = *offs; |
629 soffs = *offs; | 621 soffs = *offs; |
630 break; | 622 break; |
631 } | 623 } |
632 } while (!done); | 624 } while (!done); |
633 | 625 |
634 DBG_RE_PRINT_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); | 626 DBG_RE_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); |
635 | 627 |
636 if (!done) | 628 if (!done) |
637 return FALSE; | 629 return FALSE; |
638 | 630 |
639 *offs = soffs; | 631 *offs = soffs; |
640 } | 632 } |
641 break; | 633 break; |
642 | 634 |
643 case TH_RE_MATCH_ANCHOR_START: | 635 case TH_RE_MATCH_ANCHOR_START: |
644 DBG_RE_PRINT_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); | 636 DBG_RE_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); |
645 if (*offs != 0) | 637 if (*offs != 0) |
646 return FALSE; | 638 return FALSE; |
647 break; | 639 break; |
648 | 640 |
649 case TH_RE_MATCH_ANCHOR_END: | 641 case TH_RE_MATCH_ANCHOR_END: |
650 DBG_RE_PRINT_MATCH("is end=%d\n", haystack[*offs]); | 642 DBG_RE_MATCH("is end=%d\n", haystack[*offs]); |
651 if (haystack[*offs] != 0) | 643 if (haystack[*offs] != 0) |
652 return FALSE; | 644 return FALSE; |
653 break; | 645 break; |
654 } | 646 } |
655 } | 647 } |
679 for (size_t soffs = 0; haystack[soffs] != 0; ) | 671 for (size_t soffs = 0; haystack[soffs] != 0; ) |
680 { | 672 { |
681 BOOL matched; | 673 BOOL matched; |
682 size_t coffs = soffs; | 674 size_t coffs = soffs; |
683 | 675 |
684 DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", | 676 DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", |
685 soffs, haystack + soffs); | 677 soffs, haystack + soffs); |
686 | 678 |
687 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) | 679 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) |
688 { | 680 { |
689 // A match was found, increase count | 681 // A match was found, increase count |