comparison th_regex.c @ 635:d191ded8a790

Improve the experimental regex matching debugging macros.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 21 Jan 2020 07:16:18 +0200
parents afcaf5e38f56
children c4bca120bfb0
comparison
equal deleted inserted replaced
634:c09b068ecb32 635:d191ded8a790
7 */ 7 */
8 #include "th_util.h" 8 #include "th_util.h"
9 #include "th_regex.h" 9 #include "th_regex.h"
10 10
11 11
12 //#define DBG_RE_COMPILE 1 12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG
13 //#define DBG_RE_FREE 1 13 int th_dbg_re_flags = 0;
14 #define DBG_RE_MATCH 1 14 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stderr, __VA_ARGS__); } while (0)
15 15 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stderr, __VA_ARGS__); } while (0)
16 #if defined(DBG_RE_COMPILE) 16 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stderr, __VA_ARGS__); } while (0)
17 # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
18 #else 17 #else
19 # define DBG_RE_PRINT_COMPILE(...) do { } while (0) 18 # define DBG_RE_COMPILE(...)
19 # define DBG_RE_FREE(...)
20 # define DBG_RE_MATCH(...)
20 #endif 21 #endif
21 #if defined(DBG_RE_FREE) 22
22 # define DBG_RE_PRINT_FREE(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
23 #else
24 # define DBG_RE_PRINT_FREE(...) do { } while (0)
25 #endif
26 #if defined(DBG_RE_MATCH)
27 # define DBG_RE_PRINT_MATCH(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
28 #else
29 # define DBG_RE_PRINT_MATCH(...) do { } while (0)
30 #endif
31 23
32 #ifdef TH_EXPERIMENTAL_REGEX 24 #ifdef TH_EXPERIMENTAL_REGEX
33 25
34 enum 26 enum
35 { 27 {
49 TH_RE_TYPE_LIST_REVERSE, 41 TH_RE_TYPE_LIST_REVERSE,
50 TH_RE_TYPE_SUBEXPR, 42 TH_RE_TYPE_SUBEXPR,
51 }; 43 };
52 44
53 45
54 #if defined(DBG_RE_COMPILE) || defined(DBG_RE_FREE) || defined(DBG_RE_MATCH) 46 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG
55 static const char *re_match_modes[] = 47 static const char *re_match_modes[] =
56 { 48 {
57 "ONCE", 49 "ONCE",
58 "COUNT GREEDY", 50 "COUNT GREEDY",
59 "COUNT NONGREEDY", 51 "COUNT NONGREEDY",
176 } 168 }
177 169
178 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); 170 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node));
179 data->nnodes++; 171 data->nnodes++;
180 172
181 DBG_RE_PRINT_COMPILE( 173 DBG_RE_COMPILE(
182 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " 174 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
183 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", 175 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
184 data->nnodes, data->nodessize, 176 data->nnodes, data->nodessize,
185 node->mode, node->type, 177 node->mode, node->type,
186 node->repeatMin, node->repeatMax); 178 node->repeatMin, node->repeatMax);
253 ctx.pattern = pattern; 245 ctx.pattern = pattern;
254 246
255 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) 247 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
256 { 248 {
257 th_regex_char cch = ctx.pattern[ctx.offs]; 249 th_regex_char cch = ctx.pattern[ctx.offs];
258 DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); 250 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch);
259 switch (cch) 251 switch (cch)
260 { 252 {
261 case '?': 253 case '?':
262 case '*': 254 case '*':
263 case '+': 255 case '+':
459 { 451 {
460 for (size_t n = 0; n < expr->nnodes; n++) 452 for (size_t n = 0; n < expr->nnodes; n++)
461 { 453 {
462 th_regex_node *node = &expr->nodes[n]; 454 th_regex_node *node = &expr->nodes[n];
463 455
464 DBG_RE_PRINT_FREE( 456 DBG_RE_FREE(
465 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " 457 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
466 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", 458 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
467 n, expr->nnodes, 459 n, expr->nnodes,
468 node->mode, node->type, 460 node->mode, node->type,
469 node->repeatMin, node->repeatMax); 461 node->repeatMin, node->repeatMax);
470 462
471 switch (node->type) 463 switch (node->type)
472 { 464 {
473 case TH_RE_TYPE_SUBEXPR: 465 case TH_RE_TYPE_SUBEXPR:
474 DBG_RE_PRINT_FREE(" SUBEXPR: %p vs %p\n", expr, node->match.expr); 466 DBG_RE_FREE(" SUBEXPR: %p vs %p\n", (void *) expr, (void *) node->match.expr);
475 th_regex_free(node->match.expr); 467 th_regex_free(node->match.expr);
476 break; 468 break;
477 469
478 case TH_RE_TYPE_LIST: 470 case TH_RE_TYPE_LIST:
479 case TH_RE_TYPE_LIST_REVERSE: 471 case TH_RE_TYPE_LIST_REVERSE:
480 DBG_RE_PRINT_FREE(" list='%s'\n", node->match.list.chars); 472 DBG_RE_FREE(" list='%s'\n", node->match.list.chars);
481 th_free(node->match.list.chars); 473 th_free(node->match.list.chars);
482 break; 474 break;
483 475
484 #ifdef DBG_RE_FREE 476 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG
485 case TH_RE_TYPE_ANY_CHAR: 477 case TH_RE_TYPE_ANY_CHAR:
486 DBG_RE_PRINT_FREE(" any char\n"); 478 DBG_RE_FREE(" any char\n");
487 break; 479 break;
488 480
489 case TH_RE_TYPE_CHAR: 481 case TH_RE_TYPE_CHAR:
490 DBG_RE_PRINT_FREE(" char='%c'\n", node->match.chr); 482 DBG_RE_FREE(" char='%c'\n", node->match.chr);
491 break; 483 break;
492 #endif 484 #endif
493 } 485 }
494 } 486 }
495 487
506 size_t *offs, const th_regex_node *node, const int flags) 498 size_t *offs, const th_regex_node *node, const int flags)
507 { 499 {
508 th_regex_char cch; 500 th_regex_char cch;
509 BOOL ret = FALSE; 501 BOOL ret = FALSE;
510 502
511 DBG_RE_PRINT_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); 503 DBG_RE_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs);
512 504
513 switch (node->type) 505 switch (node->type)
514 { 506 {
515 case TH_RE_TYPE_SUBEXPR: 507 case TH_RE_TYPE_SUBEXPR:
516 DBG_RE_PRINT_MATCH("subexpr ..\n"); 508 DBG_RE_MATCH("subexpr ..\n");
517 ret = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); 509 ret = th_regex_do_match_expr(node->match.expr, haystack, offs, flags);
518 break; 510 break;
519 511
520 case TH_RE_TYPE_LIST: 512 case TH_RE_TYPE_LIST:
521 case TH_RE_TYPE_LIST_REVERSE: 513 case TH_RE_TYPE_LIST_REVERSE:
522 DBG_RE_PRINT_MATCH("[%s]\n", node->match.list.chars); 514 DBG_RE_MATCH("[%s]\n", node->match.list.chars);
523 ret = FALSE; 515 ret = FALSE;
524 if ((cch = haystack[*offs]) == 0) 516 if ((cch = haystack[*offs]) == 0)
525 goto out; 517 goto out;
526 518
527 // Could be optimized, perhaps .. sort match.chars, binary search etc? 519 // Could be optimized, perhaps .. sort match.chars, binary search etc?
538 530
539 (*offs)++; 531 (*offs)++;
540 break; 532 break;
541 533
542 case TH_RE_TYPE_ANY_CHAR: 534 case TH_RE_TYPE_ANY_CHAR:
543 DBG_RE_PRINT_MATCH("\n"); 535 DBG_RE_MATCH("\n");
544 if ((cch = haystack[*offs]) == 0) 536 if ((cch = haystack[*offs]) == 0)
545 { 537 {
546 ret = FALSE; 538 ret = FALSE;
547 goto out; 539 goto out;
548 } 540 }
550 ret = TRUE; 542 ret = TRUE;
551 (*offs)++; 543 (*offs)++;
552 break; 544 break;
553 545
554 case TH_RE_TYPE_CHAR: 546 case TH_RE_TYPE_CHAR:
555 DBG_RE_PRINT_MATCH("'%c'\n", node->match.chr); 547 DBG_RE_MATCH("'%c'\n", node->match.chr);
556 if ((cch = haystack[*offs]) == 0) 548 if ((cch = haystack[*offs]) == 0)
557 { 549 {
558 ret = FALSE; 550 ret = FALSE;
559 goto out; 551 goto out;
560 } 552 }
563 (*offs)++; 555 (*offs)++;
564 break; 556 break;
565 } 557 }
566 558
567 out: 559 out:
568 DBG_RE_PRINT_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); 560 DBG_RE_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO");
569 return ret; 561 return ret;
570 } 562 }
571 563
572 564
573 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, 565 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr,
576 for (size_t n = 0; n < expr->nnodes; n++) 568 for (size_t n = 0; n < expr->nnodes; n++)
577 { 569 {
578 const th_regex_node *node = &expr->nodes[n]; 570 const th_regex_node *node = &expr->nodes[n];
579 size_t soffs; 571 size_t soffs;
580 572
581 DBG_RE_PRINT_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", 573 DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ",
582 n, expr->nnodes, re_match_modes[node->mode]); 574 n, expr->nnodes, re_match_modes[node->mode]);
583 575
584 switch (node->mode) 576 switch (node->mode)
585 { 577 {
586 case TH_RE_MATCH_ONCE: 578 case TH_RE_MATCH_ONCE:
587 { 579 {
588 DBG_RE_PRINT_MATCH("\n"); 580 DBG_RE_MATCH("\n");
589 soffs = *offs; 581 soffs = *offs;
590 if (!th_regex_do_match_node(haystack, &soffs, node, flags)) 582 if (!th_regex_do_match_node(haystack, &soffs, node, flags))
591 return FALSE; 583 return FALSE;
592 *offs = soffs; 584 *offs = soffs;
593 } 585 }
597 case TH_RE_MATCH_COUNT_NONGREEDY: 589 case TH_RE_MATCH_COUNT_NONGREEDY:
598 { 590 {
599 BOOL done = FALSE; 591 BOOL done = FALSE;
600 ssize_t count = 0; 592 ssize_t count = 0;
601 593
602 DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); 594 DBG_RE_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax);
603 595
604 do 596 do
605 { 597 {
606 BOOL match; 598 BOOL match;
607 soffs = *offs; 599 soffs = *offs;
629 soffs = *offs; 621 soffs = *offs;
630 break; 622 break;
631 } 623 }
632 } while (!done); 624 } while (!done);
633 625
634 DBG_RE_PRINT_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); 626 DBG_RE_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO");
635 627
636 if (!done) 628 if (!done)
637 return FALSE; 629 return FALSE;
638 630
639 *offs = soffs; 631 *offs = soffs;
640 } 632 }
641 break; 633 break;
642 634
643 case TH_RE_MATCH_ANCHOR_START: 635 case TH_RE_MATCH_ANCHOR_START:
644 DBG_RE_PRINT_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); 636 DBG_RE_MATCH("offs=%" PRIu_SIZE_T "\n", *offs);
645 if (*offs != 0) 637 if (*offs != 0)
646 return FALSE; 638 return FALSE;
647 break; 639 break;
648 640
649 case TH_RE_MATCH_ANCHOR_END: 641 case TH_RE_MATCH_ANCHOR_END:
650 DBG_RE_PRINT_MATCH("is end=%d\n", haystack[*offs]); 642 DBG_RE_MATCH("is end=%d\n", haystack[*offs]);
651 if (haystack[*offs] != 0) 643 if (haystack[*offs] != 0)
652 return FALSE; 644 return FALSE;
653 break; 645 break;
654 } 646 }
655 } 647 }
679 for (size_t soffs = 0; haystack[soffs] != 0; ) 671 for (size_t soffs = 0; haystack[soffs] != 0; )
680 { 672 {
681 BOOL matched; 673 BOOL matched;
682 size_t coffs = soffs; 674 size_t coffs = soffs;
683 675
684 DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", 676 DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n",
685 soffs, haystack + soffs); 677 soffs, haystack + soffs);
686 678
687 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) 679 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags)))
688 { 680 {
689 // A match was found, increase count 681 // A match was found, increase count