comparison th_regex.c @ 645:b897995101b7

More fiddling and twiddling. Add parsing to string nodes instead of separate character nodes.
author Matti Hamalainen <ccr@tnsp.org>
date Sat, 25 Jan 2020 05:11:40 +0200
parents a2bf1ea05b05
children 1e7e3f96632e
comparison
equal deleted inserted replaced
644:562de49f8b4b 645:b897995101b7
8 #include "th_util.h" 8 #include "th_util.h"
9 #include "th_regex.h" 9 #include "th_regex.h"
10 10
11 11
12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG 12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG
13 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stdout, __VA_ARGS__); } while (0) 13 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags) fprintf(stdout, __VA_ARGS__); } while (0)
14 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stdout, __VA_ARGS__); } while (0) 14 BOOL th_dbg_re_flags = FALSE;
15 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stdout, __VA_ARGS__); } while (0) 15 #else
16 16 # define DBG_RE_MATCH(...)
17 int th_dbg_re_flags = 0; 17 #endif
18
19
20 #ifdef TH_EXPERIMENTAL_REGEX
21
22 enum
23 {
24 TH_RE_MATCH_ONCE,
25 TH_RE_MATCH_COUNT,
26 TH_RE_MATCH_ANCHOR_START,
27 TH_RE_MATCH_ANCHOR_END,
28 };
29
30
31 enum
32 {
33 TH_RE_TYPE_CHAR,
34 TH_RE_TYPE_STR,
35 TH_RE_TYPE_ANY_CHAR,
36 TH_RE_TYPE_LIST,
37 TH_RE_TYPE_LIST_REVERSE,
38 TH_RE_TYPE_SUBEXPR,
39 };
40
18 41
19 static const char *re_match_modes[] = 42 static const char *re_match_modes[] =
20 { 43 {
21 "ONCE", 44 "ONCE",
22 "COUNT", 45 "COUNT",
23 "ANCHOR START", 46 "ANCHOR START",
24 "ANCHOR END", 47 "ANCHOR END",
25 }; 48 };
26 49
50
27 static const char *re_match_types[] = 51 static const char *re_match_types[] =
28 { 52 {
29 "CHAR", 53 "CHAR",
54 "STR",
30 "ANY", 55 "ANY",
31 "LIST", 56 "LIST",
32 "LIST REVERSE", 57 "LIST REVERSE",
33 "SUBEXPR", 58 "SUBEXPR",
34 }; 59 };
35 60
36 #else
37 # define DBG_RE_COMPILE(...)
38 # define DBG_RE_FREE(...)
39 # define DBG_RE_MATCH(...)
40 #endif
41
42
43 #ifdef TH_EXPERIMENTAL_REGEX
44 61
45 typedef struct 62 typedef struct
46 { 63 {
47 int type; 64 int type;
48 th_regex_char_t start, end; 65 th_regex_char_t start, end;
64 int mode, type; 81 int mode, type;
65 ssize_t repeatMin, repeatMax; 82 ssize_t repeatMin, repeatMax;
66 83
67 struct { 84 struct {
68 th_regex_char_t chr; 85 th_regex_char_t chr;
86 th_regex_char_t *str;
69 th_regex_list_t list; 87 th_regex_list_t list;
70 struct th_regex_t *expr; 88
89 th_regex_t *expr;
71 } match; 90 } match;
72 } th_regex_node_t; 91 } th_regex_node_t;
73 92
74 93
75 struct th_regex_t 94 struct th_regex_t
77 size_t nnodes, nodessize; 96 size_t nnodes, nodessize;
78 th_regex_node_t *nodes; 97 th_regex_node_t *nodes;
79 }; 98 };
80 99
81 100
82 enum
83 {
84 TH_RE_MATCH_ONCE,
85 TH_RE_MATCH_COUNT,
86
87 TH_RE_MATCH_ANCHOR_START,
88 TH_RE_MATCH_ANCHOR_END,
89 };
90
91
92 enum
93 {
94 TH_RE_TYPE_CHAR,
95 TH_RE_TYPE_ANY_CHAR,
96 TH_RE_TYPE_LIST,
97 TH_RE_TYPE_LIST_REVERSE,
98 TH_RE_TYPE_SUBEXPR,
99 };
100
101
102 typedef struct 101 typedef struct
103 { 102 {
104 const th_regex_char_t *pattern; 103 const th_regex_char_t *pattern;
105 size_t offs; 104 size_t offs;
106 105
107 th_regex_t *data; 106 th_regex_t *data;
108 107
109 size_t nstack, stacksize; 108 size_t nstack, stacksize;
110 th_regex_t **stack; 109 th_regex_t **stack;
110
111 th_regex_char_t *buf;
112 size_t bufSize, bufPos;
113
111 } th_regex_parse_ctx_t; 114 } th_regex_parse_ctx_t;
112 115
113 116
114 static void th_regex_node_init(th_regex_node_t *node) 117 static void th_regex_node_init(th_regex_node_t *node)
115 { 118 {
202 return THERR_MALLOC; 205 return THERR_MALLOC;
203 } 206 }
204 207
205 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t)); 208 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t));
206 data->nnodes++; 209 data->nnodes++;
207
208 DBG_RE_COMPILE(
209 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
210 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
211 data->nnodes, data->nodessize,
212 node->mode, node->type,
213 node->repeatMin, node->repeatMax);
214 210
215 return THERR_OK; 211 return THERR_OK;
216 } 212 }
217 213
218 214
325 th_regex_list_item_init(&item); 321 th_regex_list_item_init(&item);
326 item.type = 1; 322 item.type = 1;
327 item.start = *prev; 323 item.start = *prev;
328 item.end = *next; 324 item.end = *next;
329 325
330 if (item.start <= item.end) 326 if (item.start >= item.end)
331 { 327 {
332 res = THERR_INVALID_DATA; 328 res = THERR_INVALID_DATA;
333 goto out; 329 goto out;
334 } 330 }
335 331
388 th_free(tmp); 384 th_free(tmp);
389 return res; 385 return res;
390 } 386 }
391 387
392 388
389 static int th_regex_parse_ctx_node_commit_strchr_do(th_regex_parse_ctx_t *ctx,
390 const th_regex_char_t *buf, const size_t bufLen)
391 {
392 th_regex_node_t node;
393 th_regex_node_init(&node);
394
395 if (bufLen > 1)
396 {
397 int res;
398 node.type = TH_RE_TYPE_STR;
399 if ((res = th_regex_strndup(&node.match.str, buf, bufLen)) != THERR_OK)
400 return res;
401 }
402 else
403 {
404 node.type = TH_RE_TYPE_CHAR;
405 node.match.chr = buf[0];
406 }
407
408 return th_regex_parse_ctx_node_commit(ctx, &node);
409 }
410
411
412 static int th_regex_parse_ctx_node_commit_strchr(th_regex_parse_ctx_t *ctx, const BOOL split)
413 {
414 int res = THERR_OK;;
415
416 if (ctx->bufPos > 0)
417 {
418 if (ctx->bufPos > 1 && split)
419 {
420 if ((res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos - 1)) != THERR_OK)
421 return res;
422
423 res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf + ctx->bufPos - 1, 1);
424 }
425 else
426 {
427 res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos);
428 }
429 ctx->bufPos = 0;
430 }
431
432 return res;
433 }
434
435
393 int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern) 436 int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern)
394 { 437 {
395 int res = THERR_OK; 438 int res = THERR_OK;
396 th_regex_parse_ctx_t ctx; 439 th_regex_parse_ctx_t ctx;
397 th_regex_node_t node, *pnode; 440 th_regex_node_t node, *pnode;
398 th_regex_char_t *tmp = NULL; 441 th_regex_char_t *tmp = NULL;
399 size_t start; 442 size_t start;
400 443
444 // Check pointers
401 if (pexpr == NULL || pattern == NULL) 445 if (pexpr == NULL || pattern == NULL)
402 { 446 {
403 res = THERR_NULLPTR; 447 res = THERR_NULLPTR;
404 goto exit; 448 goto out;
405 } 449 }
406 450
451 // Initialize parsing context
407 memset(&ctx, 0, sizeof(ctx)); 452 memset(&ctx, 0, sizeof(ctx));
408 ctx.pattern = pattern; 453 ctx.pattern = pattern;
409 454 ctx.bufSize = 256;
455
456 if ((ctx.buf = th_malloc(ctx.bufSize * sizeof(th_regex_char_t))) == NULL)
457 {
458 res = THERR_MALLOC;
459 goto out;
460 }
461
462 // Start parsing the pattern
410 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) 463 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++)
411 { 464 {
412 th_regex_char_t cch = ctx.pattern[ctx.offs]; 465 th_regex_char_t cch = ctx.pattern[ctx.offs];
413 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); 466
414 switch (cch) 467 switch (cch)
415 { 468 {
416 case '?': 469 case '?':
417 case '*': 470 case '*':
418 case '+': 471 case '+':
472 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, TRUE)) != THERR_OK)
473 goto out;
474
419 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) 475 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK)
420 goto exit; 476 goto out;
421 477
422 if (cch == '?') 478 if (cch == '?')
423 { 479 {
424 // Previous token is optional (repeat 0-1 times) (non-greedy matching) 480 // Previous token is optional (repeat 0-1 times) (non-greedy matching)
425 pnode->mode = TH_RE_MATCH_COUNT; 481 pnode->mode = TH_RE_MATCH_COUNT;
430 { 486 {
431 // Check if previous was a count ("**", "*+", etc.) 487 // Check if previous was a count ("**", "*+", etc.)
432 if (pnode->mode == TH_RE_MATCH_COUNT) 488 if (pnode->mode == TH_RE_MATCH_COUNT)
433 { 489 {
434 res = THERR_INVALID_DATA; 490 res = THERR_INVALID_DATA;
435 goto exit; 491 goto out;
436 } 492 }
437 493
438 pnode->mode = TH_RE_MATCH_COUNT; 494 pnode->mode = TH_RE_MATCH_COUNT;
439 495
440 if (cch == '*') 496 if (cch == '*')
451 } 507 }
452 } 508 }
453 break; 509 break;
454 510
455 case '{': 511 case '{':
512 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, TRUE)) != THERR_OK)
513 goto out;
514
456 // {n} | {min,max} 515 // {n} | {min,max}
457 start = ctx.offs + 1; 516 start = ctx.offs + 1;
458 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}')) 517 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}'))
459 { 518 {
460 // End not found 519 // End not found
461 res = THERR_INVALID_DATA; 520 res = THERR_INVALID_DATA;
462 goto exit; 521 goto out;
463 } 522 }
464 523
465 th_free(tmp); 524 th_free(tmp);
466 525
467 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || 526 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK ||
468 (res = th_regex_strndup(&tmp, ctx.pattern + start, 527 (res = th_regex_strndup(&tmp, ctx.pattern + start,
469 ctx.offs - start)) != THERR_OK) 528 ctx.offs - start)) != THERR_OK)
470 goto exit; 529 goto out;
471 530
472 pnode->mode = TH_RE_MATCH_COUNT; 531 pnode->mode = TH_RE_MATCH_COUNT;
473 532
474 if (th_regex_find_next(tmp, 0, &start, ',')) 533 if (th_regex_find_next(tmp, 0, &start, ','))
475 { 534 {
476 tmp[start] = 0; 535 tmp[start] = 0;
477 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || 536 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) ||
478 !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax)) 537 !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax))
479 { 538 {
480 res = THERR_INVALID_DATA; 539 res = THERR_INVALID_DATA;
481 goto exit; 540 goto out;
482 } 541 }
483 } 542 }
484 else 543 else
485 { 544 {
486 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin)) 545 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin))
487 { 546 {
488 res = THERR_INVALID_DATA; 547 res = THERR_INVALID_DATA;
489 goto exit; 548 goto out;
490 } 549 }
491 pnode->repeatMax = pnode->repeatMin; 550 pnode->repeatMax = pnode->repeatMin;
492 } 551 }
493 552
494 if (pnode->repeatMin < 0 || pnode->repeatMax < 1 || 553 if (pnode->repeatMin < 0 || pnode->repeatMax < 1 ||
495 pnode->repeatMax < pnode->repeatMin) 554 pnode->repeatMax < pnode->repeatMin)
496 { 555 {
497 // Invalid repeat counts 556 // Invalid repeat counts
498 res = THERR_INVALID_DATA; 557 res = THERR_INVALID_DATA;
499 goto exit; 558 goto out;
500 } 559 }
501 break; 560 break;
502 561
503 case '(': 562 case '(':
563 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
564 goto out;
565
504 // Start of subpattern 566 // Start of subpattern
505 if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK) 567 if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK)
506 goto exit; 568 goto out;
507 break; 569 break;
508 570
509 case ')': 571 case ')':
572 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
573 goto out;
574
510 // End of subpattern 575 // End of subpattern
511 th_regex_node_init(&node); 576 th_regex_node_init(&node);
512 node.type = TH_RE_TYPE_SUBEXPR; 577 node.type = TH_RE_TYPE_SUBEXPR;
513 578
514 if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || 579 if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK ||
515 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) 580 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
516 goto exit; 581 goto out;
517 break; 582 break;
518 583
519 case '^': 584 case '^':
585 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
586 goto out;
587
520 // Start of line anchor 588 // Start of line anchor
521 th_regex_node_init(&node); 589 th_regex_node_init(&node);
522 node.mode = TH_RE_MATCH_ANCHOR_START; 590 node.mode = TH_RE_MATCH_ANCHOR_START;
523 591
524 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) 592 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
525 goto exit; 593 goto out;
526 break; 594 break;
527 595
528 case '$': 596 case '$':
597 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
598 goto out;
599
529 // End of line anchor 600 // End of line anchor
530 th_regex_node_init(&node); 601 th_regex_node_init(&node);
531 node.mode = TH_RE_MATCH_ANCHOR_END; 602 node.mode = TH_RE_MATCH_ANCHOR_END;
532 603
533 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) 604 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
534 goto exit; 605 goto out;
535 break; 606 break;
536 607
537 case '[': 608 case '[':
609 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
610 goto out;
611
538 // Start of char list 612 // Start of char list
539 start = ctx.offs + 1; 613 start = ctx.offs + 1;
540 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') || 614 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') ||
541 ctx.offs == start) 615 ctx.offs == start)
542 { 616 {
543 res = THERR_INVALID_DATA; 617 res = THERR_INVALID_DATA;
544 goto exit; 618 goto out;
545 } 619 }
546 620
547 th_regex_node_init(&node); 621 th_regex_node_init(&node);
548 if (ctx.pattern[start] == '^') 622 if (ctx.pattern[start] == '^')
549 { 623 {
552 } 626 }
553 else 627 else
554 node.type = TH_RE_TYPE_LIST; 628 node.type = TH_RE_TYPE_LIST;
555 629
556 if ((res = th_regex_parse_list(ctx.pattern + start, 630 if ((res = th_regex_parse_list(ctx.pattern + start,
557 ctx.offs - start, &node.match.list) != THERR_OK) || 631 ctx.offs - start, &node.match.list)) != THERR_OK ||
558 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) 632 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
559 goto exit; 633 goto out;
560 break; 634 break;
561 635
562 case '.': 636 case '.':
637 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
638 goto out;
639
563 // Any single character matches 640 // Any single character matches
564 th_regex_node_init(&node); 641 th_regex_node_init(&node);
565 node.type = TH_RE_TYPE_ANY_CHAR; 642 node.type = TH_RE_TYPE_ANY_CHAR;
566 643
567 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) 644 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
568 goto exit; 645 goto out;
569 break; 646 break;
570 647
571 case '\\': 648 case '\\':
572 // Literal escape 649 // Literal escape
573 ctx.offs++; 650 ctx.offs++;
574 if (ctx.pattern[ctx.offs] == 0) 651 if (ctx.pattern[ctx.offs] == 0)
575 { 652 {
576 // End of pattern, error 653 // End of pattern, error
577 res = THERR_INVALID_DATA; 654 res = THERR_INVALID_DATA;
578 goto exit; 655 goto out;
579 } 656 }
580 // fall-through 657 // fall-through
581 658
582 default: 659 default:
583 // Given character must match 660 // Given character must match
584 th_regex_node_init(&node); 661 if (ctx.bufPos < ctx.bufSize)
585 node.type = TH_RE_TYPE_CHAR; 662 ctx.buf[ctx.bufPos++] = ctx.pattern[ctx.offs];
586 node.match.chr = ctx.pattern[ctx.offs]; 663 else
587 664 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
588 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) 665 goto out;
589 goto exit; 666 break;
590 break; 667 }
591 } 668 }
592 } 669
593 670 // Commit last string/char if any
594 exit: 671 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK)
672 goto out;
673
674 // Create root node
675 th_regex_node_init(&node);
676 node.type = TH_RE_TYPE_SUBEXPR;
677 node.match.expr = ctx.data;
678 ctx.data = NULL;
679
680 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK)
681 goto out;
682
683 out:
595 *pexpr = ctx.data; 684 *pexpr = ctx.data;
685
686 // Free temporary buffers
596 th_free(tmp); 687 th_free(tmp);
688 th_free(ctx.buf);
597 return res; 689 return res;
598 } 690 }
599 691
600 692
601 void th_regex_free(th_regex_t *expr) 693 void th_regex_free(th_regex_t *expr)
603 if (expr != NULL) 695 if (expr != NULL)
604 { 696 {
605 for (size_t nnode = 0; nnode < expr->nnodes; nnode++) 697 for (size_t nnode = 0; nnode < expr->nnodes; nnode++)
606 { 698 {
607 th_regex_node_t *node = &expr->nodes[nnode]; 699 th_regex_node_t *node = &expr->nodes[nnode];
608
609 DBG_RE_FREE(
610 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: "
611 "mode=%s, type=%s, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n",
612 nnode, expr->nnodes,
613 re_match_modes[node->mode],
614 re_match_types[node->type],
615 node->repeatMin, node->repeatMax);
616 700
617 th_regex_free(node->match.expr); 701 th_regex_free(node->match.expr);
618 th_regex_list_free(&node->match.list); 702 th_regex_list_free(&node->match.list);
619 } 703 }
620 704
621 th_free(expr->nodes); 705 th_free(expr->nodes);
706 }
707 }
708
709
710 void th_regex_dump(FILE *fh, const int level, const th_regex_t *expr)
711 {
712 if (expr != NULL)
713 {
714 for (size_t nnode = 0; nnode < expr->nnodes; nnode++)
715 {
716 th_regex_node_t *node = &expr->nodes[nnode];
717
718 for (int indent = 0; indent < level; indent++)
719 fprintf(fh, " ");
720
721 fprintf(fh, "[%" PRIu_SIZE_T " / %" PRIu_SIZE_T "] %s %s ",
722 nnode + 1, expr->nnodes,
723 re_match_modes[node->mode],
724 re_match_types[node->type]);
725
726 if (node->mode == TH_RE_MATCH_COUNT)
727 {
728 fprintf(fh, "min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T " : ",
729 node->repeatMin, node->repeatMax);
730 }
731
732 switch (node->type)
733 {
734 case TH_RE_TYPE_CHAR:
735 fprintf(fh, "'%c'\n", node->match.chr);
736 break;
737
738 case TH_RE_TYPE_STR:
739 fprintf(fh, "\"%s\"\n", node->match.str);
740 break;
741
742 case TH_RE_TYPE_ANY_CHAR:
743 fprintf(fh, ".\n");
744 break;
745
746 case TH_RE_TYPE_LIST:
747 case TH_RE_TYPE_LIST_REVERSE:
748 fprintf(fh, "[ ");
749 for (size_t n = 0; n < node->match.list.nitems; n++)
750 {
751 const th_regex_list_item_t *li = &node->match.list.items[n];
752 if (li->type)
753 {
754 fprintf(fh, "'%c-%c' ", li->start, li->end);
755 }
756 else
757 {
758 for (size_t i = 0; i < li->nchars; i++)
759 fprintf(fh, "'%c' ", li->chars[i]);
760 }
761 }
762 fprintf(fh, "]\n");
763 break;
764
765 case TH_RE_TYPE_SUBEXPR:
766 fprintf(fh, "\n");
767 th_regex_dump(fh, level + 1, node->match.expr);
768 break;
769 }
770 }
622 } 771 }
623 } 772 }
624 773
625 774
626 static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch) 775 static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch)
667 BOOL res = FALSE; 816 BOOL res = FALSE;
668 817
669 switch (node->type) 818 switch (node->type)
670 { 819 {
671 case TH_RE_TYPE_SUBEXPR: 820 case TH_RE_TYPE_SUBEXPR:
672 // push current context in stack
673 // new context for subexpr
674 res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags); 821 res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags);
675 break; 822 break;
676 823
677 case TH_RE_TYPE_LIST: 824 case TH_RE_TYPE_LIST:
678 case TH_RE_TYPE_LIST_REVERSE: 825 case TH_RE_TYPE_LIST_REVERSE:
706 { 853 {
707 res = (cch == node->match.chr); 854 res = (cch == node->match.chr);
708 (*poffs)++; 855 (*poffs)++;
709 } 856 }
710 break; 857 break;
858
859 case TH_RE_TYPE_STR:
860 res = TRUE;
861 for (th_regex_char_t *str = node->match.str; *str != 0; )
862 {
863 if (haystack[*poffs] != *str)
864 {
865 res = FALSE;
866 break;
867 }
868 str++;
869 (*poffs)++;
870 }
871 break;
711 } 872 }
712 873
713 return res; 874 return res;
714 } 875 }
715 876
742 903
743 case TH_RE_MATCH_COUNT: 904 case TH_RE_MATCH_COUNT:
744 { 905 {
745 ssize_t count = 0; 906 ssize_t count = 0;
746 907
747 DBG_RE_MATCH(" COUNT min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", 908 DBG_RE_MATCH(" min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n",
748 node->repeatMin, node->repeatMax); 909 node->repeatMin, node->repeatMax);
749 910
750 do 911 do
751 { 912 {
752 BOOL match; 913 BOOL match;
753 size_t toffs = soffs, tnode; 914 size_t toffs = soffs, tnode;
754 915
755 DBG_RE_MATCH(" START '%s'\n", haystack + toffs); 916 DBG_RE_MATCH(" ROUND #%" PRIu_SIZE_T ": '%s'\n",
756 do { 917 count, haystack + toffs);
918
919 do
920 {
757 match = TRUE; 921 match = TRUE;
922 size_t noffs = toffs;
758 for (tnode = nnode; match && tnode < expr->nnodes && haystack[toffs] != 0; ) 923 for (tnode = nnode; match && tnode < expr->nnodes && haystack[toffs] != 0; )
759 { 924 {
760 match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags); 925 match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags);
761 if (match) 926 if (match)
762 tnode++; 927 tnode++;
763 } 928 }
764 DBG_RE_MATCH(" '%s': %d (tnode=%" PRIu_SIZE_T ")\n", haystack + toffs, match, tnode); 929
930 DBG_RE_MATCH(" '%s': %d (tnode=%" PRIu_SIZE_T ")\n",
931 haystack + noffs, match, tnode);
932
765 if (node->repeatMin >= 0 && match) 933 if (node->repeatMin >= 0 && match)
766 break; 934 break;
767 } while (!match && haystack[toffs] != 0); 935 } while (!match && haystack[toffs] != 0);
768 936
769 DBG_RE_MATCH(" END '%s': %d (tnode=%" PRIu_SIZE_T ")\n", haystack + toffs, match, tnode); 937 DBG_RE_MATCH(" ROUND #%" PRIu_SIZE_T " END: match=%s, tnode=%" PRIu_SIZE_T "\n",
938 count, match ? "YES" : "NO", tnode);
770 939
771 if (match) 940 if (match)
772 { 941 {
773 // Node matched 942 // Node matched
774 count++; 943 count++;