Mercurial > hg > th-libs
comparison th_regex.c @ 645:b897995101b7
More fiddling and twiddling. Add parsing to string nodes instead of separate character nodes.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Sat, 25 Jan 2020 05:11:40 +0200 |
parents | a2bf1ea05b05 |
children | 1e7e3f96632e |
comparison
equal
deleted
inserted
replaced
644:562de49f8b4b | 645:b897995101b7 |
---|---|
8 #include "th_util.h" | 8 #include "th_util.h" |
9 #include "th_regex.h" | 9 #include "th_regex.h" |
10 | 10 |
11 | 11 |
12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG | 12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG |
13 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stdout, __VA_ARGS__); } while (0) | 13 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags) fprintf(stdout, __VA_ARGS__); } while (0) |
14 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stdout, __VA_ARGS__); } while (0) | 14 BOOL th_dbg_re_flags = FALSE; |
15 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stdout, __VA_ARGS__); } while (0) | 15 #else |
16 | 16 # define DBG_RE_MATCH(...) |
17 int th_dbg_re_flags = 0; | 17 #endif |
18 | |
19 | |
20 #ifdef TH_EXPERIMENTAL_REGEX | |
21 | |
22 enum | |
23 { | |
24 TH_RE_MATCH_ONCE, | |
25 TH_RE_MATCH_COUNT, | |
26 TH_RE_MATCH_ANCHOR_START, | |
27 TH_RE_MATCH_ANCHOR_END, | |
28 }; | |
29 | |
30 | |
31 enum | |
32 { | |
33 TH_RE_TYPE_CHAR, | |
34 TH_RE_TYPE_STR, | |
35 TH_RE_TYPE_ANY_CHAR, | |
36 TH_RE_TYPE_LIST, | |
37 TH_RE_TYPE_LIST_REVERSE, | |
38 TH_RE_TYPE_SUBEXPR, | |
39 }; | |
40 | |
18 | 41 |
19 static const char *re_match_modes[] = | 42 static const char *re_match_modes[] = |
20 { | 43 { |
21 "ONCE", | 44 "ONCE", |
22 "COUNT", | 45 "COUNT", |
23 "ANCHOR START", | 46 "ANCHOR START", |
24 "ANCHOR END", | 47 "ANCHOR END", |
25 }; | 48 }; |
26 | 49 |
50 | |
27 static const char *re_match_types[] = | 51 static const char *re_match_types[] = |
28 { | 52 { |
29 "CHAR", | 53 "CHAR", |
54 "STR", | |
30 "ANY", | 55 "ANY", |
31 "LIST", | 56 "LIST", |
32 "LIST REVERSE", | 57 "LIST REVERSE", |
33 "SUBEXPR", | 58 "SUBEXPR", |
34 }; | 59 }; |
35 | 60 |
36 #else | |
37 # define DBG_RE_COMPILE(...) | |
38 # define DBG_RE_FREE(...) | |
39 # define DBG_RE_MATCH(...) | |
40 #endif | |
41 | |
42 | |
43 #ifdef TH_EXPERIMENTAL_REGEX | |
44 | 61 |
45 typedef struct | 62 typedef struct |
46 { | 63 { |
47 int type; | 64 int type; |
48 th_regex_char_t start, end; | 65 th_regex_char_t start, end; |
64 int mode, type; | 81 int mode, type; |
65 ssize_t repeatMin, repeatMax; | 82 ssize_t repeatMin, repeatMax; |
66 | 83 |
67 struct { | 84 struct { |
68 th_regex_char_t chr; | 85 th_regex_char_t chr; |
86 th_regex_char_t *str; | |
69 th_regex_list_t list; | 87 th_regex_list_t list; |
70 struct th_regex_t *expr; | 88 |
89 th_regex_t *expr; | |
71 } match; | 90 } match; |
72 } th_regex_node_t; | 91 } th_regex_node_t; |
73 | 92 |
74 | 93 |
75 struct th_regex_t | 94 struct th_regex_t |
77 size_t nnodes, nodessize; | 96 size_t nnodes, nodessize; |
78 th_regex_node_t *nodes; | 97 th_regex_node_t *nodes; |
79 }; | 98 }; |
80 | 99 |
81 | 100 |
82 enum | |
83 { | |
84 TH_RE_MATCH_ONCE, | |
85 TH_RE_MATCH_COUNT, | |
86 | |
87 TH_RE_MATCH_ANCHOR_START, | |
88 TH_RE_MATCH_ANCHOR_END, | |
89 }; | |
90 | |
91 | |
92 enum | |
93 { | |
94 TH_RE_TYPE_CHAR, | |
95 TH_RE_TYPE_ANY_CHAR, | |
96 TH_RE_TYPE_LIST, | |
97 TH_RE_TYPE_LIST_REVERSE, | |
98 TH_RE_TYPE_SUBEXPR, | |
99 }; | |
100 | |
101 | |
102 typedef struct | 101 typedef struct |
103 { | 102 { |
104 const th_regex_char_t *pattern; | 103 const th_regex_char_t *pattern; |
105 size_t offs; | 104 size_t offs; |
106 | 105 |
107 th_regex_t *data; | 106 th_regex_t *data; |
108 | 107 |
109 size_t nstack, stacksize; | 108 size_t nstack, stacksize; |
110 th_regex_t **stack; | 109 th_regex_t **stack; |
110 | |
111 th_regex_char_t *buf; | |
112 size_t bufSize, bufPos; | |
113 | |
111 } th_regex_parse_ctx_t; | 114 } th_regex_parse_ctx_t; |
112 | 115 |
113 | 116 |
114 static void th_regex_node_init(th_regex_node_t *node) | 117 static void th_regex_node_init(th_regex_node_t *node) |
115 { | 118 { |
202 return THERR_MALLOC; | 205 return THERR_MALLOC; |
203 } | 206 } |
204 | 207 |
205 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t)); | 208 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t)); |
206 data->nnodes++; | 209 data->nnodes++; |
207 | |
208 DBG_RE_COMPILE( | |
209 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " | |
210 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", | |
211 data->nnodes, data->nodessize, | |
212 node->mode, node->type, | |
213 node->repeatMin, node->repeatMax); | |
214 | 210 |
215 return THERR_OK; | 211 return THERR_OK; |
216 } | 212 } |
217 | 213 |
218 | 214 |
325 th_regex_list_item_init(&item); | 321 th_regex_list_item_init(&item); |
326 item.type = 1; | 322 item.type = 1; |
327 item.start = *prev; | 323 item.start = *prev; |
328 item.end = *next; | 324 item.end = *next; |
329 | 325 |
330 if (item.start <= item.end) | 326 if (item.start >= item.end) |
331 { | 327 { |
332 res = THERR_INVALID_DATA; | 328 res = THERR_INVALID_DATA; |
333 goto out; | 329 goto out; |
334 } | 330 } |
335 | 331 |
388 th_free(tmp); | 384 th_free(tmp); |
389 return res; | 385 return res; |
390 } | 386 } |
391 | 387 |
392 | 388 |
389 static int th_regex_parse_ctx_node_commit_strchr_do(th_regex_parse_ctx_t *ctx, | |
390 const th_regex_char_t *buf, const size_t bufLen) | |
391 { | |
392 th_regex_node_t node; | |
393 th_regex_node_init(&node); | |
394 | |
395 if (bufLen > 1) | |
396 { | |
397 int res; | |
398 node.type = TH_RE_TYPE_STR; | |
399 if ((res = th_regex_strndup(&node.match.str, buf, bufLen)) != THERR_OK) | |
400 return res; | |
401 } | |
402 else | |
403 { | |
404 node.type = TH_RE_TYPE_CHAR; | |
405 node.match.chr = buf[0]; | |
406 } | |
407 | |
408 return th_regex_parse_ctx_node_commit(ctx, &node); | |
409 } | |
410 | |
411 | |
412 static int th_regex_parse_ctx_node_commit_strchr(th_regex_parse_ctx_t *ctx, const BOOL split) | |
413 { | |
414 int res = THERR_OK;; | |
415 | |
416 if (ctx->bufPos > 0) | |
417 { | |
418 if (ctx->bufPos > 1 && split) | |
419 { | |
420 if ((res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos - 1)) != THERR_OK) | |
421 return res; | |
422 | |
423 res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf + ctx->bufPos - 1, 1); | |
424 } | |
425 else | |
426 { | |
427 res = th_regex_parse_ctx_node_commit_strchr_do(ctx, ctx->buf, ctx->bufPos); | |
428 } | |
429 ctx->bufPos = 0; | |
430 } | |
431 | |
432 return res; | |
433 } | |
434 | |
435 | |
393 int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern) | 436 int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern) |
394 { | 437 { |
395 int res = THERR_OK; | 438 int res = THERR_OK; |
396 th_regex_parse_ctx_t ctx; | 439 th_regex_parse_ctx_t ctx; |
397 th_regex_node_t node, *pnode; | 440 th_regex_node_t node, *pnode; |
398 th_regex_char_t *tmp = NULL; | 441 th_regex_char_t *tmp = NULL; |
399 size_t start; | 442 size_t start; |
400 | 443 |
444 // Check pointers | |
401 if (pexpr == NULL || pattern == NULL) | 445 if (pexpr == NULL || pattern == NULL) |
402 { | 446 { |
403 res = THERR_NULLPTR; | 447 res = THERR_NULLPTR; |
404 goto exit; | 448 goto out; |
405 } | 449 } |
406 | 450 |
451 // Initialize parsing context | |
407 memset(&ctx, 0, sizeof(ctx)); | 452 memset(&ctx, 0, sizeof(ctx)); |
408 ctx.pattern = pattern; | 453 ctx.pattern = pattern; |
409 | 454 ctx.bufSize = 256; |
455 | |
456 if ((ctx.buf = th_malloc(ctx.bufSize * sizeof(th_regex_char_t))) == NULL) | |
457 { | |
458 res = THERR_MALLOC; | |
459 goto out; | |
460 } | |
461 | |
462 // Start parsing the pattern | |
410 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) | 463 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) |
411 { | 464 { |
412 th_regex_char_t cch = ctx.pattern[ctx.offs]; | 465 th_regex_char_t cch = ctx.pattern[ctx.offs]; |
413 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); | 466 |
414 switch (cch) | 467 switch (cch) |
415 { | 468 { |
416 case '?': | 469 case '?': |
417 case '*': | 470 case '*': |
418 case '+': | 471 case '+': |
472 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, TRUE)) != THERR_OK) | |
473 goto out; | |
474 | |
419 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) | 475 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) |
420 goto exit; | 476 goto out; |
421 | 477 |
422 if (cch == '?') | 478 if (cch == '?') |
423 { | 479 { |
424 // Previous token is optional (repeat 0-1 times) (non-greedy matching) | 480 // Previous token is optional (repeat 0-1 times) (non-greedy matching) |
425 pnode->mode = TH_RE_MATCH_COUNT; | 481 pnode->mode = TH_RE_MATCH_COUNT; |
430 { | 486 { |
431 // Check if previous was a count ("**", "*+", etc.) | 487 // Check if previous was a count ("**", "*+", etc.) |
432 if (pnode->mode == TH_RE_MATCH_COUNT) | 488 if (pnode->mode == TH_RE_MATCH_COUNT) |
433 { | 489 { |
434 res = THERR_INVALID_DATA; | 490 res = THERR_INVALID_DATA; |
435 goto exit; | 491 goto out; |
436 } | 492 } |
437 | 493 |
438 pnode->mode = TH_RE_MATCH_COUNT; | 494 pnode->mode = TH_RE_MATCH_COUNT; |
439 | 495 |
440 if (cch == '*') | 496 if (cch == '*') |
451 } | 507 } |
452 } | 508 } |
453 break; | 509 break; |
454 | 510 |
455 case '{': | 511 case '{': |
512 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, TRUE)) != THERR_OK) | |
513 goto out; | |
514 | |
456 // {n} | {min,max} | 515 // {n} | {min,max} |
457 start = ctx.offs + 1; | 516 start = ctx.offs + 1; |
458 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}')) | 517 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, '}')) |
459 { | 518 { |
460 // End not found | 519 // End not found |
461 res = THERR_INVALID_DATA; | 520 res = THERR_INVALID_DATA; |
462 goto exit; | 521 goto out; |
463 } | 522 } |
464 | 523 |
465 th_free(tmp); | 524 th_free(tmp); |
466 | 525 |
467 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || | 526 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || |
468 (res = th_regex_strndup(&tmp, ctx.pattern + start, | 527 (res = th_regex_strndup(&tmp, ctx.pattern + start, |
469 ctx.offs - start)) != THERR_OK) | 528 ctx.offs - start)) != THERR_OK) |
470 goto exit; | 529 goto out; |
471 | 530 |
472 pnode->mode = TH_RE_MATCH_COUNT; | 531 pnode->mode = TH_RE_MATCH_COUNT; |
473 | 532 |
474 if (th_regex_find_next(tmp, 0, &start, ',')) | 533 if (th_regex_find_next(tmp, 0, &start, ',')) |
475 { | 534 { |
476 tmp[start] = 0; | 535 tmp[start] = 0; |
477 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || | 536 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || |
478 !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax)) | 537 !th_regex_parse_ssize_t(tmp + start + 1, &pnode->repeatMax)) |
479 { | 538 { |
480 res = THERR_INVALID_DATA; | 539 res = THERR_INVALID_DATA; |
481 goto exit; | 540 goto out; |
482 } | 541 } |
483 } | 542 } |
484 else | 543 else |
485 { | 544 { |
486 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin)) | 545 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin)) |
487 { | 546 { |
488 res = THERR_INVALID_DATA; | 547 res = THERR_INVALID_DATA; |
489 goto exit; | 548 goto out; |
490 } | 549 } |
491 pnode->repeatMax = pnode->repeatMin; | 550 pnode->repeatMax = pnode->repeatMin; |
492 } | 551 } |
493 | 552 |
494 if (pnode->repeatMin < 0 || pnode->repeatMax < 1 || | 553 if (pnode->repeatMin < 0 || pnode->repeatMax < 1 || |
495 pnode->repeatMax < pnode->repeatMin) | 554 pnode->repeatMax < pnode->repeatMin) |
496 { | 555 { |
497 // Invalid repeat counts | 556 // Invalid repeat counts |
498 res = THERR_INVALID_DATA; | 557 res = THERR_INVALID_DATA; |
499 goto exit; | 558 goto out; |
500 } | 559 } |
501 break; | 560 break; |
502 | 561 |
503 case '(': | 562 case '(': |
563 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) | |
564 goto out; | |
565 | |
504 // Start of subpattern | 566 // Start of subpattern |
505 if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK) | 567 if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK) |
506 goto exit; | 568 goto out; |
507 break; | 569 break; |
508 | 570 |
509 case ')': | 571 case ')': |
572 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) | |
573 goto out; | |
574 | |
510 // End of subpattern | 575 // End of subpattern |
511 th_regex_node_init(&node); | 576 th_regex_node_init(&node); |
512 node.type = TH_RE_TYPE_SUBEXPR; | 577 node.type = TH_RE_TYPE_SUBEXPR; |
513 | 578 |
514 if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || | 579 if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || |
515 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | 580 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
516 goto exit; | 581 goto out; |
517 break; | 582 break; |
518 | 583 |
519 case '^': | 584 case '^': |
585 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) | |
586 goto out; | |
587 | |
520 // Start of line anchor | 588 // Start of line anchor |
521 th_regex_node_init(&node); | 589 th_regex_node_init(&node); |
522 node.mode = TH_RE_MATCH_ANCHOR_START; | 590 node.mode = TH_RE_MATCH_ANCHOR_START; |
523 | 591 |
524 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | 592 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
525 goto exit; | 593 goto out; |
526 break; | 594 break; |
527 | 595 |
528 case '$': | 596 case '$': |
597 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) | |
598 goto out; | |
599 | |
529 // End of line anchor | 600 // End of line anchor |
530 th_regex_node_init(&node); | 601 th_regex_node_init(&node); |
531 node.mode = TH_RE_MATCH_ANCHOR_END; | 602 node.mode = TH_RE_MATCH_ANCHOR_END; |
532 | 603 |
533 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | 604 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
534 goto exit; | 605 goto out; |
535 break; | 606 break; |
536 | 607 |
537 case '[': | 608 case '[': |
609 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) | |
610 goto out; | |
611 | |
538 // Start of char list | 612 // Start of char list |
539 start = ctx.offs + 1; | 613 start = ctx.offs + 1; |
540 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') || | 614 if (!th_regex_find_next(ctx.pattern, start, &ctx.offs, ']') || |
541 ctx.offs == start) | 615 ctx.offs == start) |
542 { | 616 { |
543 res = THERR_INVALID_DATA; | 617 res = THERR_INVALID_DATA; |
544 goto exit; | 618 goto out; |
545 } | 619 } |
546 | 620 |
547 th_regex_node_init(&node); | 621 th_regex_node_init(&node); |
548 if (ctx.pattern[start] == '^') | 622 if (ctx.pattern[start] == '^') |
549 { | 623 { |
552 } | 626 } |
553 else | 627 else |
554 node.type = TH_RE_TYPE_LIST; | 628 node.type = TH_RE_TYPE_LIST; |
555 | 629 |
556 if ((res = th_regex_parse_list(ctx.pattern + start, | 630 if ((res = th_regex_parse_list(ctx.pattern + start, |
557 ctx.offs - start, &node.match.list) != THERR_OK) || | 631 ctx.offs - start, &node.match.list)) != THERR_OK || |
558 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | 632 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
559 goto exit; | 633 goto out; |
560 break; | 634 break; |
561 | 635 |
562 case '.': | 636 case '.': |
637 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) | |
638 goto out; | |
639 | |
563 // Any single character matches | 640 // Any single character matches |
564 th_regex_node_init(&node); | 641 th_regex_node_init(&node); |
565 node.type = TH_RE_TYPE_ANY_CHAR; | 642 node.type = TH_RE_TYPE_ANY_CHAR; |
566 | 643 |
567 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | 644 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
568 goto exit; | 645 goto out; |
569 break; | 646 break; |
570 | 647 |
571 case '\\': | 648 case '\\': |
572 // Literal escape | 649 // Literal escape |
573 ctx.offs++; | 650 ctx.offs++; |
574 if (ctx.pattern[ctx.offs] == 0) | 651 if (ctx.pattern[ctx.offs] == 0) |
575 { | 652 { |
576 // End of pattern, error | 653 // End of pattern, error |
577 res = THERR_INVALID_DATA; | 654 res = THERR_INVALID_DATA; |
578 goto exit; | 655 goto out; |
579 } | 656 } |
580 // fall-through | 657 // fall-through |
581 | 658 |
582 default: | 659 default: |
583 // Given character must match | 660 // Given character must match |
584 th_regex_node_init(&node); | 661 if (ctx.bufPos < ctx.bufSize) |
585 node.type = TH_RE_TYPE_CHAR; | 662 ctx.buf[ctx.bufPos++] = ctx.pattern[ctx.offs]; |
586 node.match.chr = ctx.pattern[ctx.offs]; | 663 else |
587 | 664 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) |
588 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | 665 goto out; |
589 goto exit; | 666 break; |
590 break; | 667 } |
591 } | 668 } |
592 } | 669 |
593 | 670 // Commit last string/char if any |
594 exit: | 671 if ((res = th_regex_parse_ctx_node_commit_strchr(&ctx, FALSE)) != THERR_OK) |
672 goto out; | |
673 | |
674 // Create root node | |
675 th_regex_node_init(&node); | |
676 node.type = TH_RE_TYPE_SUBEXPR; | |
677 node.match.expr = ctx.data; | |
678 ctx.data = NULL; | |
679 | |
680 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | |
681 goto out; | |
682 | |
683 out: | |
595 *pexpr = ctx.data; | 684 *pexpr = ctx.data; |
685 | |
686 // Free temporary buffers | |
596 th_free(tmp); | 687 th_free(tmp); |
688 th_free(ctx.buf); | |
597 return res; | 689 return res; |
598 } | 690 } |
599 | 691 |
600 | 692 |
601 void th_regex_free(th_regex_t *expr) | 693 void th_regex_free(th_regex_t *expr) |
603 if (expr != NULL) | 695 if (expr != NULL) |
604 { | 696 { |
605 for (size_t nnode = 0; nnode < expr->nnodes; nnode++) | 697 for (size_t nnode = 0; nnode < expr->nnodes; nnode++) |
606 { | 698 { |
607 th_regex_node_t *node = &expr->nodes[nnode]; | 699 th_regex_node_t *node = &expr->nodes[nnode]; |
608 | |
609 DBG_RE_FREE( | |
610 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " | |
611 "mode=%s, type=%s, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", | |
612 nnode, expr->nnodes, | |
613 re_match_modes[node->mode], | |
614 re_match_types[node->type], | |
615 node->repeatMin, node->repeatMax); | |
616 | 700 |
617 th_regex_free(node->match.expr); | 701 th_regex_free(node->match.expr); |
618 th_regex_list_free(&node->match.list); | 702 th_regex_list_free(&node->match.list); |
619 } | 703 } |
620 | 704 |
621 th_free(expr->nodes); | 705 th_free(expr->nodes); |
706 } | |
707 } | |
708 | |
709 | |
710 void th_regex_dump(FILE *fh, const int level, const th_regex_t *expr) | |
711 { | |
712 if (expr != NULL) | |
713 { | |
714 for (size_t nnode = 0; nnode < expr->nnodes; nnode++) | |
715 { | |
716 th_regex_node_t *node = &expr->nodes[nnode]; | |
717 | |
718 for (int indent = 0; indent < level; indent++) | |
719 fprintf(fh, " "); | |
720 | |
721 fprintf(fh, "[%" PRIu_SIZE_T " / %" PRIu_SIZE_T "] %s %s ", | |
722 nnode + 1, expr->nnodes, | |
723 re_match_modes[node->mode], | |
724 re_match_types[node->type]); | |
725 | |
726 if (node->mode == TH_RE_MATCH_COUNT) | |
727 { | |
728 fprintf(fh, "min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T " : ", | |
729 node->repeatMin, node->repeatMax); | |
730 } | |
731 | |
732 switch (node->type) | |
733 { | |
734 case TH_RE_TYPE_CHAR: | |
735 fprintf(fh, "'%c'\n", node->match.chr); | |
736 break; | |
737 | |
738 case TH_RE_TYPE_STR: | |
739 fprintf(fh, "\"%s\"\n", node->match.str); | |
740 break; | |
741 | |
742 case TH_RE_TYPE_ANY_CHAR: | |
743 fprintf(fh, ".\n"); | |
744 break; | |
745 | |
746 case TH_RE_TYPE_LIST: | |
747 case TH_RE_TYPE_LIST_REVERSE: | |
748 fprintf(fh, "[ "); | |
749 for (size_t n = 0; n < node->match.list.nitems; n++) | |
750 { | |
751 const th_regex_list_item_t *li = &node->match.list.items[n]; | |
752 if (li->type) | |
753 { | |
754 fprintf(fh, "'%c-%c' ", li->start, li->end); | |
755 } | |
756 else | |
757 { | |
758 for (size_t i = 0; i < li->nchars; i++) | |
759 fprintf(fh, "'%c' ", li->chars[i]); | |
760 } | |
761 } | |
762 fprintf(fh, "]\n"); | |
763 break; | |
764 | |
765 case TH_RE_TYPE_SUBEXPR: | |
766 fprintf(fh, "\n"); | |
767 th_regex_dump(fh, level + 1, node->match.expr); | |
768 break; | |
769 } | |
770 } | |
622 } | 771 } |
623 } | 772 } |
624 | 773 |
625 | 774 |
626 static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch) | 775 static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch) |
667 BOOL res = FALSE; | 816 BOOL res = FALSE; |
668 | 817 |
669 switch (node->type) | 818 switch (node->type) |
670 { | 819 { |
671 case TH_RE_TYPE_SUBEXPR: | 820 case TH_RE_TYPE_SUBEXPR: |
672 // push current context in stack | |
673 // new context for subexpr | |
674 res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags); | 821 res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags); |
675 break; | 822 break; |
676 | 823 |
677 case TH_RE_TYPE_LIST: | 824 case TH_RE_TYPE_LIST: |
678 case TH_RE_TYPE_LIST_REVERSE: | 825 case TH_RE_TYPE_LIST_REVERSE: |
706 { | 853 { |
707 res = (cch == node->match.chr); | 854 res = (cch == node->match.chr); |
708 (*poffs)++; | 855 (*poffs)++; |
709 } | 856 } |
710 break; | 857 break; |
858 | |
859 case TH_RE_TYPE_STR: | |
860 res = TRUE; | |
861 for (th_regex_char_t *str = node->match.str; *str != 0; ) | |
862 { | |
863 if (haystack[*poffs] != *str) | |
864 { | |
865 res = FALSE; | |
866 break; | |
867 } | |
868 str++; | |
869 (*poffs)++; | |
870 } | |
871 break; | |
711 } | 872 } |
712 | 873 |
713 return res; | 874 return res; |
714 } | 875 } |
715 | 876 |
742 | 903 |
743 case TH_RE_MATCH_COUNT: | 904 case TH_RE_MATCH_COUNT: |
744 { | 905 { |
745 ssize_t count = 0; | 906 ssize_t count = 0; |
746 | 907 |
747 DBG_RE_MATCH(" COUNT min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", | 908 DBG_RE_MATCH(" min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", |
748 node->repeatMin, node->repeatMax); | 909 node->repeatMin, node->repeatMax); |
749 | 910 |
750 do | 911 do |
751 { | 912 { |
752 BOOL match; | 913 BOOL match; |
753 size_t toffs = soffs, tnode; | 914 size_t toffs = soffs, tnode; |
754 | 915 |
755 DBG_RE_MATCH(" START '%s'\n", haystack + toffs); | 916 DBG_RE_MATCH(" ROUND #%" PRIu_SIZE_T ": '%s'\n", |
756 do { | 917 count, haystack + toffs); |
918 | |
919 do | |
920 { | |
757 match = TRUE; | 921 match = TRUE; |
922 size_t noffs = toffs; | |
758 for (tnode = nnode; match && tnode < expr->nnodes && haystack[toffs] != 0; ) | 923 for (tnode = nnode; match && tnode < expr->nnodes && haystack[toffs] != 0; ) |
759 { | 924 { |
760 match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags); | 925 match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags); |
761 if (match) | 926 if (match) |
762 tnode++; | 927 tnode++; |
763 } | 928 } |
764 DBG_RE_MATCH(" '%s': %d (tnode=%" PRIu_SIZE_T ")\n", haystack + toffs, match, tnode); | 929 |
930 DBG_RE_MATCH(" '%s': %d (tnode=%" PRIu_SIZE_T ")\n", | |
931 haystack + noffs, match, tnode); | |
932 | |
765 if (node->repeatMin >= 0 && match) | 933 if (node->repeatMin >= 0 && match) |
766 break; | 934 break; |
767 } while (!match && haystack[toffs] != 0); | 935 } while (!match && haystack[toffs] != 0); |
768 | 936 |
769 DBG_RE_MATCH(" END '%s': %d (tnode=%" PRIu_SIZE_T ")\n", haystack + toffs, match, tnode); | 937 DBG_RE_MATCH(" ROUND #%" PRIu_SIZE_T " END: match=%s, tnode=%" PRIu_SIZE_T "\n", |
938 count, match ? "YES" : "NO", tnode); | |
770 | 939 |
771 if (match) | 940 if (match) |
772 { | 941 { |
773 // Node matched | 942 // Node matched |
774 count++; | 943 count++; |