Mercurial > hg > th-libs
comparison th_regex.c @ 613:2e3b81ae8c8a
More work on regexes.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Thu, 16 Jan 2020 16:01:27 +0200 |
parents | cc9ec51b4875 |
children | afcaf5e38f56 |
comparison
equal
deleted
inserted
replaced
612:cc9ec51b4875 | 613:2e3b81ae8c8a |
---|---|
11 //#include "th_datastruct.h" | 11 //#include "th_datastruct.h" |
12 | 12 |
13 | 13 |
14 //#define DBG_RE_COMPILE 1 | 14 //#define DBG_RE_COMPILE 1 |
15 //#define DBG_RE_FREE 1 | 15 //#define DBG_RE_FREE 1 |
16 //#define DBG_RE_MATCH 1 | 16 #define DBG_RE_MATCH 1 |
17 | 17 |
18 #if defined(DBG_RE_COMPILE) | 18 #if defined(DBG_RE_COMPILE) |
19 # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) | 19 # define DBG_RE_PRINT_COMPILE(...) do { fprintf(stderr, __VA_ARGS__); } while (0) |
20 #else | 20 #else |
21 # define DBG_RE_PRINT_COMPILE(...) do { } while (0) | 21 # define DBG_RE_PRINT_COMPILE(...) do { } while (0) |
33 | 33 |
34 | 34 |
35 enum | 35 enum |
36 { | 36 { |
37 TH_RE_MATCH_ONCE, | 37 TH_RE_MATCH_ONCE, |
38 TH_RE_MATCH_COUNT, | 38 TH_RE_MATCH_COUNT_GREEDY, |
39 TH_RE_MATCH_COUNT_NONGREEDY, | |
39 TH_RE_MATCH_ANCHOR_START, | 40 TH_RE_MATCH_ANCHOR_START, |
40 TH_RE_MATCH_ANCHOR_END, | 41 TH_RE_MATCH_ANCHOR_END, |
41 }; | 42 }; |
42 | 43 |
43 | 44 |
53 | 54 |
54 #if defined(DBG_RE_COMPILE) || defined(DBG_RE_FREE) || defined(DBG_RE_MATCH) | 55 #if defined(DBG_RE_COMPILE) || defined(DBG_RE_FREE) || defined(DBG_RE_MATCH) |
55 static const char *re_match_modes[] = | 56 static const char *re_match_modes[] = |
56 { | 57 { |
57 "ONCE", | 58 "ONCE", |
58 "COUNT", | 59 "COUNT GREEDY", |
60 "COUNT NONGREEDY", | |
59 "ANCHOR_START", | 61 "ANCHOR_START", |
60 "ANCHOR_END", | 62 "ANCHOR_END", |
61 }; | 63 }; |
62 | 64 |
63 static const char *re_match_types[] = | 65 static const char *re_match_types[] = |
251 memset(&ctx, 0, sizeof(ctx)); | 253 memset(&ctx, 0, sizeof(ctx)); |
252 ctx.pattern = pattern; | 254 ctx.pattern = pattern; |
253 | 255 |
254 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) | 256 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) |
255 { | 257 { |
256 DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, ctx.pattern[ctx.offs]); | 258 th_regex_char cch = ctx.pattern[ctx.offs]; |
257 switch (ctx.pattern[ctx.offs]) | 259 DBG_RE_PRINT_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); |
260 switch (cch) | |
258 { | 261 { |
259 case '?': | 262 case '?': |
260 // Previous token is optional (repeat 0-1 times) | 263 case '*': |
264 case '+': | |
261 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) | 265 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) |
262 goto exit; | 266 goto exit; |
263 | 267 |
264 pnode->mode = TH_RE_MATCH_COUNT; | 268 if (cch == '?') |
265 pnode->repeatMin = 0; | 269 { |
266 pnode->repeatMax = 1; | 270 // Check if previous was a count |
267 break; | 271 pnode->mode = (pnode->mode == TH_RE_MATCH_COUNT_GREEDY) ? |
268 | 272 TH_RE_MATCH_COUNT_NONGREEDY : TH_RE_MATCH_COUNT_GREEDY; |
269 case '*': | 273 |
270 // Previous token can repeat 0 or more times | 274 // Previous token is optional (repeat 0-1 times) |
271 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) | 275 pnode->repeatMin = 0; |
272 goto exit; | 276 pnode->repeatMax = 1; |
273 | 277 } |
274 pnode->mode = TH_RE_MATCH_COUNT; | 278 else |
275 pnode->repeatMin = 0; | 279 { |
276 pnode->repeatMax = -1; | 280 // Check if previous was a count |
277 break; | 281 if (pnode->mode == TH_RE_MATCH_COUNT_GREEDY || |
278 | 282 pnode->mode == TH_RE_MATCH_COUNT_NONGREEDY) |
279 case '+': | 283 { |
280 // Previous token must repeat 1 or more times | 284 res = THERR_INVALID_DATA; |
281 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) | 285 goto exit; |
282 goto exit; | 286 } |
283 | 287 |
284 pnode->mode = TH_RE_MATCH_COUNT; | 288 pnode->mode = TH_RE_MATCH_COUNT_GREEDY; |
285 pnode->repeatMin = 1; | 289 |
286 pnode->repeatMax = 1; | 290 if (cch == '*') |
291 { | |
292 // Previous token can repeat 0 or more times | |
293 pnode->repeatMin = 0; | |
294 pnode->repeatMax = -1; | |
295 } | |
296 else | |
297 { | |
298 // Previous token must repeat 1 or more times | |
299 pnode->repeatMin = 1; | |
300 pnode->repeatMax = -1; | |
301 } | |
302 } | |
287 break; | 303 break; |
288 | 304 |
289 case '{': | 305 case '{': |
290 // {n} | {min,max} | 306 // {n} | {min,max} |
291 start = ctx.offs + 1; | 307 start = ctx.offs + 1; |
301 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || | 317 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || |
302 (res = th_regex_strndup(&tmp, ctx.pattern + start, | 318 (res = th_regex_strndup(&tmp, ctx.pattern + start, |
303 ctx.offs - start)) != THERR_OK) | 319 ctx.offs - start)) != THERR_OK) |
304 goto exit; | 320 goto exit; |
305 | 321 |
306 pnode->mode = TH_RE_MATCH_COUNT; | 322 pnode->mode = TH_RE_MATCH_COUNT_GREEDY; |
307 | 323 |
308 if (th_regex_find_next(tmp, 0, &start, ',')) | 324 if (th_regex_find_next(tmp, 0, &start, ',')) |
309 { | 325 { |
310 tmp[start] = 0; | 326 tmp[start] = 0; |
311 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || | 327 if (!th_regex_parse_ssize_t(tmp, &pnode->repeatMin) || |
491 size_t *offs, const th_regex_node *node, const int flags) | 507 size_t *offs, const th_regex_node *node, const int flags) |
492 { | 508 { |
493 th_regex_char cch; | 509 th_regex_char cch; |
494 BOOL ret = FALSE; | 510 BOOL ret = FALSE; |
495 | 511 |
496 DBG_RE_PRINT_MATCH("node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); | 512 DBG_RE_PRINT_MATCH(" node_START [%s]: '%s': ", re_match_types[node->type], haystack + *offs); |
497 | 513 |
498 switch (node->type) | 514 switch (node->type) |
499 { | 515 { |
500 case TH_RE_TYPE_SUBEXPR: | 516 case TH_RE_TYPE_SUBEXPR: |
501 DBG_RE_PRINT_MATCH("subexpr ..\n"); | 517 DBG_RE_PRINT_MATCH("subexpr ..\n"); |
548 (*offs)++; | 564 (*offs)++; |
549 break; | 565 break; |
550 } | 566 } |
551 | 567 |
552 out: | 568 out: |
553 DBG_RE_PRINT_MATCH("node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); | 569 DBG_RE_PRINT_MATCH(" node_DONE [%s]: match %s\n", re_match_types[node->type], ret ? "YES" : "NO"); |
554 return ret; | 570 return ret; |
555 } | 571 } |
556 | 572 |
557 | 573 |
558 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, | 574 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, |
561 for (size_t n = 0; n < expr->nnodes; n++) | 577 for (size_t n = 0; n < expr->nnodes; n++) |
562 { | 578 { |
563 const th_regex_node *node = &expr->nodes[n]; | 579 const th_regex_node *node = &expr->nodes[n]; |
564 size_t soffs; | 580 size_t soffs; |
565 | 581 |
566 DBG_RE_PRINT_MATCH(" expr %p [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", | 582 DBG_RE_PRINT_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", |
567 (void *) expr, n, expr->nnodes, re_match_modes[node->mode]); | 583 n, expr->nnodes, re_match_modes[node->mode]); |
568 | 584 |
569 switch (node->mode) | 585 switch (node->mode) |
570 { | 586 { |
571 case TH_RE_MATCH_ONCE: | 587 case TH_RE_MATCH_ONCE: |
572 { | 588 { |
576 return FALSE; | 592 return FALSE; |
577 *offs = soffs; | 593 *offs = soffs; |
578 } | 594 } |
579 break; | 595 break; |
580 | 596 |
581 case TH_RE_MATCH_COUNT: | 597 case TH_RE_MATCH_COUNT_GREEDY: |
582 { | 598 case TH_RE_MATCH_COUNT_NONGREEDY: |
583 BOOL done = FALSE, match = FALSE; | 599 { |
600 BOOL done = FALSE; | |
584 ssize_t count = 0; | 601 ssize_t count = 0; |
585 | 602 |
586 DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); | 603 DBG_RE_PRINT_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); |
587 | 604 |
588 do | 605 do |
589 { | 606 { |
607 BOOL match; | |
590 soffs = *offs; | 608 soffs = *offs; |
591 if ((match = th_regex_do_match_node(haystack, &soffs, node, flags))) | 609 |
610 match = th_regex_do_match_node(haystack, &soffs, node, flags); | |
611 for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++) | |
612 { | |
613 const th_regex_node *next = &expr->nodes[qn]; | |
614 do { | |
615 match = th_regex_do_match_node(haystack, &soffs, next, flags); | |
616 } while (haystack[soffs] != 0 && !match); | |
617 } | |
618 | |
619 if (match) | |
592 { | 620 { |
593 // Node matched | 621 // Node matched |
594 count++; | 622 count++; |
595 done = (node->repeatMax > 0 && count >= node->repeatMax); | 623 done = (node->repeatMax > 0 && count >= node->repeatMax); |
596 *offs = soffs; | 624 *offs = soffs; |
652 for (size_t soffs = 0; haystack[soffs] != 0; ) | 680 for (size_t soffs = 0; haystack[soffs] != 0; ) |
653 { | 681 { |
654 BOOL matched; | 682 BOOL matched; |
655 size_t coffs = soffs; | 683 size_t coffs = soffs; |
656 | 684 |
657 DBG_RE_PRINT_MATCH("\nDO_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", | 685 DBG_RE_PRINT_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", |
658 soffs, haystack + soffs); | 686 soffs, haystack + soffs); |
659 | 687 |
660 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) | 688 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) |
661 { | 689 { |
662 // A match was found, increase count | 690 // A match was found, increase count |