Mercurial > hg > th-libs
comparison th_regex.c @ 640:9e1f9e1d1487
Aaand some more work. Still just a broken concept.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Fri, 24 Jan 2020 09:13:24 +0200 |
parents | 8c957ad9d4c3 |
children | 9a1ed82abefd |
comparison
equal
deleted
inserted
replaced
639:8c957ad9d4c3 | 640:9e1f9e1d1487 |
---|---|
8 #include "th_util.h" | 8 #include "th_util.h" |
9 #include "th_regex.h" | 9 #include "th_regex.h" |
10 | 10 |
11 | 11 |
12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG | 12 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG |
13 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stdout, __VA_ARGS__); } while (0) | |
14 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stdout, __VA_ARGS__); } while (0) | |
15 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stdout, __VA_ARGS__); } while (0) | |
16 | |
13 int th_dbg_re_flags = 0; | 17 int th_dbg_re_flags = 0; |
14 # define DBG_RE_COMPILE(...) do { if (th_dbg_re_flags & TH_DBG_RE_COMPILE) fprintf(stderr, __VA_ARGS__); } while (0) | 18 |
15 # define DBG_RE_FREE(...) do { if (th_dbg_re_flags & TH_DBG_RE_FREE) fprintf(stderr, __VA_ARGS__); } while (0) | 19 static const char *re_match_modes[] = |
16 # define DBG_RE_MATCH(...) do { if (th_dbg_re_flags & TH_DBG_RE_MATCH) fprintf(stderr, __VA_ARGS__); } while (0) | 20 { |
21 "ONCE", | |
22 "COUNT GREEDY", | |
23 "COUNT NONGREEDY", | |
24 "ANCHOR START", | |
25 "ANCHOR END", | |
26 }; | |
27 | |
28 static const char *re_match_types[] = | |
29 { | |
30 "CHAR", | |
31 "ANY", | |
32 "LIST", | |
33 "LIST REVERSE", | |
34 "SUBEXPR", | |
35 }; | |
36 | |
17 #else | 37 #else |
18 # define DBG_RE_COMPILE(...) | 38 # define DBG_RE_COMPILE(...) |
19 # define DBG_RE_FREE(...) | 39 # define DBG_RE_FREE(...) |
20 # define DBG_RE_MATCH(...) | 40 # define DBG_RE_MATCH(...) |
21 #endif | 41 #endif |
22 | 42 |
23 | 43 |
24 #ifdef TH_EXPERIMENTAL_REGEX | 44 #ifdef TH_EXPERIMENTAL_REGEX |
25 | 45 |
46 typedef struct | |
47 { | |
48 int type; | |
49 th_regex_char_t start, end; | |
50 | |
51 size_t nchars; | |
52 th_regex_char_t *chars; | |
53 } th_regex_list_item_t; | |
54 | |
55 | |
56 typedef struct | |
57 { | |
58 size_t nitems, itemssize; | |
59 th_regex_list_item_t *items; | |
60 } th_regex_list_t; | |
61 | |
62 | |
63 typedef struct | |
64 { | |
65 int mode, type; | |
66 ssize_t repeatMin, repeatMax; | |
67 | |
68 struct { | |
69 th_regex_char_t chr; | |
70 th_regex_list_t list; | |
71 struct th_regex_t *expr; | |
72 } match; | |
73 } th_regex_node_t; | |
74 | |
75 | |
76 struct th_regex_t | |
77 { | |
78 size_t nnodes, nodessize; | |
79 th_regex_node_t *nodes; | |
80 }; | |
81 | |
82 | |
26 enum | 83 enum |
27 { | 84 { |
28 TH_RE_MATCH_ONCE, | 85 TH_RE_MATCH_ONCE, |
29 TH_RE_MATCH_COUNT_GREEDY, | 86 TH_RE_MATCH_COUNT_GREEDY, |
30 TH_RE_MATCH_COUNT_NONGREEDY, | 87 TH_RE_MATCH_COUNT_NONGREEDY, |
88 | |
31 TH_RE_MATCH_ANCHOR_START, | 89 TH_RE_MATCH_ANCHOR_START, |
32 TH_RE_MATCH_ANCHOR_END, | 90 TH_RE_MATCH_ANCHOR_END, |
33 }; | 91 }; |
34 | 92 |
35 | 93 |
41 TH_RE_TYPE_LIST_REVERSE, | 99 TH_RE_TYPE_LIST_REVERSE, |
42 TH_RE_TYPE_SUBEXPR, | 100 TH_RE_TYPE_SUBEXPR, |
43 }; | 101 }; |
44 | 102 |
45 | 103 |
46 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG | |
47 static const char *re_match_modes[] = | |
48 { | |
49 "ONCE", | |
50 "COUNT GREEDY", | |
51 "COUNT NONGREEDY", | |
52 "ANCHOR_START", | |
53 "ANCHOR_END", | |
54 }; | |
55 | |
56 static const char *re_match_types[] = | |
57 { | |
58 "CHAR", | |
59 "ANY", | |
60 "LIST", | |
61 "LIST_REVERSE", | |
62 "SUBEXPR", | |
63 }; | |
64 #endif | |
65 | |
66 | |
67 typedef struct | 104 typedef struct |
68 { | 105 { |
69 const th_regex_char *pattern; | 106 const th_regex_char_t *pattern; |
70 size_t offs; | 107 size_t offs; |
71 | 108 |
72 th_regex_ctx *data; | 109 th_regex_t *data; |
73 | 110 |
74 size_t nstack, stacksize; | 111 size_t nstack, stacksize; |
75 th_regex_ctx **stack; | 112 th_regex_t **stack; |
76 } th_regex_parse_ctx; | 113 } th_regex_parse_ctx_t; |
77 | 114 |
78 | 115 |
79 static void th_regex_node_init(th_regex_node *node) | 116 static void th_regex_node_init(th_regex_node_t *node) |
80 { | 117 { |
81 memset(node, 0, sizeof(th_regex_node)); | 118 memset(node, 0, sizeof(th_regex_node_t)); |
82 node->mode = TH_RE_MATCH_ONCE; | 119 node->mode = TH_RE_MATCH_ONCE; |
83 } | 120 } |
84 | 121 |
85 | 122 |
86 static int th_regex_strndup(th_regex_char **pdst, | 123 static int th_regex_strndup(th_regex_char_t **pdst, |
87 const th_regex_char *src, const size_t len) | 124 const th_regex_char_t *src, const size_t len) |
88 { | 125 { |
89 if (pdst == NULL) | 126 if (pdst == NULL) |
90 return THERR_NULLPTR; | 127 return THERR_NULLPTR; |
91 | 128 |
92 if (UINTPTR_MAX / sizeof(th_regex_char) < len + 1) | 129 if (UINTPTR_MAX / sizeof(th_regex_char_t) < len + 1) |
93 return THERR_BOUNDS; | 130 return THERR_BOUNDS; |
94 | 131 |
95 if ((*pdst = (th_regex_char *) | 132 if ((*pdst = (th_regex_char_t *) |
96 th_malloc((len + 1) * sizeof(th_regex_char))) == NULL) | 133 th_malloc((len + 1) * sizeof(th_regex_char_t))) == NULL) |
97 return THERR_MALLOC; | 134 return THERR_MALLOC; |
98 | 135 |
99 memcpy(*pdst, src, len * sizeof(th_regex_char)); | 136 memcpy(*pdst, src, len * sizeof(th_regex_char_t)); |
100 (*pdst)[len] = 0; | 137 (*pdst)[len] = 0; |
101 | 138 |
102 return THERR_OK; | 139 return THERR_OK; |
103 } | 140 } |
104 | 141 |
105 | 142 |
106 static int th_regex_ctx_get_prev_node( | 143 static int th_regex_parse_ctx_get_prev_node( |
107 th_regex_parse_ctx *ctx, th_regex_node **pnode) | 144 th_regex_parse_ctx_t *ctx, th_regex_node_t **pnode) |
108 { | 145 { |
109 if (ctx->data != NULL && ctx->data->nnodes > 0) | 146 if (ctx->data != NULL && ctx->data->nnodes > 0) |
110 { | 147 { |
111 *pnode = &ctx->data->nodes[ctx->data->nnodes - 1]; | 148 *pnode = &ctx->data->nodes[ctx->data->nnodes - 1]; |
112 return THERR_OK; | 149 return THERR_OK; |
114 else | 151 else |
115 return THERR_INVALID_DATA; | 152 return THERR_INVALID_DATA; |
116 } | 153 } |
117 | 154 |
118 | 155 |
119 static int th_regex_ctx_push(th_regex_parse_ctx *ctx) | 156 static int th_regex_parse_ctx_push(th_regex_parse_ctx_t *ctx) |
120 { | 157 { |
121 if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize) | 158 if (ctx->stack == NULL || ctx->nstack + 1 >= ctx->stacksize) |
122 { | 159 { |
123 ctx->stacksize += 16; | 160 ctx->stacksize += 16; |
124 | 161 |
125 if ((ctx->stack = th_realloc(ctx->stack, | 162 if ((ctx->stack = th_realloc(ctx->stack, |
126 ctx->stacksize * sizeof(th_regex_node *))) == NULL) | 163 ctx->stacksize * sizeof(th_regex_node_t *))) == NULL) |
127 return THERR_MALLOC; | 164 return THERR_MALLOC; |
128 } | 165 } |
129 | 166 |
130 ctx->stack[ctx->nstack] = ctx->data; | 167 ctx->stack[ctx->nstack] = ctx->data; |
131 ctx->nstack++; | 168 ctx->nstack++; |
133 | 170 |
134 return THERR_OK; | 171 return THERR_OK; |
135 } | 172 } |
136 | 173 |
137 | 174 |
138 static int th_regex_ctx_pop(th_regex_parse_ctx *ctx, th_regex_ctx **data) | 175 static int th_regex_parse_ctx_pop(th_regex_parse_ctx_t *ctx, th_regex_t **data) |
139 { | 176 { |
140 if (ctx->nstack > 0) | 177 if (ctx->nstack > 0) |
141 { | 178 { |
142 *data = ctx->data; | 179 *data = ctx->data; |
143 ctx->nstack--; | 180 ctx->nstack--; |
147 else | 184 else |
148 return THERR_INVALID_DATA; | 185 return THERR_INVALID_DATA; |
149 } | 186 } |
150 | 187 |
151 | 188 |
152 static int th_regex_ctx_node_commit(th_regex_parse_ctx *ctx, th_regex_node *node) | 189 static int th_regex_parse_ctx_node_commit(th_regex_parse_ctx_t *ctx, th_regex_node_t *node) |
153 { | 190 { |
154 th_regex_ctx *data = ctx->data; | 191 th_regex_t *data = ctx->data; |
155 | 192 |
156 if (data == NULL) | 193 if (data == NULL) |
157 { | 194 { |
158 if ((data = ctx->data = th_malloc0(sizeof(th_regex_ctx))) == NULL) | 195 if ((data = ctx->data = th_malloc0(sizeof(th_regex_t))) == NULL) |
159 return THERR_MALLOC; | 196 return THERR_MALLOC; |
160 } | 197 } |
161 | 198 |
162 if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize) | 199 if (data->nodes == NULL || data->nnodes + 1 >= data->nodessize) |
163 { | 200 { |
164 data->nodessize += 16; | 201 data->nodessize += 16; |
165 if ((data->nodes = th_realloc(data->nodes, | 202 if ((data->nodes = th_realloc(data->nodes, |
166 data->nodessize * sizeof(th_regex_node))) == NULL) | 203 data->nodessize * sizeof(th_regex_node_t))) == NULL) |
167 return THERR_MALLOC; | 204 return THERR_MALLOC; |
168 } | 205 } |
169 | 206 |
170 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node)); | 207 memcpy(&data->nodes[data->nnodes], node, sizeof(th_regex_node_t)); |
171 data->nnodes++; | 208 data->nnodes++; |
172 | 209 |
173 DBG_RE_COMPILE( | 210 DBG_RE_COMPILE( |
174 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " | 211 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " |
175 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", | 212 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", |
179 | 216 |
180 return THERR_OK; | 217 return THERR_OK; |
181 } | 218 } |
182 | 219 |
183 | 220 |
184 static BOOL th_regex_find_next(const th_regex_char *str, | 221 static BOOL th_regex_find_next(const th_regex_char_t *str, |
185 const size_t start, size_t *offs, | 222 const size_t start, size_t *offs, |
186 const th_regex_char delim) | 223 const th_regex_char_t delim) |
187 { | 224 { |
188 for (*offs = start; str[*offs] != 0; (*offs)++) | 225 for (*offs = start; str[*offs] != 0; (*offs)++) |
189 { | 226 { |
190 if (str[*offs] == delim) | 227 if (str[*offs] == delim) |
191 return TRUE; | 228 return TRUE; |
192 } | 229 } |
193 return FALSE; | 230 return FALSE; |
194 } | 231 } |
195 | 232 |
196 | 233 |
197 static BOOL th_regex_parse_ssize_t(const th_regex_char *str, | 234 static BOOL th_regex_parse_ssize_t(const th_regex_char_t *str, |
198 ssize_t *value) | 235 ssize_t *value) |
199 { | 236 { |
200 th_regex_char ch; | 237 th_regex_char_t ch; |
201 BOOL neg; | 238 BOOL neg; |
202 | 239 |
203 if (*str == '-') | 240 if (*str == '-') |
204 { | 241 { |
205 str++; | 242 str++; |
225 | 262 |
226 return TRUE; | 263 return TRUE; |
227 } | 264 } |
228 | 265 |
229 | 266 |
230 static void th_regex_list_item_init(th_regex_list_item *item) | 267 static void th_regex_list_item_init(th_regex_list_item_t *item) |
231 { | 268 { |
232 memset(item, 0, sizeof(th_regex_list_item)); | 269 memset(item, 0, sizeof(th_regex_list_item_t)); |
233 } | 270 } |
234 | 271 |
235 | 272 |
236 static int th_regex_list_add_item(th_regex_list *list, th_regex_list_item *item) | 273 static int th_regex_list_add_item(th_regex_list_t *list, th_regex_list_item_t *item) |
237 { | 274 { |
238 if (list->items == NULL || list->nitems + 1 >= list->itemssize) | 275 if (list->items == NULL || list->nitems + 1 >= list->itemssize) |
239 { | 276 { |
240 list->itemssize += 16; | 277 list->itemssize += 16; |
241 | 278 |
242 if ((list->items = th_realloc(list->items, | 279 if ((list->items = th_realloc(list->items, |
243 list->itemssize * sizeof(th_regex_list_item))) == NULL) | 280 list->itemssize * sizeof(th_regex_list_item_t))) == NULL) |
244 return THERR_MALLOC; | 281 return THERR_MALLOC; |
245 } | 282 } |
246 | 283 |
247 memcpy(list->items + list->nitems, item, sizeof(th_regex_list_item)); | 284 memcpy(&list->items[list->nitems], item, sizeof(th_regex_list_item_t)); |
248 list->nitems++; | 285 list->nitems++; |
249 | 286 |
250 return THERR_OK; | 287 return THERR_OK; |
251 } | 288 } |
252 | 289 |
253 | 290 |
254 static void th_regex_list_free(th_regex_list *list) | 291 static void th_regex_list_free(th_regex_list_t *list) |
255 { | 292 { |
256 if (list != NULL) | 293 if (list != NULL) |
257 { | 294 { |
258 for (size_t n = 0; n < list->nitems; n++) | 295 for (size_t n = 0; n < list->nitems; n++) |
259 { | 296 { |
262 th_free(list->items); | 299 th_free(list->items); |
263 } | 300 } |
264 } | 301 } |
265 | 302 |
266 | 303 |
267 static int th_regex_parse_list(const th_regex_char *str, | 304 static int th_regex_parse_list(const th_regex_char_t *str, |
268 const size_t slen, th_regex_list *list) | 305 const size_t slen, th_regex_list_t *list) |
269 { | 306 { |
270 th_regex_char *tmp = NULL; | 307 th_regex_char_t *tmp = NULL; |
271 th_regex_list_item item; | 308 th_regex_list_item_t item; |
272 int res; | 309 int res; |
273 | 310 |
274 if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK) | 311 if ((res = th_regex_strndup(&tmp, str, slen)) != THERR_OK) |
275 goto out; | 312 goto out; |
276 | 313 |
277 // Handle ranges like [A-Z] | 314 // Handle ranges like [A-Z] |
278 for (size_t offs = 0; offs < slen; offs++) | 315 for (size_t offs = 0; offs < slen; offs++) |
279 { | 316 { |
280 th_regex_char | 317 th_regex_char_t |
281 *prev = (offs > 0) ? tmp + offs - 1 : NULL, | 318 *prev = (offs > 0) ? tmp + offs - 1 : NULL, |
282 *curr = tmp + offs, | 319 *curr = tmp + offs, |
283 *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL; | 320 *next = (offs + 1 < slen) ? tmp + offs + 1 : NULL; |
284 | 321 |
285 if (*curr == '-') | 322 if (*curr == '-') |
317 item.type = 0; | 354 item.type = 0; |
318 item.nchars = 0; | 355 item.nchars = 0; |
319 | 356 |
320 for (size_t offs = 0; offs < slen; offs++) | 357 for (size_t offs = 0; offs < slen; offs++) |
321 { | 358 { |
322 th_regex_char curr = tmp[offs]; | 359 th_regex_char_t curr = tmp[offs]; |
323 if (curr != 0) | 360 if (curr != 0) |
324 item.nchars++; | 361 item.nchars++; |
325 } | 362 } |
326 | 363 |
327 if (item.nchars > 0) | 364 if (item.nchars > 0) |
328 { | 365 { |
329 if ((item.chars = th_malloc(sizeof(th_regex_char) * item.nchars)) == NULL) | 366 if ((item.chars = th_malloc(sizeof(th_regex_char_t) * item.nchars)) == NULL) |
330 { | 367 { |
331 res = THERR_MALLOC; | 368 res = THERR_MALLOC; |
332 goto out; | 369 goto out; |
333 } | 370 } |
334 | 371 |
335 for (size_t offs = 0, n = 0; offs < slen; offs++) | 372 for (size_t offs = 0, n = 0; offs < slen; offs++) |
336 { | 373 { |
337 th_regex_char curr = tmp[offs]; | 374 th_regex_char_t curr = tmp[offs]; |
338 if (curr != 0) | 375 if (curr != 0) |
339 { | 376 { |
340 item.chars[n] = curr; | 377 item.chars[n] = curr; |
341 n++; | 378 n++; |
342 } | 379 } |
353 th_free(tmp); | 390 th_free(tmp); |
354 return res; | 391 return res; |
355 } | 392 } |
356 | 393 |
357 | 394 |
358 int th_regex_compile(th_regex_ctx **pexpr, const th_regex_char *pattern) | 395 int th_regex_compile(th_regex_t **pexpr, const th_regex_char_t *pattern) |
359 { | 396 { |
360 int res = THERR_OK; | 397 int res = THERR_OK; |
361 th_regex_parse_ctx ctx; | 398 th_regex_parse_ctx_t ctx; |
362 th_regex_node node, *pnode; | 399 th_regex_node_t node, *pnode; |
363 th_regex_char *tmp = NULL; | 400 th_regex_char_t *tmp = NULL; |
364 size_t start; | 401 size_t start; |
365 | 402 |
366 if (pexpr == NULL || pattern == NULL) | 403 if (pexpr == NULL || pattern == NULL) |
367 { | 404 { |
368 res = THERR_NULLPTR; | 405 res = THERR_NULLPTR; |
372 memset(&ctx, 0, sizeof(ctx)); | 409 memset(&ctx, 0, sizeof(ctx)); |
373 ctx.pattern = pattern; | 410 ctx.pattern = pattern; |
374 | 411 |
375 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) | 412 for (; ctx.pattern[ctx.offs] != 0; ctx.offs++) |
376 { | 413 { |
377 th_regex_char cch = ctx.pattern[ctx.offs]; | 414 th_regex_char_t cch = ctx.pattern[ctx.offs]; |
378 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); | 415 DBG_RE_COMPILE("[%" PRIu_SIZE_T "] '%c'\n", ctx.offs, cch); |
379 switch (cch) | 416 switch (cch) |
380 { | 417 { |
381 case '?': | 418 case '?': |
382 case '*': | 419 case '*': |
383 case '+': | 420 case '+': |
384 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) | 421 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK) |
385 goto exit; | 422 goto exit; |
386 | 423 |
387 if (cch == '?') | 424 if (cch == '?') |
388 { | 425 { |
389 // Check if previous was a count | 426 // Check if previous was a count |
431 goto exit; | 468 goto exit; |
432 } | 469 } |
433 | 470 |
434 th_free(tmp); | 471 th_free(tmp); |
435 | 472 |
436 if ((res = th_regex_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || | 473 if ((res = th_regex_parse_ctx_get_prev_node(&ctx, &pnode)) != THERR_OK || |
437 (res = th_regex_strndup(&tmp, ctx.pattern + start, | 474 (res = th_regex_strndup(&tmp, ctx.pattern + start, |
438 ctx.offs - start)) != THERR_OK) | 475 ctx.offs - start)) != THERR_OK) |
439 goto exit; | 476 goto exit; |
440 | 477 |
441 pnode->mode = TH_RE_MATCH_COUNT_GREEDY; | 478 pnode->mode = TH_RE_MATCH_COUNT_GREEDY; |
469 } | 506 } |
470 break; | 507 break; |
471 | 508 |
472 case '(': | 509 case '(': |
473 // Start of subpattern | 510 // Start of subpattern |
474 if ((res = th_regex_ctx_push(&ctx)) != THERR_OK) | 511 if ((res = th_regex_parse_ctx_push(&ctx)) != THERR_OK) |
475 goto exit; | 512 goto exit; |
476 break; | 513 break; |
477 | 514 |
478 case ')': | 515 case ')': |
479 // End of subpattern | 516 // End of subpattern |
480 th_regex_node_init(&node); | 517 th_regex_node_init(&node); |
481 node.type = TH_RE_TYPE_SUBEXPR; | 518 node.type = TH_RE_TYPE_SUBEXPR; |
482 | 519 |
483 if ((res = th_regex_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || | 520 if ((res = th_regex_parse_ctx_pop(&ctx, &node.match.expr)) != THERR_OK || |
484 (res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) | 521 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
485 goto exit; | 522 goto exit; |
486 break; | 523 break; |
487 | 524 |
488 case '^': | 525 case '^': |
489 // Start of line anchor | 526 // Start of line anchor |
490 th_regex_node_init(&node); | 527 th_regex_node_init(&node); |
491 node.mode = TH_RE_MATCH_ANCHOR_START; | 528 node.mode = TH_RE_MATCH_ANCHOR_START; |
492 | 529 |
493 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) | 530 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
494 goto exit; | 531 goto exit; |
495 break; | 532 break; |
496 | 533 |
497 case '$': | 534 case '$': |
498 // End of line anchor | 535 // End of line anchor |
499 th_regex_node_init(&node); | 536 th_regex_node_init(&node); |
500 node.mode = TH_RE_MATCH_ANCHOR_END; | 537 node.mode = TH_RE_MATCH_ANCHOR_END; |
501 | 538 |
502 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) | 539 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
503 goto exit; | 540 goto exit; |
504 break; | 541 break; |
505 | 542 |
506 case '[': | 543 case '[': |
507 // Start of char list | 544 // Start of char list |
511 { | 548 { |
512 res = THERR_INVALID_DATA; | 549 res = THERR_INVALID_DATA; |
513 goto exit; | 550 goto exit; |
514 } | 551 } |
515 | 552 |
516 // XXX TODO Parse/support ranges [0-9a-zA-Z_-] | |
517 | |
518 th_regex_node_init(&node); | 553 th_regex_node_init(&node); |
519 node.type = (ctx.pattern[start] == '^') ? | 554 if (ctx.pattern[start] == '^') |
520 TH_RE_TYPE_LIST_REVERSE : TH_RE_TYPE_LIST; | 555 { |
521 node.match.list.nchars = ctx.offs - start; | 556 node.type = TH_RE_TYPE_LIST_REVERSE; |
522 if ((res = th_regex_strndup(&node.match.list.chars, | 557 start++; |
523 ctx.pattern + start, node.match.list.nchars)) != THERR_OK) | 558 } |
524 goto exit; | 559 else |
525 | 560 node.type = TH_RE_TYPE_LIST; |
526 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) | 561 |
562 if ((res = th_regex_parse_list(ctx.pattern + start, | |
563 ctx.offs - start, &node.match.list) != THERR_OK) || | |
564 (res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) | |
527 goto exit; | 565 goto exit; |
528 break; | 566 break; |
529 | 567 |
530 case '.': | 568 case '.': |
531 // Any single character matches | 569 // Any single character matches |
532 th_regex_node_init(&node); | 570 th_regex_node_init(&node); |
533 node.type = TH_RE_TYPE_ANY_CHAR; | 571 node.type = TH_RE_TYPE_ANY_CHAR; |
534 | 572 |
535 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) | 573 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
536 goto exit; | 574 goto exit; |
537 break; | 575 break; |
538 | 576 |
539 case '\\': | 577 case '\\': |
540 // Literal escape | 578 // Literal escape |
543 { | 581 { |
544 // End of pattern, error | 582 // End of pattern, error |
545 res = THERR_INVALID_DATA; | 583 res = THERR_INVALID_DATA; |
546 goto exit; | 584 goto exit; |
547 } | 585 } |
548 | 586 // fall-through |
549 th_regex_node_init(&node); | |
550 node.type = TH_RE_TYPE_CHAR; | |
551 node.match.chr = ctx.pattern[ctx.offs]; | |
552 | |
553 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) | |
554 goto exit; | |
555 break; | |
556 | 587 |
557 default: | 588 default: |
558 // Given character must match | 589 // Given character must match |
559 th_regex_node_init(&node); | 590 th_regex_node_init(&node); |
560 node.type = TH_RE_TYPE_CHAR; | 591 node.type = TH_RE_TYPE_CHAR; |
561 node.match.chr = ctx.pattern[ctx.offs]; | 592 node.match.chr = ctx.pattern[ctx.offs]; |
562 | 593 |
563 if ((res = th_regex_ctx_node_commit(&ctx, &node)) != THERR_OK) | 594 if ((res = th_regex_parse_ctx_node_commit(&ctx, &node)) != THERR_OK) |
564 goto exit; | 595 goto exit; |
565 break; | 596 break; |
566 } | 597 } |
567 } | 598 } |
568 | 599 |
571 th_free(tmp); | 602 th_free(tmp); |
572 return res; | 603 return res; |
573 } | 604 } |
574 | 605 |
575 | 606 |
576 void th_regex_free(th_regex_ctx *expr) | 607 void th_regex_free(th_regex_t *expr) |
577 { | 608 { |
578 if (expr != NULL) | 609 if (expr != NULL) |
579 { | 610 { |
580 for (size_t n = 0; n < expr->nnodes; n++) | 611 for (size_t nnode = 0; nnode < expr->nnodes; nnode++) |
581 { | 612 { |
582 th_regex_node *node = &expr->nodes[n]; | 613 th_regex_node_t *node = &expr->nodes[nnode]; |
583 | 614 |
584 DBG_RE_FREE( | 615 DBG_RE_FREE( |
585 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " | 616 "node [%" PRIu_SIZE_T " / %" PRIu_SIZE_T "]: " |
586 "mode=%d, type=%d, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", | 617 "mode=%s, type=%s, rmin=%" PRId_SSIZE_T ", rmax=%" PRId_SSIZE_T "\n", |
587 n, expr->nnodes, | 618 nnode, expr->nnodes, |
588 node->mode, node->type, | 619 re_match_modes[node->mode], |
620 re_match_types[node->type], | |
589 node->repeatMin, node->repeatMax); | 621 node->repeatMin, node->repeatMax); |
590 | 622 |
591 switch (node->type) | 623 th_regex_free(node->match.expr); |
592 { | 624 th_regex_list_free(&node->match.list); |
593 case TH_RE_TYPE_SUBEXPR: | |
594 DBG_RE_FREE(" SUBEXPR: %p vs %p\n", | |
595 (void *) expr, (void *) node->match.expr); | |
596 th_regex_free(node->match.expr); | |
597 break; | |
598 | |
599 case TH_RE_TYPE_LIST: | |
600 case TH_RE_TYPE_LIST_REVERSE: | |
601 DBG_RE_FREE(" list='%s'\n", node->match.list.chars); | |
602 th_free(node->match.list.chars); | |
603 break; | |
604 | |
605 #ifdef TH_EXPERIMENTAL_REGEX_DEBUG | |
606 case TH_RE_TYPE_ANY_CHAR: | |
607 DBG_RE_FREE(" any char\n"); | |
608 break; | |
609 | |
610 case TH_RE_TYPE_CHAR: | |
611 DBG_RE_FREE(" char='%c'\n", node->match.chr); | |
612 break; | |
613 #endif | |
614 } | |
615 } | 625 } |
616 | 626 |
617 th_free(expr->nodes); | 627 th_free(expr->nodes); |
618 } | 628 } |
619 } | 629 } |
620 | 630 |
621 | 631 |
622 static BOOL th_regex_do_match_list(const th_regex_list *list, const th_regex_char cch) | 632 static BOOL th_regex_match_list(const th_regex_list_t *list, const th_regex_char_t cch) |
623 { | 633 { |
624 // Could be optimized, perhaps .. sort match.chars, binary search etc? | 634 // Could be optimized, perhaps .. sort match.chars, binary search etc? |
625 for (size_t nitem = 0; nitem < list->nitems; nitem++) | 635 for (size_t nitem = 0; nitem < list->nitems; nitem++) |
626 { | 636 { |
627 const th_regex_list_item *item = &list->items[nitem]; | 637 const th_regex_list_item_t *item = &list->items[nitem]; |
638 | |
628 if (item->type == 0) | 639 if (item->type == 0) |
629 { | 640 { |
630 for (size_t n = 0; n < item->nchars; n++) | 641 for (size_t n = 0; n < item->nchars; n++) |
631 if (item->chars[n] == cch) | 642 if (item->chars[n] == cch) |
632 return TRUE; | 643 return TRUE; |
640 | 651 |
641 return FALSE; | 652 return FALSE; |
642 } | 653 } |
643 | 654 |
644 | 655 |
645 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, | 656 static BOOL th_regex_match_expr( |
646 const th_regex_char *haystack, size_t *offs, const int flags); | 657 const th_regex_char_t *haystack, |
647 | 658 size_t *poffs, |
648 | 659 const th_regex_t *expr, |
649 static BOOL th_regex_do_match_node(const th_regex_char *haystack, | 660 const size_t startnode, |
650 size_t *offs, const th_regex_node *node, const int flags) | 661 const int flags |
651 { | 662 ); |
652 th_regex_char cch; | 663 |
664 | |
665 static BOOL th_regex_match_one( | |
666 const th_regex_char_t *haystack, | |
667 size_t *poffs, | |
668 const th_regex_node_t *node, | |
669 const int flags | |
670 ) | |
671 { | |
672 th_regex_char_t cch; | |
653 BOOL res = FALSE; | 673 BOOL res = FALSE; |
654 | 674 |
655 DBG_RE_MATCH(" node_START [%s]: '%s': ", | |
656 re_match_types[node->type], haystack + *offs); | |
657 | |
658 switch (node->type) | 675 switch (node->type) |
659 { | 676 { |
660 case TH_RE_TYPE_SUBEXPR: | 677 case TH_RE_TYPE_SUBEXPR: |
661 DBG_RE_MATCH("subexpr ..\n"); | 678 // push current context in stack |
662 res = th_regex_do_match_expr(node->match.expr, haystack, offs, flags); | 679 // new context for subexpr |
663 return res; | 680 res = th_regex_match_expr(haystack, poffs, node->match.expr, 0, flags); |
664 break; | 681 break; |
665 | 682 |
666 case TH_RE_TYPE_LIST: | 683 case TH_RE_TYPE_LIST: |
667 case TH_RE_TYPE_LIST_REVERSE: | 684 case TH_RE_TYPE_LIST_REVERSE: |
668 DBG_RE_MATCH("[%s]", node->match.list.chars); | 685 if ((cch = haystack[*poffs]) == 0) |
669 if ((cch = haystack[*offs]) == 0) | 686 res = FALSE; |
670 goto out; | 687 else |
671 | 688 { |
672 // Could be optimized, perhaps .. sort match.chars, binary search etc? | 689 res = th_regex_match_list(&node->match.list, cch); |
673 // XXX TODO Ranges are not supported yet | 690 |
674 for (size_t n = 0; n < node->match.list.nchars; n++) | 691 if (node->type == TH_RE_TYPE_LIST_REVERSE) |
675 if (node->match.list.chars[n] == cch) | 692 res = !res; |
693 | |
694 (*poffs)++; | |
695 } | |
696 break; | |
697 | |
698 case TH_RE_TYPE_ANY_CHAR: | |
699 if ((cch = haystack[*poffs]) == 0) | |
700 res = FALSE; | |
701 else | |
676 { | 702 { |
677 res = TRUE; | 703 res = TRUE; |
678 break; | 704 (*poffs)++; |
679 } | 705 } |
680 | |
681 if (node->type == TH_RE_TYPE_LIST_REVERSE) | |
682 res = !res; | |
683 | |
684 (*offs)++; | |
685 break; | 706 break; |
686 | 707 |
687 case TH_RE_TYPE_ANY_CHAR: | 708 case TH_RE_TYPE_CHAR: |
688 if ((cch = haystack[*offs]) == 0) | 709 if ((cch = haystack[*poffs]) == 0) |
689 goto out; | 710 res = FALSE; |
690 | 711 else |
691 res = TRUE; | 712 { |
692 (*offs)++; | 713 res = (cch == node->match.chr); |
714 (*poffs)++; | |
715 } | |
693 break; | 716 break; |
694 | 717 } |
695 case TH_RE_TYPE_CHAR: | |
696 DBG_RE_MATCH("'%c'", node->match.chr); | |
697 if ((cch = haystack[*offs]) == 0) | |
698 goto out; | |
699 | |
700 res = (cch == node->match.chr); | |
701 (*offs)++; | |
702 break; | |
703 } | |
704 | |
705 out: | |
706 DBG_RE_MATCH(", match=%s\n", | |
707 res ? "YES" : "NO"); | |
708 | 718 |
709 return res; | 719 return res; |
710 } | 720 } |
711 | 721 |
712 | 722 |
713 static BOOL th_regex_do_match_expr(const th_regex_ctx *expr, | 723 static BOOL th_regex_match_expr( |
714 const th_regex_char *haystack, size_t *offs, const int flags) | 724 const th_regex_char_t *haystack, |
715 { | 725 size_t *poffs, |
716 for (size_t n = 0; n < expr->nnodes; n++) | 726 const th_regex_t *expr, |
717 { | 727 const size_t startnode, |
718 const th_regex_node *node = &expr->nodes[n]; | 728 const int flags |
719 size_t soffs; | 729 ) |
720 | 730 { |
721 DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s ", | 731 BOOL res = TRUE; |
722 n, expr->nnodes, re_match_modes[node->mode]); | 732 size_t soffs = *poffs; |
733 | |
734 for (size_t nnode = startnode; res && nnode < expr->nnodes; nnode++) | |
735 { | |
736 const th_regex_node_t *node = &expr->nodes[nnode]; | |
737 | |
738 DBG_RE_MATCH(" expr [%" PRIu_SIZE_T "/%" PRIu_SIZE_T "]: %s %s '%s'\n", | |
739 nnode, expr->nnodes, | |
740 re_match_modes[node->mode], re_match_types[node->type], | |
741 haystack + *poffs); | |
723 | 742 |
724 switch (node->mode) | 743 switch (node->mode) |
725 { | 744 { |
726 case TH_RE_MATCH_ONCE: | 745 case TH_RE_MATCH_ONCE: |
727 { | 746 res = th_regex_match_one(haystack, poffs, node, flags); |
728 DBG_RE_MATCH("\n"); | |
729 soffs = *offs; | |
730 if (!th_regex_do_match_node(haystack, &soffs, node, flags)) | |
731 return FALSE; | |
732 *offs = soffs; | |
733 } | |
734 break; | 747 break; |
735 | 748 |
736 case TH_RE_MATCH_COUNT_GREEDY: | 749 case TH_RE_MATCH_COUNT_GREEDY: |
737 case TH_RE_MATCH_COUNT_NONGREEDY: | 750 case TH_RE_MATCH_COUNT_NONGREEDY: |
738 { | 751 { |
739 BOOL done = FALSE; | |
740 ssize_t count = 0; | 752 ssize_t count = 0; |
741 | 753 |
742 DBG_RE_MATCH("min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", node->repeatMin, node->repeatMax); | 754 DBG_RE_MATCH(" COUNT min=%" PRId_SSIZE_T ", max=%" PRId_SSIZE_T "\n", |
755 node->repeatMin, node->repeatMax); | |
743 | 756 |
744 do | 757 do |
745 { | 758 { |
746 BOOL match; | 759 BOOL match; |
747 soffs = *offs; | 760 size_t toffs = *poffs, tnode = nnode; |
748 | 761 |
749 match = th_regex_do_match_node(haystack, &soffs, node, flags); | 762 do { |
750 for (size_t qn = n + 1; qn < expr->nnodes && haystack[soffs] != 0; qn++) | 763 match = th_regex_match_one(haystack, &toffs, node, flags); |
751 { | 764 if (match && haystack[toffs] != 0) |
752 const th_regex_node *next = &expr->nodes[qn]; | 765 { |
753 do { | 766 for (tnode = nnode + 1; match && tnode < expr->nnodes && haystack[toffs] != 0; tnode++) |
754 match = th_regex_do_match_node(haystack, &soffs, next, flags); | 767 { |
755 } while (haystack[soffs] != 0 && !match); | 768 size_t noffs = toffs; |
756 } | 769 match = th_regex_match_one(haystack, &toffs, &expr->nodes[tnode], flags); |
770 DBG_RE_MATCH(" '%s': %d\n", haystack + noffs, match); | |
771 } | |
772 } | |
773 } while (!match && haystack[toffs] != 0); | |
757 | 774 |
758 if (match) | 775 if (match) |
759 { | 776 { |
760 // Node matched | 777 // Node matched |
761 count++; | 778 count++; |
762 done = (node->repeatMax > 0 && count >= node->repeatMax); | 779 *poffs = soffs; |
763 *offs = soffs; | 780 nnode = tnode; |
781 res = (node->repeatMax > 0 && count >= node->repeatMax); | |
764 } | 782 } |
765 else | 783 else |
766 { | 784 { |
767 // Node did not match, check if we got the minimum if set | 785 // Node did not match, check if we got the minimum if set |
768 done = (node->repeatMin >= 0 && count >= node->repeatMin); | 786 res = (node->repeatMin >= 0 && count >= node->repeatMin); |
769 soffs = *offs; | |
770 break; | |
771 } | 787 } |
772 } while (!done); | 788 |
773 | 789 } while (!res); |
774 DBG_RE_MATCH("RESULT: %" PRId_SSIZE_T " = %s\n", count, done ? "YES" : "NO"); | 790 |
775 | 791 DBG_RE_MATCH(" RESULT: count=%" PRId_SSIZE_T ", done=%s\n", |
776 if (!done) | 792 count, res ? "YES" : "NO"); |
777 return FALSE; | 793 } |
778 | 794 break; |
779 *offs = soffs; | 795 |
780 } | |
781 break; | |
782 | 796 |
783 case TH_RE_MATCH_ANCHOR_START: | 797 case TH_RE_MATCH_ANCHOR_START: |
784 DBG_RE_MATCH("offs=%" PRIu_SIZE_T "\n", *offs); | 798 res = (*poffs == 0); |
785 if (*offs != 0) | |
786 return FALSE; | |
787 break; | 799 break; |
788 | 800 |
789 case TH_RE_MATCH_ANCHOR_END: | 801 case TH_RE_MATCH_ANCHOR_END: |
790 DBG_RE_MATCH("is end=%d\n", haystack[*offs]); | 802 res = (haystack[*poffs] == 0); |
791 if (haystack[*offs] != 0) | 803 break; |
792 return FALSE; | 804 } |
793 break; | 805 } |
794 } | 806 |
795 } | 807 if (!res) |
796 | 808 *poffs = soffs; |
797 return TRUE; | 809 |
798 } | 810 return res; |
799 | 811 } |
800 | 812 |
801 int th_regex_match(const th_regex_ctx *expr, const th_regex_char *haystack, | 813 |
802 size_t *pnmatches, th_regex_match_node **pmatches, const size_t maxmatches, | 814 int th_regex_match(const th_regex_t *expr, const th_regex_char_t *haystack, |
815 size_t *pnmatches, th_regex_match_t **pmatches, const size_t maxmatches, | |
803 const int flags) | 816 const int flags) |
804 { | 817 { |
805 size_t nmatches = 0; | 818 size_t nmatches = 0; |
819 (void) flags; | |
806 | 820 |
807 if (pnmatches != NULL) | 821 if (pnmatches != NULL) |
808 *pnmatches = 0; | 822 *pnmatches = 0; |
809 | 823 |
810 // Check given pattern and string | 824 // Check given pattern and string |
816 // - anchored and unanchored expressions | 830 // - anchored and unanchored expressions |
817 // - how to check if the expression has consumed all possibilities? | 831 // - how to check if the expression has consumed all possibilities? |
818 // .. | 832 // .. |
819 for (size_t soffs = 0; haystack[soffs] != 0; ) | 833 for (size_t soffs = 0; haystack[soffs] != 0; ) |
820 { | 834 { |
821 BOOL matched; | |
822 size_t coffs = soffs; | 835 size_t coffs = soffs; |
823 | 836 |
824 DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", | 837 DBG_RE_MATCH("\nTRY_MATCH @ startoffs=%" PRIu_SIZE_T ": '%s'\n", |
825 soffs, haystack + soffs); | 838 soffs, haystack + soffs); |
826 | 839 |
827 if ((matched = th_regex_do_match_expr(expr, haystack, &coffs, flags))) | 840 if (th_regex_match_expr(haystack, &coffs, expr, 0, flags)) |
828 { | 841 { |
829 // A match was found, increase count | 842 // A match was found, increase count |
830 nmatches++; | 843 nmatches++; |
831 | 844 |
832 // Deliver to caller if required | 845 // Deliver to caller if required |
833 if (pnmatches != NULL) | 846 if (pnmatches != NULL) |
834 *pnmatches = nmatches; | 847 *pnmatches = nmatches; |
835 | 848 |
836 if (pmatches != NULL) | 849 if (pmatches != NULL) |
837 { | 850 { |
838 th_regex_match_node *match = th_malloc0(sizeof(th_regex_match_node)); | 851 th_regex_match_t *match = th_malloc0(sizeof(th_regex_match_t)); |
839 if (match == NULL) | 852 if (match == NULL) |
840 return THERR_MALLOC; | 853 return THERR_MALLOC; |
841 | 854 |
842 match->start = soffs; | 855 match->start = soffs; |
843 match->len = coffs - soffs; | 856 match->len = coffs - soffs; |
864 | 877 |
865 return THERR_OK; | 878 return THERR_OK; |
866 } | 879 } |
867 | 880 |
868 | 881 |
869 static void th_regex_free_match(th_regex_match_node *node) | 882 static void th_regex_free_match(th_regex_match_t *node) |
870 { | 883 { |
871 (void) node; | 884 (void) node; |
872 // Nothing to do here at the moment | 885 // Nothing to do here at the moment |
873 } | 886 } |
874 | 887 |
875 | 888 |
876 void th_regex_free_matches(th_regex_match_node *matches) | 889 void th_regex_free_matches(th_regex_match_t *matches) |
877 { | 890 { |
878 th_llist_free_func_node((th_llist_t *) matches, | 891 th_llist_free_func_node((th_llist_t *) matches, |
879 (void (*)(th_llist_t *)) th_regex_free_match); | 892 (void (*)(th_llist_t *)) th_regex_free_match); |
880 } | 893 } |
881 | 894 |