blob: 5c2e0ddd06aa34c19810d6d830096098c7a0fdd6 [file] [log] [blame]
xf.libdd93d52023-05-12 07:10:14 -07001/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#include <stdint.h>
21
22static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
23 int n) internal_function;
24static void match_ctx_clean (re_match_context_t *mctx) internal_function;
25static void match_ctx_free (re_match_context_t *cache) internal_function;
26static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
27 int str_idx, int from, int to)
28 internal_function;
29static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
30 internal_function;
31static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
32 int str_idx) internal_function;
33static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
34 int node, int str_idx)
35 internal_function;
36static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
37 re_dfastate_t **limited_sts, int last_node,
38 int last_str_idx)
39 internal_function;
40static reg_errcode_t re_search_internal (const regex_t *preg,
41 const char *string, int length,
42 int start, int range, int stop,
43 size_t nmatch, regmatch_t pmatch[],
44 int eflags) internal_function;
45static int re_search_2_stub (struct re_pattern_buffer *bufp,
46 const char *string1, int length1,
47 const char *string2, int length2,
48 int start, int range, struct re_registers *regs,
49 int stop, int ret_len) internal_function;
50static int re_search_stub (struct re_pattern_buffer *bufp,
51 const char *string, int length, int start,
52 int range, int stop, struct re_registers *regs,
53 int ret_len) internal_function;
54static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 int nregs, int regs_allocated) internal_function;
56static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 internal_function;
58static int check_matching (re_match_context_t *mctx, int fl_longest_match,
59 int *p_match_first) internal_function;
60static int check_halt_state_context (const re_match_context_t *mctx,
61 const re_dfastate_t *state, int idx)
62 internal_function;
63static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
64 regmatch_t *prev_idx_match, int cur_node,
65 int cur_idx, int nmatch) internal_function;
66static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
67 int str_idx, int dest_node, int nregs,
68 regmatch_t *regs,
69 re_node_set *eps_via_nodes)
70 internal_function;
71static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 int fl_backtrack) internal_function;
75static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
76 internal_function;
77
78#ifdef RE_ENABLE_I18N
79static int sift_states_iter_mb (const re_match_context_t *mctx,
80 re_sift_context_t *sctx,
81 int node_idx, int str_idx, int max_str_idx)
82 internal_function;
83#endif /* RE_ENABLE_I18N */
84static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
85 re_sift_context_t *sctx)
86 internal_function;
87static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
88 re_sift_context_t *sctx, int str_idx,
89 re_node_set *cur_dest)
90 internal_function;
91static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
92 re_sift_context_t *sctx,
93 int str_idx,
94 re_node_set *dest_nodes)
95 internal_function;
96static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates)
99 internal_function;
100static int check_dst_limits (const re_match_context_t *mctx,
101 re_node_set *limits,
102 int dst_node, int dst_idx, int src_node,
103 int src_idx) internal_function;
104static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
105 int boundaries, int subexp_idx,
106 int from_node, int bkref_idx)
107 internal_function;
108static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
109 int limit, int subexp_idx,
110 int node, int str_idx,
111 int bkref_idx) internal_function;
112static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
113 re_node_set *dest_nodes,
114 const re_node_set *candidates,
115 re_node_set *limits,
116 struct re_backref_cache_entry *bkref_ents,
117 int str_idx) internal_function;
118static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
119 re_sift_context_t *sctx,
120 int str_idx, const re_node_set *candidates)
121 internal_function;
122static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **dst,
124 re_dfastate_t **src, int num)
125 internal_function;
126static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
128static re_dfastate_t *transit_state (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *state) internal_function;
131static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state)
134 internal_function;
135static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
136 re_node_set *cur_nodes,
137 int str_idx) internal_function;
138#if 0
139static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
140 re_match_context_t *mctx,
141 re_dfastate_t *pstate)
142 internal_function;
143#endif
144#ifdef RE_ENABLE_I18N
145static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
146 re_dfastate_t *pstate)
147 internal_function;
148#endif /* RE_ENABLE_I18N */
149static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 const re_node_set *nodes)
151 internal_function;
152static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx)
154 internal_function;
155static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
156 const re_sub_match_top_t *sub_top,
157 re_sub_match_last_t *sub_last,
158 int bkref_node, int bkref_str)
159 internal_function;
160static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 int subexp_idx, int type) internal_function;
162static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 state_array_t *path, int top_node,
164 int top_str, int last_node, int last_str,
165 int type) internal_function;
166static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 int str_idx,
168 re_node_set *cur_nodes,
169 re_node_set *next_nodes)
170 internal_function;
171static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
172 re_node_set *cur_nodes,
173 int ex_subexp, int type)
174 internal_function;
175static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
176 re_node_set *dst_nodes,
177 int target, int ex_subexp,
178 int type) internal_function;
179static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
180 re_node_set *cur_nodes, int cur_str,
181 int subexp_num, int type)
182 internal_function;
183static int build_trtable (const re_dfa_t *dfa,
184 re_dfastate_t *state) internal_function;
185#ifdef RE_ENABLE_I18N
186static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
187 const re_string_t *input, int idx)
188 internal_function;
189# ifdef _LIBC
190static unsigned int find_collation_sequence_value (const unsigned char *mbs,
191 size_t name_len)
192 internal_function;
193# endif /* _LIBC */
194#endif /* RE_ENABLE_I18N */
195static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
196 const re_dfastate_t *state,
197 re_node_set *states_node,
198 bitset_t *states_ch) internal_function;
199static int check_node_accept (const re_match_context_t *mctx,
200 const re_token_t *node, int idx)
201 internal_function;
202static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
203 internal_function;
204
205/* Entry point for POSIX code. */
206
207/* regexec searches for a given pattern, specified by PREG, in the
208 string STRING.
209
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
214
215 EFLAGS specifies `execution flags' which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
218
219 We return 0 if we find a match and REG_NOMATCH if not. */
220
221int
222regexec (const regex_t *__restrict preg, const char *__restrict string,
223 size_t nmatch, regmatch_t pmatch[], int eflags)
224{
225 reg_errcode_t err;
226 int start, length;
227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
228
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
230 return REG_BADPAT;
231
232 if (eflags & REG_STARTEND)
233 {
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
236 }
237 else
238 {
239 start = 0;
240 length = strlen (string);
241 }
242
243 __libc_lock_lock (dfa->lock);
244 if (preg->no_sub)
245 err = re_search_internal (preg, string, length, start, length - start,
246 length, 0, NULL, eflags);
247 else
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, nmatch, pmatch, eflags);
250 __libc_lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
252}
253
254#ifdef _LIBC
255# include <shlib-compat.h>
256versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
257
258# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259__typeof__ (__regexec) __compat_regexec;
260
261int
262attribute_compat_text_section
263__compat_regexec (const regex_t *__restrict preg,
264 const char *__restrict string, size_t nmatch,
265 regmatch_t pmatch[], int eflags)
266{
267 return regexec (preg, string, nmatch, pmatch,
268 eflags & (REG_NOTBOL | REG_NOTEOL));
269}
270compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
271# endif
272#endif
273
274/* Entry points for GNU code. */
275
276/* re_match, re_search, re_match_2, re_search_2
277
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
281
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
284
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
288 way as re_match().)
289
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
292 concerned.
293
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
297 strings.)
298
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
302
303int
304re_match (struct re_pattern_buffer *bufp, const char *string, int length,
305 int start, struct re_registers *regs)
306{
307 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
308}
309#ifdef _LIBC
310weak_alias (__re_match, re_match)
311#endif
312
313int
314re_search (struct re_pattern_buffer *bufp, const char *string, int length,
315 int start, int range, struct re_registers *regs)
316{
317 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
318}
319#ifdef _LIBC
320weak_alias (__re_search, re_search)
321#endif
322
323int
324re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
325 const char *string2, int length2, int start,
326 struct re_registers *regs, int stop)
327{
328 return re_search_2_stub (bufp, string1, length1, string2, length2,
329 start, 0, regs, stop, 1);
330}
331#ifdef _LIBC
332weak_alias (__re_match_2, re_match_2)
333#endif
334
335int
336re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
337 const char *string2, int length2, int start, int range,
338 struct re_registers *regs, int stop)
339{
340 return re_search_2_stub (bufp, string1, length1, string2, length2,
341 start, range, regs, stop, 0);
342}
343#ifdef _LIBC
344weak_alias (__re_search_2, re_search_2)
345#endif
346
347static int
348internal_function
349re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
350 int length1, const char *string2, int length2, int start,
351 int range, struct re_registers *regs,
352 int stop, int ret_len)
353{
354 const char *str;
355 int rval;
356 int len = length1 + length2;
357 char *s = NULL;
358
359 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
360 return -2;
361
362 /* Concatenate the strings. */
363 if (length2 > 0)
364 if (length1 > 0)
365 {
366 s = re_malloc (char, len);
367
368 if (BE (s == NULL, 0))
369 return -2;
370#ifdef _LIBC
371 memcpy (__mempcpy (s, string1, length1), string2, length2);
372#else
373 memcpy (s, string1, length1);
374 memcpy (s + length1, string2, length2);
375#endif
376 str = s;
377 }
378 else
379 str = string2;
380 else
381 str = string1;
382
383 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
384 re_free (s);
385 return rval;
386}
387
388/* The parameters have the same meaning as those of re_search.
389 Additional parameters:
390 If RET_LEN is nonzero the length of the match is returned (re_match style);
391 otherwise the position of the match is returned. */
392
393static int
394internal_function
395re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length,
396 int start, int range, int stop, struct re_registers *regs,
397 int ret_len)
398{
399 reg_errcode_t result;
400 regmatch_t *pmatch;
401 int nregs, rval;
402 int eflags = 0;
403 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
404
405 /* Check for out-of-range. */
406 if (BE (start < 0 || start > length, 0))
407 return -1;
408 if (BE (start + range > length, 0))
409 range = length - start;
410 else if (BE (start + range < 0, 0))
411 range = -start;
412
413 __libc_lock_lock (dfa->lock);
414
415 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
416 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
417
418 /* Compile fastmap if we haven't yet. */
419 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
420 re_compile_fastmap (bufp);
421
422 if (BE (bufp->no_sub, 0))
423 regs = NULL;
424
425 /* We need at least 1 register. */
426 if (regs == NULL)
427 nregs = 1;
428 else if (BE (bufp->regs_allocated == REGS_FIXED &&
429 regs->num_regs < bufp->re_nsub + 1, 0))
430 {
431 nregs = regs->num_regs;
432 if (BE (nregs < 1, 0))
433 {
434 /* Nothing can be copied to regs. */
435 regs = NULL;
436 nregs = 1;
437 }
438 }
439 else
440 nregs = bufp->re_nsub + 1;
441 pmatch = re_malloc (regmatch_t, nregs);
442 if (BE (pmatch == NULL, 0))
443 {
444 rval = -2;
445 goto out;
446 }
447
448 result = re_search_internal (bufp, string, length, start, range, stop,
449 nregs, pmatch, eflags);
450
451 rval = 0;
452
453 /* I hope we needn't fill ther regs with -1's when no match was found. */
454 if (result != REG_NOERROR)
455 rval = -1;
456 else if (regs != NULL)
457 {
458 /* If caller wants register contents data back, copy them. */
459 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
460 bufp->regs_allocated);
461 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
462 rval = -2;
463 }
464
465 if (BE (rval == 0, 1))
466 {
467 if (ret_len)
468 {
469 assert (pmatch[0].rm_so == start);
470 rval = pmatch[0].rm_eo - start;
471 }
472 else
473 rval = pmatch[0].rm_so;
474 }
475 re_free (pmatch);
476 out:
477 __libc_lock_unlock (dfa->lock);
478 return rval;
479}
480
481static unsigned
482internal_function
483re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
484 int regs_allocated)
485{
486 int rval = REGS_REALLOCATE;
487 int i;
488 int need_regs = nregs + 1;
489 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
490 uses. */
491
492 /* Have the register data arrays been allocated? */
493 if (regs_allocated == REGS_UNALLOCATED)
494 { /* No. So allocate them with malloc. */
495 regs->start = re_malloc (regoff_t, need_regs);
496 if (BE (regs->start == NULL, 0))
497 return REGS_UNALLOCATED;
498 regs->end = re_malloc (regoff_t, need_regs);
499 if (BE (regs->end == NULL, 0))
500 {
501 re_free (regs->start);
502 return REGS_UNALLOCATED;
503 }
504 regs->num_regs = need_regs;
505 }
506 else if (regs_allocated == REGS_REALLOCATE)
507 { /* Yes. If we need more elements than were already
508 allocated, reallocate them. If we need fewer, just
509 leave it alone. */
510 if (BE (need_regs > regs->num_regs, 0))
511 {
512 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
513 regoff_t *new_end;
514 if (BE (new_start == NULL, 0))
515 return REGS_UNALLOCATED;
516 new_end = re_realloc (regs->end, regoff_t, need_regs);
517 if (BE (new_end == NULL, 0))
518 {
519 re_free (new_start);
520 return REGS_UNALLOCATED;
521 }
522 regs->start = new_start;
523 regs->end = new_end;
524 regs->num_regs = need_regs;
525 }
526 }
527 else
528 {
529 assert (regs_allocated == REGS_FIXED);
530 /* This function may not be called with REGS_FIXED and nregs too big. */
531 assert (regs->num_regs >= nregs);
532 rval = REGS_FIXED;
533 }
534
535 /* Copy the regs. */
536 for (i = 0; i < nregs; ++i)
537 {
538 regs->start[i] = pmatch[i].rm_so;
539 regs->end[i] = pmatch[i].rm_eo;
540 }
541 for ( ; i < regs->num_regs; ++i)
542 regs->start[i] = regs->end[i] = -1;
543
544 return rval;
545}
546
547/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
548 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
549 this memory for recording register information. STARTS and ENDS
550 must be allocated using the malloc library routine, and must each
551 be at least NUM_REGS * sizeof (regoff_t) bytes long.
552
553 If NUM_REGS == 0, then subsequent matches should allocate their own
554 register data.
555
556 Unless this function is called, the first search or match using
557 PATTERN_BUFFER will allocate its own register data, without
558 freeing the old data. */
559
560void
561re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
562 unsigned num_regs, regoff_t *starts, regoff_t *ends)
563{
564 if (num_regs)
565 {
566 bufp->regs_allocated = REGS_REALLOCATE;
567 regs->num_regs = num_regs;
568 regs->start = starts;
569 regs->end = ends;
570 }
571 else
572 {
573 bufp->regs_allocated = REGS_UNALLOCATED;
574 regs->num_regs = 0;
575 regs->start = regs->end = (regoff_t *) 0;
576 }
577}
578#ifdef _LIBC
579weak_alias (__re_set_registers, re_set_registers)
580#endif
581
582/* Entry points compatible with 4.2 BSD regex library. We don't define
583 them unless specifically requested. */
584
585#if defined _REGEX_RE_COMP || defined _LIBC
586int
587# ifdef _LIBC
588weak_function
589# endif
590re_exec (const char *s)
591{
592 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
593}
594#endif /* _REGEX_RE_COMP */
595
596/* Internal entry point. */
597
598/* Searches for a compiled pattern PREG in the string STRING, whose
599 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
600 mingings with regexec. START, and RANGE have the same meanings
601 with re_search.
602 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
603 otherwise return the error code.
604 Note: We assume front end functions already check ranges.
605 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
606
607static reg_errcode_t
608__attribute_warn_unused_result__ internal_function
609re_search_internal (const regex_t *preg, const char *string, int length,
610 int start, int range, int stop, size_t nmatch,
611 regmatch_t pmatch[], int eflags)
612{
613 reg_errcode_t err;
614 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
615 int left_lim, right_lim, incr;
616 int fl_longest_match, match_first, match_kind, match_last = -1;
617 int extra_nmatch;
618 int sb, ch;
619#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
620 re_match_context_t mctx = { .dfa = dfa };
621#else
622 re_match_context_t mctx;
623#endif
624 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
625 && range && !preg->can_be_null) ? preg->fastmap : NULL;
626 RE_TRANSLATE_TYPE t = preg->translate;
627
628#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
629 memset (&mctx, '\0', sizeof (re_match_context_t));
630 mctx.dfa = dfa;
631#endif
632
633 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
634 nmatch -= extra_nmatch;
635
636 /* Check if the DFA haven't been compiled. */
637 if (BE (preg->used == 0 || dfa->init_state == NULL
638 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
639 || dfa->init_state_begbuf == NULL, 0))
640 return REG_NOMATCH;
641
642#ifdef DEBUG
643 /* We assume front-end functions already check them. */
644 assert (start + range >= 0 && start + range <= length);
645#endif
646
647 /* If initial states with non-begbuf contexts have no elements,
648 the regex must be anchored. If preg->newline_anchor is set,
649 we'll never use init_state_nl, so do not check it. */
650 if (dfa->init_state->nodes.nelem == 0
651 && dfa->init_state_word->nodes.nelem == 0
652 && (dfa->init_state_nl->nodes.nelem == 0
653 || !preg->newline_anchor))
654 {
655 if (start != 0 && start + range != 0)
656 return REG_NOMATCH;
657 start = range = 0;
658 }
659
660 /* We must check the longest matching, if nmatch > 0. */
661 fl_longest_match = (nmatch != 0 || dfa->nbackref);
662
663 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
664 preg->translate, preg->syntax & RE_ICASE, dfa);
665 if (BE (err != REG_NOERROR, 0))
666 goto free_return;
667 mctx.input.stop = stop;
668 mctx.input.raw_stop = stop;
669 mctx.input.newline_anchor = preg->newline_anchor;
670
671 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
672 if (BE (err != REG_NOERROR, 0))
673 goto free_return;
674
675 /* We will log all the DFA states through which the dfa pass,
676 if nmatch > 1, or this dfa has "multibyte node", which is a
677 back-reference or a node which can accept multibyte character or
678 multi character collating element. */
679 if (nmatch > 1 || dfa->has_mb_node)
680 {
681 /* Avoid overflow. */
682 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
683 {
684 err = REG_ESPACE;
685 goto free_return;
686 }
687
688 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
689 if (BE (mctx.state_log == NULL, 0))
690 {
691 err = REG_ESPACE;
692 goto free_return;
693 }
694 }
695 else
696 mctx.state_log = NULL;
697
698 match_first = start;
699 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
700 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
701
702 /* Check incrementally whether of not the input string match. */
703 incr = (range < 0) ? -1 : 1;
704 left_lim = (range < 0) ? start + range : start;
705 right_lim = (range < 0) ? start : start + range;
706 sb = dfa->mb_cur_max == 1;
707 match_kind =
708 (fastmap
709 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
710 | (range >= 0 ? 2 : 0)
711 | (t != NULL ? 1 : 0))
712 : 8);
713
714 for (;; match_first += incr)
715 {
716 err = REG_NOMATCH;
717 if (match_first < left_lim || right_lim < match_first)
718 goto free_return;
719
720 /* Advance as rapidly as possible through the string, until we
721 find a plausible place to start matching. This may be done
722 with varying efficiency, so there are various possibilities:
723 only the most common of them are specialized, in order to
724 save on code size. We use a switch statement for speed. */
725 switch (match_kind)
726 {
727 case 8:
728 /* No fastmap. */
729 break;
730
731 case 7:
732 /* Fastmap with single-byte translation, match forward. */
733 while (BE (match_first < right_lim, 1)
734 && !fastmap[t[(unsigned char) string[match_first]]])
735 ++match_first;
736 goto forward_match_found_start_or_reached_end;
737
738 case 6:
739 /* Fastmap without translation, match forward. */
740 while (BE (match_first < right_lim, 1)
741 && !fastmap[(unsigned char) string[match_first]])
742 ++match_first;
743
744 forward_match_found_start_or_reached_end:
745 if (BE (match_first == right_lim, 0))
746 {
747 ch = match_first >= length
748 ? 0 : (unsigned char) string[match_first];
749 if (!fastmap[t ? t[ch] : ch])
750 goto free_return;
751 }
752 break;
753
754 case 4:
755 case 5:
756 /* Fastmap without multi-byte translation, match backwards. */
757 while (match_first >= left_lim)
758 {
759 ch = match_first >= length
760 ? 0 : (unsigned char) string[match_first];
761 if (fastmap[t ? t[ch] : ch])
762 break;
763 --match_first;
764 }
765 if (match_first < left_lim)
766 goto free_return;
767 break;
768
769 default:
770 /* In this case, we can't determine easily the current byte,
771 since it might be a component byte of a multibyte
772 character. Then we use the constructed buffer instead. */
773 for (;;)
774 {
775 /* If MATCH_FIRST is out of the valid range, reconstruct the
776 buffers. */
777 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
778 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
779 {
780 err = re_string_reconstruct (&mctx.input, match_first,
781 eflags);
782 if (BE (err != REG_NOERROR, 0))
783 goto free_return;
784
785 offset = match_first - mctx.input.raw_mbs_idx;
786 }
787 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
788 Note that MATCH_FIRST must not be smaller than 0. */
789 ch = (match_first >= length
790 ? 0 : re_string_byte_at (&mctx.input, offset));
791 if (fastmap[ch])
792 break;
793 match_first += incr;
794 if (match_first < left_lim || match_first > right_lim)
795 {
796 err = REG_NOMATCH;
797 goto free_return;
798 }
799 }
800 break;
801 }
802
803 /* Reconstruct the buffers so that the matcher can assume that
804 the matching starts from the beginning of the buffer. */
805 err = re_string_reconstruct (&mctx.input, match_first, eflags);
806 if (BE (err != REG_NOERROR, 0))
807 goto free_return;
808
809#ifdef RE_ENABLE_I18N
810 /* Don't consider this char as a possible match start if it part,
811 yet isn't the head, of a multibyte character. */
812 if (!sb && !re_string_first_byte (&mctx.input, 0))
813 continue;
814#endif
815
816 /* It seems to be appropriate one, then use the matcher. */
817 /* We assume that the matching starts from 0. */
818 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
819 match_last = check_matching (&mctx, fl_longest_match,
820 range >= 0 ? &match_first : NULL);
821 if (match_last != -1)
822 {
823 if (BE (match_last == -2, 0))
824 {
825 err = REG_ESPACE;
826 goto free_return;
827 }
828 else
829 {
830 mctx.match_last = match_last;
831 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
832 {
833 re_dfastate_t *pstate = mctx.state_log[match_last];
834 mctx.last_node = check_halt_state_context (&mctx, pstate,
835 match_last);
836 }
837 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
838 || dfa->nbackref)
839 {
840 err = prune_impossible_nodes (&mctx);
841 if (err == REG_NOERROR)
842 break;
843 if (BE (err != REG_NOMATCH, 0))
844 goto free_return;
845 match_last = -1;
846 }
847 else
848 break; /* We found a match. */
849 }
850 }
851
852 match_ctx_clean (&mctx);
853 }
854
855#ifdef DEBUG
856 assert (match_last != -1);
857 assert (err == REG_NOERROR);
858#endif
859
860 /* Set pmatch[] if we need. */
861 if (nmatch > 0)
862 {
863 int reg_idx;
864
865 /* Initialize registers. */
866 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
867 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
868
869 /* Set the points where matching start/end. */
870 pmatch[0].rm_so = 0;
871 pmatch[0].rm_eo = mctx.match_last;
872
873 if (!preg->no_sub && nmatch > 1)
874 {
875 err = set_regs (preg, &mctx, nmatch, pmatch,
876 dfa->has_plural_match && dfa->nbackref > 0);
877 if (BE (err != REG_NOERROR, 0))
878 goto free_return;
879 }
880
881 /* At last, add the offset to the each registers, since we slided
882 the buffers so that we could assume that the matching starts
883 from 0. */
884 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
885 if (pmatch[reg_idx].rm_so != -1)
886 {
887#ifdef RE_ENABLE_I18N
888 if (BE (mctx.input.offsets_needed != 0, 0))
889 {
890 pmatch[reg_idx].rm_so =
891 (pmatch[reg_idx].rm_so == mctx.input.valid_len
892 ? mctx.input.valid_raw_len
893 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
894 pmatch[reg_idx].rm_eo =
895 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
896 ? mctx.input.valid_raw_len
897 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
898 }
899#else
900 assert (mctx.input.offsets_needed == 0);
901#endif
902 pmatch[reg_idx].rm_so += match_first;
903 pmatch[reg_idx].rm_eo += match_first;
904 }
905 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
906 {
907 pmatch[nmatch + reg_idx].rm_so = -1;
908 pmatch[nmatch + reg_idx].rm_eo = -1;
909 }
910
911 if (dfa->subexp_map)
912 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
913 if (dfa->subexp_map[reg_idx] != reg_idx)
914 {
915 pmatch[reg_idx + 1].rm_so
916 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
917 pmatch[reg_idx + 1].rm_eo
918 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
919 }
920 }
921
922 free_return:
923 re_free (mctx.state_log);
924 if (dfa->nbackref)
925 match_ctx_free (&mctx);
926 re_string_destruct (&mctx.input);
927 return err;
928}
929
930static reg_errcode_t
931internal_function __attribute_warn_unused_result__
932prune_impossible_nodes (re_match_context_t *mctx)
933{
934 const re_dfa_t *const dfa = mctx->dfa;
935 int halt_node, match_last;
936 reg_errcode_t ret;
937 re_dfastate_t **sifted_states;
938 re_dfastate_t **lim_states = NULL;
939 re_sift_context_t sctx;
940#ifdef DEBUG
941 assert (mctx->state_log != NULL);
942#endif
943 match_last = mctx->match_last;
944 halt_node = mctx->last_node;
945
946 /* Avoid overflow. */
947 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
948 return REG_ESPACE;
949
950 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
951 if (BE (sifted_states == NULL, 0))
952 {
953 ret = REG_ESPACE;
954 goto free_return;
955 }
956 if (dfa->nbackref)
957 {
958 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
959 if (BE (lim_states == NULL, 0))
960 {
961 ret = REG_ESPACE;
962 goto free_return;
963 }
964 while (1)
965 {
966 memset (lim_states, '\0',
967 sizeof (re_dfastate_t *) * (match_last + 1));
968 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
969 match_last);
970 ret = sift_states_backward (mctx, &sctx);
971 re_node_set_free (&sctx.limits);
972 if (BE (ret != REG_NOERROR, 0))
973 goto free_return;
974 if (sifted_states[0] != NULL || lim_states[0] != NULL)
975 break;
976 do
977 {
978 --match_last;
979 if (match_last < 0)
980 {
981 ret = REG_NOMATCH;
982 goto free_return;
983 }
984 } while (mctx->state_log[match_last] == NULL
985 || !mctx->state_log[match_last]->halt);
986 halt_node = check_halt_state_context (mctx,
987 mctx->state_log[match_last],
988 match_last);
989 }
990 ret = merge_state_array (dfa, sifted_states, lim_states,
991 match_last + 1);
992 re_free (lim_states);
993 lim_states = NULL;
994 if (BE (ret != REG_NOERROR, 0))
995 goto free_return;
996 }
997 else
998 {
999 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1000 ret = sift_states_backward (mctx, &sctx);
1001 re_node_set_free (&sctx.limits);
1002 if (BE (ret != REG_NOERROR, 0))
1003 goto free_return;
1004 if (sifted_states[0] == NULL)
1005 {
1006 ret = REG_NOMATCH;
1007 goto free_return;
1008 }
1009 }
1010 re_free (mctx->state_log);
1011 mctx->state_log = sifted_states;
1012 sifted_states = NULL;
1013 mctx->last_node = halt_node;
1014 mctx->match_last = match_last;
1015 ret = REG_NOERROR;
1016 free_return:
1017 re_free (sifted_states);
1018 re_free (lim_states);
1019 return ret;
1020}
1021
1022/* Acquire an initial state and return it.
1023 We must select appropriate initial state depending on the context,
1024 since initial states may have constraints like "\<", "^", etc.. */
1025
1026static inline re_dfastate_t *
1027__attribute ((always_inline)) internal_function
1028acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1029 int idx)
1030{
1031 const re_dfa_t *const dfa = mctx->dfa;
1032 if (dfa->init_state->has_constraint)
1033 {
1034 unsigned int context;
1035 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1036 if (IS_WORD_CONTEXT (context))
1037 return dfa->init_state_word;
1038 else if (IS_ORDINARY_CONTEXT (context))
1039 return dfa->init_state;
1040 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1041 return dfa->init_state_begbuf;
1042 else if (IS_NEWLINE_CONTEXT (context))
1043 return dfa->init_state_nl;
1044 else if (IS_BEGBUF_CONTEXT (context))
1045 {
1046 /* It is relatively rare case, then calculate on demand. */
1047 return re_acquire_state_context (err, dfa,
1048 dfa->init_state->entrance_nodes,
1049 context);
1050 }
1051 else
1052 /* Must not happen? */
1053 return dfa->init_state;
1054 }
1055 else
1056 return dfa->init_state;
1057}
1058
1059/* Check whether the regular expression match input string INPUT or not,
1060 and return the index where the matching end, return -1 if not match,
1061 or return -2 in case of an error.
1062 FL_LONGEST_MATCH means we want the POSIX longest matching.
1063 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1064 next place where we may want to try matching.
1065 Note that the matcher assume that the maching starts from the current
1066 index of the buffer. */
1067
1068static int
1069internal_function __attribute_warn_unused_result__
1070check_matching (re_match_context_t *mctx, int fl_longest_match,
1071 int *p_match_first)
1072{
1073 const re_dfa_t *const dfa = mctx->dfa;
1074 reg_errcode_t err;
1075 int match = 0;
1076 int match_last = -1;
1077 int cur_str_idx = re_string_cur_idx (&mctx->input);
1078 re_dfastate_t *cur_state;
1079 int at_init_state = p_match_first != NULL;
1080 int next_start_idx = cur_str_idx;
1081
1082 err = REG_NOERROR;
1083 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1084 /* An initial state must not be NULL (invalid). */
1085 if (BE (cur_state == NULL, 0))
1086 {
1087 assert (err == REG_ESPACE);
1088 return -2;
1089 }
1090
1091 if (mctx->state_log != NULL)
1092 {
1093 mctx->state_log[cur_str_idx] = cur_state;
1094
1095 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1096 later. E.g. Processing back references. */
1097 if (BE (dfa->nbackref, 0))
1098 {
1099 at_init_state = 0;
1100 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1101 if (BE (err != REG_NOERROR, 0))
1102 return err;
1103
1104 if (cur_state->has_backref)
1105 {
1106 err = transit_state_bkref (mctx, &cur_state->nodes);
1107 if (BE (err != REG_NOERROR, 0))
1108 return err;
1109 }
1110 }
1111 }
1112
1113 /* If the RE accepts NULL string. */
1114 if (BE (cur_state->halt, 0))
1115 {
1116 if (!cur_state->has_constraint
1117 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1118 {
1119 if (!fl_longest_match)
1120 return cur_str_idx;
1121 else
1122 {
1123 match_last = cur_str_idx;
1124 match = 1;
1125 }
1126 }
1127 }
1128
1129 while (!re_string_eoi (&mctx->input))
1130 {
1131 re_dfastate_t *old_state = cur_state;
1132 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1133
1134 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1135 && mctx->input.bufs_len < mctx->input.len)
1136 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1137 && mctx->input.valid_len < mctx->input.len))
1138 {
1139 err = extend_buffers (mctx, next_char_idx + 1);
1140 if (BE (err != REG_NOERROR, 0))
1141 {
1142 assert (err == REG_ESPACE);
1143 return -2;
1144 }
1145 }
1146
1147 cur_state = transit_state (&err, mctx, cur_state);
1148 if (mctx->state_log != NULL)
1149 cur_state = merge_state_with_log (&err, mctx, cur_state);
1150
1151 if (cur_state == NULL)
1152 {
1153 /* Reached the invalid state or an error. Try to recover a valid
1154 state using the state log, if available and if we have not
1155 already found a valid (even if not the longest) match. */
1156 if (BE (err != REG_NOERROR, 0))
1157 return -2;
1158
1159 if (mctx->state_log == NULL
1160 || (match && !fl_longest_match)
1161 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1162 break;
1163 }
1164
1165 if (BE (at_init_state, 0))
1166 {
1167 if (old_state == cur_state)
1168 next_start_idx = next_char_idx;
1169 else
1170 at_init_state = 0;
1171 }
1172
1173 if (cur_state->halt)
1174 {
1175 /* Reached a halt state.
1176 Check the halt state can satisfy the current context. */
1177 if (!cur_state->has_constraint
1178 || check_halt_state_context (mctx, cur_state,
1179 re_string_cur_idx (&mctx->input)))
1180 {
1181 /* We found an appropriate halt state. */
1182 match_last = re_string_cur_idx (&mctx->input);
1183 match = 1;
1184
1185 /* We found a match, do not modify match_first below. */
1186 p_match_first = NULL;
1187 if (!fl_longest_match)
1188 break;
1189 }
1190 }
1191 }
1192
1193 if (p_match_first)
1194 *p_match_first += next_start_idx;
1195
1196 return match_last;
1197}
1198
1199/* Check NODE match the current context. */
1200
1201static int
1202internal_function
1203check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1204{
1205 re_token_type_t type = dfa->nodes[node].type;
1206 unsigned int constraint = dfa->nodes[node].constraint;
1207 if (type != END_OF_RE)
1208 return 0;
1209 if (!constraint)
1210 return 1;
1211 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1212 return 0;
1213 return 1;
1214}
1215
1216/* Check the halt state STATE match the current context.
1217 Return 0 if not match, if the node, STATE has, is a halt node and
1218 match the context, return the node. */
1219
1220static int
1221internal_function
1222check_halt_state_context (const re_match_context_t *mctx,
1223 const re_dfastate_t *state, int idx)
1224{
1225 int i;
1226 unsigned int context;
1227#ifdef DEBUG
1228 assert (state->halt);
1229#endif
1230 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1231 for (i = 0; i < state->nodes.nelem; ++i)
1232 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1233 return state->nodes.elems[i];
1234 return 0;
1235}
1236
1237/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1238 corresponding to the DFA).
1239 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1240 of errors. */
1241
1242static int
1243internal_function
1244proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1245 int *pidx, int node, re_node_set *eps_via_nodes,
1246 struct re_fail_stack_t *fs)
1247{
1248 const re_dfa_t *const dfa = mctx->dfa;
1249 int i, err;
1250 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1251 {
1252 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1253 re_node_set *edests = &dfa->edests[node];
1254 int dest_node;
1255 err = re_node_set_insert (eps_via_nodes, node);
1256 if (BE (err < 0, 0))
1257 return -2;
1258 /* Pick up a valid destination, or return -1 if none is found. */
1259 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1260 {
1261 int candidate = edests->elems[i];
1262 if (!re_node_set_contains (cur_nodes, candidate))
1263 continue;
1264 if (dest_node == -1)
1265 dest_node = candidate;
1266
1267 else
1268 {
1269 /* In order to avoid infinite loop like "(a*)*", return the second
1270 epsilon-transition if the first was already considered. */
1271 if (re_node_set_contains (eps_via_nodes, dest_node))
1272 return candidate;
1273
1274 /* Otherwise, push the second epsilon-transition on the fail stack. */
1275 else if (fs != NULL
1276 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1277 eps_via_nodes))
1278 return -2;
1279
1280 /* We know we are going to exit. */
1281 break;
1282 }
1283 }
1284 return dest_node;
1285 }
1286 else
1287 {
1288 int naccepted = 0;
1289 re_token_type_t type = dfa->nodes[node].type;
1290
1291#ifdef RE_ENABLE_I18N
1292 if (dfa->nodes[node].accept_mb)
1293 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1294 else
1295#endif /* RE_ENABLE_I18N */
1296 if (type == OP_BACK_REF)
1297 {
1298 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1299 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1300 if (fs != NULL)
1301 {
1302 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1303 return -1;
1304 else if (naccepted)
1305 {
1306 char *buf = (char *) re_string_get_buffer (&mctx->input);
xf.li84027492024-04-09 00:17:51 -07001307 if (mctx->input.valid_len - *pidx < naccepted
1308 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1309 naccepted)
1310 != 0))
xf.libdd93d52023-05-12 07:10:14 -07001311 return -1;
1312 }
1313 }
1314
1315 if (naccepted == 0)
1316 {
1317 int dest_node;
1318 err = re_node_set_insert (eps_via_nodes, node);
1319 if (BE (err < 0, 0))
1320 return -2;
1321 dest_node = dfa->edests[node].elems[0];
1322 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1323 dest_node))
1324 return dest_node;
1325 }
1326 }
1327
1328 if (naccepted != 0
1329 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1330 {
1331 int dest_node = dfa->nexts[node];
1332 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1333 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1334 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1335 dest_node)))
1336 return -1;
1337 re_node_set_empty (eps_via_nodes);
1338 return dest_node;
1339 }
1340 }
1341 return -1;
1342}
1343
1344static reg_errcode_t
1345internal_function __attribute_warn_unused_result__
1346push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1347 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1348{
1349 reg_errcode_t err;
1350 int num = fs->num++;
1351 if (fs->num == fs->alloc)
1352 {
1353 struct re_fail_stack_ent_t *new_array;
1354 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1355 * fs->alloc * 2));
1356 if (new_array == NULL)
1357 return REG_ESPACE;
1358 fs->alloc *= 2;
1359 fs->stack = new_array;
1360 }
1361 fs->stack[num].idx = str_idx;
1362 fs->stack[num].node = dest_node;
1363 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1364 if (fs->stack[num].regs == NULL)
1365 return REG_ESPACE;
1366 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1367 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1368 return err;
1369}
1370
1371static int
1372internal_function
1373pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1374 regmatch_t *regs, re_node_set *eps_via_nodes)
1375{
1376 int num = --fs->num;
1377 assert (num >= 0);
1378 *pidx = fs->stack[num].idx;
1379 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1380 re_node_set_free (eps_via_nodes);
1381 re_free (fs->stack[num].regs);
1382 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1383 return fs->stack[num].node;
1384}
1385
1386/* Set the positions where the subexpressions are starts/ends to registers
1387 PMATCH.
1388 Note: We assume that pmatch[0] is already set, and
1389 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1390
1391static reg_errcode_t
1392internal_function __attribute_warn_unused_result__
1393set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1394 regmatch_t *pmatch, int fl_backtrack)
1395{
1396 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1397 int idx, cur_node;
1398 re_node_set eps_via_nodes;
1399 struct re_fail_stack_t *fs;
1400 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1401 regmatch_t *prev_idx_match;
1402 int prev_idx_match_malloced = 0;
1403
1404#ifdef DEBUG
1405 assert (nmatch > 1);
1406 assert (mctx->state_log != NULL);
1407#endif
1408 if (fl_backtrack)
1409 {
1410 fs = &fs_body;
1411 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1412 if (fs->stack == NULL)
1413 return REG_ESPACE;
1414 }
1415 else
1416 fs = NULL;
1417
1418 cur_node = dfa->init_node;
1419 re_node_set_init_empty (&eps_via_nodes);
1420
1421 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1422 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1423 else
1424 {
1425 prev_idx_match = re_malloc (regmatch_t, nmatch);
1426 if (prev_idx_match == NULL)
1427 {
1428 free_fail_stack_return (fs);
1429 return REG_ESPACE;
1430 }
1431 prev_idx_match_malloced = 1;
1432 }
1433 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1434
1435 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1436 {
1437 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1438
1439 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1440 {
1441 int reg_idx;
1442 if (fs)
1443 {
1444 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1445 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1446 break;
1447 if (reg_idx == nmatch)
1448 {
1449 re_node_set_free (&eps_via_nodes);
1450 if (prev_idx_match_malloced)
1451 re_free (prev_idx_match);
1452 return free_fail_stack_return (fs);
1453 }
1454 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1455 &eps_via_nodes);
1456 }
1457 else
1458 {
1459 re_node_set_free (&eps_via_nodes);
1460 if (prev_idx_match_malloced)
1461 re_free (prev_idx_match);
1462 return REG_NOERROR;
1463 }
1464 }
1465
1466 /* Proceed to next node. */
1467 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1468 &eps_via_nodes, fs);
1469
1470 if (BE (cur_node < 0, 0))
1471 {
1472 if (BE (cur_node == -2, 0))
1473 {
1474 re_node_set_free (&eps_via_nodes);
1475 if (prev_idx_match_malloced)
1476 re_free (prev_idx_match);
1477 free_fail_stack_return (fs);
1478 return REG_ESPACE;
1479 }
1480 if (fs)
1481 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1482 &eps_via_nodes);
1483 else
1484 {
1485 re_node_set_free (&eps_via_nodes);
1486 if (prev_idx_match_malloced)
1487 re_free (prev_idx_match);
1488 return REG_NOMATCH;
1489 }
1490 }
1491 }
1492 re_node_set_free (&eps_via_nodes);
1493 if (prev_idx_match_malloced)
1494 re_free (prev_idx_match);
1495 return free_fail_stack_return (fs);
1496}
1497
1498static reg_errcode_t
1499internal_function
1500free_fail_stack_return (struct re_fail_stack_t *fs)
1501{
1502 if (fs)
1503 {
1504 int fs_idx;
1505 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1506 {
1507 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1508 re_free (fs->stack[fs_idx].regs);
1509 }
1510 re_free (fs->stack);
1511 }
1512 return REG_NOERROR;
1513}
1514
1515static void
1516internal_function
1517update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1518 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1519{
1520 int type = dfa->nodes[cur_node].type;
1521 if (type == OP_OPEN_SUBEXP)
1522 {
1523 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1524
1525 /* We are at the first node of this sub expression. */
1526 if (reg_num < nmatch)
1527 {
1528 pmatch[reg_num].rm_so = cur_idx;
1529 pmatch[reg_num].rm_eo = -1;
1530 }
1531 }
1532 else if (type == OP_CLOSE_SUBEXP)
1533 {
1534 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1535 if (reg_num < nmatch)
1536 {
1537 /* We are at the last node of this sub expression. */
1538 if (pmatch[reg_num].rm_so < cur_idx)
1539 {
1540 pmatch[reg_num].rm_eo = cur_idx;
1541 /* This is a non-empty match or we are not inside an optional
1542 subexpression. Accept this right away. */
1543 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1544 }
1545 else
1546 {
1547 if (dfa->nodes[cur_node].opt_subexp
1548 && prev_idx_match[reg_num].rm_so != -1)
1549 /* We transited through an empty match for an optional
1550 subexpression, like (a?)*, and this is not the subexp's
1551 first match. Copy back the old content of the registers
1552 so that matches of an inner subexpression are undone as
1553 well, like in ((a?))*. */
1554 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1555 else
1556 /* We completed a subexpression, but it may be part of
1557 an optional one, so do not update PREV_IDX_MATCH. */
1558 pmatch[reg_num].rm_eo = cur_idx;
1559 }
1560 }
1561 }
1562}
1563
1564/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1565 and sift the nodes in each states according to the following rules.
1566 Updated state_log will be wrote to STATE_LOG.
1567
1568 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1569 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1570 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1571 the LAST_NODE, we throw away the node `a'.
1572 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1573 string `s' and transit to `b':
1574 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1575 away the node `a'.
1576 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1577 thrown away, we throw away the node `a'.
1578 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1579 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1580 node `a'.
1581 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1582 we throw away the node `a'. */
1583
1584#define STATE_NODE_CONTAINS(state,node) \
1585 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1586
1587static reg_errcode_t
1588internal_function
1589sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1590{
1591 reg_errcode_t err;
1592 int null_cnt = 0;
1593 int str_idx = sctx->last_str_idx;
1594 re_node_set cur_dest;
1595
1596#ifdef DEBUG
1597 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1598#endif
1599
1600 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1601 transit to the last_node and the last_node itself. */
1602 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1603 if (BE (err != REG_NOERROR, 0))
1604 return err;
1605 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1606 if (BE (err != REG_NOERROR, 0))
1607 goto free_return;
1608
1609 /* Then check each states in the state_log. */
1610 while (str_idx > 0)
1611 {
1612 /* Update counters. */
1613 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1614 if (null_cnt > mctx->max_mb_elem_len)
1615 {
1616 memset (sctx->sifted_states, '\0',
1617 sizeof (re_dfastate_t *) * str_idx);
1618 re_node_set_free (&cur_dest);
1619 return REG_NOERROR;
1620 }
1621 re_node_set_empty (&cur_dest);
1622 --str_idx;
1623
1624 if (mctx->state_log[str_idx])
1625 {
1626 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1627 if (BE (err != REG_NOERROR, 0))
1628 goto free_return;
1629 }
1630
1631 /* Add all the nodes which satisfy the following conditions:
1632 - It can epsilon transit to a node in CUR_DEST.
1633 - It is in CUR_SRC.
1634 And update state_log. */
1635 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1636 if (BE (err != REG_NOERROR, 0))
1637 goto free_return;
1638 }
1639 err = REG_NOERROR;
1640 free_return:
1641 re_node_set_free (&cur_dest);
1642 return err;
1643}
1644
1645static reg_errcode_t
1646internal_function __attribute_warn_unused_result__
1647build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1648 int str_idx, re_node_set *cur_dest)
1649{
1650 const re_dfa_t *const dfa = mctx->dfa;
1651 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1652 int i;
1653
1654 /* Then build the next sifted state.
1655 We build the next sifted state on `cur_dest', and update
1656 `sifted_states[str_idx]' with `cur_dest'.
1657 Note:
1658 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1659 `cur_src' points the node_set of the old `state_log[str_idx]'
1660 (with the epsilon nodes pre-filtered out). */
1661 for (i = 0; i < cur_src->nelem; i++)
1662 {
1663 int prev_node = cur_src->elems[i];
1664 int naccepted = 0;
1665 int ret;
1666
1667#ifdef DEBUG
1668 re_token_type_t type = dfa->nodes[prev_node].type;
1669 assert (!IS_EPSILON_NODE (type));
1670#endif
1671#ifdef RE_ENABLE_I18N
1672 /* If the node may accept `multi byte'. */
1673 if (dfa->nodes[prev_node].accept_mb)
1674 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1675 str_idx, sctx->last_str_idx);
1676#endif /* RE_ENABLE_I18N */
1677
1678 /* We don't check backreferences here.
1679 See update_cur_sifted_state(). */
1680 if (!naccepted
1681 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1682 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1683 dfa->nexts[prev_node]))
1684 naccepted = 1;
1685
1686 if (naccepted == 0)
1687 continue;
1688
1689 if (sctx->limits.nelem)
1690 {
1691 int to_idx = str_idx + naccepted;
1692 if (check_dst_limits (mctx, &sctx->limits,
1693 dfa->nexts[prev_node], to_idx,
1694 prev_node, str_idx))
1695 continue;
1696 }
1697 ret = re_node_set_insert (cur_dest, prev_node);
1698 if (BE (ret == -1, 0))
1699 return REG_ESPACE;
1700 }
1701
1702 return REG_NOERROR;
1703}
1704
1705/* Helper functions. */
1706
1707static reg_errcode_t
1708internal_function
1709clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1710{
1711 int top = mctx->state_log_top;
1712
1713 if ((next_state_log_idx >= mctx->input.bufs_len
1714 && mctx->input.bufs_len < mctx->input.len)
1715 || (next_state_log_idx >= mctx->input.valid_len
1716 && mctx->input.valid_len < mctx->input.len))
1717 {
1718 reg_errcode_t err;
1719 err = extend_buffers (mctx, next_state_log_idx + 1);
1720 if (BE (err != REG_NOERROR, 0))
1721 return err;
1722 }
1723
1724 if (top < next_state_log_idx)
1725 {
1726 memset (mctx->state_log + top + 1, '\0',
1727 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1728 mctx->state_log_top = next_state_log_idx;
1729 }
1730 return REG_NOERROR;
1731}
1732
1733static reg_errcode_t
1734internal_function
1735merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1736 re_dfastate_t **src, int num)
1737{
1738 int st_idx;
1739 reg_errcode_t err;
1740 for (st_idx = 0; st_idx < num; ++st_idx)
1741 {
1742 if (dst[st_idx] == NULL)
1743 dst[st_idx] = src[st_idx];
1744 else if (src[st_idx] != NULL)
1745 {
1746 re_node_set merged_set;
1747 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1748 &src[st_idx]->nodes);
1749 if (BE (err != REG_NOERROR, 0))
1750 return err;
1751 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1752 re_node_set_free (&merged_set);
1753 if (BE (err != REG_NOERROR, 0))
1754 return err;
1755 }
1756 }
1757 return REG_NOERROR;
1758}
1759
1760static reg_errcode_t
1761internal_function
1762update_cur_sifted_state (const re_match_context_t *mctx,
1763 re_sift_context_t *sctx, int str_idx,
1764 re_node_set *dest_nodes)
1765{
1766 const re_dfa_t *const dfa = mctx->dfa;
1767 reg_errcode_t err = REG_NOERROR;
1768 const re_node_set *candidates;
1769 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1770 : &mctx->state_log[str_idx]->nodes);
1771
1772 if (dest_nodes->nelem == 0)
1773 sctx->sifted_states[str_idx] = NULL;
1774 else
1775 {
1776 if (candidates)
1777 {
1778 /* At first, add the nodes which can epsilon transit to a node in
1779 DEST_NODE. */
1780 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1781 if (BE (err != REG_NOERROR, 0))
1782 return err;
1783
1784 /* Then, check the limitations in the current sift_context. */
1785 if (sctx->limits.nelem)
1786 {
1787 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1788 mctx->bkref_ents, str_idx);
1789 if (BE (err != REG_NOERROR, 0))
1790 return err;
1791 }
1792 }
1793
1794 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1795 if (BE (err != REG_NOERROR, 0))
1796 return err;
1797 }
1798
1799 if (candidates && mctx->state_log[str_idx]->has_backref)
1800 {
1801 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1802 if (BE (err != REG_NOERROR, 0))
1803 return err;
1804 }
1805 return REG_NOERROR;
1806}
1807
1808static reg_errcode_t
1809internal_function __attribute_warn_unused_result__
1810add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1811 const re_node_set *candidates)
1812{
1813 reg_errcode_t err = REG_NOERROR;
1814 int i;
1815
1816 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1817 if (BE (err != REG_NOERROR, 0))
1818 return err;
1819
1820 if (!state->inveclosure.alloc)
1821 {
1822 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1823 if (BE (err != REG_NOERROR, 0))
1824 return REG_ESPACE;
1825 for (i = 0; i < dest_nodes->nelem; i++)
1826 {
1827 err = re_node_set_merge (&state->inveclosure,
1828 dfa->inveclosures + dest_nodes->elems[i]);
1829 if (BE (err != REG_NOERROR, 0))
1830 return REG_ESPACE;
1831 }
1832 }
1833 return re_node_set_add_intersect (dest_nodes, candidates,
1834 &state->inveclosure);
1835}
1836
1837static reg_errcode_t
1838internal_function
1839sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1840 const re_node_set *candidates)
1841{
1842 int ecl_idx;
1843 reg_errcode_t err;
1844 re_node_set *inv_eclosure = dfa->inveclosures + node;
1845 re_node_set except_nodes;
1846 re_node_set_init_empty (&except_nodes);
1847 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1848 {
1849 int cur_node = inv_eclosure->elems[ecl_idx];
1850 if (cur_node == node)
1851 continue;
1852 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1853 {
1854 int edst1 = dfa->edests[cur_node].elems[0];
1855 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1856 ? dfa->edests[cur_node].elems[1] : -1);
1857 if ((!re_node_set_contains (inv_eclosure, edst1)
1858 && re_node_set_contains (dest_nodes, edst1))
1859 || (edst2 > 0
1860 && !re_node_set_contains (inv_eclosure, edst2)
1861 && re_node_set_contains (dest_nodes, edst2)))
1862 {
1863 err = re_node_set_add_intersect (&except_nodes, candidates,
1864 dfa->inveclosures + cur_node);
1865 if (BE (err != REG_NOERROR, 0))
1866 {
1867 re_node_set_free (&except_nodes);
1868 return err;
1869 }
1870 }
1871 }
1872 }
1873 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1874 {
1875 int cur_node = inv_eclosure->elems[ecl_idx];
1876 if (!re_node_set_contains (&except_nodes, cur_node))
1877 {
1878 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1879 re_node_set_remove_at (dest_nodes, idx);
1880 }
1881 }
1882 re_node_set_free (&except_nodes);
1883 return REG_NOERROR;
1884}
1885
1886static int
1887internal_function
1888check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1889 int dst_node, int dst_idx, int src_node, int src_idx)
1890{
1891 const re_dfa_t *const dfa = mctx->dfa;
1892 int lim_idx, src_pos, dst_pos;
1893
1894 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1895 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1896 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1897 {
1898 int subexp_idx;
1899 struct re_backref_cache_entry *ent;
1900 ent = mctx->bkref_ents + limits->elems[lim_idx];
1901 subexp_idx = dfa->nodes[ent->node].opr.idx;
1902
1903 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1904 subexp_idx, dst_node, dst_idx,
1905 dst_bkref_idx);
1906 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1907 subexp_idx, src_node, src_idx,
1908 src_bkref_idx);
1909
1910 /* In case of:
1911 <src> <dst> ( <subexp> )
1912 ( <subexp> ) <src> <dst>
1913 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1914 if (src_pos == dst_pos)
1915 continue; /* This is unrelated limitation. */
1916 else
1917 return 1;
1918 }
1919 return 0;
1920}
1921
1922static int
1923internal_function
1924check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1925 int subexp_idx, int from_node, int bkref_idx)
1926{
1927 const re_dfa_t *const dfa = mctx->dfa;
1928 const re_node_set *eclosures = dfa->eclosures + from_node;
1929 int node_idx;
1930
1931 /* Else, we are on the boundary: examine the nodes on the epsilon
1932 closure. */
1933 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1934 {
1935 int node = eclosures->elems[node_idx];
1936 switch (dfa->nodes[node].type)
1937 {
1938 case OP_BACK_REF:
1939 if (bkref_idx != -1)
1940 {
1941 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1942 do
1943 {
1944 int dst, cpos;
1945
1946 if (ent->node != node)
1947 continue;
1948
1949 if (subexp_idx < BITSET_WORD_BITS
1950 && !(ent->eps_reachable_subexps_map
1951 & ((bitset_word_t) 1 << subexp_idx)))
1952 continue;
1953
1954 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1955 OP_CLOSE_SUBEXP cases below. But, if the
1956 destination node is the same node as the source
1957 node, don't recurse because it would cause an
1958 infinite loop: a regex that exhibits this behavior
1959 is ()\1*\1* */
1960 dst = dfa->edests[node].elems[0];
1961 if (dst == from_node)
1962 {
1963 if (boundaries & 1)
1964 return -1;
1965 else /* if (boundaries & 2) */
1966 return 0;
1967 }
1968
1969 cpos =
1970 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1971 dst, bkref_idx);
1972 if (cpos == -1 /* && (boundaries & 1) */)
1973 return -1;
1974 if (cpos == 0 && (boundaries & 2))
1975 return 0;
1976
1977 if (subexp_idx < BITSET_WORD_BITS)
1978 ent->eps_reachable_subexps_map
1979 &= ~((bitset_word_t) 1 << subexp_idx);
1980 }
1981 while (ent++->more);
1982 }
1983 break;
1984
1985 case OP_OPEN_SUBEXP:
1986 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1987 return -1;
1988 break;
1989
1990 case OP_CLOSE_SUBEXP:
1991 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1992 return 0;
1993 break;
1994
1995 default:
1996 break;
1997 }
1998 }
1999
2000 return (boundaries & 2) ? 1 : 0;
2001}
2002
2003static int
2004internal_function
2005check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2006 int subexp_idx, int from_node, int str_idx,
2007 int bkref_idx)
2008{
2009 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2010 int boundaries;
2011
2012 /* If we are outside the range of the subexpression, return -1 or 1. */
2013 if (str_idx < lim->subexp_from)
2014 return -1;
2015
2016 if (lim->subexp_to < str_idx)
2017 return 1;
2018
2019 /* If we are within the subexpression, return 0. */
2020 boundaries = (str_idx == lim->subexp_from);
2021 boundaries |= (str_idx == lim->subexp_to) << 1;
2022 if (boundaries == 0)
2023 return 0;
2024
2025 /* Else, examine epsilon closure. */
2026 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2027 from_node, bkref_idx);
2028}
2029
2030/* Check the limitations of sub expressions LIMITS, and remove the nodes
2031 which are against limitations from DEST_NODES. */
2032
2033static reg_errcode_t
2034internal_function
2035check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2036 const re_node_set *candidates, re_node_set *limits,
2037 struct re_backref_cache_entry *bkref_ents, int str_idx)
2038{
2039 reg_errcode_t err;
2040 int node_idx, lim_idx;
2041
2042 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2043 {
2044 int subexp_idx;
2045 struct re_backref_cache_entry *ent;
2046 ent = bkref_ents + limits->elems[lim_idx];
2047
2048 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2049 continue; /* This is unrelated limitation. */
2050
2051 subexp_idx = dfa->nodes[ent->node].opr.idx;
2052 if (ent->subexp_to == str_idx)
2053 {
2054 int ops_node = -1;
2055 int cls_node = -1;
2056 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2057 {
2058 int node = dest_nodes->elems[node_idx];
2059 re_token_type_t type = dfa->nodes[node].type;
2060 if (type == OP_OPEN_SUBEXP
2061 && subexp_idx == dfa->nodes[node].opr.idx)
2062 ops_node = node;
2063 else if (type == OP_CLOSE_SUBEXP
2064 && subexp_idx == dfa->nodes[node].opr.idx)
2065 cls_node = node;
2066 }
2067
2068 /* Check the limitation of the open subexpression. */
2069 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2070 if (ops_node >= 0)
2071 {
2072 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2073 candidates);
2074 if (BE (err != REG_NOERROR, 0))
2075 return err;
2076 }
2077
2078 /* Check the limitation of the close subexpression. */
2079 if (cls_node >= 0)
2080 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2081 {
2082 int node = dest_nodes->elems[node_idx];
2083 if (!re_node_set_contains (dfa->inveclosures + node,
2084 cls_node)
2085 && !re_node_set_contains (dfa->eclosures + node,
2086 cls_node))
2087 {
2088 /* It is against this limitation.
2089 Remove it form the current sifted state. */
2090 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2091 candidates);
2092 if (BE (err != REG_NOERROR, 0))
2093 return err;
2094 --node_idx;
2095 }
2096 }
2097 }
2098 else /* (ent->subexp_to != str_idx) */
2099 {
2100 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2101 {
2102 int node = dest_nodes->elems[node_idx];
2103 re_token_type_t type = dfa->nodes[node].type;
2104 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2105 {
2106 if (subexp_idx != dfa->nodes[node].opr.idx)
2107 continue;
2108 /* It is against this limitation.
2109 Remove it form the current sifted state. */
2110 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2111 candidates);
2112 if (BE (err != REG_NOERROR, 0))
2113 return err;
2114 }
2115 }
2116 }
2117 }
2118 return REG_NOERROR;
2119}
2120
2121static reg_errcode_t
2122internal_function __attribute_warn_unused_result__
2123sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2124 int str_idx, const re_node_set *candidates)
2125{
2126 const re_dfa_t *const dfa = mctx->dfa;
2127 reg_errcode_t err;
2128 int node_idx, node;
2129 re_sift_context_t local_sctx;
2130 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2131
2132 if (first_idx == -1)
2133 return REG_NOERROR;
2134
2135 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2136
2137 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2138 {
2139 int enabled_idx;
2140 re_token_type_t type;
2141 struct re_backref_cache_entry *entry;
2142 node = candidates->elems[node_idx];
2143 type = dfa->nodes[node].type;
2144 /* Avoid infinite loop for the REs like "()\1+". */
2145 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2146 continue;
2147 if (type != OP_BACK_REF)
2148 continue;
2149
2150 entry = mctx->bkref_ents + first_idx;
2151 enabled_idx = first_idx;
2152 do
2153 {
2154 int subexp_len;
2155 int to_idx;
2156 int dst_node;
2157 int ret;
2158 re_dfastate_t *cur_state;
2159
2160 if (entry->node != node)
2161 continue;
2162 subexp_len = entry->subexp_to - entry->subexp_from;
2163 to_idx = str_idx + subexp_len;
2164 dst_node = (subexp_len ? dfa->nexts[node]
2165 : dfa->edests[node].elems[0]);
2166
2167 if (to_idx > sctx->last_str_idx
2168 || sctx->sifted_states[to_idx] == NULL
2169 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2170 || check_dst_limits (mctx, &sctx->limits, node,
2171 str_idx, dst_node, to_idx))
2172 continue;
2173
2174 if (local_sctx.sifted_states == NULL)
2175 {
2176 local_sctx = *sctx;
2177 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2178 if (BE (err != REG_NOERROR, 0))
2179 goto free_return;
2180 }
2181 local_sctx.last_node = node;
2182 local_sctx.last_str_idx = str_idx;
2183 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2184 if (BE (ret < 0, 0))
2185 {
2186 err = REG_ESPACE;
2187 goto free_return;
2188 }
2189 cur_state = local_sctx.sifted_states[str_idx];
2190 err = sift_states_backward (mctx, &local_sctx);
2191 if (BE (err != REG_NOERROR, 0))
2192 goto free_return;
2193 if (sctx->limited_states != NULL)
2194 {
2195 err = merge_state_array (dfa, sctx->limited_states,
2196 local_sctx.sifted_states,
2197 str_idx + 1);
2198 if (BE (err != REG_NOERROR, 0))
2199 goto free_return;
2200 }
2201 local_sctx.sifted_states[str_idx] = cur_state;
2202 re_node_set_remove (&local_sctx.limits, enabled_idx);
2203
2204 /* mctx->bkref_ents may have changed, reload the pointer. */
2205 entry = mctx->bkref_ents + enabled_idx;
2206 }
2207 while (enabled_idx++, entry++->more);
2208 }
2209 err = REG_NOERROR;
2210 free_return:
2211 if (local_sctx.sifted_states != NULL)
2212 {
2213 re_node_set_free (&local_sctx.limits);
2214 }
2215
2216 return err;
2217}
2218
2219
2220#ifdef RE_ENABLE_I18N
2221static int
2222internal_function
2223sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2224 int node_idx, int str_idx, int max_str_idx)
2225{
2226 const re_dfa_t *const dfa = mctx->dfa;
2227 int naccepted;
2228 /* Check the node can accept `multi byte'. */
2229 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2230 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2231 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2232 dfa->nexts[node_idx]))
2233 /* The node can't accept the `multi byte', or the
2234 destination was already thrown away, then the node
2235 could't accept the current input `multi byte'. */
2236 naccepted = 0;
2237 /* Otherwise, it is sure that the node could accept
2238 `naccepted' bytes input. */
2239 return naccepted;
2240}
2241#endif /* RE_ENABLE_I18N */
2242
2243
2244/* Functions for state transition. */
2245
2246/* Return the next state to which the current state STATE will transit by
2247 accepting the current input byte, and update STATE_LOG if necessary.
2248 If STATE can accept a multibyte char/collating element/back reference
2249 update the destination of STATE_LOG. */
2250
2251static re_dfastate_t *
2252internal_function __attribute_warn_unused_result__
2253transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2254 re_dfastate_t *state)
2255{
2256 re_dfastate_t **trtable;
2257 unsigned char ch;
2258
2259#ifdef RE_ENABLE_I18N
2260 /* If the current state can accept multibyte. */
2261 if (BE (state->accept_mb, 0))
2262 {
2263 *err = transit_state_mb (mctx, state);
2264 if (BE (*err != REG_NOERROR, 0))
2265 return NULL;
2266 }
2267#endif /* RE_ENABLE_I18N */
2268
2269 /* Then decide the next state with the single byte. */
2270#if 0
2271 if (0)
2272 /* don't use transition table */
2273 return transit_state_sb (err, mctx, state);
2274#endif
2275
2276 /* Use transition table */
2277 ch = re_string_fetch_byte (&mctx->input);
2278 for (;;)
2279 {
2280 trtable = state->trtable;
2281 if (BE (trtable != NULL, 1))
2282 return trtable[ch];
2283
2284 trtable = state->word_trtable;
2285 if (BE (trtable != NULL, 1))
2286 {
2287 unsigned int context;
2288 context
2289 = re_string_context_at (&mctx->input,
2290 re_string_cur_idx (&mctx->input) - 1,
2291 mctx->eflags);
2292 if (IS_WORD_CONTEXT (context))
2293 return trtable[ch + SBC_MAX];
2294 else
2295 return trtable[ch];
2296 }
2297
2298 if (!build_trtable (mctx->dfa, state))
2299 {
2300 *err = REG_ESPACE;
2301 return NULL;
2302 }
2303
2304 /* Retry, we now have a transition table. */
2305 }
2306}
2307
2308/* Update the state_log if we need */
2309re_dfastate_t *
2310internal_function
2311merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2312 re_dfastate_t *next_state)
2313{
2314 const re_dfa_t *const dfa = mctx->dfa;
2315 int cur_idx = re_string_cur_idx (&mctx->input);
2316
2317 if (cur_idx > mctx->state_log_top)
2318 {
2319 mctx->state_log[cur_idx] = next_state;
2320 mctx->state_log_top = cur_idx;
2321 }
2322 else if (mctx->state_log[cur_idx] == 0)
2323 {
2324 mctx->state_log[cur_idx] = next_state;
2325 }
2326 else
2327 {
2328 re_dfastate_t *pstate;
2329 unsigned int context;
2330 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2331 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2332 the destination of a multibyte char/collating element/
2333 back reference. Then the next state is the union set of
2334 these destinations and the results of the transition table. */
2335 pstate = mctx->state_log[cur_idx];
2336 log_nodes = pstate->entrance_nodes;
2337 if (next_state != NULL)
2338 {
2339 table_nodes = next_state->entrance_nodes;
2340 *err = re_node_set_init_union (&next_nodes, table_nodes,
2341 log_nodes);
2342 if (BE (*err != REG_NOERROR, 0))
2343 return NULL;
2344 }
2345 else
2346 next_nodes = *log_nodes;
2347 /* Note: We already add the nodes of the initial state,
2348 then we don't need to add them here. */
2349
2350 context = re_string_context_at (&mctx->input,
2351 re_string_cur_idx (&mctx->input) - 1,
2352 mctx->eflags);
2353 next_state = mctx->state_log[cur_idx]
2354 = re_acquire_state_context (err, dfa, &next_nodes, context);
2355 /* We don't need to check errors here, since the return value of
2356 this function is next_state and ERR is already set. */
2357
2358 if (table_nodes != NULL)
2359 re_node_set_free (&next_nodes);
2360 }
2361
2362 if (BE (dfa->nbackref, 0) && next_state != NULL)
2363 {
2364 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2365 later. We must check them here, since the back references in the
2366 next state might use them. */
2367 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2368 cur_idx);
2369 if (BE (*err != REG_NOERROR, 0))
2370 return NULL;
2371
2372 /* If the next state has back references. */
2373 if (next_state->has_backref)
2374 {
2375 *err = transit_state_bkref (mctx, &next_state->nodes);
2376 if (BE (*err != REG_NOERROR, 0))
2377 return NULL;
2378 next_state = mctx->state_log[cur_idx];
2379 }
2380 }
2381
2382 return next_state;
2383}
2384
2385/* Skip bytes in the input that correspond to part of a
2386 multi-byte match, then look in the log for a state
2387 from which to restart matching. */
2388re_dfastate_t *
2389internal_function
2390find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2391{
2392 re_dfastate_t *cur_state;
2393 do
2394 {
2395 int max = mctx->state_log_top;
2396 int cur_str_idx = re_string_cur_idx (&mctx->input);
2397
2398 do
2399 {
2400 if (++cur_str_idx > max)
2401 return NULL;
2402 re_string_skip_bytes (&mctx->input, 1);
2403 }
2404 while (mctx->state_log[cur_str_idx] == NULL);
2405
2406 cur_state = merge_state_with_log (err, mctx, NULL);
2407 }
2408 while (*err == REG_NOERROR && cur_state == NULL);
2409 return cur_state;
2410}
2411
2412/* Helper functions for transit_state. */
2413
2414/* From the node set CUR_NODES, pick up the nodes whose types are
2415 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2416 expression. And register them to use them later for evaluating the
2417 correspoding back references. */
2418
2419static reg_errcode_t
2420internal_function
2421check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2422 int str_idx)
2423{
2424 const re_dfa_t *const dfa = mctx->dfa;
2425 int node_idx;
2426 reg_errcode_t err;
2427
2428 /* TODO: This isn't efficient.
2429 Because there might be more than one nodes whose types are
2430 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2431 nodes.
2432 E.g. RE: (a){2} */
2433 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2434 {
2435 int node = cur_nodes->elems[node_idx];
2436 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2437 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2438 && (dfa->used_bkref_map
2439 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2440 {
2441 err = match_ctx_add_subtop (mctx, node, str_idx);
2442 if (BE (err != REG_NOERROR, 0))
2443 return err;
2444 }
2445 }
2446 return REG_NOERROR;
2447}
2448
2449#if 0
2450/* Return the next state to which the current state STATE will transit by
2451 accepting the current input byte. */
2452
2453static re_dfastate_t *
2454transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2455 re_dfastate_t *state)
2456{
2457 const re_dfa_t *const dfa = mctx->dfa;
2458 re_node_set next_nodes;
2459 re_dfastate_t *next_state;
2460 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2461 unsigned int context;
2462
2463 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2464 if (BE (*err != REG_NOERROR, 0))
2465 return NULL;
2466 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2467 {
2468 int cur_node = state->nodes.elems[node_cnt];
2469 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2470 {
2471 *err = re_node_set_merge (&next_nodes,
2472 dfa->eclosures + dfa->nexts[cur_node]);
2473 if (BE (*err != REG_NOERROR, 0))
2474 {
2475 re_node_set_free (&next_nodes);
2476 return NULL;
2477 }
2478 }
2479 }
2480 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2481 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2482 /* We don't need to check errors here, since the return value of
2483 this function is next_state and ERR is already set. */
2484
2485 re_node_set_free (&next_nodes);
2486 re_string_skip_bytes (&mctx->input, 1);
2487 return next_state;
2488}
2489#endif
2490
2491#ifdef RE_ENABLE_I18N
2492static reg_errcode_t
2493internal_function
2494transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2495{
2496 const re_dfa_t *const dfa = mctx->dfa;
2497 reg_errcode_t err;
2498 int i;
2499
2500 for (i = 0; i < pstate->nodes.nelem; ++i)
2501 {
2502 re_node_set dest_nodes, *new_nodes;
2503 int cur_node_idx = pstate->nodes.elems[i];
2504 int naccepted, dest_idx;
2505 unsigned int context;
2506 re_dfastate_t *dest_state;
2507
2508 if (!dfa->nodes[cur_node_idx].accept_mb)
2509 continue;
2510
2511 if (dfa->nodes[cur_node_idx].constraint)
2512 {
2513 context = re_string_context_at (&mctx->input,
2514 re_string_cur_idx (&mctx->input),
2515 mctx->eflags);
2516 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2517 context))
2518 continue;
2519 }
2520
2521 /* How many bytes the node can accept? */
2522 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2523 re_string_cur_idx (&mctx->input));
2524 if (naccepted == 0)
2525 continue;
2526
2527 /* The node can accepts `naccepted' bytes. */
2528 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2529 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2530 : mctx->max_mb_elem_len);
2531 err = clean_state_log_if_needed (mctx, dest_idx);
2532 if (BE (err != REG_NOERROR, 0))
2533 return err;
2534#ifdef DEBUG
2535 assert (dfa->nexts[cur_node_idx] != -1);
2536#endif
2537 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2538
2539 dest_state = mctx->state_log[dest_idx];
2540 if (dest_state == NULL)
2541 dest_nodes = *new_nodes;
2542 else
2543 {
2544 err = re_node_set_init_union (&dest_nodes,
2545 dest_state->entrance_nodes, new_nodes);
2546 if (BE (err != REG_NOERROR, 0))
2547 return err;
2548 }
2549 context = re_string_context_at (&mctx->input, dest_idx - 1,
2550 mctx->eflags);
2551 mctx->state_log[dest_idx]
2552 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2553 if (dest_state != NULL)
2554 re_node_set_free (&dest_nodes);
2555 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2556 return err;
2557 }
2558 return REG_NOERROR;
2559}
2560#endif /* RE_ENABLE_I18N */
2561
2562static reg_errcode_t
2563internal_function
2564transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2565{
2566 const re_dfa_t *const dfa = mctx->dfa;
2567 reg_errcode_t err;
2568 int i;
2569 int cur_str_idx = re_string_cur_idx (&mctx->input);
2570
2571 for (i = 0; i < nodes->nelem; ++i)
2572 {
2573 int dest_str_idx, prev_nelem, bkc_idx;
2574 int node_idx = nodes->elems[i];
2575 unsigned int context;
2576 const re_token_t *node = dfa->nodes + node_idx;
2577 re_node_set *new_dest_nodes;
2578
2579 /* Check whether `node' is a backreference or not. */
2580 if (node->type != OP_BACK_REF)
2581 continue;
2582
2583 if (node->constraint)
2584 {
2585 context = re_string_context_at (&mctx->input, cur_str_idx,
2586 mctx->eflags);
2587 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2588 continue;
2589 }
2590
2591 /* `node' is a backreference.
2592 Check the substring which the substring matched. */
2593 bkc_idx = mctx->nbkref_ents;
2594 err = get_subexp (mctx, node_idx, cur_str_idx);
2595 if (BE (err != REG_NOERROR, 0))
2596 goto free_return;
2597
2598 /* And add the epsilon closures (which is `new_dest_nodes') of
2599 the backreference to appropriate state_log. */
2600#ifdef DEBUG
2601 assert (dfa->nexts[node_idx] != -1);
2602#endif
2603 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2604 {
2605 int subexp_len;
2606 re_dfastate_t *dest_state;
2607 struct re_backref_cache_entry *bkref_ent;
2608 bkref_ent = mctx->bkref_ents + bkc_idx;
2609 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2610 continue;
2611 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2612 new_dest_nodes = (subexp_len == 0
2613 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2614 : dfa->eclosures + dfa->nexts[node_idx]);
2615 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2616 - bkref_ent->subexp_from);
2617 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2618 mctx->eflags);
2619 dest_state = mctx->state_log[dest_str_idx];
2620 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2621 : mctx->state_log[cur_str_idx]->nodes.nelem);
2622 /* Add `new_dest_node' to state_log. */
2623 if (dest_state == NULL)
2624 {
2625 mctx->state_log[dest_str_idx]
2626 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2627 context);
2628 if (BE (mctx->state_log[dest_str_idx] == NULL
2629 && err != REG_NOERROR, 0))
2630 goto free_return;
2631 }
2632 else
2633 {
2634 re_node_set dest_nodes;
2635 err = re_node_set_init_union (&dest_nodes,
2636 dest_state->entrance_nodes,
2637 new_dest_nodes);
2638 if (BE (err != REG_NOERROR, 0))
2639 {
2640 re_node_set_free (&dest_nodes);
2641 goto free_return;
2642 }
2643 mctx->state_log[dest_str_idx]
2644 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2645 re_node_set_free (&dest_nodes);
2646 if (BE (mctx->state_log[dest_str_idx] == NULL
2647 && err != REG_NOERROR, 0))
2648 goto free_return;
2649 }
2650 /* We need to check recursively if the backreference can epsilon
2651 transit. */
2652 if (subexp_len == 0
2653 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2654 {
2655 err = check_subexp_matching_top (mctx, new_dest_nodes,
2656 cur_str_idx);
2657 if (BE (err != REG_NOERROR, 0))
2658 goto free_return;
2659 err = transit_state_bkref (mctx, new_dest_nodes);
2660 if (BE (err != REG_NOERROR, 0))
2661 goto free_return;
2662 }
2663 }
2664 }
2665 err = REG_NOERROR;
2666 free_return:
2667 return err;
2668}
2669
2670/* Enumerate all the candidates which the backreference BKREF_NODE can match
2671 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2672 Note that we might collect inappropriate candidates here.
2673 However, the cost of checking them strictly here is too high, then we
2674 delay these checking for prune_impossible_nodes(). */
2675
2676static reg_errcode_t
2677internal_function __attribute_warn_unused_result__
2678get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2679{
2680 const re_dfa_t *const dfa = mctx->dfa;
2681 int subexp_num, sub_top_idx;
2682 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2683 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2684 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2685 if (cache_idx != -1)
2686 {
2687 const struct re_backref_cache_entry *entry
2688 = mctx->bkref_ents + cache_idx;
2689 do
2690 if (entry->node == bkref_node)
2691 return REG_NOERROR; /* We already checked it. */
2692 while (entry++->more);
2693 }
2694
2695 subexp_num = dfa->nodes[bkref_node].opr.idx;
2696
2697 /* For each sub expression */
2698 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2699 {
2700 reg_errcode_t err;
2701 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2702 re_sub_match_last_t *sub_last;
2703 int sub_last_idx, sl_str, bkref_str_off;
2704
2705 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2706 continue; /* It isn't related. */
2707
2708 sl_str = sub_top->str_idx;
2709 bkref_str_off = bkref_str_idx;
2710 /* At first, check the last node of sub expressions we already
2711 evaluated. */
2712 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2713 {
2714 int sl_str_diff;
2715 sub_last = sub_top->lasts[sub_last_idx];
2716 sl_str_diff = sub_last->str_idx - sl_str;
2717 /* The matched string by the sub expression match with the substring
2718 at the back reference? */
2719 if (sl_str_diff > 0)
2720 {
2721 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2722 {
2723 /* Not enough chars for a successful match. */
2724 if (bkref_str_off + sl_str_diff > mctx->input.len)
2725 break;
2726
2727 err = clean_state_log_if_needed (mctx,
2728 bkref_str_off
2729 + sl_str_diff);
2730 if (BE (err != REG_NOERROR, 0))
2731 return err;
2732 buf = (const char *) re_string_get_buffer (&mctx->input);
2733 }
2734 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2735 /* We don't need to search this sub expression any more. */
2736 break;
2737 }
2738 bkref_str_off += sl_str_diff;
2739 sl_str += sl_str_diff;
2740 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2741 bkref_str_idx);
2742
2743 /* Reload buf, since the preceding call might have reallocated
2744 the buffer. */
2745 buf = (const char *) re_string_get_buffer (&mctx->input);
2746
2747 if (err == REG_NOMATCH)
2748 continue;
2749 if (BE (err != REG_NOERROR, 0))
2750 return err;
2751 }
2752
2753 if (sub_last_idx < sub_top->nlasts)
2754 continue;
2755 if (sub_last_idx > 0)
2756 ++sl_str;
2757 /* Then, search for the other last nodes of the sub expression. */
2758 for (; sl_str <= bkref_str_idx; ++sl_str)
2759 {
2760 int cls_node, sl_str_off;
2761 const re_node_set *nodes;
2762 sl_str_off = sl_str - sub_top->str_idx;
2763 /* The matched string by the sub expression match with the substring
2764 at the back reference? */
2765 if (sl_str_off > 0)
2766 {
2767 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2768 {
2769 /* If we are at the end of the input, we cannot match. */
2770 if (bkref_str_off >= mctx->input.len)
2771 break;
2772
2773 err = extend_buffers (mctx, bkref_str_off + 1);
2774 if (BE (err != REG_NOERROR, 0))
2775 return err;
2776
2777 buf = (const char *) re_string_get_buffer (&mctx->input);
2778 }
2779 if (buf [bkref_str_off++] != buf[sl_str - 1])
2780 break; /* We don't need to search this sub expression
2781 any more. */
2782 }
2783 if (mctx->state_log[sl_str] == NULL)
2784 continue;
2785 /* Does this state have a ')' of the sub expression? */
2786 nodes = &mctx->state_log[sl_str]->nodes;
2787 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2788 OP_CLOSE_SUBEXP);
2789 if (cls_node == -1)
2790 continue; /* No. */
2791 if (sub_top->path == NULL)
2792 {
2793 sub_top->path = calloc (sizeof (state_array_t),
2794 sl_str - sub_top->str_idx + 1);
2795 if (sub_top->path == NULL)
2796 return REG_ESPACE;
2797 }
2798 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2799 in the current context? */
2800 err = check_arrival (mctx, sub_top->path, sub_top->node,
2801 sub_top->str_idx, cls_node, sl_str,
2802 OP_CLOSE_SUBEXP);
2803 if (err == REG_NOMATCH)
2804 continue;
2805 if (BE (err != REG_NOERROR, 0))
2806 return err;
2807 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2808 if (BE (sub_last == NULL, 0))
2809 return REG_ESPACE;
2810 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2811 bkref_str_idx);
2812 if (err == REG_NOMATCH)
2813 continue;
2814 }
2815 }
2816 return REG_NOERROR;
2817}
2818
2819/* Helper functions for get_subexp(). */
2820
2821/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2822 If it can arrive, register the sub expression expressed with SUB_TOP
2823 and SUB_LAST. */
2824
2825static reg_errcode_t
2826internal_function
2827get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2828 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2829{
2830 reg_errcode_t err;
2831 int to_idx;
2832 /* Can the subexpression arrive the back reference? */
2833 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2834 sub_last->str_idx, bkref_node, bkref_str,
2835 OP_OPEN_SUBEXP);
2836 if (err != REG_NOERROR)
2837 return err;
2838 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2839 sub_last->str_idx);
2840 if (BE (err != REG_NOERROR, 0))
2841 return err;
2842 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2843 return clean_state_log_if_needed (mctx, to_idx);
2844}
2845
2846/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2847 Search '(' if FL_OPEN, or search ')' otherwise.
2848 TODO: This function isn't efficient...
2849 Because there might be more than one nodes whose types are
2850 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2851 nodes.
2852 E.g. RE: (a){2} */
2853
2854static int
2855internal_function
2856find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2857 int subexp_idx, int type)
2858{
2859 int cls_idx;
2860 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2861 {
2862 int cls_node = nodes->elems[cls_idx];
2863 const re_token_t *node = dfa->nodes + cls_node;
2864 if (node->type == type
2865 && node->opr.idx == subexp_idx)
2866 return cls_node;
2867 }
2868 return -1;
2869}
2870
2871/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2872 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2873 heavily reused.
2874 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2875
2876static reg_errcode_t
2877internal_function __attribute_warn_unused_result__
2878check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2879 int top_str, int last_node, int last_str, int type)
2880{
2881 const re_dfa_t *const dfa = mctx->dfa;
2882 reg_errcode_t err = REG_NOERROR;
2883 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2884 re_dfastate_t *cur_state = NULL;
2885 re_node_set *cur_nodes, next_nodes;
2886 re_dfastate_t **backup_state_log;
2887 unsigned int context;
2888
2889 subexp_num = dfa->nodes[top_node].opr.idx;
2890 /* Extend the buffer if we need. */
2891 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2892 {
2893 re_dfastate_t **new_array;
2894 int old_alloc = path->alloc;
2895 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2896 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2897 if (BE (new_array == NULL, 0))
2898 {
2899 path->alloc = old_alloc;
2900 return REG_ESPACE;
2901 }
2902 path->array = new_array;
2903 memset (new_array + old_alloc, '\0',
2904 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2905 }
2906
2907 str_idx = path->next_idx ?: top_str;
2908
2909 /* Temporary modify MCTX. */
2910 backup_state_log = mctx->state_log;
2911 backup_cur_idx = mctx->input.cur_idx;
2912 mctx->state_log = path->array;
2913 mctx->input.cur_idx = str_idx;
2914
2915 /* Setup initial node set. */
2916 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2917 if (str_idx == top_str)
2918 {
2919 err = re_node_set_init_1 (&next_nodes, top_node);
2920 if (BE (err != REG_NOERROR, 0))
2921 return err;
2922 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2923 if (BE (err != REG_NOERROR, 0))
2924 {
2925 re_node_set_free (&next_nodes);
2926 return err;
2927 }
2928 }
2929 else
2930 {
2931 cur_state = mctx->state_log[str_idx];
2932 if (cur_state && cur_state->has_backref)
2933 {
2934 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2935 if (BE (err != REG_NOERROR, 0))
2936 return err;
2937 }
2938 else
2939 re_node_set_init_empty (&next_nodes);
2940 }
2941 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2942 {
2943 if (next_nodes.nelem)
2944 {
2945 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2946 subexp_num, type);
2947 if (BE (err != REG_NOERROR, 0))
2948 {
2949 re_node_set_free (&next_nodes);
2950 return err;
2951 }
2952 }
2953 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2954 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2955 {
2956 re_node_set_free (&next_nodes);
2957 return err;
2958 }
2959 mctx->state_log[str_idx] = cur_state;
2960 }
2961
2962 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2963 {
2964 re_node_set_empty (&next_nodes);
2965 if (mctx->state_log[str_idx + 1])
2966 {
2967 err = re_node_set_merge (&next_nodes,
2968 &mctx->state_log[str_idx + 1]->nodes);
2969 if (BE (err != REG_NOERROR, 0))
2970 {
2971 re_node_set_free (&next_nodes);
2972 return err;
2973 }
2974 }
2975 if (cur_state)
2976 {
2977 err = check_arrival_add_next_nodes (mctx, str_idx,
2978 &cur_state->non_eps_nodes,
2979 &next_nodes);
2980 if (BE (err != REG_NOERROR, 0))
2981 {
2982 re_node_set_free (&next_nodes);
2983 return err;
2984 }
2985 }
2986 ++str_idx;
2987 if (next_nodes.nelem)
2988 {
2989 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2990 if (BE (err != REG_NOERROR, 0))
2991 {
2992 re_node_set_free (&next_nodes);
2993 return err;
2994 }
2995 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2996 subexp_num, type);
2997 if (BE (err != REG_NOERROR, 0))
2998 {
2999 re_node_set_free (&next_nodes);
3000 return err;
3001 }
3002 }
3003 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3004 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3005 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3006 {
3007 re_node_set_free (&next_nodes);
3008 return err;
3009 }
3010 mctx->state_log[str_idx] = cur_state;
3011 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3012 }
3013 re_node_set_free (&next_nodes);
3014 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3015 : &mctx->state_log[last_str]->nodes);
3016 path->next_idx = str_idx;
3017
3018 /* Fix MCTX. */
3019 mctx->state_log = backup_state_log;
3020 mctx->input.cur_idx = backup_cur_idx;
3021
3022 /* Then check the current node set has the node LAST_NODE. */
3023 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3024 return REG_NOERROR;
3025
3026 return REG_NOMATCH;
3027}
3028
3029/* Helper functions for check_arrival. */
3030
3031/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3032 to NEXT_NODES.
3033 TODO: This function is similar to the functions transit_state*(),
3034 however this function has many additional works.
3035 Can't we unify them? */
3036
3037static reg_errcode_t
3038internal_function __attribute_warn_unused_result__
3039check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3040 re_node_set *cur_nodes, re_node_set *next_nodes)
3041{
3042 const re_dfa_t *const dfa = mctx->dfa;
3043 int result;
3044 int cur_idx;
3045 reg_errcode_t err = REG_NOERROR;
3046 re_node_set union_set;
3047 re_node_set_init_empty (&union_set);
3048 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3049 {
3050 int naccepted = 0;
3051 int cur_node = cur_nodes->elems[cur_idx];
3052#ifdef DEBUG
3053 re_token_type_t type = dfa->nodes[cur_node].type;
3054 assert (!IS_EPSILON_NODE (type));
3055#endif
3056#ifdef RE_ENABLE_I18N
3057 /* If the node may accept `multi byte'. */
3058 if (dfa->nodes[cur_node].accept_mb)
3059 {
3060 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3061 str_idx);
3062 if (naccepted > 1)
3063 {
3064 re_dfastate_t *dest_state;
3065 int next_node = dfa->nexts[cur_node];
3066 int next_idx = str_idx + naccepted;
3067 dest_state = mctx->state_log[next_idx];
3068 re_node_set_empty (&union_set);
3069 if (dest_state)
3070 {
3071 err = re_node_set_merge (&union_set, &dest_state->nodes);
3072 if (BE (err != REG_NOERROR, 0))
3073 {
3074 re_node_set_free (&union_set);
3075 return err;
3076 }
3077 }
3078 result = re_node_set_insert (&union_set, next_node);
3079 if (BE (result < 0, 0))
3080 {
3081 re_node_set_free (&union_set);
3082 return REG_ESPACE;
3083 }
3084 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3085 &union_set);
3086 if (BE (mctx->state_log[next_idx] == NULL
3087 && err != REG_NOERROR, 0))
3088 {
3089 re_node_set_free (&union_set);
3090 return err;
3091 }
3092 }
3093 }
3094#endif /* RE_ENABLE_I18N */
3095 if (naccepted
3096 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3097 {
3098 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3099 if (BE (result < 0, 0))
3100 {
3101 re_node_set_free (&union_set);
3102 return REG_ESPACE;
3103 }
3104 }
3105 }
3106 re_node_set_free (&union_set);
3107 return REG_NOERROR;
3108}
3109
3110/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3111 CUR_NODES, however exclude the nodes which are:
3112 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3113 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3114*/
3115
3116static reg_errcode_t
3117internal_function
3118check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3119 int ex_subexp, int type)
3120{
3121 reg_errcode_t err;
3122 int idx, outside_node;
3123 re_node_set new_nodes;
3124#ifdef DEBUG
3125 assert (cur_nodes->nelem);
3126#endif
3127 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3128 if (BE (err != REG_NOERROR, 0))
3129 return err;
3130 /* Create a new node set NEW_NODES with the nodes which are epsilon
3131 closures of the node in CUR_NODES. */
3132
3133 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3134 {
3135 int cur_node = cur_nodes->elems[idx];
3136 const re_node_set *eclosure = dfa->eclosures + cur_node;
3137 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3138 if (outside_node == -1)
3139 {
3140 /* There are no problematic nodes, just merge them. */
3141 err = re_node_set_merge (&new_nodes, eclosure);
3142 if (BE (err != REG_NOERROR, 0))
3143 {
3144 re_node_set_free (&new_nodes);
3145 return err;
3146 }
3147 }
3148 else
3149 {
3150 /* There are problematic nodes, re-calculate incrementally. */
3151 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3152 ex_subexp, type);
3153 if (BE (err != REG_NOERROR, 0))
3154 {
3155 re_node_set_free (&new_nodes);
3156 return err;
3157 }
3158 }
3159 }
3160 re_node_set_free (cur_nodes);
3161 *cur_nodes = new_nodes;
3162 return REG_NOERROR;
3163}
3164
3165/* Helper function for check_arrival_expand_ecl.
3166 Check incrementally the epsilon closure of TARGET, and if it isn't
3167 problematic append it to DST_NODES. */
3168
3169static reg_errcode_t
3170internal_function __attribute_warn_unused_result__
3171check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3172 int target, int ex_subexp, int type)
3173{
3174 int cur_node;
3175 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3176 {
3177 int err;
3178
3179 if (dfa->nodes[cur_node].type == type
3180 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3181 {
3182 if (type == OP_CLOSE_SUBEXP)
3183 {
3184 err = re_node_set_insert (dst_nodes, cur_node);
3185 if (BE (err == -1, 0))
3186 return REG_ESPACE;
3187 }
3188 break;
3189 }
3190 err = re_node_set_insert (dst_nodes, cur_node);
3191 if (BE (err == -1, 0))
3192 return REG_ESPACE;
3193 if (dfa->edests[cur_node].nelem == 0)
3194 break;
3195 if (dfa->edests[cur_node].nelem == 2)
3196 {
3197 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3198 dfa->edests[cur_node].elems[1],
3199 ex_subexp, type);
3200 if (BE (err != REG_NOERROR, 0))
3201 return err;
3202 }
3203 cur_node = dfa->edests[cur_node].elems[0];
3204 }
3205 return REG_NOERROR;
3206}
3207
3208
3209/* For all the back references in the current state, calculate the
3210 destination of the back references by the appropriate entry
3211 in MCTX->BKREF_ENTS. */
3212
3213static reg_errcode_t
3214internal_function __attribute_warn_unused_result__
3215expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3216 int cur_str, int subexp_num, int type)
3217{
3218 const re_dfa_t *const dfa = mctx->dfa;
3219 reg_errcode_t err;
3220 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3221 struct re_backref_cache_entry *ent;
3222
3223 if (cache_idx_start == -1)
3224 return REG_NOERROR;
3225
3226 restart:
3227 ent = mctx->bkref_ents + cache_idx_start;
3228 do
3229 {
3230 int to_idx, next_node;
3231
3232 /* Is this entry ENT is appropriate? */
3233 if (!re_node_set_contains (cur_nodes, ent->node))
3234 continue; /* No. */
3235
3236 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3237 /* Calculate the destination of the back reference, and append it
3238 to MCTX->STATE_LOG. */
3239 if (to_idx == cur_str)
3240 {
3241 /* The backreference did epsilon transit, we must re-check all the
3242 node in the current state. */
3243 re_node_set new_dests;
3244 reg_errcode_t err2, err3;
3245 next_node = dfa->edests[ent->node].elems[0];
3246 if (re_node_set_contains (cur_nodes, next_node))
3247 continue;
3248 err = re_node_set_init_1 (&new_dests, next_node);
3249 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3250 err3 = re_node_set_merge (cur_nodes, &new_dests);
3251 re_node_set_free (&new_dests);
3252 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3253 || err3 != REG_NOERROR, 0))
3254 {
3255 err = (err != REG_NOERROR ? err
3256 : (err2 != REG_NOERROR ? err2 : err3));
3257 return err;
3258 }
3259 /* TODO: It is still inefficient... */
3260 goto restart;
3261 }
3262 else
3263 {
3264 re_node_set union_set;
3265 next_node = dfa->nexts[ent->node];
3266 if (mctx->state_log[to_idx])
3267 {
3268 int ret;
3269 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3270 next_node))
3271 continue;
3272 err = re_node_set_init_copy (&union_set,
3273 &mctx->state_log[to_idx]->nodes);
3274 ret = re_node_set_insert (&union_set, next_node);
3275 if (BE (err != REG_NOERROR || ret < 0, 0))
3276 {
3277 re_node_set_free (&union_set);
3278 err = err != REG_NOERROR ? err : REG_ESPACE;
3279 return err;
3280 }
3281 }
3282 else
3283 {
3284 err = re_node_set_init_1 (&union_set, next_node);
3285 if (BE (err != REG_NOERROR, 0))
3286 return err;
3287 }
3288 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3289 re_node_set_free (&union_set);
3290 if (BE (mctx->state_log[to_idx] == NULL
3291 && err != REG_NOERROR, 0))
3292 return err;
3293 }
3294 }
3295 while (ent++->more);
3296 return REG_NOERROR;
3297}
3298
3299/* Build transition table for the state.
3300 Return 1 if succeeded, otherwise return NULL. */
3301
3302static int
3303internal_function
3304build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3305{
3306 reg_errcode_t err;
3307 int i, j, ch, need_word_trtable = 0;
3308 bitset_word_t elem, mask;
3309 bool dests_node_malloced = false;
3310 bool dest_states_malloced = false;
3311 int ndests; /* Number of the destination states from `state'. */
3312 re_dfastate_t **trtable;
3313 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3314 re_node_set follows, *dests_node;
3315 bitset_t *dests_ch;
3316 bitset_t acceptable;
3317
3318 struct dests_alloc
3319 {
3320 re_node_set dests_node[SBC_MAX];
3321 bitset_t dests_ch[SBC_MAX];
3322 } *dests_alloc;
3323
3324 /* We build DFA states which corresponds to the destination nodes
3325 from `state'. `dests_node[i]' represents the nodes which i-th
3326 destination state contains, and `dests_ch[i]' represents the
3327 characters which i-th destination state accepts. */
3328 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3329 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3330 else
3331 {
3332 dests_alloc = re_malloc (struct dests_alloc, 1);
3333 if (BE (dests_alloc == NULL, 0))
3334 return 0;
3335 dests_node_malloced = true;
3336 }
3337 dests_node = dests_alloc->dests_node;
3338 dests_ch = dests_alloc->dests_ch;
3339
3340 /* Initialize transiton table. */
3341 state->word_trtable = state->trtable = NULL;
3342
3343 /* At first, group all nodes belonging to `state' into several
3344 destinations. */
3345 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3346 if (BE (ndests <= 0, 0))
3347 {
3348 if (dests_node_malloced)
3349 free (dests_alloc);
3350 /* Return 0 in case of an error, 1 otherwise. */
3351 if (ndests == 0)
3352 {
3353 state->trtable = (re_dfastate_t **)
3354 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3355 if (BE (state->trtable == NULL, 0))
3356 return 0;
3357 return 1;
3358 }
3359 return 0;
3360 }
3361
3362 err = re_node_set_alloc (&follows, ndests + 1);
3363 if (BE (err != REG_NOERROR, 0))
3364 goto out_free;
3365
3366 /* Avoid arithmetic overflow in size calculation. */
3367 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3368 / (3 * sizeof (re_dfastate_t *)))
3369 < ndests),
3370 0))
3371 goto out_free;
3372
3373 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3374 + ndests * 3 * sizeof (re_dfastate_t *)))
3375 dest_states = (re_dfastate_t **)
3376 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3377 else
3378 {
3379 dest_states = (re_dfastate_t **)
3380 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3381 if (BE (dest_states == NULL, 0))
3382 {
3383out_free:
3384 if (dest_states_malloced)
3385 free (dest_states);
3386 re_node_set_free (&follows);
3387 for (i = 0; i < ndests; ++i)
3388 re_node_set_free (dests_node + i);
3389 if (dests_node_malloced)
3390 free (dests_alloc);
3391 return 0;
3392 }
3393 dest_states_malloced = true;
3394 }
3395 dest_states_word = dest_states + ndests;
3396 dest_states_nl = dest_states_word + ndests;
3397 bitset_empty (acceptable);
3398
3399 /* Then build the states for all destinations. */
3400 for (i = 0; i < ndests; ++i)
3401 {
3402 int next_node;
3403 re_node_set_empty (&follows);
3404 /* Merge the follows of this destination states. */
3405 for (j = 0; j < dests_node[i].nelem; ++j)
3406 {
3407 next_node = dfa->nexts[dests_node[i].elems[j]];
3408 if (next_node != -1)
3409 {
3410 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3411 if (BE (err != REG_NOERROR, 0))
3412 goto out_free;
3413 }
3414 }
3415 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3416 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3417 goto out_free;
3418 /* If the new state has context constraint,
3419 build appropriate states for these contexts. */
3420 if (dest_states[i]->has_constraint)
3421 {
3422 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3423 CONTEXT_WORD);
3424 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3425 goto out_free;
3426
3427 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3428 need_word_trtable = 1;
3429
3430 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3431 CONTEXT_NEWLINE);
3432 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3433 goto out_free;
3434 }
3435 else
3436 {
3437 dest_states_word[i] = dest_states[i];
3438 dest_states_nl[i] = dest_states[i];
3439 }
3440 bitset_merge (acceptable, dests_ch[i]);
3441 }
3442
3443 if (!BE (need_word_trtable, 0))
3444 {
3445 /* We don't care about whether the following character is a word
3446 character, or we are in a single-byte character set so we can
3447 discern by looking at the character code: allocate a
3448 256-entry transition table. */
3449 trtable = state->trtable =
3450 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3451 if (BE (trtable == NULL, 0))
3452 goto out_free;
3453
3454 /* For all characters ch...: */
3455 for (i = 0; i < BITSET_WORDS; ++i)
3456 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3457 elem;
3458 mask <<= 1, elem >>= 1, ++ch)
3459 if (BE (elem & 1, 0))
3460 {
3461 /* There must be exactly one destination which accepts
3462 character ch. See group_nodes_into_DFAstates. */
3463 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3464 ;
3465
3466 /* j-th destination accepts the word character ch. */
3467 if (dfa->word_char[i] & mask)
3468 trtable[ch] = dest_states_word[j];
3469 else
3470 trtable[ch] = dest_states[j];
3471 }
3472 }
3473 else
3474 {
3475 /* We care about whether the following character is a word
3476 character, and we are in a multi-byte character set: discern
3477 by looking at the character code: build two 256-entry
3478 transition tables, one starting at trtable[0] and one
3479 starting at trtable[SBC_MAX]. */
3480 trtable = state->word_trtable =
3481 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3482 if (BE (trtable == NULL, 0))
3483 goto out_free;
3484
3485 /* For all characters ch...: */
3486 for (i = 0; i < BITSET_WORDS; ++i)
3487 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3488 elem;
3489 mask <<= 1, elem >>= 1, ++ch)
3490 if (BE (elem & 1, 0))
3491 {
3492 /* There must be exactly one destination which accepts
3493 character ch. See group_nodes_into_DFAstates. */
3494 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3495 ;
3496
3497 /* j-th destination accepts the word character ch. */
3498 trtable[ch] = dest_states[j];
3499 trtable[ch + SBC_MAX] = dest_states_word[j];
3500 }
3501 }
3502
3503 /* new line */
3504 if (bitset_contain (acceptable, NEWLINE_CHAR))
3505 {
3506 /* The current state accepts newline character. */
3507 for (j = 0; j < ndests; ++j)
3508 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3509 {
3510 /* k-th destination accepts newline character. */
3511 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3512 if (need_word_trtable)
3513 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3514 /* There must be only one destination which accepts
3515 newline. See group_nodes_into_DFAstates. */
3516 break;
3517 }
3518 }
3519
3520 if (dest_states_malloced)
3521 free (dest_states);
3522
3523 re_node_set_free (&follows);
3524 for (i = 0; i < ndests; ++i)
3525 re_node_set_free (dests_node + i);
3526
3527 if (dests_node_malloced)
3528 free (dests_alloc);
3529
3530 return 1;
3531}
3532
3533/* Group all nodes belonging to STATE into several destinations.
3534 Then for all destinations, set the nodes belonging to the destination
3535 to DESTS_NODE[i] and set the characters accepted by the destination
3536 to DEST_CH[i]. This function return the number of destinations. */
3537
3538static int
3539internal_function
3540group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3541 re_node_set *dests_node, bitset_t *dests_ch)
3542{
3543 reg_errcode_t err;
3544 int result;
3545 int i, j, k;
3546 int ndests; /* Number of the destinations from `state'. */
3547 bitset_t accepts; /* Characters a node can accept. */
3548 const re_node_set *cur_nodes = &state->nodes;
3549 bitset_empty (accepts);
3550 ndests = 0;
3551
3552 /* For all the nodes belonging to `state', */
3553 for (i = 0; i < cur_nodes->nelem; ++i)
3554 {
3555 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3556 re_token_type_t type = node->type;
3557 unsigned int constraint = node->constraint;
3558
3559 /* Enumerate all single byte character this node can accept. */
3560 if (type == CHARACTER)
3561 bitset_set (accepts, node->opr.c);
3562 else if (type == SIMPLE_BRACKET)
3563 {
3564 bitset_merge (accepts, node->opr.sbcset);
3565 }
3566 else if (type == OP_PERIOD)
3567 {
3568#ifdef RE_ENABLE_I18N
3569 if (dfa->mb_cur_max > 1)
3570 bitset_merge (accepts, dfa->sb_char);
3571 else
3572#endif
3573 bitset_set_all (accepts);
3574 if (!(dfa->syntax & RE_DOT_NEWLINE))
3575 bitset_clear (accepts, '\n');
3576 if (dfa->syntax & RE_DOT_NOT_NULL)
3577 bitset_clear (accepts, '\0');
3578 }
3579#ifdef RE_ENABLE_I18N
3580 else if (type == OP_UTF8_PERIOD)
3581 {
3582 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3583 if (!(dfa->syntax & RE_DOT_NEWLINE))
3584 bitset_clear (accepts, '\n');
3585 if (dfa->syntax & RE_DOT_NOT_NULL)
3586 bitset_clear (accepts, '\0');
3587 }
3588#endif
3589 else
3590 continue;
3591
3592 /* Check the `accepts' and sift the characters which are not
3593 match it the context. */
3594 if (constraint)
3595 {
3596 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3597 {
3598 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3599 bitset_empty (accepts);
3600 if (accepts_newline)
3601 bitset_set (accepts, NEWLINE_CHAR);
3602 else
3603 continue;
3604 }
3605 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3606 {
3607 bitset_empty (accepts);
3608 continue;
3609 }
3610
3611 if (constraint & NEXT_WORD_CONSTRAINT)
3612 {
3613 bitset_word_t any_set = 0;
3614 if (type == CHARACTER && !node->word_char)
3615 {
3616 bitset_empty (accepts);
3617 continue;
3618 }
3619#ifdef RE_ENABLE_I18N
3620 if (dfa->mb_cur_max > 1)
3621 for (j = 0; j < BITSET_WORDS; ++j)
3622 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3623 else
3624#endif
3625 for (j = 0; j < BITSET_WORDS; ++j)
3626 any_set |= (accepts[j] &= dfa->word_char[j]);
3627 if (!any_set)
3628 continue;
3629 }
3630 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3631 {
3632 bitset_word_t any_set = 0;
3633 if (type == CHARACTER && node->word_char)
3634 {
3635 bitset_empty (accepts);
3636 continue;
3637 }
3638#ifdef RE_ENABLE_I18N
3639 if (dfa->mb_cur_max > 1)
3640 for (j = 0; j < BITSET_WORDS; ++j)
3641 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3642 else
3643#endif
3644 for (j = 0; j < BITSET_WORDS; ++j)
3645 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3646 if (!any_set)
3647 continue;
3648 }
3649 }
3650
3651 /* Then divide `accepts' into DFA states, or create a new
3652 state. Above, we make sure that accepts is not empty. */
3653 for (j = 0; j < ndests; ++j)
3654 {
3655 bitset_t intersec; /* Intersection sets, see below. */
3656 bitset_t remains;
3657 /* Flags, see below. */
3658 bitset_word_t has_intersec, not_subset, not_consumed;
3659
3660 /* Optimization, skip if this state doesn't accept the character. */
3661 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3662 continue;
3663
3664 /* Enumerate the intersection set of this state and `accepts'. */
3665 has_intersec = 0;
3666 for (k = 0; k < BITSET_WORDS; ++k)
3667 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3668 /* And skip if the intersection set is empty. */
3669 if (!has_intersec)
3670 continue;
3671
3672 /* Then check if this state is a subset of `accepts'. */
3673 not_subset = not_consumed = 0;
3674 for (k = 0; k < BITSET_WORDS; ++k)
3675 {
3676 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3677 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3678 }
3679
3680 /* If this state isn't a subset of `accepts', create a
3681 new group state, which has the `remains'. */
3682 if (not_subset)
3683 {
3684 bitset_copy (dests_ch[ndests], remains);
3685 bitset_copy (dests_ch[j], intersec);
3686 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3687 if (BE (err != REG_NOERROR, 0))
3688 goto error_return;
3689 ++ndests;
3690 }
3691
3692 /* Put the position in the current group. */
3693 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3694 if (BE (result < 0, 0))
3695 goto error_return;
3696
3697 /* If all characters are consumed, go to next node. */
3698 if (!not_consumed)
3699 break;
3700 }
3701 /* Some characters remain, create a new group. */
3702 if (j == ndests)
3703 {
3704 bitset_copy (dests_ch[ndests], accepts);
3705 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3706 if (BE (err != REG_NOERROR, 0))
3707 goto error_return;
3708 ++ndests;
3709 bitset_empty (accepts);
3710 }
3711 }
3712 return ndests;
3713 error_return:
3714 for (j = 0; j < ndests; ++j)
3715 re_node_set_free (dests_node + j);
3716 return -1;
3717}
3718
3719#ifdef RE_ENABLE_I18N
3720/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3721 Return the number of the bytes the node accepts.
3722 STR_IDX is the current index of the input string.
3723
3724 This function handles the nodes which can accept one character, or
3725 one collating element like '.', '[a-z]', opposite to the other nodes
3726 can only accept one byte. */
3727
3728# ifdef _LIBC
3729# include <locale/weight.h>
3730# endif
3731
3732static int
3733internal_function
3734check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3735 const re_string_t *input, int str_idx)
3736{
3737 const re_token_t *node = dfa->nodes + node_idx;
3738 int char_len, elem_len;
3739 int i;
3740
3741 if (BE (node->type == OP_UTF8_PERIOD, 0))
3742 {
3743 unsigned char c = re_string_byte_at (input, str_idx), d;
3744 if (BE (c < 0xc2, 1))
3745 return 0;
3746
3747 if (str_idx + 2 > input->len)
3748 return 0;
3749
3750 d = re_string_byte_at (input, str_idx + 1);
3751 if (c < 0xe0)
3752 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3753 else if (c < 0xf0)
3754 {
3755 char_len = 3;
3756 if (c == 0xe0 && d < 0xa0)
3757 return 0;
3758 }
3759 else if (c < 0xf8)
3760 {
3761 char_len = 4;
3762 if (c == 0xf0 && d < 0x90)
3763 return 0;
3764 }
3765 else if (c < 0xfc)
3766 {
3767 char_len = 5;
3768 if (c == 0xf8 && d < 0x88)
3769 return 0;
3770 }
3771 else if (c < 0xfe)
3772 {
3773 char_len = 6;
3774 if (c == 0xfc && d < 0x84)
3775 return 0;
3776 }
3777 else
3778 return 0;
3779
3780 if (str_idx + char_len > input->len)
3781 return 0;
3782
3783 for (i = 1; i < char_len; ++i)
3784 {
3785 d = re_string_byte_at (input, str_idx + i);
3786 if (d < 0x80 || d > 0xbf)
3787 return 0;
3788 }
3789 return char_len;
3790 }
3791
3792 char_len = re_string_char_size_at (input, str_idx);
3793 if (node->type == OP_PERIOD)
3794 {
3795 if (char_len <= 1)
3796 return 0;
3797 /* FIXME: I don't think this if is needed, as both '\n'
3798 and '\0' are char_len == 1. */
3799 /* '.' accepts any one character except the following two cases. */
3800 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3801 re_string_byte_at (input, str_idx) == '\n') ||
3802 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3803 re_string_byte_at (input, str_idx) == '\0'))
3804 return 0;
3805 return char_len;
3806 }
3807
3808 elem_len = re_string_elem_size_at (input, str_idx);
3809 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3810 return 0;
3811
3812 if (node->type == COMPLEX_BRACKET)
3813 {
3814 const re_charset_t *cset = node->opr.mbcset;
3815# ifdef _LIBC
3816 const unsigned char *pin
3817 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3818 int j;
3819 uint32_t nrules;
3820# endif /* _LIBC */
3821 int match_len = 0;
3822 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3823 ? re_string_wchar_at (input, str_idx) : 0);
3824
3825 /* match with multibyte character? */
3826 for (i = 0; i < cset->nmbchars; ++i)
3827 if (wc == cset->mbchars[i])
3828 {
3829 match_len = char_len;
3830 goto check_node_accept_bytes_match;
3831 }
3832 /* match with character_class? */
3833 for (i = 0; i < cset->nchar_classes; ++i)
3834 {
3835 wctype_t wt = cset->char_classes[i];
3836 if (__iswctype (wc, wt))
3837 {
3838 match_len = char_len;
3839 goto check_node_accept_bytes_match;
3840 }
3841 }
3842
3843# ifdef _LIBC
3844 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3845 if (nrules != 0)
3846 {
3847 unsigned int in_collseq = 0;
3848 const int32_t *table, *indirect;
3849 const unsigned char *weights, *extra;
3850 const char *collseqwc;
3851
3852 /* match with collating_symbol? */
3853 if (cset->ncoll_syms)
3854 extra = (const unsigned char *)
3855 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3856 for (i = 0; i < cset->ncoll_syms; ++i)
3857 {
3858 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3859 /* Compare the length of input collating element and
3860 the length of current collating element. */
3861 if (*coll_sym != elem_len)
3862 continue;
3863 /* Compare each bytes. */
3864 for (j = 0; j < *coll_sym; j++)
3865 if (pin[j] != coll_sym[1 + j])
3866 break;
3867 if (j == *coll_sym)
3868 {
3869 /* Match if every bytes is equal. */
3870 match_len = j;
3871 goto check_node_accept_bytes_match;
3872 }
3873 }
3874
3875 if (cset->nranges)
3876 {
3877 if (elem_len <= char_len)
3878 {
3879 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3880 in_collseq = __collseq_table_lookup (collseqwc, wc);
3881 }
3882 else
3883 in_collseq = find_collation_sequence_value (pin, elem_len);
3884 }
3885 /* match with range expression? */
3886 for (i = 0; i < cset->nranges; ++i)
3887 if (cset->range_starts[i] <= in_collseq
3888 && in_collseq <= cset->range_ends[i])
3889 {
3890 match_len = elem_len;
3891 goto check_node_accept_bytes_match;
3892 }
3893
3894 /* match with equivalence_class? */
3895 if (cset->nequiv_classes)
3896 {
3897 const unsigned char *cp = pin;
3898 table = (const int32_t *)
3899 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3900 weights = (const unsigned char *)
3901 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3902 extra = (const unsigned char *)
3903 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3904 indirect = (const int32_t *)
3905 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3906 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3907 if (idx > 0)
3908 for (i = 0; i < cset->nequiv_classes; ++i)
3909 {
3910 int32_t equiv_class_idx = cset->equiv_classes[i];
3911 size_t weight_len = weights[idx & 0xffffff];
3912 if (weight_len == weights[equiv_class_idx & 0xffffff]
3913 && (idx >> 24) == (equiv_class_idx >> 24))
3914 {
3915 int cnt = 0;
3916
3917 idx &= 0xffffff;
3918 equiv_class_idx &= 0xffffff;
3919
3920 while (cnt <= weight_len
3921 && (weights[equiv_class_idx + 1 + cnt]
3922 == weights[idx + 1 + cnt]))
3923 ++cnt;
3924 if (cnt > weight_len)
3925 {
3926 match_len = elem_len;
3927 goto check_node_accept_bytes_match;
3928 }
3929 }
3930 }
3931 }
3932 }
3933 else
3934# endif /* _LIBC */
3935 {
3936 /* match with range expression? */
3937#if __GNUC__ >= 2
3938 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3939#else
3940 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3941 cmp_buf[2] = wc;
3942#endif
3943 for (i = 0; i < cset->nranges; ++i)
3944 {
3945 cmp_buf[0] = cset->range_starts[i];
3946 cmp_buf[4] = cset->range_ends[i];
3947 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
3948 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3949 {
3950 match_len = char_len;
3951 goto check_node_accept_bytes_match;
3952 }
3953 }
3954 }
3955 check_node_accept_bytes_match:
3956 if (!cset->non_match)
3957 return match_len;
3958 else
3959 {
3960 if (match_len > 0)
3961 return 0;
3962 else
3963 return (elem_len > char_len) ? elem_len : char_len;
3964 }
3965 }
3966 return 0;
3967}
3968
3969# ifdef _LIBC
3970static unsigned int
3971internal_function
3972find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3973{
3974 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3975 if (nrules == 0)
3976 {
3977 if (mbs_len == 1)
3978 {
3979 /* No valid character. Match it as a single byte character. */
3980 const unsigned char *collseq = (const unsigned char *)
3981 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3982 return collseq[mbs[0]];
3983 }
3984 return UINT_MAX;
3985 }
3986 else
3987 {
3988 int32_t idx;
3989 const unsigned char *extra = (const unsigned char *)
3990 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3991 int32_t extrasize = (const unsigned char *)
3992 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3993
3994 for (idx = 0; idx < extrasize;)
3995 {
3996 int mbs_cnt, found = 0;
3997 int32_t elem_mbs_len;
3998 /* Skip the name of collating element name. */
3999 idx = idx + extra[idx] + 1;
4000 elem_mbs_len = extra[idx++];
4001 if (mbs_len == elem_mbs_len)
4002 {
4003 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4004 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4005 break;
4006 if (mbs_cnt == elem_mbs_len)
4007 /* Found the entry. */
4008 found = 1;
4009 }
4010 /* Skip the byte sequence of the collating element. */
4011 idx += elem_mbs_len;
4012 /* Adjust for the alignment. */
4013 idx = (idx + 3) & ~3;
4014 /* Skip the collation sequence value. */
4015 idx += sizeof (uint32_t);
4016 /* Skip the wide char sequence of the collating element. */
4017 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4018 /* If we found the entry, return the sequence value. */
4019 if (found)
4020 return *(uint32_t *) (extra + idx);
4021 /* Skip the collation sequence value. */
4022 idx += sizeof (uint32_t);
4023 }
4024 return UINT_MAX;
4025 }
4026}
4027# endif /* _LIBC */
4028#endif /* RE_ENABLE_I18N */
4029
4030/* Check whether the node accepts the byte which is IDX-th
4031 byte of the INPUT. */
4032
4033static int
4034internal_function
4035check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4036 int idx)
4037{
4038 unsigned char ch;
4039 ch = re_string_byte_at (&mctx->input, idx);
4040 switch (node->type)
4041 {
4042 case CHARACTER:
4043 if (node->opr.c != ch)
4044 return 0;
4045 break;
4046
4047 case SIMPLE_BRACKET:
4048 if (!bitset_contain (node->opr.sbcset, ch))
4049 return 0;
4050 break;
4051
4052#ifdef RE_ENABLE_I18N
4053 case OP_UTF8_PERIOD:
4054 if (ch >= 0x80)
4055 return 0;
4056 /* FALLTHROUGH */
4057#endif
4058 case OP_PERIOD:
4059 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4060 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4061 return 0;
4062 break;
4063
4064 default:
4065 return 0;
4066 }
4067
4068 if (node->constraint)
4069 {
4070 /* The node has constraints. Check whether the current context
4071 satisfies the constraints. */
4072 unsigned int context = re_string_context_at (&mctx->input, idx,
4073 mctx->eflags);
4074 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4075 return 0;
4076 }
4077
4078 return 1;
4079}
4080
4081/* Extend the buffers, if the buffers have run out. */
4082
4083static reg_errcode_t
4084internal_function __attribute_warn_unused_result__
4085extend_buffers (re_match_context_t *mctx, int min_len)
4086{
4087 reg_errcode_t ret;
4088 re_string_t *pstr = &mctx->input;
4089
4090 /* Avoid overflow. */
4091 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4092 return REG_ESPACE;
4093
4094 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4095 ret = re_string_realloc_buffers (pstr,
4096 MAX (min_len,
4097 MIN (pstr->len, pstr->bufs_len * 2)));
4098 if (BE (ret != REG_NOERROR, 0))
4099 return ret;
4100
4101 if (mctx->state_log != NULL)
4102 {
4103 /* And double the length of state_log. */
4104 /* XXX We have no indication of the size of this buffer. If this
4105 allocation fail we have no indication that the state_log array
4106 does not have the right size. */
4107 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4108 pstr->bufs_len + 1);
4109 if (BE (new_array == NULL, 0))
4110 return REG_ESPACE;
4111 mctx->state_log = new_array;
4112 }
4113
4114 /* Then reconstruct the buffers. */
4115 if (pstr->icase)
4116 {
4117#ifdef RE_ENABLE_I18N
4118 if (pstr->mb_cur_max > 1)
4119 {
4120 ret = build_wcs_upper_buffer (pstr);
4121 if (BE (ret != REG_NOERROR, 0))
4122 return ret;
4123 }
4124 else
4125#endif /* RE_ENABLE_I18N */
4126 build_upper_buffer (pstr);
4127 }
4128 else
4129 {
4130#ifdef RE_ENABLE_I18N
4131 if (pstr->mb_cur_max > 1)
4132 build_wcs_buffer (pstr);
4133 else
4134#endif /* RE_ENABLE_I18N */
4135 {
4136 if (pstr->trans != NULL)
4137 re_string_translate_buffer (pstr);
4138 }
4139 }
4140 return REG_NOERROR;
4141}
4142
4143
4144/* Functions for matching context. */
4145
4146/* Initialize MCTX. */
4147
4148static reg_errcode_t
4149internal_function __attribute_warn_unused_result__
4150match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4151{
4152 mctx->eflags = eflags;
4153 mctx->match_last = -1;
4154 if (n > 0)
4155 {
4156 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4157 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4158 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4159 return REG_ESPACE;
4160 }
4161 /* Already zero-ed by the caller.
4162 else
4163 mctx->bkref_ents = NULL;
4164 mctx->nbkref_ents = 0;
4165 mctx->nsub_tops = 0; */
4166 mctx->abkref_ents = n;
4167 mctx->max_mb_elem_len = 1;
4168 mctx->asub_tops = n;
4169 return REG_NOERROR;
4170}
4171
4172/* Clean the entries which depend on the current input in MCTX.
4173 This function must be invoked when the matcher changes the start index
4174 of the input, or changes the input string. */
4175
4176static void
4177internal_function
4178match_ctx_clean (re_match_context_t *mctx)
4179{
4180 int st_idx;
4181 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4182 {
4183 int sl_idx;
4184 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4185 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4186 {
4187 re_sub_match_last_t *last = top->lasts[sl_idx];
4188 re_free (last->path.array);
4189 re_free (last);
4190 }
4191 re_free (top->lasts);
4192 if (top->path)
4193 {
4194 re_free (top->path->array);
4195 re_free (top->path);
4196 }
4197 free (top);
4198 }
4199
4200 mctx->nsub_tops = 0;
4201 mctx->nbkref_ents = 0;
4202}
4203
4204/* Free all the memory associated with MCTX. */
4205
4206static void
4207internal_function
4208match_ctx_free (re_match_context_t *mctx)
4209{
4210 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4211 match_ctx_clean (mctx);
4212 re_free (mctx->sub_tops);
4213 re_free (mctx->bkref_ents);
4214}
4215
4216/* Add a new backreference entry to MCTX.
4217 Note that we assume that caller never call this function with duplicate
4218 entry, and call with STR_IDX which isn't smaller than any existing entry.
4219*/
4220
4221static reg_errcode_t
4222internal_function __attribute_warn_unused_result__
4223match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4224 int to)
4225{
4226 if (mctx->nbkref_ents >= mctx->abkref_ents)
4227 {
4228 struct re_backref_cache_entry* new_entry;
4229 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4230 mctx->abkref_ents * 2);
4231 if (BE (new_entry == NULL, 0))
4232 {
4233 re_free (mctx->bkref_ents);
4234 return REG_ESPACE;
4235 }
4236 mctx->bkref_ents = new_entry;
4237 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4238 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4239 mctx->abkref_ents *= 2;
4240 }
4241 if (mctx->nbkref_ents > 0
4242 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4243 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4244
4245 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4246 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4247 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4248 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4249
4250 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4251 If bit N is clear, means that this entry won't epsilon-transition to
4252 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4253 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4254 such node.
4255
4256 A backreference does not epsilon-transition unless it is empty, so set
4257 to all zeros if FROM != TO. */
4258 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4259 = (from == to ? ~0 : 0);
4260
4261 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4262 if (mctx->max_mb_elem_len < to - from)
4263 mctx->max_mb_elem_len = to - from;
4264 return REG_NOERROR;
4265}
4266
4267/* Search for the first entry which has the same str_idx, or -1 if none is
4268 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4269
4270static int
4271internal_function
4272search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4273{
4274 int left, right, mid, last;
4275 last = right = mctx->nbkref_ents;
4276 for (left = 0; left < right;)
4277 {
4278 mid = (left + right) / 2;
4279 if (mctx->bkref_ents[mid].str_idx < str_idx)
4280 left = mid + 1;
4281 else
4282 right = mid;
4283 }
4284 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4285 return left;
4286 else
4287 return -1;
4288}
4289
4290/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4291 at STR_IDX. */
4292
4293static reg_errcode_t
4294internal_function __attribute_warn_unused_result__
4295match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4296{
4297#ifdef DEBUG
4298 assert (mctx->sub_tops != NULL);
4299 assert (mctx->asub_tops > 0);
4300#endif
4301 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4302 {
4303 int new_asub_tops = mctx->asub_tops * 2;
4304 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4305 re_sub_match_top_t *,
4306 new_asub_tops);
4307 if (BE (new_array == NULL, 0))
4308 return REG_ESPACE;
4309 mctx->sub_tops = new_array;
4310 mctx->asub_tops = new_asub_tops;
4311 }
4312 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4313 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4314 return REG_ESPACE;
4315 mctx->sub_tops[mctx->nsub_tops]->node = node;
4316 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4317 return REG_NOERROR;
4318}
4319
4320/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4321 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4322
4323static re_sub_match_last_t *
4324internal_function
4325match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4326{
4327 re_sub_match_last_t *new_entry;
4328 if (BE (subtop->nlasts == subtop->alasts, 0))
4329 {
4330 int new_alasts = 2 * subtop->alasts + 1;
4331 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4332 re_sub_match_last_t *,
4333 new_alasts);
4334 if (BE (new_array == NULL, 0))
4335 return NULL;
4336 subtop->lasts = new_array;
4337 subtop->alasts = new_alasts;
4338 }
4339 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4340 if (BE (new_entry != NULL, 1))
4341 {
4342 subtop->lasts[subtop->nlasts] = new_entry;
4343 new_entry->node = node;
4344 new_entry->str_idx = str_idx;
4345 ++subtop->nlasts;
4346 }
4347 return new_entry;
4348}
4349
4350static void
4351internal_function
4352sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4353 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4354{
4355 sctx->sifted_states = sifted_sts;
4356 sctx->limited_states = limited_sts;
4357 sctx->last_node = last_node;
4358 sctx->last_str_idx = last_str_idx;
4359 re_node_set_init_empty (&sctx->limits);
4360}