blob: c13c64ef485e97c21ead9cdc134374a2bbca52bc [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
76
77#ifdef RE_ENABLE_I18N
78static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82#endif
83static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137#if 0
138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142#endif
143#ifdef RE_ENABLE_I18N
144static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147#endif
148static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184#ifdef RE_ENABLE_I18N
185static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188#endif
189static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
190 const re_dfastate_t *state,
191 re_node_set *states_node,
192 bitset_t *states_ch) internal_function;
193static int check_node_accept (const re_match_context_t *mctx,
194 const re_token_t *node, int idx)
195 internal_function;
196static reg_errcode_t extend_buffers (re_match_context_t *mctx)
197 internal_function;
198
199/* Entry point for POSIX code. */
200
201/* regexec searches for a given pattern, specified by PREG, in the
202 string STRING.
203
204 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
205 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
206 least NMATCH elements, and we set them to the offsets of the
207 corresponding matched substrings.
208
209 EFLAGS specifies `execution flags' which affect matching: if
210 REG_NOTBOL is set, then ^ does not match at the beginning of the
211 string; if REG_NOTEOL is set, then $ does not match at the end.
212
213 We return 0 if we find a match and REG_NOMATCH if not. */
214
215int
216regexec (const regex_t *__restrict preg, const char *__restrict string,
217 size_t nmatch, regmatch_t pmatch[], int eflags)
218{
219 reg_errcode_t err;
220 int start, length;
221#ifdef __UCLIBC_HAS_THREADS__
222 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
223#endif
224
225 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
226 return REG_BADPAT;
227
228 if (eflags & REG_STARTEND)
229 {
230 start = pmatch[0].rm_so;
231 length = pmatch[0].rm_eo;
232 }
233 else
234 {
235 start = 0;
236 length = strlen (string);
237 }
238
239 __libc_lock_lock (dfa->lock);
240 if (preg->no_sub)
241 err = re_search_internal (preg, string, length, start, length - start,
242 length, 0, NULL, eflags);
243 else
244 err = re_search_internal (preg, string, length, start, length - start,
245 length, nmatch, pmatch, eflags);
246 __libc_lock_unlock (dfa->lock);
247 return err != REG_NOERROR;
248}
249libc_hidden_def(regexec)
250
251/* Entry points for GNU code. */
252
253/* re_match, re_search, re_match_2, re_search_2
254
255 The former two functions operate on STRING with length LENGTH,
256 while the later two operate on concatenation of STRING1 and STRING2
257 with lengths LENGTH1 and LENGTH2, respectively.
258
259 re_match() matches the compiled pattern in BUFP against the string,
260 starting at index START.
261
262 re_search() first tries matching at index START, then it tries to match
263 starting from index START + 1, and so on. The last start position tried
264 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
265 way as re_match().)
266
267 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
268 the first STOP characters of the concatenation of the strings should be
269 concerned.
270
271 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
272 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
273 computed relative to the concatenation, not relative to the individual
274 strings.)
275
276 On success, re_match* functions return the length of the match, re_search*
277 return the position of the start of the match. Return value -1 means no
278 match was found and -2 indicates an internal error. */
279
280int
281re_match (struct re_pattern_buffer *bufp, const char *string, int length,
282 int start, struct re_registers *regs)
283{
284 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
285}
286
287int
288re_search (struct re_pattern_buffer *bufp, const char *string, int length,
289 int start, int range, struct re_registers *regs)
290{
291 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
292}
293libc_hidden_def(re_search)
294
295int
296re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
297 const char *string2, int length2, int start,
298 struct re_registers *regs, int stop)
299{
300 return re_search_2_stub (bufp, string1, length1, string2, length2,
301 start, 0, regs, stop, 1);
302}
303
304int
305re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int lenght1,
306 const char *string2, int length2, int start, int range,
307 struct re_registers *regs, int stop)
308{
309 return re_search_2_stub (bufp, string1, lenght1, string2, length2,
310 start, range, regs, stop, 0);
311}
312libc_hidden_def(re_search_2)
313
314static int internal_function
315re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
316 int length1, const char *string2, int length2, int start,
317 int range, struct re_registers *regs, int stop, int ret_len)
318{
319 const char *str;
320 int rval;
321 int len = length1 + length2;
322 int free_str = 0;
323
324 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
325 return -2;
326
327 /* Concatenate the strings. */
328 if (length2 > 0)
329 if (length1 > 0)
330 {
331 char *s = re_malloc (char, len);
332
333 if (BE (s == NULL, 0))
334 return -2;
335 memcpy (s, string1, length1);
336 memcpy (s + length1, string2, length2);
337 str = s;
338 free_str = 1;
339 }
340 else
341 str = string2;
342 else
343 str = string1;
344
345 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
346 ret_len);
347 if (free_str)
348 re_free ((char *) str);
349 return rval;
350}
351
352/* The parameters have the same meaning as those of re_search.
353 Additional parameters:
354 If RET_LEN is nonzero the length of the match is returned (re_match style);
355 otherwise the position of the match is returned. */
356
357static int internal_function
358re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length,
359 int start, int range, int stop, struct re_registers *regs,
360 int ret_len)
361{
362 reg_errcode_t result;
363 regmatch_t *pmatch;
364 int nregs, rval;
365 int eflags = 0;
366#ifdef __UCLIBC_HAS_THREADS__
367 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
368#endif
369 /* Check for out-of-range. */
370 if (BE (start < 0 || start > length, 0))
371 return -1;
372 if (BE (start + range > length, 0))
373 range = length - start;
374 else if (BE (start + range < 0, 0))
375 range = -start;
376
377 __libc_lock_lock (dfa->lock);
378
379 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
380 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
381
382 /* Compile fastmap if we haven't yet. */
383 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
384 re_compile_fastmap (bufp);
385
386 if (BE (bufp->no_sub, 0))
387 regs = NULL;
388
389 /* We need at least 1 register. */
390 if (regs == NULL)
391 nregs = 1;
392 else if (BE (bufp->regs_allocated == REGS_FIXED &&
393 regs->num_regs < bufp->re_nsub + 1, 0))
394 {
395 nregs = regs->num_regs;
396 if (BE (nregs < 1, 0))
397 {
398 /* Nothing can be copied to regs. */
399 regs = NULL;
400 nregs = 1;
401 }
402 }
403 else
404 nregs = bufp->re_nsub + 1;
405 pmatch = re_malloc (regmatch_t, nregs);
406 if (BE (pmatch == NULL, 0))
407 {
408 rval = -2;
409 goto out;
410 }
411
412 result = re_search_internal (bufp, string, length, start, range, stop,
413 nregs, pmatch, eflags);
414
415 rval = 0;
416
417 /* I hope we needn't fill ther regs with -1's when no match was found. */
418 if (result != REG_NOERROR)
419 rval = -1;
420 else if (regs != NULL)
421 {
422 /* If caller wants register contents data back, copy them. */
423 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
424 bufp->regs_allocated);
425 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
426 rval = -2;
427 }
428
429 if (BE (rval == 0, 1))
430 {
431 if (ret_len)
432 {
433 assert (pmatch[0].rm_so == start);
434 rval = pmatch[0].rm_eo - start;
435 }
436 else
437 rval = pmatch[0].rm_so;
438 }
439 re_free (pmatch);
440 out:
441 __libc_lock_unlock (dfa->lock);
442 return rval;
443}
444
445static unsigned internal_function
446re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
447 int regs_allocated)
448{
449 int rval = REGS_REALLOCATE;
450 int i;
451 int need_regs = nregs + 1;
452 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
453 uses. */
454
455 /* Have the register data arrays been allocated? */
456 if (regs_allocated == REGS_UNALLOCATED)
457 { /* No. So allocate them with malloc. */
458 regs->start = re_malloc (regoff_t, need_regs);
459 regs->end = re_malloc (regoff_t, need_regs);
460 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
461 return REGS_UNALLOCATED;
462 regs->num_regs = need_regs;
463 }
464 else if (regs_allocated == REGS_REALLOCATE)
465 { /* Yes. If we need more elements than were already
466 allocated, reallocate them. If we need fewer, just
467 leave it alone. */
468 if (BE (need_regs > regs->num_regs, 0))
469 {
470 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
471 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
472 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
473 return REGS_UNALLOCATED;
474 regs->start = new_start;
475 regs->end = new_end;
476 regs->num_regs = need_regs;
477 }
478 }
479 else
480 {
481 assert (regs_allocated == REGS_FIXED);
482 /* This function may not be called with REGS_FIXED and nregs too big. */
483 assert (regs->num_regs >= nregs);
484 rval = REGS_FIXED;
485 }
486
487 /* Copy the regs. */
488 for (i = 0; i < nregs; ++i)
489 {
490 regs->start[i] = pmatch[i].rm_so;
491 regs->end[i] = pmatch[i].rm_eo;
492 }
493 for ( ; i < regs->num_regs; ++i)
494 regs->start[i] = regs->end[i] = -1;
495
496 return rval;
497}
498
499/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
500 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
501 this memory for recording register information. STARTS and ENDS
502 must be allocated using the malloc library routine, and must each
503 be at least NUM_REGS * sizeof (regoff_t) bytes long.
504
505 If NUM_REGS == 0, then subsequent matches should allocate their own
506 register data.
507
508 Unless this function is called, the first search or match using
509 PATTERN_BUFFER will allocate its own register data, without
510 freeing the old data. */
511
512void
513re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
514 unsigned num_regs, regoff_t *starts, regoff_t *ends)
515{
516 if (num_regs)
517 {
518 bufp->regs_allocated = REGS_REALLOCATE;
519 regs->num_regs = num_regs;
520 regs->start = starts;
521 regs->end = ends;
522 }
523 else
524 {
525 bufp->regs_allocated = REGS_UNALLOCATED;
526 regs->num_regs = 0;
527 regs->start = regs->end = (regoff_t *) 0;
528 }
529}
530
531/* Entry points compatible with 4.2 BSD regex library. We don't define
532 them unless specifically requested. */
533
534#if defined _REGEX_RE_COMP || defined __UCLIBC__
535int
536weak_function
537re_exec (const char *s)
538{
539 return 0 == regexec (re_comp_buf, s, 0, NULL, 0);
540}
541#endif
542
543/* Internal entry point. */
544
545/* Searches for a compiled pattern PREG in the string STRING, whose
546 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
547 mingings with regexec. START, and RANGE have the same meanings
548 with re_search.
549 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
550 otherwise return the error code.
551 Note: We assume front end functions already check ranges.
552 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
553static reg_errcode_t internal_function
554re_search_internal (const regex_t *preg, const char *string, int length,
555 int start, int range, int stop, size_t nmatch,
556 regmatch_t pmatch[], int eflags)
557{
558 reg_errcode_t err;
559 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
560 int left_lim, right_lim, incr;
561 int fl_longest_match, match_first, match_kind, match_last = -1;
562 int extra_nmatch;
563 int sb, ch;
564 re_match_context_t mctx;
565 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
566 && range && !preg->can_be_null) ? preg->fastmap : NULL;
567 RE_TRANSLATE_TYPE t = preg->translate;
568
569 memset (&mctx, '\0', sizeof (re_match_context_t));
570 mctx.dfa = dfa;
571
572 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
573 nmatch -= extra_nmatch;
574
575 /* Check if the DFA haven't been compiled. */
576 if (BE (preg->used == 0 || dfa->init_state == NULL
577 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
578 || dfa->init_state_begbuf == NULL, 0))
579 return REG_NOMATCH;
580
581#ifdef DEBUG
582 /* We assume front-end functions already check them. */
583 assert (start + range >= 0 && start + range <= length);
584#endif
585
586 /* If initial states with non-begbuf contexts have no elements,
587 the regex must be anchored. If preg->newline_anchor is set,
588 we'll never use init_state_nl, so do not check it. */
589 if (dfa->init_state->nodes.nelem == 0
590 && dfa->init_state_word->nodes.nelem == 0
591 && (dfa->init_state_nl->nodes.nelem == 0
592 || !preg->newline_anchor))
593 {
594 if (start != 0 && start + range != 0)
595 return REG_NOMATCH;
596 start = range = 0;
597 }
598
599 /* We must check the longest matching, if nmatch > 0. */
600 fl_longest_match = (nmatch != 0 || dfa->nbackref);
601
602 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
603 preg->translate, preg->syntax & RE_ICASE, dfa);
604 if (BE (err != REG_NOERROR, 0))
605 goto free_return;
606 mctx.input.stop = stop;
607 mctx.input.raw_stop = stop;
608 mctx.input.newline_anchor = preg->newline_anchor;
609
610 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
611 if (BE (err != REG_NOERROR, 0))
612 goto free_return;
613
614 /* We will log all the DFA states through which the dfa pass,
615 if nmatch > 1, or this dfa has "multibyte node", which is a
616 back-reference or a node which can accept multibyte character or
617 multi character collating element. */
618 if (nmatch > 1 || dfa->has_mb_node)
619 {
620 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
621 if (BE (mctx.state_log == NULL, 0))
622 {
623 err = REG_ESPACE;
624 goto free_return;
625 }
626 }
627 else
628 mctx.state_log = NULL;
629
630 match_first = start;
631 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
632 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
633
634 /* Check incrementally whether of not the input string match. */
635 incr = (range < 0) ? -1 : 1;
636 left_lim = (range < 0) ? start + range : start;
637 right_lim = (range < 0) ? start : start + range;
638 sb = dfa->mb_cur_max == 1;
639 match_kind =
640 (fastmap
641 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
642 | (range >= 0 ? 2 : 0)
643 | (t != NULL ? 1 : 0))
644 : 8);
645
646 for (;; match_first += incr)
647 {
648 err = REG_NOMATCH;
649 if (match_first < left_lim || right_lim < match_first)
650 goto free_return;
651
652 /* Advance as rapidly as possible through the string, until we
653 find a plausible place to start matching. This may be done
654 with varying efficiency, so there are various possibilities:
655 only the most common of them are specialized, in order to
656 save on code size. We use a switch statement for speed. */
657 switch (match_kind)
658 {
659 case 8:
660 /* No fastmap. */
661 break;
662
663 case 7:
664 /* Fastmap with single-byte translation, match forward. */
665 while (BE (match_first < right_lim, 1)
666 && !fastmap[t[(unsigned char) string[match_first]]])
667 ++match_first;
668 goto forward_match_found_start_or_reached_end;
669
670 case 6:
671 /* Fastmap without translation, match forward. */
672 while (BE (match_first < right_lim, 1)
673 && !fastmap[(unsigned char) string[match_first]])
674 ++match_first;
675
676 forward_match_found_start_or_reached_end:
677 if (BE (match_first == right_lim, 0))
678 {
679 ch = match_first >= length
680 ? 0 : (unsigned char) string[match_first];
681 if (!fastmap[t ? t[ch] : ch])
682 goto free_return;
683 }
684 break;
685
686 case 4:
687 case 5:
688 /* Fastmap without multi-byte translation, match backwards. */
689 while (match_first >= left_lim)
690 {
691 ch = match_first >= length
692 ? 0 : (unsigned char) string[match_first];
693 if (fastmap[t ? t[ch] : ch])
694 break;
695 --match_first;
696 }
697 if (match_first < left_lim)
698 goto free_return;
699 break;
700
701 default:
702 /* In this case, we can't determine easily the current byte,
703 since it might be a component byte of a multibyte
704 character. Then we use the constructed buffer instead. */
705 for (;;)
706 {
707 /* If MATCH_FIRST is out of the valid range, reconstruct the
708 buffers. */
709 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
710 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
711 {
712 err = re_string_reconstruct (&mctx.input, match_first,
713 eflags);
714 if (BE (err != REG_NOERROR, 0))
715 goto free_return;
716
717 offset = match_first - mctx.input.raw_mbs_idx;
718 }
719 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
720 Note that MATCH_FIRST must not be smaller than 0. */
721 ch = (match_first >= length
722 ? 0 : re_string_byte_at (&mctx.input, offset));
723 if (fastmap[ch])
724 break;
725 match_first += incr;
726 if (match_first < left_lim || match_first > right_lim)
727 {
728 err = REG_NOMATCH;
729 goto free_return;
730 }
731 }
732 break;
733 }
734
735 /* Reconstruct the buffers so that the matcher can assume that
736 the matching starts from the beginning of the buffer. */
737 err = re_string_reconstruct (&mctx.input, match_first, eflags);
738 if (BE (err != REG_NOERROR, 0))
739 goto free_return;
740
741#ifdef RE_ENABLE_I18N
742 /* Don't consider this char as a possible match start if it part,
743 yet isn't the head, of a multibyte character. */
744 if (!sb && !re_string_first_byte (&mctx.input, 0))
745 continue;
746#endif
747
748 /* It seems to be appropriate one, then use the matcher. */
749 /* We assume that the matching starts from 0. */
750 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
751 match_last = check_matching (&mctx, fl_longest_match,
752 range >= 0 ? &match_first : NULL);
753 if (match_last != -1)
754 {
755 if (BE (match_last == -2, 0))
756 {
757 err = REG_ESPACE;
758 goto free_return;
759 }
760 else
761 {
762 mctx.match_last = match_last;
763 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
764 {
765 re_dfastate_t *pstate = mctx.state_log[match_last];
766 mctx.last_node = check_halt_state_context (&mctx, pstate,
767 match_last);
768 }
769 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
770 || dfa->nbackref)
771 {
772 err = prune_impossible_nodes (&mctx);
773 if (err == REG_NOERROR)
774 break;
775 if (BE (err != REG_NOMATCH, 0))
776 goto free_return;
777 match_last = -1;
778 }
779 else
780 break; /* We found a match. */
781 }
782 }
783
784 match_ctx_clean (&mctx);
785 }
786
787#ifdef DEBUG
788 assert (match_last != -1);
789 assert (err == REG_NOERROR);
790#endif
791
792 /* Set pmatch[] if we need. */
793 if (nmatch > 0)
794 {
795 int reg_idx;
796
797 /* Initialize registers. */
798 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
799 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
800
801 /* Set the points where matching start/end. */
802 pmatch[0].rm_so = 0;
803 pmatch[0].rm_eo = mctx.match_last;
804
805 if (!preg->no_sub && nmatch > 1)
806 {
807 err = set_regs (preg, &mctx, nmatch, pmatch,
808 dfa->has_plural_match && dfa->nbackref > 0);
809 if (BE (err != REG_NOERROR, 0))
810 goto free_return;
811 }
812
813 /* At last, add the offset to the each registers, since we slided
814 the buffers so that we could assume that the matching starts
815 from 0. */
816 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
817 if (pmatch[reg_idx].rm_so != -1)
818 {
819#ifdef RE_ENABLE_I18N
820 if (BE (mctx.input.offsets_needed != 0, 0))
821 {
822 pmatch[reg_idx].rm_so =
823 (pmatch[reg_idx].rm_so == mctx.input.valid_len
824 ? mctx.input.valid_raw_len
825 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
826 pmatch[reg_idx].rm_eo =
827 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
828 ? mctx.input.valid_raw_len
829 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
830 }
831#else
832 assert (mctx.input.offsets_needed == 0);
833#endif
834 pmatch[reg_idx].rm_so += match_first;
835 pmatch[reg_idx].rm_eo += match_first;
836 }
837 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
838 {
839 pmatch[nmatch + reg_idx].rm_so = -1;
840 pmatch[nmatch + reg_idx].rm_eo = -1;
841 }
842
843 if (dfa->subexp_map)
844 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
845 if (dfa->subexp_map[reg_idx] != reg_idx)
846 {
847 pmatch[reg_idx + 1].rm_so
848 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
849 pmatch[reg_idx + 1].rm_eo
850 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
851 }
852 }
853
854 free_return:
855 re_free (mctx.state_log);
856 if (dfa->nbackref)
857 match_ctx_free (&mctx);
858 re_string_destruct (&mctx.input);
859 return err;
860}
861
862static reg_errcode_t internal_function
863prune_impossible_nodes (re_match_context_t *mctx)
864{
865 const re_dfa_t *const dfa = mctx->dfa;
866 int halt_node, match_last;
867 reg_errcode_t ret;
868 re_dfastate_t **sifted_states;
869 re_dfastate_t **lim_states = NULL;
870 re_sift_context_t sctx;
871#ifdef DEBUG
872 assert (mctx->state_log != NULL);
873#endif
874 match_last = mctx->match_last;
875 halt_node = mctx->last_node;
876 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
877 if (BE (sifted_states == NULL, 0))
878 {
879 ret = REG_ESPACE;
880 goto free_return;
881 }
882 if (dfa->nbackref)
883 {
884 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
885 if (BE (lim_states == NULL, 0))
886 {
887 ret = REG_ESPACE;
888 goto free_return;
889 }
890 while (1)
891 {
892 memset (lim_states, '\0',
893 sizeof (re_dfastate_t *) * (match_last + 1));
894 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
895 match_last);
896 ret = sift_states_backward (mctx, &sctx);
897 re_node_set_free (&sctx.limits);
898 if (BE (ret != REG_NOERROR, 0))
899 goto free_return;
900 if (sifted_states[0] != NULL || lim_states[0] != NULL)
901 break;
902 do
903 {
904 --match_last;
905 if (match_last < 0)
906 {
907 ret = REG_NOMATCH;
908 goto free_return;
909 }
910 } while (mctx->state_log[match_last] == NULL
911 || !mctx->state_log[match_last]->halt);
912 halt_node = check_halt_state_context (mctx,
913 mctx->state_log[match_last],
914 match_last);
915 }
916 ret = merge_state_array (dfa, sifted_states, lim_states,
917 match_last + 1);
918 re_free (lim_states);
919 lim_states = NULL;
920 if (BE (ret != REG_NOERROR, 0))
921 goto free_return;
922 }
923 else
924 {
925 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
926 ret = sift_states_backward (mctx, &sctx);
927 re_node_set_free (&sctx.limits);
928 if (BE (ret != REG_NOERROR, 0))
929 goto free_return;
930 }
931 re_free (mctx->state_log);
932 mctx->state_log = sifted_states;
933 sifted_states = NULL;
934 mctx->last_node = halt_node;
935 mctx->match_last = match_last;
936 ret = REG_NOERROR;
937 free_return:
938 re_free (sifted_states);
939 re_free (lim_states);
940 return ret;
941}
942
943/* Acquire an initial state and return it.
944 We must select appropriate initial state depending on the context,
945 since initial states may have constraints like "\<", "^", etc.. */
946
947static __inline__ re_dfastate_t *
948__attribute ((always_inline)) internal_function
949acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
950 int idx)
951{
952 const re_dfa_t *const dfa = mctx->dfa;
953 if (dfa->init_state->has_constraint)
954 {
955 unsigned int context;
956 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
957 if (IS_WORD_CONTEXT (context))
958 return dfa->init_state_word;
959 else if (IS_ORDINARY_CONTEXT (context))
960 return dfa->init_state;
961 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
962 return dfa->init_state_begbuf;
963 else if (IS_NEWLINE_CONTEXT (context))
964 return dfa->init_state_nl;
965 else if (IS_BEGBUF_CONTEXT (context))
966 {
967 /* It is relatively rare case, then calculate on demand. */
968 return re_acquire_state_context (err, dfa,
969 dfa->init_state->entrance_nodes,
970 context);
971 }
972 else
973 /* Must not happen? */
974 return dfa->init_state;
975 }
976 else
977 return dfa->init_state;
978}
979
980/* Check whether the regular expression match input string INPUT or not,
981 and return the index where the matching end, return -1 if not match,
982 or return -2 in case of an error.
983 FL_LONGEST_MATCH means we want the POSIX longest matching.
984 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
985 next place where we may want to try matching.
986 Note that the matcher assume that the maching starts from the current
987 index of the buffer. */
988
989static int
990internal_function
991check_matching (re_match_context_t *mctx, int fl_longest_match,
992 int *p_match_first)
993{
994 const re_dfa_t *const dfa = mctx->dfa;
995 reg_errcode_t err;
996 int match = 0;
997 int match_last = -1;
998 int cur_str_idx = re_string_cur_idx (&mctx->input);
999 re_dfastate_t *cur_state;
1000 int at_init_state = p_match_first != NULL;
1001 int next_start_idx = cur_str_idx;
1002
1003 err = REG_NOERROR;
1004 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1005 /* An initial state must not be NULL (invalid). */
1006 if (BE (cur_state == NULL, 0))
1007 {
1008 assert (err == REG_ESPACE);
1009 return -2;
1010 }
1011
1012 if (mctx->state_log != NULL)
1013 {
1014 mctx->state_log[cur_str_idx] = cur_state;
1015
1016 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1017 later. E.g. Processing back references. */
1018 if (BE (dfa->nbackref, 0))
1019 {
1020 at_init_state = 0;
1021 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1022 if (BE (err != REG_NOERROR, 0))
1023 return err;
1024
1025 if (cur_state->has_backref)
1026 {
1027 err = transit_state_bkref (mctx, &cur_state->nodes);
1028 if (BE (err != REG_NOERROR, 0))
1029 return err;
1030 }
1031 }
1032 }
1033
1034 /* If the RE accepts NULL string. */
1035 if (BE (cur_state->halt, 0))
1036 {
1037 if (!cur_state->has_constraint
1038 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1039 {
1040 if (!fl_longest_match)
1041 return cur_str_idx;
1042 else
1043 {
1044 match_last = cur_str_idx;
1045 match = 1;
1046 }
1047 }
1048 }
1049
1050 while (!re_string_eoi (&mctx->input))
1051 {
1052 re_dfastate_t *old_state = cur_state;
1053 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1054
1055 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1056 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1057 && mctx->input.valid_len < mctx->input.len))
1058 {
1059 err = extend_buffers (mctx);
1060 if (BE (err != REG_NOERROR, 0))
1061 {
1062 assert (err == REG_ESPACE);
1063 return -2;
1064 }
1065 }
1066
1067 cur_state = transit_state (&err, mctx, cur_state);
1068 if (mctx->state_log != NULL)
1069 cur_state = merge_state_with_log (&err, mctx, cur_state);
1070
1071 if (cur_state == NULL)
1072 {
1073 /* Reached the invalid state or an error. Try to recover a valid
1074 state using the state log, if available and if we have not
1075 already found a valid (even if not the longest) match. */
1076 if (BE (err != REG_NOERROR, 0))
1077 return -2;
1078
1079 if (mctx->state_log == NULL
1080 || (match && !fl_longest_match)
1081 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1082 break;
1083 }
1084
1085 if (BE (at_init_state, 0))
1086 {
1087 if (old_state == cur_state)
1088 next_start_idx = next_char_idx;
1089 else
1090 at_init_state = 0;
1091 }
1092
1093 if (cur_state->halt)
1094 {
1095 /* Reached a halt state.
1096 Check the halt state can satisfy the current context. */
1097 if (!cur_state->has_constraint
1098 || check_halt_state_context (mctx, cur_state,
1099 re_string_cur_idx (&mctx->input)))
1100 {
1101 /* We found an appropriate halt state. */
1102 match_last = re_string_cur_idx (&mctx->input);
1103 match = 1;
1104
1105 /* We found a match, do not modify match_first below. */
1106 p_match_first = NULL;
1107 if (!fl_longest_match)
1108 break;
1109 }
1110 }
1111 }
1112
1113 if (p_match_first)
1114 *p_match_first += next_start_idx;
1115
1116 return match_last;
1117}
1118
1119/* Check NODE match the current context. */
1120
1121static int
1122internal_function
1123check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1124{
1125 re_token_type_t type = dfa->nodes[node].type;
1126 unsigned int constraint = dfa->nodes[node].constraint;
1127 if (type != END_OF_RE)
1128 return 0;
1129 if (!constraint)
1130 return 1;
1131 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1132 return 0;
1133 return 1;
1134}
1135
1136/* Check the halt state STATE match the current context.
1137 Return 0 if not match, if the node, STATE has, is a halt node and
1138 match the context, return the node. */
1139
1140static int
1141internal_function
1142check_halt_state_context (const re_match_context_t *mctx,
1143 const re_dfastate_t *state, int idx)
1144{
1145 int i;
1146 unsigned int context;
1147#ifdef DEBUG
1148 assert (state->halt);
1149#endif
1150 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1151 for (i = 0; i < state->nodes.nelem; ++i)
1152 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1153 return state->nodes.elems[i];
1154 return 0;
1155}
1156
1157/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1158 corresponding to the DFA).
1159 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1160 of errors. */
1161
1162static int
1163internal_function
1164proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1165 int *pidx, int node, re_node_set *eps_via_nodes,
1166 struct re_fail_stack_t *fs)
1167{
1168 const re_dfa_t *const dfa = mctx->dfa;
1169 int i, err;
1170 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1171 {
1172 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1173 re_node_set *edests = &dfa->edests[node];
1174 int dest_node;
1175 err = re_node_set_insert (eps_via_nodes, node);
1176 if (BE (err < 0, 0))
1177 return -2;
1178 /* Pick up a valid destination, or return -1 if none is found. */
1179 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1180 {
1181 int candidate = edests->elems[i];
1182 if (!re_node_set_contains (cur_nodes, candidate))
1183 continue;
1184 if (dest_node == -1)
1185 dest_node = candidate;
1186
1187 else
1188 {
1189 /* In order to avoid infinite loop like "(a*)*", return the second
1190 epsilon-transition if the first was already considered. */
1191 if (re_node_set_contains (eps_via_nodes, dest_node))
1192 return candidate;
1193
1194 /* Otherwise, push the second epsilon-transition on the fail stack. */
1195 else if (fs != NULL
1196 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1197 eps_via_nodes))
1198 return -2;
1199
1200 /* We know we are going to exit. */
1201 break;
1202 }
1203 }
1204 return dest_node;
1205 }
1206 else
1207 {
1208 int naccepted = 0;
1209 re_token_type_t type = dfa->nodes[node].type;
1210
1211#ifdef RE_ENABLE_I18N
1212 if (dfa->nodes[node].accept_mb)
1213 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1214 else
1215#endif /* RE_ENABLE_I18N */
1216 if (type == OP_BACK_REF)
1217 {
1218 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1219 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1220 if (fs != NULL)
1221 {
1222 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1223 return -1;
1224 else if (naccepted)
1225 {
1226 char *buf = (char *) re_string_get_buffer (&mctx->input);
1227 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1228 naccepted) != 0)
1229 return -1;
1230 }
1231 }
1232
1233 if (naccepted == 0)
1234 {
1235 int dest_node;
1236 err = re_node_set_insert (eps_via_nodes, node);
1237 if (BE (err < 0, 0))
1238 return -2;
1239 dest_node = dfa->edests[node].elems[0];
1240 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1241 dest_node))
1242 return dest_node;
1243 }
1244 }
1245
1246 if (naccepted != 0
1247 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1248 {
1249 int dest_node = dfa->nexts[node];
1250 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1251 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1252 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1253 dest_node)))
1254 return -1;
1255 re_node_set_empty (eps_via_nodes);
1256 return dest_node;
1257 }
1258 }
1259 return -1;
1260}
1261
1262static reg_errcode_t
1263internal_function
1264push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1265 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1266{
1267 reg_errcode_t err;
1268 int num = fs->num++;
1269 if (fs->num == fs->alloc)
1270 {
1271 struct re_fail_stack_ent_t *new_array;
1272 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1273 * fs->alloc * 2));
1274 if (new_array == NULL)
1275 return REG_ESPACE;
1276 fs->alloc *= 2;
1277 fs->stack = new_array;
1278 }
1279 fs->stack[num].idx = str_idx;
1280 fs->stack[num].node = dest_node;
1281 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1282 if (fs->stack[num].regs == NULL)
1283 return REG_ESPACE;
1284 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1285 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1286 return err;
1287}
1288
1289static int
1290internal_function
1291pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1292 regmatch_t *regs, re_node_set *eps_via_nodes)
1293{
1294 int num = --fs->num;
1295 assert (num >= 0);
1296 *pidx = fs->stack[num].idx;
1297 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1298 re_node_set_free (eps_via_nodes);
1299 re_free (fs->stack[num].regs);
1300 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1301 return fs->stack[num].node;
1302}
1303
1304/* Set the positions where the subexpressions are starts/ends to registers
1305 PMATCH.
1306 Note: We assume that pmatch[0] is already set, and
1307 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1308
1309static reg_errcode_t
1310internal_function
1311set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1312 regmatch_t *pmatch, int fl_backtrack)
1313{
1314 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1315 int idx, cur_node;
1316 re_node_set eps_via_nodes;
1317 struct re_fail_stack_t *fs;
1318 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1319 regmatch_t *prev_idx_match;
1320 int prev_idx_match_malloced = 0;
1321
1322#ifdef DEBUG
1323 assert (nmatch > 1);
1324 assert (mctx->state_log != NULL);
1325#endif
1326 if (fl_backtrack)
1327 {
1328 fs = &fs_body;
1329 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1330 if (fs->stack == NULL)
1331 return REG_ESPACE;
1332 }
1333 else
1334 fs = NULL;
1335
1336 cur_node = dfa->init_node;
1337 re_node_set_init_empty (&eps_via_nodes);
1338
1339 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1340 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1341 else
1342 {
1343 prev_idx_match = re_malloc (regmatch_t, nmatch);
1344 if (prev_idx_match == NULL)
1345 {
1346 free_fail_stack_return (fs);
1347 return REG_ESPACE;
1348 }
1349 prev_idx_match_malloced = 1;
1350 }
1351 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1352
1353 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1354 {
1355 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1356
1357 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1358 {
1359 int reg_idx;
1360 if (fs)
1361 {
1362 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1363 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1364 break;
1365 if (reg_idx == nmatch)
1366 {
1367 re_node_set_free (&eps_via_nodes);
1368 if (prev_idx_match_malloced)
1369 re_free (prev_idx_match);
1370 return free_fail_stack_return (fs);
1371 }
1372 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1373 &eps_via_nodes);
1374 }
1375 else
1376 {
1377 re_node_set_free (&eps_via_nodes);
1378 if (prev_idx_match_malloced)
1379 re_free (prev_idx_match);
1380 return REG_NOERROR;
1381 }
1382 }
1383
1384 /* Proceed to next node. */
1385 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1386 &eps_via_nodes, fs);
1387
1388 if (BE (cur_node < 0, 0))
1389 {
1390 if (BE (cur_node == -2, 0))
1391 {
1392 re_node_set_free (&eps_via_nodes);
1393 if (prev_idx_match_malloced)
1394 re_free (prev_idx_match);
1395 free_fail_stack_return (fs);
1396 return REG_ESPACE;
1397 }
1398 if (fs)
1399 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1400 &eps_via_nodes);
1401 else
1402 {
1403 re_node_set_free (&eps_via_nodes);
1404 if (prev_idx_match_malloced)
1405 re_free (prev_idx_match);
1406 return REG_NOMATCH;
1407 }
1408 }
1409 }
1410 re_node_set_free (&eps_via_nodes);
1411 if (prev_idx_match_malloced)
1412 re_free (prev_idx_match);
1413 return free_fail_stack_return (fs);
1414}
1415
1416static reg_errcode_t
1417internal_function
1418free_fail_stack_return (struct re_fail_stack_t *fs)
1419{
1420 if (fs)
1421 {
1422 int fs_idx;
1423 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1424 {
1425 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1426 re_free (fs->stack[fs_idx].regs);
1427 }
1428 re_free (fs->stack);
1429 }
1430 return REG_NOERROR;
1431}
1432
1433static void
1434internal_function
1435update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1436 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1437{
1438 int type = dfa->nodes[cur_node].type;
1439 if (type == OP_OPEN_SUBEXP)
1440 {
1441 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1442
1443 /* We are at the first node of this sub expression. */
1444 if (reg_num < nmatch)
1445 {
1446 pmatch[reg_num].rm_so = cur_idx;
1447 pmatch[reg_num].rm_eo = -1;
1448 }
1449 }
1450 else if (type == OP_CLOSE_SUBEXP)
1451 {
1452 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1453 if (reg_num < nmatch)
1454 {
1455 /* We are at the last node of this sub expression. */
1456 if (pmatch[reg_num].rm_so < cur_idx)
1457 {
1458 pmatch[reg_num].rm_eo = cur_idx;
1459 /* This is a non-empty match or we are not inside an optional
1460 subexpression. Accept this right away. */
1461 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1462 }
1463 else
1464 {
1465 if (dfa->nodes[cur_node].opt_subexp
1466 && prev_idx_match[reg_num].rm_so != -1)
1467 /* We transited through an empty match for an optional
1468 subexpression, like (a?)*, and this is not the subexp's
1469 first match. Copy back the old content of the registers
1470 so that matches of an inner subexpression are undone as
1471 well, like in ((a?))*. */
1472 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1473 else
1474 /* We completed a subexpression, but it may be part of
1475 an optional one, so do not update PREV_IDX_MATCH. */
1476 pmatch[reg_num].rm_eo = cur_idx;
1477 }
1478 }
1479 }
1480}
1481
1482/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1483 and sift the nodes in each states according to the following rules.
1484 Updated state_log will be wrote to STATE_LOG.
1485
1486 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1487 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1488 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1489 the LAST_NODE, we throw away the node `a'.
1490 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1491 string `s' and transit to `b':
1492 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1493 away the node `a'.
1494 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1495 thrown away, we throw away the node `a'.
1496 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1497 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1498 node `a'.
1499 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1500 we throw away the node `a'. */
1501
1502#define STATE_NODE_CONTAINS(state,node) \
1503 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1504
1505static reg_errcode_t
1506internal_function
1507sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1508{
1509 reg_errcode_t err;
1510 int null_cnt = 0;
1511 int str_idx = sctx->last_str_idx;
1512 re_node_set cur_dest;
1513
1514#ifdef DEBUG
1515 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1516#endif
1517
1518 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1519 transit to the last_node and the last_node itself. */
1520 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1521 if (BE (err != REG_NOERROR, 0))
1522 return err;
1523 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1524 if (BE (err != REG_NOERROR, 0))
1525 goto free_return;
1526
1527 /* Then check each states in the state_log. */
1528 while (str_idx > 0)
1529 {
1530 /* Update counters. */
1531 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1532 if (null_cnt > mctx->max_mb_elem_len)
1533 {
1534 memset (sctx->sifted_states, '\0',
1535 sizeof (re_dfastate_t *) * str_idx);
1536 re_node_set_free (&cur_dest);
1537 return REG_NOERROR;
1538 }
1539 re_node_set_empty (&cur_dest);
1540 --str_idx;
1541
1542 if (mctx->state_log[str_idx])
1543 {
1544 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1545 if (BE (err != REG_NOERROR, 0))
1546 goto free_return;
1547 }
1548
1549 /* Add all the nodes which satisfy the following conditions:
1550 - It can epsilon transit to a node in CUR_DEST.
1551 - It is in CUR_SRC.
1552 And update state_log. */
1553 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1554 if (BE (err != REG_NOERROR, 0))
1555 goto free_return;
1556 }
1557 err = REG_NOERROR;
1558 free_return:
1559 re_node_set_free (&cur_dest);
1560 return err;
1561}
1562
1563static reg_errcode_t
1564internal_function
1565build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1566 int str_idx, re_node_set *cur_dest)
1567{
1568 const re_dfa_t *const dfa = mctx->dfa;
1569 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1570 int i;
1571
1572 /* Then build the next sifted state.
1573 We build the next sifted state on `cur_dest', and update
1574 `sifted_states[str_idx]' with `cur_dest'.
1575 Note:
1576 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1577 `cur_src' points the node_set of the old `state_log[str_idx]'
1578 (with the epsilon nodes pre-filtered out). */
1579 for (i = 0; i < cur_src->nelem; i++)
1580 {
1581 int prev_node = cur_src->elems[i];
1582 int naccepted = 0;
1583 int ret;
1584
1585#ifdef DEBUG
1586 re_token_type_t type = dfa->nodes[prev_node].type;
1587 assert (!IS_EPSILON_NODE (type));
1588#endif
1589#ifdef RE_ENABLE_I18N
1590 /* If the node may accept `multi byte'. */
1591 if (dfa->nodes[prev_node].accept_mb)
1592 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1593 str_idx, sctx->last_str_idx);
1594#endif /* RE_ENABLE_I18N */
1595
1596 /* We don't check backreferences here.
1597 See update_cur_sifted_state(). */
1598 if (!naccepted
1599 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1600 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1601 dfa->nexts[prev_node]))
1602 naccepted = 1;
1603
1604 if (naccepted == 0)
1605 continue;
1606
1607 if (sctx->limits.nelem)
1608 {
1609 int to_idx = str_idx + naccepted;
1610 if (check_dst_limits (mctx, &sctx->limits,
1611 dfa->nexts[prev_node], to_idx,
1612 prev_node, str_idx))
1613 continue;
1614 }
1615 ret = re_node_set_insert (cur_dest, prev_node);
1616 if (BE (ret == -1, 0))
1617 return REG_ESPACE;
1618 }
1619
1620 return REG_NOERROR;
1621}
1622
1623/* Helper functions. */
1624
1625static reg_errcode_t
1626internal_function
1627clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1628{
1629 int top = mctx->state_log_top;
1630
1631 if (next_state_log_idx >= mctx->input.bufs_len
1632 || (next_state_log_idx >= mctx->input.valid_len
1633 && mctx->input.valid_len < mctx->input.len))
1634 {
1635 reg_errcode_t err;
1636 err = extend_buffers (mctx);
1637 if (BE (err != REG_NOERROR, 0))
1638 return err;
1639 }
1640
1641 if (top < next_state_log_idx)
1642 {
1643 memset (mctx->state_log + top + 1, '\0',
1644 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1645 mctx->state_log_top = next_state_log_idx;
1646 }
1647 return REG_NOERROR;
1648}
1649
1650static reg_errcode_t
1651internal_function
1652merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1653 re_dfastate_t **src, int num)
1654{
1655 int st_idx;
1656 reg_errcode_t err;
1657 for (st_idx = 0; st_idx < num; ++st_idx)
1658 {
1659 if (dst[st_idx] == NULL)
1660 dst[st_idx] = src[st_idx];
1661 else if (src[st_idx] != NULL)
1662 {
1663 re_node_set merged_set;
1664 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1665 &src[st_idx]->nodes);
1666 if (BE (err != REG_NOERROR, 0))
1667 return err;
1668 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1669 re_node_set_free (&merged_set);
1670 if (BE (err != REG_NOERROR, 0))
1671 return err;
1672 }
1673 }
1674 return REG_NOERROR;
1675}
1676
1677static reg_errcode_t
1678internal_function
1679update_cur_sifted_state (const re_match_context_t *mctx,
1680 re_sift_context_t *sctx, int str_idx,
1681 re_node_set *dest_nodes)
1682{
1683 const re_dfa_t *const dfa = mctx->dfa;
1684 reg_errcode_t err = REG_NOERROR;
1685 const re_node_set *candidates;
1686 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1687 : &mctx->state_log[str_idx]->nodes);
1688
1689 if (dest_nodes->nelem == 0)
1690 sctx->sifted_states[str_idx] = NULL;
1691 else
1692 {
1693 if (candidates)
1694 {
1695 /* At first, add the nodes which can epsilon transit to a node in
1696 DEST_NODE. */
1697 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1700
1701 /* Then, check the limitations in the current sift_context. */
1702 if (sctx->limits.nelem)
1703 {
1704 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1705 mctx->bkref_ents, str_idx);
1706 if (BE (err != REG_NOERROR, 0))
1707 return err;
1708 }
1709 }
1710
1711 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1712 if (BE (err != REG_NOERROR, 0))
1713 return err;
1714 }
1715
1716 if (candidates && mctx->state_log[str_idx]->has_backref)
1717 {
1718 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1719 if (BE (err != REG_NOERROR, 0))
1720 return err;
1721 }
1722 return REG_NOERROR;
1723}
1724
1725static reg_errcode_t
1726internal_function
1727add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1728 const re_node_set *candidates)
1729{
1730 reg_errcode_t err = REG_NOERROR;
1731 int i;
1732
1733 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1734 if (BE (err != REG_NOERROR, 0))
1735 return err;
1736
1737 if (!state->inveclosure.alloc)
1738 {
1739 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1740 if (BE (err != REG_NOERROR, 0))
1741 return REG_ESPACE;
1742 for (i = 0; i < dest_nodes->nelem; i++)
1743 re_node_set_merge (&state->inveclosure,
1744 dfa->inveclosures + dest_nodes->elems[i]);
1745 }
1746 return re_node_set_add_intersect (dest_nodes, candidates,
1747 &state->inveclosure);
1748}
1749
1750static reg_errcode_t
1751internal_function
1752sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1753 const re_node_set *candidates)
1754{
1755 int ecl_idx;
1756 reg_errcode_t err;
1757 re_node_set *inv_eclosure = dfa->inveclosures + node;
1758 re_node_set except_nodes;
1759 re_node_set_init_empty (&except_nodes);
1760 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1761 {
1762 int cur_node = inv_eclosure->elems[ecl_idx];
1763 if (cur_node == node)
1764 continue;
1765 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1766 {
1767 int edst1 = dfa->edests[cur_node].elems[0];
1768 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1769 ? dfa->edests[cur_node].elems[1] : -1);
1770 if ((!re_node_set_contains (inv_eclosure, edst1)
1771 && re_node_set_contains (dest_nodes, edst1))
1772 || (edst2 > 0
1773 && !re_node_set_contains (inv_eclosure, edst2)
1774 && re_node_set_contains (dest_nodes, edst2)))
1775 {
1776 err = re_node_set_add_intersect (&except_nodes, candidates,
1777 dfa->inveclosures + cur_node);
1778 if (BE (err != REG_NOERROR, 0))
1779 {
1780 re_node_set_free (&except_nodes);
1781 return err;
1782 }
1783 }
1784 }
1785 }
1786 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1787 {
1788 int cur_node = inv_eclosure->elems[ecl_idx];
1789 if (!re_node_set_contains (&except_nodes, cur_node))
1790 {
1791 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1792 re_node_set_remove_at (dest_nodes, idx);
1793 }
1794 }
1795 re_node_set_free (&except_nodes);
1796 return REG_NOERROR;
1797}
1798
1799static int
1800internal_function
1801check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1802 int dst_node, int dst_idx, int src_node, int src_idx)
1803{
1804 const re_dfa_t *const dfa = mctx->dfa;
1805 int lim_idx, src_pos, dst_pos;
1806
1807 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1808 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1809 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1810 {
1811 int subexp_idx;
1812 struct re_backref_cache_entry *ent;
1813 ent = mctx->bkref_ents + limits->elems[lim_idx];
1814 subexp_idx = dfa->nodes[ent->node].opr.idx;
1815
1816 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1817 subexp_idx, dst_node, dst_idx,
1818 dst_bkref_idx);
1819 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1820 subexp_idx, src_node, src_idx,
1821 src_bkref_idx);
1822
1823 /* In case of:
1824 <src> <dst> ( <subexp> )
1825 ( <subexp> ) <src> <dst>
1826 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1827 if (src_pos == dst_pos)
1828 continue; /* This is unrelated limitation. */
1829 else
1830 return 1;
1831 }
1832 return 0;
1833}
1834
1835static int
1836internal_function
1837check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1838 int subexp_idx, int from_node, int bkref_idx)
1839{
1840 const re_dfa_t *const dfa = mctx->dfa;
1841 const re_node_set *eclosures = dfa->eclosures + from_node;
1842 int node_idx;
1843
1844 /* Else, we are on the boundary: examine the nodes on the epsilon
1845 closure. */
1846 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1847 {
1848 int node = eclosures->elems[node_idx];
1849 switch (dfa->nodes[node].type)
1850 {
1851 case OP_BACK_REF:
1852 if (bkref_idx != -1)
1853 {
1854 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1855 do
1856 {
1857 int dst, cpos;
1858
1859 if (ent->node != node)
1860 continue;
1861
1862 if (subexp_idx < BITSET_WORD_BITS
1863 && !(ent->eps_reachable_subexps_map
1864 & ((bitset_word_t) 1 << subexp_idx)))
1865 continue;
1866
1867 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1868 OP_CLOSE_SUBEXP cases below. But, if the
1869 destination node is the same node as the source
1870 node, don't recurse because it would cause an
1871 infinite loop: a regex that exhibits this behavior
1872 is ()\1*\1* */
1873 dst = dfa->edests[node].elems[0];
1874 if (dst == from_node)
1875 {
1876 if (boundaries & 1)
1877 return -1;
1878 else /* if (boundaries & 2) */
1879 return 0;
1880 }
1881
1882 cpos =
1883 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1884 dst, bkref_idx);
1885 if (cpos == -1 /* && (boundaries & 1) */)
1886 return -1;
1887 if (cpos == 0 && (boundaries & 2))
1888 return 0;
1889
1890 if (subexp_idx < BITSET_WORD_BITS)
1891 ent->eps_reachable_subexps_map
1892 &= ~((bitset_word_t) 1 << subexp_idx);
1893 }
1894 while (ent++->more);
1895 }
1896 break;
1897
1898 case OP_OPEN_SUBEXP:
1899 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1900 return -1;
1901 break;
1902
1903 case OP_CLOSE_SUBEXP:
1904 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1905 return 0;
1906 break;
1907
1908 default:
1909 break;
1910 }
1911 }
1912
1913 return (boundaries & 2) ? 1 : 0;
1914}
1915
1916static int
1917internal_function
1918check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1919 int subexp_idx, int from_node, int str_idx,
1920 int bkref_idx)
1921{
1922 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1923 int boundaries;
1924
1925 /* If we are outside the range of the subexpression, return -1 or 1. */
1926 if (str_idx < lim->subexp_from)
1927 return -1;
1928
1929 if (lim->subexp_to < str_idx)
1930 return 1;
1931
1932 /* If we are within the subexpression, return 0. */
1933 boundaries = (str_idx == lim->subexp_from);
1934 boundaries |= (str_idx == lim->subexp_to) << 1;
1935 if (boundaries == 0)
1936 return 0;
1937
1938 /* Else, examine epsilon closure. */
1939 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1940 from_node, bkref_idx);
1941}
1942
1943/* Check the limitations of sub expressions LIMITS, and remove the nodes
1944 which are against limitations from DEST_NODES. */
1945
1946static reg_errcode_t
1947internal_function
1948check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1949 const re_node_set *candidates, re_node_set *limits,
1950 struct re_backref_cache_entry *bkref_ents, int str_idx)
1951{
1952 reg_errcode_t err;
1953 int node_idx, lim_idx;
1954
1955 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1956 {
1957 int subexp_idx;
1958 struct re_backref_cache_entry *ent;
1959 ent = bkref_ents + limits->elems[lim_idx];
1960
1961 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1962 continue; /* This is unrelated limitation. */
1963
1964 subexp_idx = dfa->nodes[ent->node].opr.idx;
1965 if (ent->subexp_to == str_idx)
1966 {
1967 int ops_node = -1;
1968 int cls_node = -1;
1969 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1970 {
1971 int node = dest_nodes->elems[node_idx];
1972 re_token_type_t type = dfa->nodes[node].type;
1973 if (type == OP_OPEN_SUBEXP
1974 && subexp_idx == dfa->nodes[node].opr.idx)
1975 ops_node = node;
1976 else if (type == OP_CLOSE_SUBEXP
1977 && subexp_idx == dfa->nodes[node].opr.idx)
1978 cls_node = node;
1979 }
1980
1981 /* Check the limitation of the open subexpression. */
1982 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1983 if (ops_node >= 0)
1984 {
1985 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1986 candidates);
1987 if (BE (err != REG_NOERROR, 0))
1988 return err;
1989 }
1990
1991 /* Check the limitation of the close subexpression. */
1992 if (cls_node >= 0)
1993 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1994 {
1995 int node = dest_nodes->elems[node_idx];
1996 if (!re_node_set_contains (dfa->inveclosures + node,
1997 cls_node)
1998 && !re_node_set_contains (dfa->eclosures + node,
1999 cls_node))
2000 {
2001 /* It is against this limitation.
2002 Remove it form the current sifted state. */
2003 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2004 candidates);
2005 if (BE (err != REG_NOERROR, 0))
2006 return err;
2007 --node_idx;
2008 }
2009 }
2010 }
2011 else /* (ent->subexp_to != str_idx) */
2012 {
2013 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2014 {
2015 int node = dest_nodes->elems[node_idx];
2016 re_token_type_t type = dfa->nodes[node].type;
2017 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2018 {
2019 if (subexp_idx != dfa->nodes[node].opr.idx)
2020 continue;
2021 /* It is against this limitation.
2022 Remove it form the current sifted state. */
2023 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2024 candidates);
2025 if (BE (err != REG_NOERROR, 0))
2026 return err;
2027 }
2028 }
2029 }
2030 }
2031 return REG_NOERROR;
2032}
2033
2034static reg_errcode_t
2035internal_function
2036sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2037 int str_idx, const re_node_set *candidates)
2038{
2039 const re_dfa_t *const dfa = mctx->dfa;
2040 reg_errcode_t err;
2041 int node_idx, node;
2042 re_sift_context_t local_sctx;
2043 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2044
2045 if (first_idx == -1)
2046 return REG_NOERROR;
2047
2048 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2049
2050 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2051 {
2052 int enabled_idx;
2053 re_token_type_t type;
2054 struct re_backref_cache_entry *entry;
2055 node = candidates->elems[node_idx];
2056 type = dfa->nodes[node].type;
2057 /* Avoid infinite loop for the REs like "()\1+". */
2058 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2059 continue;
2060 if (type != OP_BACK_REF)
2061 continue;
2062
2063 entry = mctx->bkref_ents + first_idx;
2064 enabled_idx = first_idx;
2065 do
2066 {
2067 int subexp_len;
2068 int to_idx;
2069 int dst_node;
2070 int ret;
2071 re_dfastate_t *cur_state;
2072
2073 if (entry->node != node)
2074 continue;
2075 subexp_len = entry->subexp_to - entry->subexp_from;
2076 to_idx = str_idx + subexp_len;
2077 dst_node = (subexp_len ? dfa->nexts[node]
2078 : dfa->edests[node].elems[0]);
2079
2080 if (to_idx > sctx->last_str_idx
2081 || sctx->sifted_states[to_idx] == NULL
2082 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2083 || check_dst_limits (mctx, &sctx->limits, node,
2084 str_idx, dst_node, to_idx))
2085 continue;
2086
2087 if (local_sctx.sifted_states == NULL)
2088 {
2089 local_sctx = *sctx;
2090 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2091 if (BE (err != REG_NOERROR, 0))
2092 goto free_return;
2093 }
2094 local_sctx.last_node = node;
2095 local_sctx.last_str_idx = str_idx;
2096 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2097 if (BE (ret < 0, 0))
2098 {
2099 err = REG_ESPACE;
2100 goto free_return;
2101 }
2102 cur_state = local_sctx.sifted_states[str_idx];
2103 err = sift_states_backward (mctx, &local_sctx);
2104 if (BE (err != REG_NOERROR, 0))
2105 goto free_return;
2106 if (sctx->limited_states != NULL)
2107 {
2108 err = merge_state_array (dfa, sctx->limited_states,
2109 local_sctx.sifted_states,
2110 str_idx + 1);
2111 if (BE (err != REG_NOERROR, 0))
2112 goto free_return;
2113 }
2114 local_sctx.sifted_states[str_idx] = cur_state;
2115 re_node_set_remove (&local_sctx.limits, enabled_idx);
2116
2117 /* mctx->bkref_ents may have changed, reload the pointer. */
2118 entry = mctx->bkref_ents + enabled_idx;
2119 }
2120 while (enabled_idx++, entry++->more);
2121 }
2122 err = REG_NOERROR;
2123 free_return:
2124 if (local_sctx.sifted_states != NULL)
2125 {
2126 re_node_set_free (&local_sctx.limits);
2127 }
2128
2129 return err;
2130}
2131
2132
2133#ifdef RE_ENABLE_I18N
2134static int
2135internal_function
2136sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2137 int node_idx, int str_idx, int max_str_idx)
2138{
2139 const re_dfa_t *const dfa = mctx->dfa;
2140 int naccepted;
2141 /* Check the node can accept `multi byte'. */
2142 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2143 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2144 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2145 dfa->nexts[node_idx]))
2146 /* The node can't accept the `multi byte', or the
2147 destination was already thrown away, then the node
2148 could't accept the current input `multi byte'. */
2149 naccepted = 0;
2150 /* Otherwise, it is sure that the node could accept
2151 `naccepted' bytes input. */
2152 return naccepted;
2153}
2154#endif /* RE_ENABLE_I18N */
2155
2156
2157/* Functions for state transition. */
2158
2159/* Return the next state to which the current state STATE will transit by
2160 accepting the current input byte, and update STATE_LOG if necessary.
2161 If STATE can accept a multibyte char/collating element/back reference
2162 update the destination of STATE_LOG. */
2163
2164static re_dfastate_t *
2165internal_function
2166transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2167 re_dfastate_t *state)
2168{
2169 re_dfastate_t **trtable;
2170 unsigned char ch;
2171
2172#ifdef RE_ENABLE_I18N
2173 /* If the current state can accept multibyte. */
2174 if (BE (state->accept_mb, 0))
2175 {
2176 *err = transit_state_mb (mctx, state);
2177 if (BE (*err != REG_NOERROR, 0))
2178 return NULL;
2179 }
2180#endif /* RE_ENABLE_I18N */
2181
2182 /* Then decide the next state with the single byte. */
2183#if 0
2184 if (0)
2185 /* don't use transition table */
2186 return transit_state_sb (err, mctx, state);
2187#endif
2188
2189 /* Use transition table */
2190 ch = re_string_fetch_byte (&mctx->input);
2191 for (;;)
2192 {
2193 trtable = state->trtable;
2194 if (BE (trtable != NULL, 1))
2195 return trtable[ch];
2196
2197 trtable = state->word_trtable;
2198 if (BE (trtable != NULL, 1))
2199 {
2200 unsigned int context;
2201 context
2202 = re_string_context_at (&mctx->input,
2203 re_string_cur_idx (&mctx->input) - 1,
2204 mctx->eflags);
2205 if (IS_WORD_CONTEXT (context))
2206 return trtable[ch + SBC_MAX];
2207 else
2208 return trtable[ch];
2209 }
2210
2211 if (!build_trtable (mctx->dfa, state))
2212 {
2213 *err = REG_ESPACE;
2214 return NULL;
2215 }
2216
2217 /* Retry, we now have a transition table. */
2218 }
2219}
2220
2221/* Update the state_log if we need */
2222re_dfastate_t *
2223internal_function
2224merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2225 re_dfastate_t *next_state)
2226{
2227 const re_dfa_t *const dfa = mctx->dfa;
2228 int cur_idx = re_string_cur_idx (&mctx->input);
2229
2230 if (cur_idx > mctx->state_log_top)
2231 {
2232 mctx->state_log[cur_idx] = next_state;
2233 mctx->state_log_top = cur_idx;
2234 }
2235 else if (mctx->state_log[cur_idx] == 0)
2236 {
2237 mctx->state_log[cur_idx] = next_state;
2238 }
2239 else
2240 {
2241 re_dfastate_t *pstate;
2242 unsigned int context;
2243 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2244 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2245 the destination of a multibyte char/collating element/
2246 back reference. Then the next state is the union set of
2247 these destinations and the results of the transition table. */
2248 pstate = mctx->state_log[cur_idx];
2249 log_nodes = pstate->entrance_nodes;
2250 if (next_state != NULL)
2251 {
2252 table_nodes = next_state->entrance_nodes;
2253 *err = re_node_set_init_union (&next_nodes, table_nodes,
2254 log_nodes);
2255 if (BE (*err != REG_NOERROR, 0))
2256 return NULL;
2257 }
2258 else
2259 next_nodes = *log_nodes;
2260 /* Note: We already add the nodes of the initial state,
2261 then we don't need to add them here. */
2262
2263 context = re_string_context_at (&mctx->input,
2264 re_string_cur_idx (&mctx->input) - 1,
2265 mctx->eflags);
2266 next_state = mctx->state_log[cur_idx]
2267 = re_acquire_state_context (err, dfa, &next_nodes, context);
2268 /* We don't need to check errors here, since the return value of
2269 this function is next_state and ERR is already set. */
2270
2271 if (table_nodes != NULL)
2272 re_node_set_free (&next_nodes);
2273 }
2274
2275 if (BE (dfa->nbackref, 0) && next_state != NULL)
2276 {
2277 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2278 later. We must check them here, since the back references in the
2279 next state might use them. */
2280 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2281 cur_idx);
2282 if (BE (*err != REG_NOERROR, 0))
2283 return NULL;
2284
2285 /* If the next state has back references. */
2286 if (next_state->has_backref)
2287 {
2288 *err = transit_state_bkref (mctx, &next_state->nodes);
2289 if (BE (*err != REG_NOERROR, 0))
2290 return NULL;
2291 next_state = mctx->state_log[cur_idx];
2292 }
2293 }
2294
2295 return next_state;
2296}
2297
2298/* Skip bytes in the input that correspond to part of a
2299 multi-byte match, then look in the log for a state
2300 from which to restart matching. */
2301re_dfastate_t *
2302internal_function
2303find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2304{
2305 re_dfastate_t *cur_state;
2306 do
2307 {
2308 int max = mctx->state_log_top;
2309 int cur_str_idx = re_string_cur_idx (&mctx->input);
2310
2311 do
2312 {
2313 if (++cur_str_idx > max)
2314 return NULL;
2315 re_string_skip_bytes (&mctx->input, 1);
2316 }
2317 while (mctx->state_log[cur_str_idx] == NULL);
2318
2319 cur_state = merge_state_with_log (err, mctx, NULL);
2320 }
2321 while (*err == REG_NOERROR && cur_state == NULL);
2322 return cur_state;
2323}
2324
2325/* Helper functions for transit_state. */
2326
2327/* From the node set CUR_NODES, pick up the nodes whose types are
2328 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2329 expression. And register them to use them later for evaluating the
2330 correspoding back references. */
2331
2332static reg_errcode_t
2333internal_function
2334check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2335 int str_idx)
2336{
2337 const re_dfa_t *const dfa = mctx->dfa;
2338 int node_idx;
2339 reg_errcode_t err;
2340
2341 /* TODO: This isn't efficient.
2342 Because there might be more than one nodes whose types are
2343 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2344 nodes.
2345 E.g. RE: (a){2} */
2346 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2347 {
2348 int node = cur_nodes->elems[node_idx];
2349 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2350 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2351 && (dfa->used_bkref_map
2352 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2353 {
2354 err = match_ctx_add_subtop (mctx, node, str_idx);
2355 if (BE (err != REG_NOERROR, 0))
2356 return err;
2357 }
2358 }
2359 return REG_NOERROR;
2360}
2361
2362#if 0
2363/* Return the next state to which the current state STATE will transit by
2364 accepting the current input byte. */
2365
2366static re_dfastate_t *
2367transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2368 re_dfastate_t *state)
2369{
2370 const re_dfa_t *const dfa = mctx->dfa;
2371 re_node_set next_nodes;
2372 re_dfastate_t *next_state;
2373 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2374 unsigned int context;
2375
2376 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2377 if (BE (*err != REG_NOERROR, 0))
2378 return NULL;
2379 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2380 {
2381 int cur_node = state->nodes.elems[node_cnt];
2382 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2383 {
2384 *err = re_node_set_merge (&next_nodes,
2385 dfa->eclosures + dfa->nexts[cur_node]);
2386 if (BE (*err != REG_NOERROR, 0))
2387 {
2388 re_node_set_free (&next_nodes);
2389 return NULL;
2390 }
2391 }
2392 }
2393 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2394 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2395 /* We don't need to check errors here, since the return value of
2396 this function is next_state and ERR is already set. */
2397
2398 re_node_set_free (&next_nodes);
2399 re_string_skip_bytes (&mctx->input, 1);
2400 return next_state;
2401}
2402#endif
2403
2404#ifdef RE_ENABLE_I18N
2405static reg_errcode_t
2406internal_function
2407transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2408{
2409 const re_dfa_t *const dfa = mctx->dfa;
2410 reg_errcode_t err;
2411 int i;
2412
2413 for (i = 0; i < pstate->nodes.nelem; ++i)
2414 {
2415 re_node_set dest_nodes, *new_nodes;
2416 int cur_node_idx = pstate->nodes.elems[i];
2417 int naccepted, dest_idx;
2418 unsigned int context;
2419 re_dfastate_t *dest_state;
2420
2421 if (!dfa->nodes[cur_node_idx].accept_mb)
2422 continue;
2423
2424 if (dfa->nodes[cur_node_idx].constraint)
2425 {
2426 context = re_string_context_at (&mctx->input,
2427 re_string_cur_idx (&mctx->input),
2428 mctx->eflags);
2429 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2430 context))
2431 continue;
2432 }
2433
2434 /* How many bytes the node can accept? */
2435 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2436 re_string_cur_idx (&mctx->input));
2437 if (naccepted == 0)
2438 continue;
2439
2440 /* The node can accepts `naccepted' bytes. */
2441 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2442 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2443 : mctx->max_mb_elem_len);
2444 err = clean_state_log_if_needed (mctx, dest_idx);
2445 if (BE (err != REG_NOERROR, 0))
2446 return err;
2447#ifdef DEBUG
2448 assert (dfa->nexts[cur_node_idx] != -1);
2449#endif
2450 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2451
2452 dest_state = mctx->state_log[dest_idx];
2453 if (dest_state == NULL)
2454 dest_nodes = *new_nodes;
2455 else
2456 {
2457 err = re_node_set_init_union (&dest_nodes,
2458 dest_state->entrance_nodes, new_nodes);
2459 if (BE (err != REG_NOERROR, 0))
2460 return err;
2461 }
2462 context = re_string_context_at (&mctx->input, dest_idx - 1,
2463 mctx->eflags);
2464 mctx->state_log[dest_idx]
2465 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2466 if (dest_state != NULL)
2467 re_node_set_free (&dest_nodes);
2468 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2469 return err;
2470 }
2471 return REG_NOERROR;
2472}
2473#endif /* RE_ENABLE_I18N */
2474
2475static reg_errcode_t
2476internal_function
2477transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2478{
2479 const re_dfa_t *const dfa = mctx->dfa;
2480 reg_errcode_t err;
2481 int i;
2482 int cur_str_idx = re_string_cur_idx (&mctx->input);
2483
2484 for (i = 0; i < nodes->nelem; ++i)
2485 {
2486 int dest_str_idx, prev_nelem, bkc_idx;
2487 int node_idx = nodes->elems[i];
2488 unsigned int context;
2489 const re_token_t *node = dfa->nodes + node_idx;
2490 re_node_set *new_dest_nodes;
2491
2492 /* Check whether `node' is a backreference or not. */
2493 if (node->type != OP_BACK_REF)
2494 continue;
2495
2496 if (node->constraint)
2497 {
2498 context = re_string_context_at (&mctx->input, cur_str_idx,
2499 mctx->eflags);
2500 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2501 continue;
2502 }
2503
2504 /* `node' is a backreference.
2505 Check the substring which the substring matched. */
2506 bkc_idx = mctx->nbkref_ents;
2507 err = get_subexp (mctx, node_idx, cur_str_idx);
2508 if (BE (err != REG_NOERROR, 0))
2509 goto free_return;
2510
2511 /* And add the epsilon closures (which is `new_dest_nodes') of
2512 the backreference to appropriate state_log. */
2513#ifdef DEBUG
2514 assert (dfa->nexts[node_idx] != -1);
2515#endif
2516 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2517 {
2518 int subexp_len;
2519 re_dfastate_t *dest_state;
2520 struct re_backref_cache_entry *bkref_ent;
2521 bkref_ent = mctx->bkref_ents + bkc_idx;
2522 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2523 continue;
2524 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2525 new_dest_nodes = (subexp_len == 0
2526 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2527 : dfa->eclosures + dfa->nexts[node_idx]);
2528 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2529 - bkref_ent->subexp_from);
2530 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2531 mctx->eflags);
2532 dest_state = mctx->state_log[dest_str_idx];
2533 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2534 : mctx->state_log[cur_str_idx]->nodes.nelem);
2535 /* Add `new_dest_node' to state_log. */
2536 if (dest_state == NULL)
2537 {
2538 mctx->state_log[dest_str_idx]
2539 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2540 context);
2541 if (BE (mctx->state_log[dest_str_idx] == NULL
2542 && err != REG_NOERROR, 0))
2543 goto free_return;
2544 }
2545 else
2546 {
2547 re_node_set dest_nodes;
2548 err = re_node_set_init_union (&dest_nodes,
2549 dest_state->entrance_nodes,
2550 new_dest_nodes);
2551 if (BE (err != REG_NOERROR, 0))
2552 {
2553 re_node_set_free (&dest_nodes);
2554 goto free_return;
2555 }
2556 mctx->state_log[dest_str_idx]
2557 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2558 re_node_set_free (&dest_nodes);
2559 if (BE (mctx->state_log[dest_str_idx] == NULL
2560 && err != REG_NOERROR, 0))
2561 goto free_return;
2562 }
2563 /* We need to check recursively if the backreference can epsilon
2564 transit. */
2565 if (subexp_len == 0
2566 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2567 {
2568 err = check_subexp_matching_top (mctx, new_dest_nodes,
2569 cur_str_idx);
2570 if (BE (err != REG_NOERROR, 0))
2571 goto free_return;
2572 err = transit_state_bkref (mctx, new_dest_nodes);
2573 if (BE (err != REG_NOERROR, 0))
2574 goto free_return;
2575 }
2576 }
2577 }
2578 err = REG_NOERROR;
2579 free_return:
2580 return err;
2581}
2582
2583/* Enumerate all the candidates which the backreference BKREF_NODE can match
2584 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2585 Note that we might collect inappropriate candidates here.
2586 However, the cost of checking them strictly here is too high, then we
2587 delay these checking for prune_impossible_nodes(). */
2588
2589static reg_errcode_t
2590internal_function
2591get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2592{
2593 const re_dfa_t *const dfa = mctx->dfa;
2594 int subexp_num, sub_top_idx;
2595 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2596 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2597 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2598 if (cache_idx != -1)
2599 {
2600 const struct re_backref_cache_entry *entry
2601 = mctx->bkref_ents + cache_idx;
2602 do
2603 if (entry->node == bkref_node)
2604 return REG_NOERROR; /* We already checked it. */
2605 while (entry++->more);
2606 }
2607
2608 subexp_num = dfa->nodes[bkref_node].opr.idx;
2609
2610 /* For each sub expression */
2611 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2612 {
2613 reg_errcode_t err;
2614 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2615 re_sub_match_last_t *sub_last;
2616 int sub_last_idx, sl_str, bkref_str_off;
2617
2618 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2619 continue; /* It isn't related. */
2620
2621 sl_str = sub_top->str_idx;
2622 bkref_str_off = bkref_str_idx;
2623 /* At first, check the last node of sub expressions we already
2624 evaluated. */
2625 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2626 {
2627 int sl_str_diff;
2628 sub_last = sub_top->lasts[sub_last_idx];
2629 sl_str_diff = sub_last->str_idx - sl_str;
2630 /* The matched string by the sub expression match with the substring
2631 at the back reference? */
2632 if (sl_str_diff > 0)
2633 {
2634 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2635 {
2636 /* Not enough chars for a successful match. */
2637 if (bkref_str_off + sl_str_diff > mctx->input.len)
2638 break;
2639
2640 err = clean_state_log_if_needed (mctx,
2641 bkref_str_off
2642 + sl_str_diff);
2643 if (BE (err != REG_NOERROR, 0))
2644 return err;
2645 buf = (const char *) re_string_get_buffer (&mctx->input);
2646 }
2647 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2648 /* We don't need to search this sub expression any more. */
2649 break;
2650 }
2651 bkref_str_off += sl_str_diff;
2652 sl_str += sl_str_diff;
2653 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2654 bkref_str_idx);
2655
2656 /* Reload buf, since the preceding call might have reallocated
2657 the buffer. */
2658 buf = (const char *) re_string_get_buffer (&mctx->input);
2659
2660 if (err == REG_NOMATCH)
2661 continue;
2662 if (BE (err != REG_NOERROR, 0))
2663 return err;
2664 }
2665
2666 if (sub_last_idx < sub_top->nlasts)
2667 continue;
2668 if (sub_last_idx > 0)
2669 ++sl_str;
2670 /* Then, search for the other last nodes of the sub expression. */
2671 for (; sl_str <= bkref_str_idx; ++sl_str)
2672 {
2673 int cls_node, sl_str_off;
2674 const re_node_set *nodes;
2675 sl_str_off = sl_str - sub_top->str_idx;
2676 /* The matched string by the sub expression match with the substring
2677 at the back reference? */
2678 if (sl_str_off > 0)
2679 {
2680 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2681 {
2682 /* If we are at the end of the input, we cannot match. */
2683 if (bkref_str_off >= mctx->input.len)
2684 break;
2685
2686 err = extend_buffers (mctx);
2687 if (BE (err != REG_NOERROR, 0))
2688 return err;
2689
2690 buf = (const char *) re_string_get_buffer (&mctx->input);
2691 }
2692 if (buf [bkref_str_off++] != buf[sl_str - 1])
2693 break; /* We don't need to search this sub expression
2694 any more. */
2695 }
2696 if (mctx->state_log[sl_str] == NULL)
2697 continue;
2698 /* Does this state have a ')' of the sub expression? */
2699 nodes = &mctx->state_log[sl_str]->nodes;
2700 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2701 OP_CLOSE_SUBEXP);
2702 if (cls_node == -1)
2703 continue; /* No. */
2704 if (sub_top->path == NULL)
2705 {
2706 sub_top->path = calloc (sizeof (state_array_t),
2707 sl_str - sub_top->str_idx + 1);
2708 if (sub_top->path == NULL)
2709 return REG_ESPACE;
2710 }
2711 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2712 in the current context? */
2713 err = check_arrival (mctx, sub_top->path, sub_top->node,
2714 sub_top->str_idx, cls_node, sl_str,
2715 OP_CLOSE_SUBEXP);
2716 if (err == REG_NOMATCH)
2717 continue;
2718 if (BE (err != REG_NOERROR, 0))
2719 return err;
2720 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2721 if (BE (sub_last == NULL, 0))
2722 return REG_ESPACE;
2723 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2724 bkref_str_idx);
2725 if (err == REG_NOMATCH)
2726 continue;
2727 }
2728 }
2729 return REG_NOERROR;
2730}
2731
2732/* Helper functions for get_subexp(). */
2733
2734/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2735 If it can arrive, register the sub expression expressed with SUB_TOP
2736 and SUB_LAST. */
2737
2738static reg_errcode_t
2739internal_function
2740get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2741 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2742{
2743 reg_errcode_t err;
2744 int to_idx;
2745 /* Can the subexpression arrive the back reference? */
2746 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2747 sub_last->str_idx, bkref_node, bkref_str,
2748 OP_OPEN_SUBEXP);
2749 if (err != REG_NOERROR)
2750 return err;
2751 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2752 sub_last->str_idx);
2753 if (BE (err != REG_NOERROR, 0))
2754 return err;
2755 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2756 return clean_state_log_if_needed (mctx, to_idx);
2757}
2758
2759/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2760 Search '(' if FL_OPEN, or search ')' otherwise.
2761 TODO: This function isn't efficient...
2762 Because there might be more than one nodes whose types are
2763 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2764 nodes.
2765 E.g. RE: (a){2} */
2766
2767static int
2768internal_function
2769find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2770 int subexp_idx, int type)
2771{
2772 int cls_idx;
2773 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2774 {
2775 int cls_node = nodes->elems[cls_idx];
2776 const re_token_t *node = dfa->nodes + cls_node;
2777 if (node->type == type
2778 && node->opr.idx == subexp_idx)
2779 return cls_node;
2780 }
2781 return -1;
2782}
2783
2784/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2785 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2786 heavily reused.
2787 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2788
2789static reg_errcode_t
2790internal_function
2791check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2792 int top_str, int last_node, int last_str, int type)
2793{
2794 const re_dfa_t *const dfa = mctx->dfa;
2795 reg_errcode_t err = REG_NOERROR;
2796 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2797 re_dfastate_t *cur_state = NULL;
2798 re_node_set *cur_nodes, next_nodes;
2799 re_dfastate_t **backup_state_log;
2800 unsigned int context;
2801
2802 subexp_num = dfa->nodes[top_node].opr.idx;
2803 /* Extend the buffer if we need. */
2804 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2805 {
2806 re_dfastate_t **new_array;
2807 int old_alloc = path->alloc;
2808 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2809 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2810 if (BE (new_array == NULL, 0))
2811 {
2812 path->alloc = old_alloc;
2813 return REG_ESPACE;
2814 }
2815 path->array = new_array;
2816 memset (new_array + old_alloc, '\0',
2817 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2818 }
2819
2820 str_idx = path->next_idx ?: top_str;
2821
2822 /* Temporary modify MCTX. */
2823 backup_state_log = mctx->state_log;
2824 backup_cur_idx = mctx->input.cur_idx;
2825 mctx->state_log = path->array;
2826 mctx->input.cur_idx = str_idx;
2827
2828 /* Setup initial node set. */
2829 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2830 if (str_idx == top_str)
2831 {
2832 err = re_node_set_init_1 (&next_nodes, top_node);
2833 if (BE (err != REG_NOERROR, 0))
2834 return err;
2835 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2836 if (BE (err != REG_NOERROR, 0))
2837 {
2838 re_node_set_free (&next_nodes);
2839 return err;
2840 }
2841 }
2842 else
2843 {
2844 cur_state = mctx->state_log[str_idx];
2845 if (cur_state && cur_state->has_backref)
2846 {
2847 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2848 if (BE (err != REG_NOERROR, 0))
2849 return err;
2850 }
2851 else
2852 re_node_set_init_empty (&next_nodes);
2853 }
2854 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2855 {
2856 if (next_nodes.nelem)
2857 {
2858 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2859 subexp_num, type);
2860 if (BE (err != REG_NOERROR, 0))
2861 {
2862 re_node_set_free (&next_nodes);
2863 return err;
2864 }
2865 }
2866 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2867 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2868 {
2869 re_node_set_free (&next_nodes);
2870 return err;
2871 }
2872 mctx->state_log[str_idx] = cur_state;
2873 }
2874
2875 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2876 {
2877 re_node_set_empty (&next_nodes);
2878 if (mctx->state_log[str_idx + 1])
2879 {
2880 err = re_node_set_merge (&next_nodes,
2881 &mctx->state_log[str_idx + 1]->nodes);
2882 if (BE (err != REG_NOERROR, 0))
2883 {
2884 re_node_set_free (&next_nodes);
2885 return err;
2886 }
2887 }
2888 if (cur_state)
2889 {
2890 err = check_arrival_add_next_nodes (mctx, str_idx,
2891 &cur_state->non_eps_nodes,
2892 &next_nodes);
2893 if (BE (err != REG_NOERROR, 0))
2894 {
2895 re_node_set_free (&next_nodes);
2896 return err;
2897 }
2898 }
2899 ++str_idx;
2900 if (next_nodes.nelem)
2901 {
2902 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2903 if (BE (err != REG_NOERROR, 0))
2904 {
2905 re_node_set_free (&next_nodes);
2906 return err;
2907 }
2908 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2909 subexp_num, type);
2910 if (BE (err != REG_NOERROR, 0))
2911 {
2912 re_node_set_free (&next_nodes);
2913 return err;
2914 }
2915 }
2916 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2917 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2918 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2919 {
2920 re_node_set_free (&next_nodes);
2921 return err;
2922 }
2923 mctx->state_log[str_idx] = cur_state;
2924 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2925 }
2926 re_node_set_free (&next_nodes);
2927 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2928 : &mctx->state_log[last_str]->nodes);
2929 path->next_idx = str_idx;
2930
2931 /* Fix MCTX. */
2932 mctx->state_log = backup_state_log;
2933 mctx->input.cur_idx = backup_cur_idx;
2934
2935 /* Then check the current node set has the node LAST_NODE. */
2936 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2937 return REG_NOERROR;
2938
2939 return REG_NOMATCH;
2940}
2941
2942/* Helper functions for check_arrival. */
2943
2944/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2945 to NEXT_NODES.
2946 TODO: This function is similar to the functions transit_state*(),
2947 however this function has many additional works.
2948 Can't we unify them? */
2949
2950static reg_errcode_t
2951internal_function
2952check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2953 re_node_set *cur_nodes, re_node_set *next_nodes)
2954{
2955 const re_dfa_t *const dfa = mctx->dfa;
2956 int result;
2957 int cur_idx;
2958#ifdef RE_ENABLE_I18N
2959 reg_errcode_t err = REG_NOERROR;
2960#endif
2961 re_node_set union_set;
2962 re_node_set_init_empty (&union_set);
2963 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2964 {
2965 int naccepted = 0;
2966 int cur_node = cur_nodes->elems[cur_idx];
2967#ifdef DEBUG
2968 re_token_type_t type = dfa->nodes[cur_node].type;
2969 assert (!IS_EPSILON_NODE (type));
2970#endif
2971#ifdef RE_ENABLE_I18N
2972 /* If the node may accept `multi byte'. */
2973 if (dfa->nodes[cur_node].accept_mb)
2974 {
2975 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2976 str_idx);
2977 if (naccepted > 1)
2978 {
2979 re_dfastate_t *dest_state;
2980 int next_node = dfa->nexts[cur_node];
2981 int next_idx = str_idx + naccepted;
2982 dest_state = mctx->state_log[next_idx];
2983 re_node_set_empty (&union_set);
2984 if (dest_state)
2985 {
2986 err = re_node_set_merge (&union_set, &dest_state->nodes);
2987 if (BE (err != REG_NOERROR, 0))
2988 {
2989 re_node_set_free (&union_set);
2990 return err;
2991 }
2992 }
2993 result = re_node_set_insert (&union_set, next_node);
2994 if (BE (result < 0, 0))
2995 {
2996 re_node_set_free (&union_set);
2997 return REG_ESPACE;
2998 }
2999 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3000 &union_set);
3001 if (BE (mctx->state_log[next_idx] == NULL
3002 && err != REG_NOERROR, 0))
3003 {
3004 re_node_set_free (&union_set);
3005 return err;
3006 }
3007 }
3008 }
3009#endif /* RE_ENABLE_I18N */
3010 if (naccepted
3011 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3012 {
3013 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3014 if (BE (result < 0, 0))
3015 {
3016 re_node_set_free (&union_set);
3017 return REG_ESPACE;
3018 }
3019 }
3020 }
3021 re_node_set_free (&union_set);
3022 return REG_NOERROR;
3023}
3024
3025/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3026 CUR_NODES, however exclude the nodes which are:
3027 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3028 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3029*/
3030
3031static reg_errcode_t
3032internal_function
3033check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3034 int ex_subexp, int type)
3035{
3036 reg_errcode_t err;
3037 int idx, outside_node;
3038 re_node_set new_nodes;
3039#ifdef DEBUG
3040 assert (cur_nodes->nelem);
3041#endif
3042 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3043 if (BE (err != REG_NOERROR, 0))
3044 return err;
3045 /* Create a new node set NEW_NODES with the nodes which are epsilon
3046 closures of the node in CUR_NODES. */
3047
3048 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3049 {
3050 int cur_node = cur_nodes->elems[idx];
3051 const re_node_set *eclosure = dfa->eclosures + cur_node;
3052 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3053 if (outside_node == -1)
3054 {
3055 /* There are no problematic nodes, just merge them. */
3056 err = re_node_set_merge (&new_nodes, eclosure);
3057 if (BE (err != REG_NOERROR, 0))
3058 {
3059 re_node_set_free (&new_nodes);
3060 return err;
3061 }
3062 }
3063 else
3064 {
3065 /* There are problematic nodes, re-calculate incrementally. */
3066 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3067 ex_subexp, type);
3068 if (BE (err != REG_NOERROR, 0))
3069 {
3070 re_node_set_free (&new_nodes);
3071 return err;
3072 }
3073 }
3074 }
3075 re_node_set_free (cur_nodes);
3076 *cur_nodes = new_nodes;
3077 return REG_NOERROR;
3078}
3079
3080/* Helper function for check_arrival_expand_ecl.
3081 Check incrementally the epsilon closure of TARGET, and if it isn't
3082 problematic append it to DST_NODES. */
3083
3084static reg_errcode_t
3085internal_function
3086check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3087 int target, int ex_subexp, int type)
3088{
3089 int cur_node;
3090 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3091 {
3092 int err;
3093
3094 if (dfa->nodes[cur_node].type == type
3095 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3096 {
3097 if (type == OP_CLOSE_SUBEXP)
3098 {
3099 err = re_node_set_insert (dst_nodes, cur_node);
3100 if (BE (err == -1, 0))
3101 return REG_ESPACE;
3102 }
3103 break;
3104 }
3105 err = re_node_set_insert (dst_nodes, cur_node);
3106 if (BE (err == -1, 0))
3107 return REG_ESPACE;
3108 if (dfa->edests[cur_node].nelem == 0)
3109 break;
3110 if (dfa->edests[cur_node].nelem == 2)
3111 {
3112 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3113 dfa->edests[cur_node].elems[1],
3114 ex_subexp, type);
3115 if (BE (err != REG_NOERROR, 0))
3116 return err;
3117 }
3118 cur_node = dfa->edests[cur_node].elems[0];
3119 }
3120 return REG_NOERROR;
3121}
3122
3123
3124/* For all the back references in the current state, calculate the
3125 destination of the back references by the appropriate entry
3126 in MCTX->BKREF_ENTS. */
3127
3128static reg_errcode_t
3129internal_function
3130expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3131 int cur_str, int subexp_num, int type)
3132{
3133 const re_dfa_t *const dfa = mctx->dfa;
3134 reg_errcode_t err;
3135 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3136 struct re_backref_cache_entry *ent;
3137
3138 if (cache_idx_start == -1)
3139 return REG_NOERROR;
3140
3141 restart:
3142 ent = mctx->bkref_ents + cache_idx_start;
3143 do
3144 {
3145 int to_idx, next_node;
3146
3147 /* Is this entry ENT is appropriate? */
3148 if (!re_node_set_contains (cur_nodes, ent->node))
3149 continue; /* No. */
3150
3151 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3152 /* Calculate the destination of the back reference, and append it
3153 to MCTX->STATE_LOG. */
3154 if (to_idx == cur_str)
3155 {
3156 /* The backreference did epsilon transit, we must re-check all the
3157 node in the current state. */
3158 re_node_set new_dests;
3159 reg_errcode_t err2, err3;
3160 next_node = dfa->edests[ent->node].elems[0];
3161 if (re_node_set_contains (cur_nodes, next_node))
3162 continue;
3163 err = re_node_set_init_1 (&new_dests, next_node);
3164 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3165 err3 = re_node_set_merge (cur_nodes, &new_dests);
3166 re_node_set_free (&new_dests);
3167 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3168 || err3 != REG_NOERROR, 0))
3169 {
3170 err = (err != REG_NOERROR ? err
3171 : (err2 != REG_NOERROR ? err2 : err3));
3172 return err;
3173 }
3174 /* TODO: It is still inefficient... */
3175 goto restart;
3176 }
3177 else
3178 {
3179 re_node_set union_set;
3180 next_node = dfa->nexts[ent->node];
3181 if (mctx->state_log[to_idx])
3182 {
3183 int ret;
3184 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3185 next_node))
3186 continue;
3187 err = re_node_set_init_copy (&union_set,
3188 &mctx->state_log[to_idx]->nodes);
3189 ret = re_node_set_insert (&union_set, next_node);
3190 if (BE (err != REG_NOERROR || ret < 0, 0))
3191 {
3192 re_node_set_free (&union_set);
3193 err = err != REG_NOERROR ? err : REG_ESPACE;
3194 return err;
3195 }
3196 }
3197 else
3198 {
3199 err = re_node_set_init_1 (&union_set, next_node);
3200 if (BE (err != REG_NOERROR, 0))
3201 return err;
3202 }
3203 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3204 re_node_set_free (&union_set);
3205 if (BE (mctx->state_log[to_idx] == NULL
3206 && err != REG_NOERROR, 0))
3207 return err;
3208 }
3209 }
3210 while (ent++->more);
3211 return REG_NOERROR;
3212}
3213
3214/* Build transition table for the state.
3215 Return 1 if succeeded, otherwise return NULL. */
3216
3217static int
3218internal_function
3219build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3220{
3221 reg_errcode_t err;
3222 int i, j, ch, need_word_trtable = 0;
3223 bitset_word_t elem, mask;
3224 bool dests_node_malloced = false;
3225 bool dest_states_malloced = false;
3226 int ndests; /* Number of the destination states from `state'. */
3227 re_dfastate_t **trtable;
3228 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3229 re_node_set follows, *dests_node;
3230 bitset_t *dests_ch;
3231 bitset_t acceptable;
3232
3233 struct dests_alloc
3234 {
3235 re_node_set dests_node[SBC_MAX];
3236 bitset_t dests_ch[SBC_MAX];
3237 } *dests_alloc;
3238
3239 /* We build DFA states which corresponds to the destination nodes
3240 from `state'. `dests_node[i]' represents the nodes which i-th
3241 destination state contains, and `dests_ch[i]' represents the
3242 characters which i-th destination state accepts. */
3243 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3244 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3245 else
3246 {
3247 dests_alloc = re_malloc (struct dests_alloc, 1);
3248 if (BE (dests_alloc == NULL, 0))
3249 return 0;
3250 dests_node_malloced = true;
3251 }
3252 dests_node = dests_alloc->dests_node;
3253 dests_ch = dests_alloc->dests_ch;
3254
3255 /* Initialize transiton table. */
3256 state->word_trtable = state->trtable = NULL;
3257
3258 /* At first, group all nodes belonging to `state' into several
3259 destinations. */
3260 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3261 if (BE (ndests <= 0, 0))
3262 {
3263 if (dests_node_malloced)
3264 free (dests_alloc);
3265 /* Return 0 in case of an error, 1 otherwise. */
3266 if (ndests == 0)
3267 {
3268 state->trtable = (re_dfastate_t **)
3269 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3270 return 1;
3271 }
3272 return 0;
3273 }
3274
3275 err = re_node_set_alloc (&follows, ndests + 1);
3276 if (BE (err != REG_NOERROR, 0))
3277 goto out_free;
3278
3279 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3280 + ndests * 3 * sizeof (re_dfastate_t *)))
3281 dest_states = (re_dfastate_t **)
3282 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3283 else
3284 {
3285 dest_states = (re_dfastate_t **)
3286 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3287 if (BE (dest_states == NULL, 0))
3288 {
3289out_free:
3290 if (dest_states_malloced)
3291 free (dest_states);
3292 re_node_set_free (&follows);
3293 for (i = 0; i < ndests; ++i)
3294 re_node_set_free (dests_node + i);
3295 if (dests_node_malloced)
3296 free (dests_alloc);
3297 return 0;
3298 }
3299 dest_states_malloced = true;
3300 }
3301 dest_states_word = dest_states + ndests;
3302 dest_states_nl = dest_states_word + ndests;
3303 bitset_empty (acceptable);
3304
3305 /* Then build the states for all destinations. */
3306 for (i = 0; i < ndests; ++i)
3307 {
3308 int next_node;
3309 re_node_set_empty (&follows);
3310 /* Merge the follows of this destination states. */
3311 for (j = 0; j < dests_node[i].nelem; ++j)
3312 {
3313 next_node = dfa->nexts[dests_node[i].elems[j]];
3314 if (next_node != -1)
3315 {
3316 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3317 if (BE (err != REG_NOERROR, 0))
3318 goto out_free;
3319 }
3320 }
3321 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3322 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3323 goto out_free;
3324 /* If the new state has context constraint,
3325 build appropriate states for these contexts. */
3326 if (dest_states[i]->has_constraint)
3327 {
3328 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3329 CONTEXT_WORD);
3330 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3331 goto out_free;
3332
3333 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3334 need_word_trtable = 1;
3335
3336 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3337 CONTEXT_NEWLINE);
3338 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3339 goto out_free;
3340 }
3341 else
3342 {
3343 dest_states_word[i] = dest_states[i];
3344 dest_states_nl[i] = dest_states[i];
3345 }
3346 bitset_merge (acceptable, dests_ch[i]);
3347 }
3348
3349 if (!BE (need_word_trtable, 0))
3350 {
3351 /* We don't care about whether the following character is a word
3352 character, or we are in a single-byte character set so we can
3353 discern by looking at the character code: allocate a
3354 256-entry transition table. */
3355 trtable = state->trtable = calloc (sizeof (re_dfastate_t *), SBC_MAX);
3356 if (BE (trtable == NULL, 0))
3357 goto out_free;
3358
3359 /* For all characters ch...: */
3360 for (i = 0; i < BITSET_WORDS; ++i)
3361 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3362 elem;
3363 mask <<= 1, elem >>= 1, ++ch)
3364 if (BE (elem & 1, 0))
3365 {
3366 /* There must be exactly one destination which accepts
3367 character ch. See group_nodes_into_DFAstates. */
3368 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3369 ;
3370
3371 /* j-th destination accepts the word character ch. */
3372 if (dfa->word_char[i] & mask)
3373 trtable[ch] = dest_states_word[j];
3374 else
3375 trtable[ch] = dest_states[j];
3376 }
3377 }
3378 else
3379 {
3380 /* We care about whether the following character is a word
3381 character, and we are in a multi-byte character set: discern
3382 by looking at the character code: build two 256-entry
3383 transition tables, one starting at trtable[0] and one
3384 starting at trtable[SBC_MAX]. */
3385 trtable = state->word_trtable = calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3386 if (BE (trtable == NULL, 0))
3387 goto out_free;
3388
3389 /* For all characters ch...: */
3390 for (i = 0; i < BITSET_WORDS; ++i)
3391 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3392 elem;
3393 mask <<= 1, elem >>= 1, ++ch)
3394 if (BE (elem & 1, 0))
3395 {
3396 /* There must be exactly one destination which accepts
3397 character ch. See group_nodes_into_DFAstates. */
3398 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3399 ;
3400
3401 /* j-th destination accepts the word character ch. */
3402 trtable[ch] = dest_states[j];
3403 trtable[ch + SBC_MAX] = dest_states_word[j];
3404 }
3405 }
3406
3407 /* new line */
3408 if (bitset_contain (acceptable, NEWLINE_CHAR))
3409 {
3410 /* The current state accepts newline character. */
3411 for (j = 0; j < ndests; ++j)
3412 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3413 {
3414 /* k-th destination accepts newline character. */
3415 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3416 if (need_word_trtable)
3417 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3418 /* There must be only one destination which accepts
3419 newline. See group_nodes_into_DFAstates. */
3420 break;
3421 }
3422 }
3423
3424 if (dest_states_malloced)
3425 free (dest_states);
3426
3427 re_node_set_free (&follows);
3428 for (i = 0; i < ndests; ++i)
3429 re_node_set_free (dests_node + i);
3430
3431 if (dests_node_malloced)
3432 free (dests_alloc);
3433
3434 return 1;
3435}
3436
3437/* Group all nodes belonging to STATE into several destinations.
3438 Then for all destinations, set the nodes belonging to the destination
3439 to DESTS_NODE[i] and set the characters accepted by the destination
3440 to DEST_CH[i]. This function return the number of destinations. */
3441
3442static int
3443internal_function
3444group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3445 re_node_set *dests_node, bitset_t *dests_ch)
3446{
3447 reg_errcode_t err;
3448 int result;
3449 int i, j, k;
3450 int ndests; /* Number of the destinations from `state'. */
3451 bitset_t accepts; /* Characters a node can accept. */
3452 const re_node_set *cur_nodes = &state->nodes;
3453 bitset_empty (accepts);
3454 ndests = 0;
3455
3456 /* For all the nodes belonging to `state', */
3457 for (i = 0; i < cur_nodes->nelem; ++i)
3458 {
3459 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3460 re_token_type_t type = node->type;
3461 unsigned int constraint = node->constraint;
3462
3463 /* Enumerate all single byte character this node can accept. */
3464 if (type == CHARACTER)
3465 bitset_set (accepts, node->opr.c);
3466 else if (type == SIMPLE_BRACKET)
3467 {
3468 bitset_merge (accepts, node->opr.sbcset);
3469 }
3470 else if (type == OP_PERIOD)
3471 {
3472#ifdef RE_ENABLE_I18N
3473 if (dfa->mb_cur_max > 1)
3474 bitset_merge (accepts, dfa->sb_char);
3475 else
3476#endif
3477 bitset_set_all (accepts);
3478 if (!(dfa->syntax & RE_DOT_NEWLINE))
3479 bitset_clear (accepts, '\n');
3480 if (dfa->syntax & RE_DOT_NOT_NULL)
3481 bitset_clear (accepts, '\0');
3482 }
3483#ifdef RE_ENABLE_I18N
3484 else if (type == OP_UTF8_PERIOD)
3485 {
3486 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3487 if (!(dfa->syntax & RE_DOT_NEWLINE))
3488 bitset_clear (accepts, '\n');
3489 if (dfa->syntax & RE_DOT_NOT_NULL)
3490 bitset_clear (accepts, '\0');
3491 }
3492#endif
3493 else
3494 continue;
3495
3496 /* Check the `accepts' and sift the characters which are not
3497 match it the context. */
3498 if (constraint)
3499 {
3500 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3501 {
3502 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3503 bitset_empty (accepts);
3504 if (accepts_newline)
3505 bitset_set (accepts, NEWLINE_CHAR);
3506 else
3507 continue;
3508 }
3509 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3510 {
3511 bitset_empty (accepts);
3512 continue;
3513 }
3514
3515 if (constraint & NEXT_WORD_CONSTRAINT)
3516 {
3517 bitset_word_t any_set = 0;
3518 if (type == CHARACTER && !node->word_char)
3519 {
3520 bitset_empty (accepts);
3521 continue;
3522 }
3523#ifdef RE_ENABLE_I18N
3524 if (dfa->mb_cur_max > 1)
3525 for (j = 0; j < BITSET_WORDS; ++j)
3526 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3527 else
3528#endif
3529 for (j = 0; j < BITSET_WORDS; ++j)
3530 any_set |= (accepts[j] &= dfa->word_char[j]);
3531 if (!any_set)
3532 continue;
3533 }
3534 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3535 {
3536 bitset_word_t any_set = 0;
3537 if (type == CHARACTER && node->word_char)
3538 {
3539 bitset_empty (accepts);
3540 continue;
3541 }
3542#ifdef RE_ENABLE_I18N
3543 if (dfa->mb_cur_max > 1)
3544 for (j = 0; j < BITSET_WORDS; ++j)
3545 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3546 else
3547#endif
3548 for (j = 0; j < BITSET_WORDS; ++j)
3549 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3550 if (!any_set)
3551 continue;
3552 }
3553 }
3554
3555 /* Then divide `accepts' into DFA states, or create a new
3556 state. Above, we make sure that accepts is not empty. */
3557 for (j = 0; j < ndests; ++j)
3558 {
3559 bitset_t intersec; /* Intersection sets, see below. */
3560 bitset_t remains;
3561 /* Flags, see below. */
3562 bitset_word_t has_intersec, not_subset, not_consumed;
3563
3564 /* Optimization, skip if this state doesn't accept the character. */
3565 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3566 continue;
3567
3568 /* Enumerate the intersection set of this state and `accepts'. */
3569 has_intersec = 0;
3570 for (k = 0; k < BITSET_WORDS; ++k)
3571 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3572 /* And skip if the intersection set is empty. */
3573 if (!has_intersec)
3574 continue;
3575
3576 /* Then check if this state is a subset of `accepts'. */
3577 not_subset = not_consumed = 0;
3578 for (k = 0; k < BITSET_WORDS; ++k)
3579 {
3580 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3581 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3582 }
3583
3584 /* If this state isn't a subset of `accepts', create a
3585 new group state, which has the `remains'. */
3586 if (not_subset)
3587 {
3588 bitset_copy (dests_ch[ndests], remains);
3589 bitset_copy (dests_ch[j], intersec);
3590 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3591 if (BE (err != REG_NOERROR, 0))
3592 goto error_return;
3593 ++ndests;
3594 }
3595
3596 /* Put the position in the current group. */
3597 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3598 if (BE (result < 0, 0))
3599 goto error_return;
3600
3601 /* If all characters are consumed, go to next node. */
3602 if (!not_consumed)
3603 break;
3604 }
3605 /* Some characters remain, create a new group. */
3606 if (j == ndests)
3607 {
3608 bitset_copy (dests_ch[ndests], accepts);
3609 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3610 if (BE (err != REG_NOERROR, 0))
3611 goto error_return;
3612 ++ndests;
3613 bitset_empty (accepts);
3614 }
3615 }
3616 return ndests;
3617 error_return:
3618 for (j = 0; j < ndests; ++j)
3619 re_node_set_free (dests_node + j);
3620 return -1;
3621}
3622
3623#ifdef RE_ENABLE_I18N
3624/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3625 Return the number of the bytes the node accepts.
3626 STR_IDX is the current index of the input string.
3627
3628 This function handles the nodes which can accept one character, or
3629 one collating element like '.', '[a-z]', opposite to the other nodes
3630 can only accept one byte. */
3631
3632static int
3633internal_function
3634check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3635 const re_string_t *input, int str_idx)
3636{
3637 const re_token_t *node = dfa->nodes + node_idx;
3638 int char_len, elem_len;
3639 int i;
3640
3641 if (BE (node->type == OP_UTF8_PERIOD, 0))
3642 {
3643 unsigned char c = re_string_byte_at (input, str_idx), d;
3644 if (BE (c < 0xc2, 1))
3645 return 0;
3646
3647 if (str_idx + 2 > input->len)
3648 return 0;
3649
3650 d = re_string_byte_at (input, str_idx + 1);
3651 if (c < 0xe0)
3652 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3653 else if (c < 0xf0)
3654 {
3655 char_len = 3;
3656 if (c == 0xe0 && d < 0xa0)
3657 return 0;
3658 }
3659 else if (c < 0xf8)
3660 {
3661 char_len = 4;
3662 if (c == 0xf0 && d < 0x90)
3663 return 0;
3664 }
3665 else if (c < 0xfc)
3666 {
3667 char_len = 5;
3668 if (c == 0xf8 && d < 0x88)
3669 return 0;
3670 }
3671 else if (c < 0xfe)
3672 {
3673 char_len = 6;
3674 if (c == 0xfc && d < 0x84)
3675 return 0;
3676 }
3677 else
3678 return 0;
3679
3680 if (str_idx + char_len > input->len)
3681 return 0;
3682
3683 for (i = 1; i < char_len; ++i)
3684 {
3685 d = re_string_byte_at (input, str_idx + i);
3686 if (d < 0x80 || d > 0xbf)
3687 return 0;
3688 }
3689 return char_len;
3690 }
3691
3692 char_len = re_string_char_size_at (input, str_idx);
3693 if (node->type == OP_PERIOD)
3694 {
3695 if (char_len <= 1)
3696 return 0;
3697 /* FIXME: I don't think this if is needed, as both '\n'
3698 and '\0' are char_len == 1. */
3699 /* '.' accepts any one character except the following two cases. */
3700 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3701 re_string_byte_at (input, str_idx) == '\n') ||
3702 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3703 re_string_byte_at (input, str_idx) == '\0'))
3704 return 0;
3705 return char_len;
3706 }
3707
3708 elem_len = re_string_elem_size_at (input, str_idx);
3709 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3710 return 0;
3711
3712 if (node->type == COMPLEX_BRACKET)
3713 {
3714 const re_charset_t *cset = node->opr.mbcset;
3715# if 0
3716 const unsigned char *pin
3717 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3718 int j;
3719 uint32_t nrules;
3720# endif
3721 int match_len = 0;
3722 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3723 ? re_string_wchar_at (input, str_idx) : 0);
3724
3725 /* match with multibyte character? */
3726 for (i = 0; i < cset->nmbchars; ++i)
3727 if (wc == cset->mbchars[i])
3728 {
3729 match_len = char_len;
3730 goto check_node_accept_bytes_match;
3731 }
3732 /* match with character_class? */
3733 for (i = 0; i < cset->nchar_classes; ++i)
3734 {
3735 wctype_t wt = cset->char_classes[i];
3736 if (__iswctype (wc, wt))
3737 {
3738 match_len = char_len;
3739 goto check_node_accept_bytes_match;
3740 }
3741 }
3742
3743# if 0
3744 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3745 if (nrules != 0)
3746 {
3747 unsigned int in_collseq = 0;
3748 const int32_t *table, *indirect;
3749 const unsigned char *weights, *extra;
3750 const char *collseqwc;
3751 int32_t idx;
3752 /* This #include defines a local function! */
3753# include <locale/weight.h>
3754
3755 /* match with collating_symbol? */
3756 if (cset->ncoll_syms)
3757 extra = (const unsigned char *)
3758 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3759 for (i = 0; i < cset->ncoll_syms; ++i)
3760 {
3761 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3762 /* Compare the length of input collating element and
3763 the length of current collating element. */
3764 if (*coll_sym != elem_len)
3765 continue;
3766 /* Compare each bytes. */
3767 for (j = 0; j < *coll_sym; j++)
3768 if (pin[j] != coll_sym[1 + j])
3769 break;
3770 if (j == *coll_sym)
3771 {
3772 /* Match if every bytes is equal. */
3773 match_len = j;
3774 goto check_node_accept_bytes_match;
3775 }
3776 }
3777
3778 if (cset->nranges)
3779 {
3780 if (elem_len <= char_len)
3781 {
3782 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3783 in_collseq = __collseq_table_lookup (collseqwc, wc);
3784 }
3785 else
3786 in_collseq = find_collation_sequence_value (pin, elem_len);
3787 }
3788 /* match with range expression? */
3789 for (i = 0; i < cset->nranges; ++i)
3790 if (cset->range_starts[i] <= in_collseq
3791 && in_collseq <= cset->range_ends[i])
3792 {
3793 match_len = elem_len;
3794 goto check_node_accept_bytes_match;
3795 }
3796
3797 /* match with equivalence_class? */
3798 if (cset->nequiv_classes)
3799 {
3800 const unsigned char *cp = pin;
3801 table = (const int32_t *)
3802 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3803 weights = (const unsigned char *)
3804 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3805 extra = (const unsigned char *)
3806 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3807 indirect = (const int32_t *)
3808 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3809 idx = findidx (&cp);
3810 if (idx > 0)
3811 for (i = 0; i < cset->nequiv_classes; ++i)
3812 {
3813 int32_t equiv_class_idx = cset->equiv_classes[i];
3814 size_t weight_len = weights[idx];
3815 if (weight_len == weights[equiv_class_idx])
3816 {
3817 int cnt = 0;
3818 while (cnt <= weight_len
3819 && (weights[equiv_class_idx + 1 + cnt]
3820 == weights[idx + 1 + cnt]))
3821 ++cnt;
3822 if (cnt > weight_len)
3823 {
3824 match_len = elem_len;
3825 goto check_node_accept_bytes_match;
3826 }
3827 }
3828 }
3829 }
3830 }
3831 else
3832# endif
3833 {
3834 /* match with range expression? */
3835 wchar_t cmp_buf[6];
3836
3837 memset (cmp_buf, 0, sizeof(cmp_buf));
3838 cmp_buf[2] = wc;
3839 for (i = 0; i < cset->nranges; ++i)
3840 {
3841 cmp_buf[0] = cset->range_starts[i];
3842 cmp_buf[4] = cset->range_ends[i];
3843 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3844 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3845 {
3846 match_len = char_len;
3847 goto check_node_accept_bytes_match;
3848 }
3849 }
3850 }
3851
3852 check_node_accept_bytes_match:
3853 if (!cset->non_match)
3854 return match_len;
3855 if (match_len > 0)
3856 return 0;
3857 return (elem_len > char_len) ? elem_len : char_len;
3858 }
3859 return 0;
3860}
3861
3862# if 0
3863static unsigned int
3864internal_function
3865find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3866{
3867 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3868 if (nrules == 0)
3869 {
3870 if (mbs_len == 1)
3871 {
3872 /* No valid character. Match it as a single byte character. */
3873 const unsigned char *collseq = (const unsigned char *)
3874 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3875 return collseq[mbs[0]];
3876 }
3877 return UINT_MAX;
3878 }
3879 else
3880 {
3881 int32_t idx;
3882 const unsigned char *extra = (const unsigned char *)
3883 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3884 int32_t extrasize = (const unsigned char *)
3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3886
3887 for (idx = 0; idx < extrasize;)
3888 {
3889 int mbs_cnt, found = 0;
3890 int32_t elem_mbs_len;
3891 /* Skip the name of collating element name. */
3892 idx = idx + extra[idx] + 1;
3893 elem_mbs_len = extra[idx++];
3894 if (mbs_len == elem_mbs_len)
3895 {
3896 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3897 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3898 break;
3899 if (mbs_cnt == elem_mbs_len)
3900 /* Found the entry. */
3901 found = 1;
3902 }
3903 /* Skip the byte sequence of the collating element. */
3904 idx += elem_mbs_len;
3905 /* Adjust for the alignment. */
3906 idx = (idx + 3) & ~3;
3907 /* Skip the collation sequence value. */
3908 idx += sizeof (uint32_t);
3909 /* Skip the wide char sequence of the collating element. */
3910 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3911 /* If we found the entry, return the sequence value. */
3912 if (found)
3913 return *(uint32_t *) (extra + idx);
3914 /* Skip the collation sequence value. */
3915 idx += sizeof (uint32_t);
3916 }
3917 return UINT_MAX;
3918 }
3919}
3920# endif
3921#endif /* RE_ENABLE_I18N */
3922
3923/* Check whether the node accepts the byte which is IDX-th
3924 byte of the INPUT. */
3925
3926static int
3927internal_function
3928check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3929 int idx)
3930{
3931 unsigned char ch;
3932 ch = re_string_byte_at (&mctx->input, idx);
3933 switch (node->type)
3934 {
3935 case CHARACTER:
3936 if (node->opr.c != ch)
3937 return 0;
3938 break;
3939
3940 case SIMPLE_BRACKET:
3941 if (!bitset_contain (node->opr.sbcset, ch))
3942 return 0;
3943 break;
3944
3945#ifdef RE_ENABLE_I18N
3946 case OP_UTF8_PERIOD:
3947 if (ch >= 0x80)
3948 return 0;
3949 /* FALLTHROUGH */
3950#endif
3951 case OP_PERIOD:
3952 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3953 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3954 return 0;
3955 break;
3956
3957 default:
3958 return 0;
3959 }
3960
3961 if (node->constraint)
3962 {
3963 /* The node has constraints. Check whether the current context
3964 satisfies the constraints. */
3965 unsigned int context = re_string_context_at (&mctx->input, idx,
3966 mctx->eflags);
3967 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3968 return 0;
3969 }
3970
3971 return 1;
3972}
3973
3974/* Extend the buffers, if the buffers have run out. */
3975
3976static reg_errcode_t
3977internal_function
3978extend_buffers (re_match_context_t *mctx)
3979{
3980 reg_errcode_t ret;
3981 re_string_t *pstr = &mctx->input;
3982
3983 /* Double the lengthes of the buffers. */
3984 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3985 if (BE (ret != REG_NOERROR, 0))
3986 return ret;
3987
3988 if (mctx->state_log != NULL)
3989 {
3990 /* And double the length of state_log. */
3991 /* XXX We have no indication of the size of this buffer. If this
3992 allocation fail we have no indication that the state_log array
3993 does not have the right size. */
3994 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3995 pstr->bufs_len + 1);
3996 if (BE (new_array == NULL, 0))
3997 return REG_ESPACE;
3998 mctx->state_log = new_array;
3999 }
4000
4001 /* Then reconstruct the buffers. */
4002 if (pstr->icase)
4003 {
4004#ifdef RE_ENABLE_I18N
4005 if (pstr->mb_cur_max > 1)
4006 {
4007 ret = build_wcs_upper_buffer (pstr);
4008 if (BE (ret != REG_NOERROR, 0))
4009 return ret;
4010 }
4011 else
4012#endif /* RE_ENABLE_I18N */
4013 build_upper_buffer (pstr);
4014 }
4015 else
4016 {
4017#ifdef RE_ENABLE_I18N
4018 if (pstr->mb_cur_max > 1)
4019 build_wcs_buffer (pstr);
4020 else
4021#endif /* RE_ENABLE_I18N */
4022 {
4023 if (pstr->trans != NULL)
4024 re_string_translate_buffer (pstr);
4025 }
4026 }
4027 return REG_NOERROR;
4028}
4029
4030
4031/* Functions for matching context. */
4032
4033/* Initialize MCTX. */
4034
4035static reg_errcode_t
4036internal_function
4037match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4038{
4039 mctx->eflags = eflags;
4040 mctx->match_last = -1;
4041 if (n > 0)
4042 {
4043 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4044 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4045 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4046 return REG_ESPACE;
4047 }
4048 /* Already zero-ed by the caller.
4049 else
4050 mctx->bkref_ents = NULL;
4051 mctx->nbkref_ents = 0;
4052 mctx->nsub_tops = 0; */
4053 mctx->abkref_ents = n;
4054 mctx->max_mb_elem_len = 1;
4055 mctx->asub_tops = n;
4056 return REG_NOERROR;
4057}
4058
4059/* Clean the entries which depend on the current input in MCTX.
4060 This function must be invoked when the matcher changes the start index
4061 of the input, or changes the input string. */
4062
4063static void
4064internal_function
4065match_ctx_clean (re_match_context_t *mctx)
4066{
4067 int st_idx;
4068 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4069 {
4070 int sl_idx;
4071 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4072 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4073 {
4074 re_sub_match_last_t *last = top->lasts[sl_idx];
4075 re_free (last->path.array);
4076 re_free (last);
4077 }
4078 re_free (top->lasts);
4079 if (top->path)
4080 {
4081 re_free (top->path->array);
4082 re_free (top->path);
4083 }
4084 free (top);
4085 }
4086
4087 mctx->nsub_tops = 0;
4088 mctx->nbkref_ents = 0;
4089}
4090
4091/* Free all the memory associated with MCTX. */
4092
4093static void
4094internal_function
4095match_ctx_free (re_match_context_t *mctx)
4096{
4097 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4098 match_ctx_clean (mctx);
4099 re_free (mctx->sub_tops);
4100 re_free (mctx->bkref_ents);
4101}
4102
4103/* Add a new backreference entry to MCTX.
4104 Note that we assume that caller never call this function with duplicate
4105 entry, and call with STR_IDX which isn't smaller than any existing entry.
4106*/
4107
4108static reg_errcode_t
4109internal_function
4110match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4111 int to)
4112{
4113 if (mctx->nbkref_ents >= mctx->abkref_ents)
4114 {
4115 struct re_backref_cache_entry* new_entry;
4116 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4117 mctx->abkref_ents * 2);
4118 if (BE (new_entry == NULL, 0))
4119 {
4120 re_free (mctx->bkref_ents);
4121 return REG_ESPACE;
4122 }
4123 mctx->bkref_ents = new_entry;
4124 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4125 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4126 mctx->abkref_ents *= 2;
4127 }
4128 if (mctx->nbkref_ents > 0
4129 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4130 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4131
4132 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4133 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4134 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4135 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4136
4137 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4138 If bit N is clear, means that this entry won't epsilon-transition to
4139 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4140 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4141 such node.
4142
4143 A backreference does not epsilon-transition unless it is empty, so set
4144 to all zeros if FROM != TO. */
4145 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4146 = (from == to ? ~0 : 0);
4147
4148 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4149 if (mctx->max_mb_elem_len < to - from)
4150 mctx->max_mb_elem_len = to - from;
4151 return REG_NOERROR;
4152}
4153
4154/* Search for the first entry which has the same str_idx, or -1 if none is
4155 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4156
4157static int
4158internal_function
4159search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4160{
4161 int left, right, mid, last;
4162 last = right = mctx->nbkref_ents;
4163 for (left = 0; left < right;)
4164 {
4165 mid = (left + right) / 2;
4166 if (mctx->bkref_ents[mid].str_idx < str_idx)
4167 left = mid + 1;
4168 else
4169 right = mid;
4170 }
4171 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4172 return left;
4173 else
4174 return -1;
4175}
4176
4177/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4178 at STR_IDX. */
4179
4180static reg_errcode_t
4181internal_function
4182match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4183{
4184#ifdef DEBUG
4185 assert (mctx->sub_tops != NULL);
4186 assert (mctx->asub_tops > 0);
4187#endif
4188 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4189 {
4190 int new_asub_tops = mctx->asub_tops * 2;
4191 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4192 re_sub_match_top_t *,
4193 new_asub_tops);
4194 if (BE (new_array == NULL, 0))
4195 return REG_ESPACE;
4196 mctx->sub_tops = new_array;
4197 mctx->asub_tops = new_asub_tops;
4198 }
4199 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4200 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4201 return REG_ESPACE;
4202 mctx->sub_tops[mctx->nsub_tops]->node = node;
4203 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4204 return REG_NOERROR;
4205}
4206
4207/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4208 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4209
4210static re_sub_match_last_t *
4211internal_function
4212match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4213{
4214 re_sub_match_last_t *new_entry;
4215 if (BE (subtop->nlasts == subtop->alasts, 0))
4216 {
4217 int new_alasts = 2 * subtop->alasts + 1;
4218 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4219 re_sub_match_last_t *,
4220 new_alasts);
4221 if (BE (new_array == NULL, 0))
4222 return NULL;
4223 subtop->lasts = new_array;
4224 subtop->alasts = new_alasts;
4225 }
4226 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4227 if (BE (new_entry != NULL, 1))
4228 {
4229 subtop->lasts[subtop->nlasts] = new_entry;
4230 new_entry->node = node;
4231 new_entry->str_idx = str_idx;
4232 ++subtop->nlasts;
4233 }
4234 return new_entry;
4235}
4236
4237static void
4238internal_function
4239sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4240 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4241{
4242 sctx->sifted_states = sifted_sts;
4243 sctx->limited_states = limited_sts;
4244 sctx->last_node = last_node;
4245 sctx->last_str_idx = last_str_idx;
4246 re_node_set_init_empty (&sctx->limits);
4247}