blob: b25850ccd68fb27c82ad9063201e01ee2c625d08 [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001/* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27#ifdef RE_ENABLE_I18N
28static void free_charset (re_charset_t *cset);
29#endif
30static void free_workarea_compile (regex_t *preg);
31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32#ifdef RE_ENABLE_I18N
33static void optimize_utf8 (re_dfa_t *dfa);
34#endif
35static reg_errcode_t analyze (regex_t *preg);
36static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89#ifdef RE_ENABLE_I18N
90static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120static void free_token (re_token_t *node);
121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123
124/* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129static const char __re_error_msgid[] =
130 {
131#define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183static const uint16_t __re_error_msgid_idx[] =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203
204/* Entry points for GNU code. */
205
206/* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
209
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
212
213const char *
214re_compile_pattern (const char *pattern,
215 size_t length,
216 struct re_pattern_buffer *bufp)
217{
218 reg_errcode_t ret;
219
220 /* And GNU code determines whether or not to get register information
221 by passing null for the REGS argument to re_match, etc., not by
222 setting no_sub, unless RE_NO_SUB is set. */
223 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
224
225 /* Match anchors at newline. */
226 bufp->newline_anchor = 1;
227
228 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
229
230 if (!ret)
231 return NULL;
232 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
233}
234
235/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
236 also be assigned to arbitrarily: each pattern buffer stores its own
237 syntax, so it can be changed between regex compilations. */
238/* This has no initializer because initialized variables in Emacs
239 become read-only after dumping. */
240reg_syntax_t re_syntax_options;
241
242
243/* Specify the precise syntax of regexps for compilation. This provides
244 for compatibility for various utilities which historically have
245 different, incompatible syntaxes.
246
247 The argument SYNTAX is a bit mask comprised of the various bits
248 defined in regex.h. We return the old syntax. */
249
250reg_syntax_t
251re_set_syntax (reg_syntax_t syntax)
252{
253 reg_syntax_t ret = re_syntax_options;
254
255 re_syntax_options = syntax;
256 return ret;
257}
258
259int
260re_compile_fastmap (struct re_pattern_buffer *bufp)
261{
262 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
263 char *fastmap = bufp->fastmap;
264
265 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
266 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
267 if (dfa->init_state != dfa->init_state_word)
268 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
269 if (dfa->init_state != dfa->init_state_nl)
270 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
271 if (dfa->init_state != dfa->init_state_begbuf)
272 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
273 bufp->fastmap_accurate = 1;
274 return 0;
275}
276libc_hidden_def(re_compile_fastmap)
277
278static __inline__ void
279__attribute ((always_inline))
280re_set_fastmap (char *fastmap, int icase, int ch)
281{
282 fastmap[ch] = 1;
283 if (icase)
284 fastmap[tolower (ch)] = 1;
285}
286
287/* Helper function for re_compile_fastmap.
288 Compile fastmap for the initial_state INIT_STATE. */
289
290static void
291re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
292 char *fastmap)
293{
294 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
295 int node_cnt;
296 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
297 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
298 {
299 int node = init_state->nodes.elems[node_cnt];
300 re_token_type_t type = dfa->nodes[node].type;
301
302 if (type == CHARACTER)
303 {
304 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
305#ifdef RE_ENABLE_I18N
306 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
307 {
308 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
309 wchar_t wc;
310 mbstate_t state;
311
312 p = buf;
313 *p++ = dfa->nodes[node].opr.c;
314 while (++node < dfa->nodes_len
315 && dfa->nodes[node].type == CHARACTER
316 && dfa->nodes[node].mb_partial)
317 *p++ = dfa->nodes[node].opr.c;
318 memset (&state, '\0', sizeof (state));
319 if (mbrtowc (&wc, (const char *) buf, p - buf,
320 &state) == p - buf
321 && (__wcrtomb ((char *) buf, towlower (wc), &state)
322 != (size_t) -1))
323 re_set_fastmap (fastmap, 0, buf[0]);
324 }
325#endif
326 }
327 else if (type == SIMPLE_BRACKET)
328 {
329 int i, ch;
330 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
331 {
332 int j;
333 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
334 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
335 if (w & ((bitset_word_t) 1 << j))
336 re_set_fastmap (fastmap, icase, ch);
337 }
338 }
339#ifdef RE_ENABLE_I18N
340 else if (type == COMPLEX_BRACKET)
341 {
342 int i;
343 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
344 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
345 || cset->nranges || cset->nchar_classes)
346 {
347# if 0
348 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
349 {
350 /* In this case we want to catch the bytes which are
351 the first byte of any collation elements.
352 e.g. In da_DK, we want to catch 'a' since "aa"
353 is a valid collation element, and don't catch
354 'b' since 'b' is the only collation element
355 which starts from 'b'. */
356 const int32_t *table = (const int32_t *)
357 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
358 for (i = 0; i < SBC_MAX; ++i)
359 if (table[i] < 0)
360 re_set_fastmap (fastmap, icase, i);
361 }
362# else
363 if (dfa->mb_cur_max > 1)
364 for (i = 0; i < SBC_MAX; ++i)
365 if (__btowc (i) == WEOF)
366 re_set_fastmap (fastmap, icase, i);
367# endif
368 }
369 for (i = 0; i < cset->nmbchars; ++i)
370 {
371 char buf[256];
372 mbstate_t state;
373 memset (&state, '\0', sizeof (state));
374 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
375 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
376 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
377 {
378 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
379 != (size_t) -1)
380 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
381 }
382 }
383 }
384#endif /* RE_ENABLE_I18N */
385 else if (type == OP_PERIOD
386#ifdef RE_ENABLE_I18N
387 || type == OP_UTF8_PERIOD
388#endif
389 || type == END_OF_RE)
390 {
391 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
392 if (type == END_OF_RE)
393 bufp->can_be_null = 1;
394 return;
395 }
396 }
397}
398
399/* Entry point for POSIX code. */
400/* regcomp takes a regular expression as a string and compiles it.
401
402 PREG is a regex_t *. We do not expect any fields to be initialized,
403 since POSIX says we shouldn't. Thus, we set
404
405 `buffer' to the compiled pattern;
406 `used' to the length of the compiled pattern;
407 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
408 REG_EXTENDED bit in CFLAGS is set; otherwise, to
409 RE_SYNTAX_POSIX_BASIC;
410 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
411 `fastmap' to an allocated space for the fastmap;
412 `fastmap_accurate' to zero;
413 `re_nsub' to the number of subexpressions in PATTERN.
414
415 PATTERN is the address of the pattern string.
416
417 CFLAGS is a series of bits which affect compilation.
418
419 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
420 use POSIX basic syntax.
421
422 If REG_NEWLINE is set, then . and [^...] don't match newline.
423 Also, regexec will try a match beginning after every newline.
424
425 If REG_ICASE is set, then we considers upper- and lowercase
426 versions of letters to be equivalent when matching.
427
428 If REG_NOSUB is set, then when PREG is passed to regexec, that
429 routine will report only success or failure, and nothing about the
430 registers.
431
432 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
433 the return codes and their meanings.) */
434
435int
436regcomp (regex_t *__restrict preg,
437 const char *__restrict pattern,
438 int cflags)
439{
440 reg_errcode_t ret;
441 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
442 : RE_SYNTAX_POSIX_BASIC);
443
444 preg->buffer = NULL;
445 preg->allocated = 0;
446 preg->used = 0;
447
448 /* Try to allocate space for the fastmap. */
449 preg->fastmap = re_malloc (char, SBC_MAX);
450 if (BE (preg->fastmap == NULL, 0))
451 return REG_ESPACE;
452
453 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
454
455 /* If REG_NEWLINE is set, newlines are treated differently. */
456 if (cflags & REG_NEWLINE)
457 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
458 syntax &= ~RE_DOT_NEWLINE;
459 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
460 /* It also changes the matching behavior. */
461 preg->newline_anchor = 1;
462 }
463 else
464 preg->newline_anchor = 0;
465 preg->no_sub = !!(cflags & REG_NOSUB);
466 preg->translate = NULL;
467
468 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
469
470 /* POSIX doesn't distinguish between an unmatched open-group and an
471 unmatched close-group: both are REG_EPAREN. */
472 if (ret == REG_ERPAREN)
473 ret = REG_EPAREN;
474
475 /* We have already checked preg->fastmap != NULL. */
476 if (BE (ret == REG_NOERROR, 1))
477 /* Compute the fastmap now, since regexec cannot modify the pattern
478 buffer. This function never fails in this implementation. */
479 (void) re_compile_fastmap (preg);
480 else
481 {
482 /* Some error occurred while compiling the expression. */
483 re_free (preg->fastmap);
484 preg->fastmap = NULL;
485 }
486
487 return (int) ret;
488}
489
490/* Returns a message corresponding to an error code, ERRCODE, returned
491 from either regcomp or regexec. We don't use PREG here. */
492
493size_t
494regerror (int errcode,
495 const regex_t *__restrict preg,
496 char *__restrict errbuf,
497 size_t errbuf_size)
498{
499 const char *msg;
500 size_t msg_size;
501
502 if (BE (errcode < 0
503 || errcode >= (int) (sizeof (__re_error_msgid_idx)
504 / sizeof (__re_error_msgid_idx[0])), 0))
505 /* Only error codes returned by the rest of the code should be passed
506 to this routine. If we are given anything else, or if other regex
507 code generates an invalid error code, then the program has a bug.
508 Dump core so we can fix it. */
509 abort ();
510
511 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
512
513 msg_size = strlen (msg) + 1; /* Includes the null. */
514
515 if (BE (errbuf_size != 0, 1))
516 {
517 if (BE (msg_size > errbuf_size, 0))
518 {
519 memcpy (errbuf, msg, errbuf_size - 1);
520 errbuf[errbuf_size - 1] = 0;
521 }
522 else
523 memcpy (errbuf, msg, msg_size);
524 }
525
526 return msg_size;
527}
528
529
530#ifdef RE_ENABLE_I18N
531/* This static array is used for the map to single-byte characters when
532 UTF-8 is used. Otherwise we would allocate memory just to initialize
533 it the same all the time. UTF-8 is the preferred encoding so this is
534 a worthwhile optimization. */
535static const bitset_t utf8_sb_map =
536{
537 /* Set the first 128 bits. */
538 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
539};
540#endif
541
542
543static void
544free_dfa_content (re_dfa_t *dfa)
545{
546 int i, j;
547
548 if (dfa->nodes)
549 for (i = 0; i < dfa->nodes_len; ++i)
550 free_token (dfa->nodes + i);
551 re_free (dfa->nexts);
552 for (i = 0; i < dfa->nodes_len; ++i)
553 {
554 if (dfa->eclosures != NULL)
555 re_node_set_free (dfa->eclosures + i);
556 if (dfa->inveclosures != NULL)
557 re_node_set_free (dfa->inveclosures + i);
558 if (dfa->edests != NULL)
559 re_node_set_free (dfa->edests + i);
560 }
561 re_free (dfa->edests);
562 re_free (dfa->eclosures);
563 re_free (dfa->inveclosures);
564 re_free (dfa->nodes);
565
566 if (dfa->state_table)
567 for (i = 0; i <= dfa->state_hash_mask; ++i)
568 {
569 struct re_state_table_entry *entry = dfa->state_table + i;
570 for (j = 0; j < entry->num; ++j)
571 {
572 re_dfastate_t *state = entry->array[j];
573 free_state (state);
574 }
575 re_free (entry->array);
576 }
577 re_free (dfa->state_table);
578#ifdef RE_ENABLE_I18N
579 if (dfa->sb_char != utf8_sb_map)
580 re_free (dfa->sb_char);
581#endif
582 re_free (dfa->subexp_map);
583#ifdef DEBUG
584 re_free (dfa->re_str);
585#endif
586
587 re_free (dfa);
588}
589
590
591/* Free dynamically allocated space used by PREG. */
592
593void
594regfree (regex_t *preg)
595{
596 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
597 if (BE (dfa != NULL, 1))
598 free_dfa_content (dfa);
599 preg->buffer = NULL;
600 preg->allocated = 0;
601
602 re_free (preg->fastmap);
603 preg->fastmap = NULL;
604
605 re_free (preg->translate);
606 preg->translate = NULL;
607}
608libc_hidden_def(regfree)
609
610/* Entry points compatible with 4.2 BSD regex library. We don't define
611 them unless specifically requested. */
612
613#if defined _REGEX_RE_COMP || defined __UCLIBC__
614
615/* BSD has one and only one pattern buffer. */
616static struct re_pattern_buffer *re_comp_buf;
617
618char *
619/* Make BCD definitions weak in libc, so POSIX programs can redefine
620 these names if they don't use our functions, and still use
621 regcomp/regexec above without link errors. */
622weak_function
623re_comp (const char *s)
624{
625 reg_errcode_t ret;
626
627 /* "If re_comp() is passed NULL or a null string, it returns
628 * without changing the currently compiled regular expression." */
629 if (!s || !s[0])
630 {
631 if (!re_comp_buf)
632 return gettext ("No previous regular expression");
633 return NULL;
634 }
635
636 if (!re_comp_buf)
637 {
638 re_comp_buf = calloc (1, sizeof(*re_comp_buf));
639 if (!re_comp_buf)
640 {
641 ret = REG_ESPACE;
642 goto err;
643 }
644 }
645
646 if (re_comp_buf->buffer)
647 {
648 regfree (re_comp_buf);
649 memset (re_comp_buf, '\0', sizeof(*re_comp_buf));
650 }
651
652 if (re_comp_buf->fastmap == NULL)
653 {
654 re_comp_buf->fastmap = malloc (SBC_MAX);
655 if (re_comp_buf->fastmap == NULL)
656 {
657 ret = REG_ESPACE;
658 goto err;
659 }
660 }
661
662 /* Since `re_exec' always passes NULL for the `regs' argument, we
663 don't need to initialize the pattern buffer fields which affect it. */
664
665 /* Match anchors at newlines. */
666 re_comp_buf->newline_anchor = 1;
667
668 ret = re_compile_internal (re_comp_buf, s, strlen (s), re_syntax_options);
669
670 if (!ret)
671 return NULL;
672 free (re_comp_buf);
673 re_comp_buf = NULL;
674
675 err:
676 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
677 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
678}
679
680#if 0
681libc_freeres_fn (free_mem)
682{
683 regfree (re_comp_buf);
684 free (re_comp_buf);
685 re_comp_buf = NULL;
686}
687#endif
688
689#endif /* _REGEX_RE_COMP */
690
691/* Internal entry point.
692 Compile the regular expression PATTERN, whose length is LENGTH.
693 SYNTAX indicate regular expression's syntax. */
694
695static reg_errcode_t
696re_compile_internal (regex_t *preg, const char * pattern, size_t length,
697 reg_syntax_t syntax)
698{
699 reg_errcode_t err = REG_NOERROR;
700 re_dfa_t *dfa;
701 re_string_t regexp;
702
703 /* Initialize the pattern buffer. */
704 preg->fastmap_accurate = 0;
705 preg->syntax = syntax;
706 preg->not_bol = preg->not_eol = 0;
707 preg->used = 0;
708 preg->re_nsub = 0;
709 preg->can_be_null = 0;
710 preg->regs_allocated = REGS_UNALLOCATED;
711
712 /* Initialize the dfa. */
713 dfa = (re_dfa_t *) preg->buffer;
714 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
715 {
716 /* If zero allocated, but buffer is non-null, try to realloc
717 enough space. This loses if buffer's address is bogus, but
718 that is the user's responsibility. If ->buffer is NULL this
719 is a simple allocation. */
720 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
721 if (dfa == NULL)
722 return REG_ESPACE;
723 preg->allocated = sizeof (re_dfa_t);
724 preg->buffer = (unsigned char *) dfa;
725 }
726 preg->used = sizeof (re_dfa_t);
727
728 err = init_dfa (dfa, length);
729 if (BE (err != REG_NOERROR, 0))
730 {
731 free_dfa_content (dfa);
732 preg->buffer = NULL;
733 preg->allocated = 0;
734 return err;
735 }
736#ifdef DEBUG
737 /* Note: length+1 will not overflow since it is checked in init_dfa. */
738 dfa->re_str = re_malloc (char, length + 1);
739 strncpy (dfa->re_str, pattern, length + 1);
740#endif
741
742 __libc_lock_init (dfa->lock);
743
744 err = re_string_construct (&regexp, pattern, length, preg->translate,
745 syntax & RE_ICASE, dfa);
746 if (BE (err != REG_NOERROR, 0))
747 {
748 re_compile_internal_free_return:
749 free_workarea_compile (preg);
750 re_string_destruct (&regexp);
751 free_dfa_content (dfa);
752 preg->buffer = NULL;
753 preg->allocated = 0;
754 return err;
755 }
756
757 /* Parse the regular expression, and build a structure tree. */
758 preg->re_nsub = 0;
759 dfa->str_tree = parse (&regexp, preg, syntax, &err);
760 if (BE (dfa->str_tree == NULL, 0))
761 goto re_compile_internal_free_return;
762
763 /* Analyze the tree and create the nfa. */
764 err = analyze (preg);
765 if (BE (err != REG_NOERROR, 0))
766 goto re_compile_internal_free_return;
767
768#ifdef RE_ENABLE_I18N
769 /* If possible, do searching in single byte encoding to speed things up. */
770 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
771 optimize_utf8 (dfa);
772#endif
773
774 /* Then create the initial state of the dfa. */
775 err = create_initial_state (dfa);
776
777 /* Release work areas. */
778 free_workarea_compile (preg);
779 re_string_destruct (&regexp);
780
781 if (BE (err != REG_NOERROR, 0))
782 {
783 free_dfa_content (dfa);
784 preg->buffer = NULL;
785 preg->allocated = 0;
786 }
787
788 return err;
789}
790
791/* Initialize DFA. We use the length of the regular expression PAT_LEN
792 as the initial length of some arrays. */
793
794static reg_errcode_t
795init_dfa (re_dfa_t *dfa, size_t pat_len)
796{
797 unsigned int table_size;
798#if 1
799 char *codeset_name;
800#endif
801
802 memset (dfa, '\0', sizeof (re_dfa_t));
803
804 /* Force allocation of str_tree_storage the first time. */
805 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
806
807 /* Avoid overflows. */
808 if (pat_len == SIZE_MAX)
809 return REG_ESPACE;
810
811 dfa->nodes_alloc = pat_len + 1;
812 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
813
814 /* table_size = 2 ^ ceil(log pat_len) */
815 for (table_size = 1; ; table_size <<= 1)
816 if (table_size > pat_len)
817 break;
818
819 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
820 dfa->state_hash_mask = table_size - 1;
821
822 dfa->mb_cur_max = MB_CUR_MAX;
823#if 0
824 if (dfa->mb_cur_max == 6
825 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
826 dfa->is_utf8 = 1;
827 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
828 != 0);
829#else
830# ifdef HAVE_LANGINFO_CODESET
831 codeset_name = nl_langinfo (CODESET);
832# else
833 codeset_name = getenv ("LC_ALL");
834 if (codeset_name == NULL || codeset_name[0] == '\0')
835 codeset_name = getenv ("LC_CTYPE");
836 if (codeset_name == NULL || codeset_name[0] == '\0')
837 codeset_name = getenv ("LANG");
838 if (codeset_name == NULL)
839 codeset_name = "";
840 else if (strchr (codeset_name, '.') != NULL)
841 codeset_name = strchr (codeset_name, '.') + 1;
842# endif
843
844 if (strcasecmp (codeset_name, "UTF-8") == 0
845 || strcasecmp (codeset_name, "UTF8") == 0)
846 dfa->is_utf8 = 1;
847
848 /* We check exhaustively in the loop below if this charset is a
849 superset of ASCII. */
850 dfa->map_notascii = 0;
851#endif
852
853#ifdef RE_ENABLE_I18N
854 if (dfa->mb_cur_max > 1)
855 {
856 if (dfa->is_utf8)
857 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
858 else
859 {
860 int i, j, ch;
861
862 dfa->sb_char = calloc (sizeof (bitset_t), 1);
863 if (BE (dfa->sb_char == NULL, 0))
864 return REG_ESPACE;
865
866 /* Set the bits corresponding to single byte chars. */
867 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
868 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
869 {
870 wint_t wch = __btowc (ch);
871 if (wch != WEOF)
872 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
873# if 1
874 if (isascii (ch) && wch != ch)
875 dfa->map_notascii = 1;
876# endif
877 }
878 }
879 }
880#endif
881
882 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
883 return REG_ESPACE;
884 return REG_NOERROR;
885}
886
887/* Initialize WORD_CHAR table, which indicate which character is
888 "word". In this case "word" means that it is the word construction
889 character used by some operators like "\<", "\>", etc. */
890
891static void
892internal_function
893init_word_char (re_dfa_t *dfa)
894{
895 int i, j, ch;
896 dfa->word_ops_used = 1;
897 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
898 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
899 if (isalnum (ch) || ch == '_')
900 dfa->word_char[i] |= (bitset_word_t) 1 << j;
901}
902
903/* Free the work area which are only used while compiling. */
904
905static void
906free_workarea_compile (regex_t *preg)
907{
908 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
909 bin_tree_storage_t *storage, *next;
910 for (storage = dfa->str_tree_storage; storage; storage = next)
911 {
912 next = storage->next;
913 re_free (storage);
914 }
915 dfa->str_tree_storage = NULL;
916 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
917 dfa->str_tree = NULL;
918 re_free (dfa->org_indices);
919 dfa->org_indices = NULL;
920}
921
922/* Create initial states for all contexts. */
923
924static reg_errcode_t
925create_initial_state (re_dfa_t *dfa)
926{
927 int first, i;
928 reg_errcode_t err;
929 re_node_set init_nodes;
930
931 /* Initial states have the epsilon closure of the node which is
932 the first node of the regular expression. */
933 first = dfa->str_tree->first->node_idx;
934 dfa->init_node = first;
935 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
936 if (BE (err != REG_NOERROR, 0))
937 return err;
938
939 /* The back-references which are in initial states can epsilon transit,
940 since in this case all of the subexpressions can be null.
941 Then we add epsilon closures of the nodes which are the next nodes of
942 the back-references. */
943 if (dfa->nbackref > 0)
944 for (i = 0; i < init_nodes.nelem; ++i)
945 {
946 int node_idx = init_nodes.elems[i];
947 re_token_type_t type = dfa->nodes[node_idx].type;
948
949 int clexp_idx;
950 if (type != OP_BACK_REF)
951 continue;
952 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
953 {
954 re_token_t *clexp_node;
955 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
956 if (clexp_node->type == OP_CLOSE_SUBEXP
957 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
958 break;
959 }
960 if (clexp_idx == init_nodes.nelem)
961 continue;
962
963 if (type == OP_BACK_REF)
964 {
965 int dest_idx = dfa->edests[node_idx].elems[0];
966 if (!re_node_set_contains (&init_nodes, dest_idx))
967 {
968 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
969 i = 0;
970 }
971 }
972 }
973
974 /* It must be the first time to invoke acquire_state. */
975 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
976 /* We don't check ERR here, since the initial state must not be NULL. */
977 if (BE (dfa->init_state == NULL, 0))
978 return err;
979 if (dfa->init_state->has_constraint)
980 {
981 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
982 CONTEXT_WORD);
983 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
984 CONTEXT_NEWLINE);
985 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
986 &init_nodes,
987 CONTEXT_NEWLINE
988 | CONTEXT_BEGBUF);
989 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
990 || dfa->init_state_begbuf == NULL, 0))
991 return err;
992 }
993 else
994 dfa->init_state_word = dfa->init_state_nl
995 = dfa->init_state_begbuf = dfa->init_state;
996
997 re_node_set_free (&init_nodes);
998 return REG_NOERROR;
999}
1000
1001#ifdef RE_ENABLE_I18N
1002/* If it is possible to do searching in single byte encoding instead of UTF-8
1003 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1004 DFA nodes where needed. */
1005
1006static void
1007optimize_utf8 (re_dfa_t *dfa)
1008{
1009 int node, i, mb_chars = 0, has_period = 0;
1010
1011 for (node = 0; node < dfa->nodes_len; ++node)
1012 switch (dfa->nodes[node].type)
1013 {
1014 case CHARACTER:
1015 if (dfa->nodes[node].opr.c >= 0x80)
1016 mb_chars = 1;
1017 break;
1018 case ANCHOR:
1019 switch (dfa->nodes[node].opr.idx)
1020 {
1021 case LINE_FIRST:
1022 case LINE_LAST:
1023 case BUF_FIRST:
1024 case BUF_LAST:
1025 break;
1026 default:
1027 /* Word anchors etc. cannot be handled. */
1028 return;
1029 }
1030 break;
1031 case OP_PERIOD:
1032 has_period = 1;
1033 break;
1034 case OP_BACK_REF:
1035 case OP_ALT:
1036 case END_OF_RE:
1037 case OP_DUP_ASTERISK:
1038 case OP_OPEN_SUBEXP:
1039 case OP_CLOSE_SUBEXP:
1040 break;
1041 case COMPLEX_BRACKET:
1042 return;
1043 case SIMPLE_BRACKET:
1044 /* Just double check. The non-ASCII range starts at 0x80. */
1045 assert (0x80 % BITSET_WORD_BITS == 0);
1046 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1047 if (dfa->nodes[node].opr.sbcset[i])
1048 return;
1049 break;
1050 default:
1051 abort ();
1052 }
1053
1054 if (mb_chars || has_period)
1055 for (node = 0; node < dfa->nodes_len; ++node)
1056 {
1057 if (dfa->nodes[node].type == CHARACTER
1058 && dfa->nodes[node].opr.c >= 0x80)
1059 dfa->nodes[node].mb_partial = 0;
1060 else if (dfa->nodes[node].type == OP_PERIOD)
1061 dfa->nodes[node].type = OP_UTF8_PERIOD;
1062 }
1063
1064 /* The search can be in single byte locale. */
1065 dfa->mb_cur_max = 1;
1066 dfa->is_utf8 = 0;
1067 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1068}
1069#endif
1070
1071/* Analyze the structure tree, and calculate "first", "next", "edest",
1072 "eclosure", and "inveclosure". */
1073
1074static reg_errcode_t
1075analyze (regex_t *preg)
1076{
1077 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1078 reg_errcode_t ret;
1079
1080 /* Allocate arrays. */
1081 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1082 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1083 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1084 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1085 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1086 || dfa->eclosures == NULL, 0))
1087 return REG_ESPACE;
1088
1089 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1090 if (dfa->subexp_map != NULL)
1091 {
1092 int i;
1093 for (i = 0; i < preg->re_nsub; i++)
1094 dfa->subexp_map[i] = i;
1095 preorder (dfa->str_tree, optimize_subexps, dfa);
1096 for (i = 0; i < preg->re_nsub; i++)
1097 if (dfa->subexp_map[i] != i)
1098 break;
1099 if (i == preg->re_nsub)
1100 {
1101 free (dfa->subexp_map);
1102 dfa->subexp_map = NULL;
1103 }
1104 }
1105
1106 ret = postorder (dfa->str_tree, lower_subexps, preg);
1107 if (BE (ret != REG_NOERROR, 0))
1108 return ret;
1109 ret = postorder (dfa->str_tree, calc_first, dfa);
1110 if (BE (ret != REG_NOERROR, 0))
1111 return ret;
1112 preorder (dfa->str_tree, calc_next, dfa);
1113 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1114 if (BE (ret != REG_NOERROR, 0))
1115 return ret;
1116 ret = calc_eclosure (dfa);
1117 if (BE (ret != REG_NOERROR, 0))
1118 return ret;
1119
1120 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1121 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1122 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1123 || dfa->nbackref)
1124 {
1125 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1126 if (BE (dfa->inveclosures == NULL, 0))
1127 return REG_ESPACE;
1128 ret = calc_inveclosure (dfa);
1129 }
1130
1131 return ret;
1132}
1133
1134/* Our parse trees are very unbalanced, so we cannot use a stack to
1135 implement parse tree visits. Instead, we use parent pointers and
1136 some hairy code in these two functions. */
1137static reg_errcode_t
1138postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1139 void *extra)
1140{
1141 bin_tree_t *node, *prev;
1142
1143 for (node = root; ; )
1144 {
1145 /* Descend down the tree, preferably to the left (or to the right
1146 if that's the only child). */
1147 while (node->left || node->right)
1148 if (node->left)
1149 node = node->left;
1150 else
1151 node = node->right;
1152
1153 do
1154 {
1155 reg_errcode_t err = fn (extra, node);
1156 if (BE (err != REG_NOERROR, 0))
1157 return err;
1158 if (node->parent == NULL)
1159 return REG_NOERROR;
1160 prev = node;
1161 node = node->parent;
1162 }
1163 /* Go up while we have a node that is reached from the right. */
1164 while (node->right == prev || node->right == NULL);
1165 node = node->right;
1166 }
1167}
1168
1169static reg_errcode_t
1170preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1171 void *extra)
1172{
1173 bin_tree_t *node;
1174
1175 for (node = root; ; )
1176 {
1177 reg_errcode_t err = fn (extra, node);
1178 if (BE (err != REG_NOERROR, 0))
1179 return err;
1180
1181 /* Go to the left node, or up and to the right. */
1182 if (node->left)
1183 node = node->left;
1184 else
1185 {
1186 bin_tree_t *prev = NULL;
1187 while (node->right == prev || node->right == NULL)
1188 {
1189 prev = node;
1190 node = node->parent;
1191 if (!node)
1192 return REG_NOERROR;
1193 }
1194 node = node->right;
1195 }
1196 }
1197}
1198
1199/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1200 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1201 backreferences as well. Requires a preorder visit. */
1202static reg_errcode_t
1203optimize_subexps (void *extra, bin_tree_t *node)
1204{
1205 re_dfa_t *dfa = (re_dfa_t *) extra;
1206
1207 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1208 {
1209 int idx = node->token.opr.idx;
1210 node->token.opr.idx = dfa->subexp_map[idx];
1211 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1212 }
1213
1214 else if (node->token.type == SUBEXP
1215 && node->left && node->left->token.type == SUBEXP)
1216 {
1217 int other_idx = node->left->token.opr.idx;
1218
1219 node->left = node->left->left;
1220 if (node->left)
1221 node->left->parent = node;
1222
1223 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1224 if (other_idx < BITSET_WORD_BITS)
1225 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1226 }
1227
1228 return REG_NOERROR;
1229}
1230
1231/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1232 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1233static reg_errcode_t
1234lower_subexps (void *extra, bin_tree_t *node)
1235{
1236 regex_t *preg = (regex_t *) extra;
1237 reg_errcode_t err = REG_NOERROR;
1238
1239 if (node->left && node->left->token.type == SUBEXP)
1240 {
1241 node->left = lower_subexp (&err, preg, node->left);
1242 if (node->left)
1243 node->left->parent = node;
1244 }
1245 if (node->right && node->right->token.type == SUBEXP)
1246 {
1247 node->right = lower_subexp (&err, preg, node->right);
1248 if (node->right)
1249 node->right->parent = node;
1250 }
1251
1252 return err;
1253}
1254
1255static bin_tree_t *
1256lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1257{
1258 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1259 bin_tree_t *body = node->left;
1260 bin_tree_t *op, *cls, *tree1, *tree;
1261
1262 if (preg->no_sub
1263 /* We do not optimize empty subexpressions, because otherwise we may
1264 have bad CONCAT nodes with NULL children. This is obviously not
1265 very common, so we do not lose much. An example that triggers
1266 this case is the sed "script" /\(\)/x. */
1267 && node->left != NULL
1268 && (node->token.opr.idx >= BITSET_WORD_BITS
1269 || !(dfa->used_bkref_map
1270 & ((bitset_word_t) 1 << node->token.opr.idx))))
1271 return node->left;
1272
1273 /* Convert the SUBEXP node to the concatenation of an
1274 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1275 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1276 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1277 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1278 tree = create_tree (dfa, op, tree1, CONCAT);
1279 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1280 {
1281 *err = REG_ESPACE;
1282 return NULL;
1283 }
1284
1285 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1286 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1287 return tree;
1288}
1289
1290/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1291 nodes. Requires a postorder visit. */
1292static reg_errcode_t
1293calc_first (void *extra, bin_tree_t *node)
1294{
1295 re_dfa_t *dfa = (re_dfa_t *) extra;
1296 if (node->token.type == CONCAT)
1297 {
1298 node->first = node->left->first;
1299 node->node_idx = node->left->node_idx;
1300 }
1301 else
1302 {
1303 node->first = node;
1304 node->node_idx = re_dfa_add_node (dfa, node->token);
1305 if (BE (node->node_idx == -1, 0))
1306 return REG_ESPACE;
1307 }
1308 return REG_NOERROR;
1309}
1310
1311/* Pass 2: compute NEXT on the tree. Preorder visit. */
1312static reg_errcode_t
1313calc_next (void *extra, bin_tree_t *node)
1314{
1315 switch (node->token.type)
1316 {
1317 case OP_DUP_ASTERISK:
1318 node->left->next = node;
1319 break;
1320 case CONCAT:
1321 node->left->next = node->right->first;
1322 node->right->next = node->next;
1323 break;
1324 default:
1325 if (node->left)
1326 node->left->next = node->next;
1327 if (node->right)
1328 node->right->next = node->next;
1329 break;
1330 }
1331 return REG_NOERROR;
1332}
1333
1334/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1335static reg_errcode_t
1336link_nfa_nodes (void *extra, bin_tree_t *node)
1337{
1338 re_dfa_t *dfa = (re_dfa_t *) extra;
1339 int idx = node->node_idx;
1340 reg_errcode_t err = REG_NOERROR;
1341
1342 switch (node->token.type)
1343 {
1344 case CONCAT:
1345 break;
1346
1347 case END_OF_RE:
1348 assert (node->next == NULL);
1349 break;
1350
1351 case OP_DUP_ASTERISK:
1352 case OP_ALT:
1353 {
1354 int left, right;
1355 dfa->has_plural_match = 1;
1356 if (node->left != NULL)
1357 left = node->left->first->node_idx;
1358 else
1359 left = node->next->node_idx;
1360 if (node->right != NULL)
1361 right = node->right->first->node_idx;
1362 else
1363 right = node->next->node_idx;
1364 assert (left > -1);
1365 assert (right > -1);
1366 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1367 }
1368 break;
1369
1370 case ANCHOR:
1371 case OP_OPEN_SUBEXP:
1372 case OP_CLOSE_SUBEXP:
1373 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1374 break;
1375
1376 case OP_BACK_REF:
1377 dfa->nexts[idx] = node->next->node_idx;
1378 if (node->token.type == OP_BACK_REF)
1379 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1380 break;
1381
1382 default:
1383 assert (!IS_EPSILON_NODE (node->token.type));
1384 dfa->nexts[idx] = node->next->node_idx;
1385 break;
1386 }
1387
1388 return err;
1389}
1390
1391/* Duplicate the epsilon closure of the node ROOT_NODE.
1392 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1393 to their own constraint. */
1394
1395static reg_errcode_t
1396internal_function
1397duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1398 int root_node, unsigned int init_constraint)
1399{
1400 int org_node, clone_node, ret;
1401 unsigned int constraint = init_constraint;
1402 for (org_node = top_org_node, clone_node = top_clone_node;;)
1403 {
1404 int org_dest, clone_dest;
1405 if (dfa->nodes[org_node].type == OP_BACK_REF)
1406 {
1407 /* If the back reference epsilon-transit, its destination must
1408 also have the constraint. Then duplicate the epsilon closure
1409 of the destination of the back reference, and store it in
1410 edests of the back reference. */
1411 org_dest = dfa->nexts[org_node];
1412 re_node_set_empty (dfa->edests + clone_node);
1413 clone_dest = duplicate_node (dfa, org_dest, constraint);
1414 if (BE (clone_dest == -1, 0))
1415 return REG_ESPACE;
1416 dfa->nexts[clone_node] = dfa->nexts[org_node];
1417 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1418 if (BE (ret < 0, 0))
1419 return REG_ESPACE;
1420 }
1421 else if (dfa->edests[org_node].nelem == 0)
1422 {
1423 /* In case of the node can't epsilon-transit, don't duplicate the
1424 destination and store the original destination as the
1425 destination of the node. */
1426 dfa->nexts[clone_node] = dfa->nexts[org_node];
1427 break;
1428 }
1429 else if (dfa->edests[org_node].nelem == 1)
1430 {
1431 /* In case of the node can epsilon-transit, and it has only one
1432 destination. */
1433 org_dest = dfa->edests[org_node].elems[0];
1434 re_node_set_empty (dfa->edests + clone_node);
1435 if (dfa->nodes[org_node].type == ANCHOR)
1436 {
1437 /* In case of the node has another constraint, append it. */
1438 if (org_node == root_node && clone_node != org_node)
1439 {
1440 /* ...but if the node is root_node itself, it means the
1441 epsilon closure have a loop, then tie it to the
1442 destination of the root_node. */
1443 ret = re_node_set_insert (dfa->edests + clone_node,
1444 org_dest);
1445 if (BE (ret < 0, 0))
1446 return REG_ESPACE;
1447 break;
1448 }
1449 constraint |= dfa->nodes[org_node].opr.ctx_type;
1450 }
1451 clone_dest = duplicate_node (dfa, org_dest, constraint);
1452 if (BE (clone_dest == -1, 0))
1453 return REG_ESPACE;
1454 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1455 if (BE (ret < 0, 0))
1456 return REG_ESPACE;
1457 }
1458 else /* dfa->edests[org_node].nelem == 2 */
1459 {
1460 /* In case of the node can epsilon-transit, and it has two
1461 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1462 org_dest = dfa->edests[org_node].elems[0];
1463 re_node_set_empty (dfa->edests + clone_node);
1464 /* Search for a duplicated node which satisfies the constraint. */
1465 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1466 if (clone_dest == -1)
1467 {
1468 /* There are no such a duplicated node, create a new one. */
1469 reg_errcode_t err;
1470 clone_dest = duplicate_node (dfa, org_dest, constraint);
1471 if (BE (clone_dest == -1, 0))
1472 return REG_ESPACE;
1473 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1474 if (BE (ret < 0, 0))
1475 return REG_ESPACE;
1476 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1477 root_node, constraint);
1478 if (BE (err != REG_NOERROR, 0))
1479 return err;
1480 }
1481 else
1482 {
1483 /* There are a duplicated node which satisfy the constraint,
1484 use it to avoid infinite loop. */
1485 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1486 if (BE (ret < 0, 0))
1487 return REG_ESPACE;
1488 }
1489
1490 org_dest = dfa->edests[org_node].elems[1];
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == -1, 0))
1493 return REG_ESPACE;
1494 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 if (BE (ret < 0, 0))
1496 return REG_ESPACE;
1497 }
1498 org_node = org_dest;
1499 clone_node = clone_dest;
1500 }
1501 return REG_NOERROR;
1502}
1503
1504/* Search for a node which is duplicated from the node ORG_NODE, and
1505 satisfies the constraint CONSTRAINT. */
1506
1507static int
1508search_duplicated_node (const re_dfa_t *dfa, int org_node,
1509 unsigned int constraint)
1510{
1511 int idx;
1512 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1513 {
1514 if (org_node == dfa->org_indices[idx]
1515 && constraint == dfa->nodes[idx].constraint)
1516 return idx; /* Found. */
1517 }
1518 return -1; /* Not found. */
1519}
1520
1521/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1522 Return the index of the new node, or -1 if insufficient storage is
1523 available. */
1524
1525static int
1526duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1527{
1528 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1529 if (BE (dup_idx != -1, 1))
1530 {
1531 dfa->nodes[dup_idx].constraint = constraint;
1532 if (dfa->nodes[org_idx].type == ANCHOR)
1533 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1534 dfa->nodes[dup_idx].duplicated = 1;
1535
1536 /* Store the index of the original node. */
1537 dfa->org_indices[dup_idx] = org_idx;
1538 }
1539 return dup_idx;
1540}
1541
1542static reg_errcode_t
1543calc_inveclosure (re_dfa_t *dfa)
1544{
1545 int src, idx, ret;
1546 for (idx = 0; idx < dfa->nodes_len; ++idx)
1547 re_node_set_init_empty (dfa->inveclosures + idx);
1548
1549 for (src = 0; src < dfa->nodes_len; ++src)
1550 {
1551 int *elems = dfa->eclosures[src].elems;
1552 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1553 {
1554 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1555 if (BE (ret == -1, 0))
1556 return REG_ESPACE;
1557 }
1558 }
1559
1560 return REG_NOERROR;
1561}
1562
1563/* Calculate "eclosure" for all the node in DFA. */
1564
1565static reg_errcode_t
1566calc_eclosure (re_dfa_t *dfa)
1567{
1568 int node_idx, incomplete;
1569#ifdef DEBUG
1570 assert (dfa->nodes_len > 0);
1571#endif
1572 incomplete = 0;
1573 /* For each nodes, calculate epsilon closure. */
1574 for (node_idx = 0; ; ++node_idx)
1575 {
1576 reg_errcode_t err;
1577 re_node_set eclosure_elem;
1578 if (node_idx == dfa->nodes_len)
1579 {
1580 if (!incomplete)
1581 break;
1582 incomplete = 0;
1583 node_idx = 0;
1584 }
1585
1586#ifdef DEBUG
1587 assert (dfa->eclosures[node_idx].nelem != -1);
1588#endif
1589
1590 /* If we have already calculated, skip it. */
1591 if (dfa->eclosures[node_idx].nelem != 0)
1592 continue;
1593 /* Calculate epsilon closure of `node_idx'. */
1594 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1595 if (BE (err != REG_NOERROR, 0))
1596 return err;
1597
1598 if (dfa->eclosures[node_idx].nelem == 0)
1599 {
1600 incomplete = 1;
1601 re_node_set_free (&eclosure_elem);
1602 }
1603 }
1604 return REG_NOERROR;
1605}
1606
1607/* Calculate epsilon closure of NODE. */
1608
1609static reg_errcode_t
1610calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1611{
1612 reg_errcode_t err;
1613 unsigned int constraint;
1614 int i, incomplete;
1615 re_node_set eclosure;
1616 incomplete = 0;
1617 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1618 if (BE (err != REG_NOERROR, 0))
1619 return err;
1620
1621 /* This indicates that we are calculating this node now.
1622 We reference this value to avoid infinite loop. */
1623 dfa->eclosures[node].nelem = -1;
1624
1625 constraint = ((dfa->nodes[node].type == ANCHOR)
1626 ? dfa->nodes[node].opr.ctx_type : 0);
1627 /* If the current node has constraints, duplicate all nodes.
1628 Since they must inherit the constraints. */
1629 if (constraint
1630 && dfa->edests[node].nelem
1631 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1632 {
1633 err = duplicate_node_closure (dfa, node, node, node, constraint);
1634 if (BE (err != REG_NOERROR, 0))
1635 return err;
1636 }
1637
1638 /* Expand each epsilon destination nodes. */
1639 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1640 for (i = 0; i < dfa->edests[node].nelem; ++i)
1641 {
1642 re_node_set eclosure_elem;
1643 int edest = dfa->edests[node].elems[i];
1644 /* If calculating the epsilon closure of `edest' is in progress,
1645 return intermediate result. */
1646 if (dfa->eclosures[edest].nelem == -1)
1647 {
1648 incomplete = 1;
1649 continue;
1650 }
1651 /* If we haven't calculated the epsilon closure of `edest' yet,
1652 calculate now. Otherwise use calculated epsilon closure. */
1653 if (dfa->eclosures[edest].nelem == 0)
1654 {
1655 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1656 if (BE (err != REG_NOERROR, 0))
1657 return err;
1658 }
1659 else
1660 eclosure_elem = dfa->eclosures[edest];
1661 /* Merge the epsilon closure of `edest'. */
1662 re_node_set_merge (&eclosure, &eclosure_elem);
1663 /* If the epsilon closure of `edest' is incomplete,
1664 the epsilon closure of this node is also incomplete. */
1665 if (dfa->eclosures[edest].nelem == 0)
1666 {
1667 incomplete = 1;
1668 re_node_set_free (&eclosure_elem);
1669 }
1670 }
1671
1672 /* Epsilon closures include itself. */
1673 re_node_set_insert (&eclosure, node);
1674 if (incomplete && !root)
1675 dfa->eclosures[node].nelem = 0;
1676 else
1677 dfa->eclosures[node] = eclosure;
1678 *new_set = eclosure;
1679 return REG_NOERROR;
1680}
1681
1682/* Functions for token which are used in the parser. */
1683
1684/* Fetch a token from INPUT.
1685 We must not use this function inside bracket expressions. */
1686
1687static void
1688internal_function
1689fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1690{
1691 re_string_skip_bytes (input, peek_token (result, input, syntax));
1692}
1693
1694/* Peek a token from INPUT, and return the length of the token.
1695 We must not use this function inside bracket expressions. */
1696
1697static int
1698internal_function
1699peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1700{
1701 unsigned char c;
1702
1703 if (re_string_eoi (input))
1704 {
1705 token->type = END_OF_RE;
1706 return 0;
1707 }
1708
1709 c = re_string_peek_byte (input, 0);
1710 token->opr.c = c;
1711
1712 token->word_char = 0;
1713#ifdef RE_ENABLE_I18N
1714 token->mb_partial = 0;
1715 if (input->mb_cur_max > 1 &&
1716 !re_string_first_byte (input, re_string_cur_idx (input)))
1717 {
1718 token->type = CHARACTER;
1719 token->mb_partial = 1;
1720 return 1;
1721 }
1722#endif
1723 if (c == '\\')
1724 {
1725 unsigned char c2;
1726 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1727 {
1728 token->type = BACK_SLASH;
1729 return 1;
1730 }
1731
1732 c2 = re_string_peek_byte_case (input, 1);
1733 token->opr.c = c2;
1734 token->type = CHARACTER;
1735#ifdef RE_ENABLE_I18N
1736 if (input->mb_cur_max > 1)
1737 {
1738 wint_t wc = re_string_wchar_at (input,
1739 re_string_cur_idx (input) + 1);
1740 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1741 }
1742 else
1743#endif
1744 token->word_char = IS_WORD_CHAR (c2) != 0;
1745
1746 switch (c2)
1747 {
1748 case '|':
1749 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1750 token->type = OP_ALT;
1751 break;
1752 case '1': case '2': case '3': case '4': case '5':
1753 case '6': case '7': case '8': case '9':
1754 if (!(syntax & RE_NO_BK_REFS))
1755 {
1756 token->type = OP_BACK_REF;
1757 token->opr.idx = c2 - '1';
1758 }
1759 break;
1760 case '<':
1761 if (!(syntax & RE_NO_GNU_OPS))
1762 {
1763 token->type = ANCHOR;
1764 token->opr.ctx_type = WORD_FIRST;
1765 }
1766 break;
1767 case '>':
1768 if (!(syntax & RE_NO_GNU_OPS))
1769 {
1770 token->type = ANCHOR;
1771 token->opr.ctx_type = WORD_LAST;
1772 }
1773 break;
1774 case 'b':
1775 if (!(syntax & RE_NO_GNU_OPS))
1776 {
1777 token->type = ANCHOR;
1778 token->opr.ctx_type = WORD_DELIM;
1779 }
1780 break;
1781 case 'B':
1782 if (!(syntax & RE_NO_GNU_OPS))
1783 {
1784 token->type = ANCHOR;
1785 token->opr.ctx_type = NOT_WORD_DELIM;
1786 }
1787 break;
1788 case 'w':
1789 if (!(syntax & RE_NO_GNU_OPS))
1790 token->type = OP_WORD;
1791 break;
1792 case 'W':
1793 if (!(syntax & RE_NO_GNU_OPS))
1794 token->type = OP_NOTWORD;
1795 break;
1796 case 's':
1797 if (!(syntax & RE_NO_GNU_OPS))
1798 token->type = OP_SPACE;
1799 break;
1800 case 'S':
1801 if (!(syntax & RE_NO_GNU_OPS))
1802 token->type = OP_NOTSPACE;
1803 break;
1804 case '`':
1805 if (!(syntax & RE_NO_GNU_OPS))
1806 {
1807 token->type = ANCHOR;
1808 token->opr.ctx_type = BUF_FIRST;
1809 }
1810 break;
1811 case '\'':
1812 if (!(syntax & RE_NO_GNU_OPS))
1813 {
1814 token->type = ANCHOR;
1815 token->opr.ctx_type = BUF_LAST;
1816 }
1817 break;
1818 case '(':
1819 if (!(syntax & RE_NO_BK_PARENS))
1820 token->type = OP_OPEN_SUBEXP;
1821 break;
1822 case ')':
1823 if (!(syntax & RE_NO_BK_PARENS))
1824 token->type = OP_CLOSE_SUBEXP;
1825 break;
1826 case '+':
1827 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1828 token->type = OP_DUP_PLUS;
1829 break;
1830 case '?':
1831 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1832 token->type = OP_DUP_QUESTION;
1833 break;
1834 case '{':
1835 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1836 token->type = OP_OPEN_DUP_NUM;
1837 break;
1838 case '}':
1839 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1840 token->type = OP_CLOSE_DUP_NUM;
1841 break;
1842 default:
1843 break;
1844 }
1845 return 2;
1846 }
1847
1848 token->type = CHARACTER;
1849#ifdef RE_ENABLE_I18N
1850 if (input->mb_cur_max > 1)
1851 {
1852 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1853 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1854 }
1855 else
1856#endif
1857 token->word_char = IS_WORD_CHAR (token->opr.c);
1858
1859 switch (c)
1860 {
1861 case '\n':
1862 if (syntax & RE_NEWLINE_ALT)
1863 token->type = OP_ALT;
1864 break;
1865 case '|':
1866 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1867 token->type = OP_ALT;
1868 break;
1869 case '*':
1870 token->type = OP_DUP_ASTERISK;
1871 break;
1872 case '+':
1873 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1874 token->type = OP_DUP_PLUS;
1875 break;
1876 case '?':
1877 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1878 token->type = OP_DUP_QUESTION;
1879 break;
1880 case '{':
1881 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1882 token->type = OP_OPEN_DUP_NUM;
1883 break;
1884 case '}':
1885 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1886 token->type = OP_CLOSE_DUP_NUM;
1887 break;
1888 case '(':
1889 if (syntax & RE_NO_BK_PARENS)
1890 token->type = OP_OPEN_SUBEXP;
1891 break;
1892 case ')':
1893 if (syntax & RE_NO_BK_PARENS)
1894 token->type = OP_CLOSE_SUBEXP;
1895 break;
1896 case '[':
1897 token->type = OP_OPEN_BRACKET;
1898 break;
1899 case '.':
1900 token->type = OP_PERIOD;
1901 break;
1902 case '^':
1903 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1904 re_string_cur_idx (input) != 0)
1905 {
1906 char prev = re_string_peek_byte (input, -1);
1907 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1908 break;
1909 }
1910 token->type = ANCHOR;
1911 token->opr.ctx_type = LINE_FIRST;
1912 break;
1913 case '$':
1914 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1915 re_string_cur_idx (input) + 1 != re_string_length (input))
1916 {
1917 re_token_t next;
1918 re_string_skip_bytes (input, 1);
1919 peek_token (&next, input, syntax);
1920 re_string_skip_bytes (input, -1);
1921 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1922 break;
1923 }
1924 token->type = ANCHOR;
1925 token->opr.ctx_type = LINE_LAST;
1926 break;
1927 default:
1928 break;
1929 }
1930 return 1;
1931}
1932
1933/* Peek a token from INPUT, and return the length of the token.
1934 We must not use this function out of bracket expressions. */
1935
1936static int
1937internal_function
1938peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1939{
1940 unsigned char c;
1941 if (re_string_eoi (input))
1942 {
1943 token->type = END_OF_RE;
1944 return 0;
1945 }
1946 c = re_string_peek_byte (input, 0);
1947 token->opr.c = c;
1948
1949#ifdef RE_ENABLE_I18N
1950 if (input->mb_cur_max > 1 &&
1951 !re_string_first_byte (input, re_string_cur_idx (input)))
1952 {
1953 token->type = CHARACTER;
1954 return 1;
1955 }
1956#endif
1957
1958 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1959 && re_string_cur_idx (input) + 1 < re_string_length (input))
1960 {
1961 /* In this case, '\' escape a character. */
1962 unsigned char c2;
1963 re_string_skip_bytes (input, 1);
1964 c2 = re_string_peek_byte (input, 0);
1965 token->opr.c = c2;
1966 token->type = CHARACTER;
1967 return 1;
1968 }
1969 if (c == '[') /* '[' is a special char in a bracket exps. */
1970 {
1971 unsigned char c2;
1972 int token_len;
1973 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1974 c2 = re_string_peek_byte (input, 1);
1975 else
1976 c2 = 0;
1977 token->opr.c = c2;
1978 token_len = 2;
1979 switch (c2)
1980 {
1981 case '.':
1982 token->type = OP_OPEN_COLL_ELEM;
1983 break;
1984 case '=':
1985 token->type = OP_OPEN_EQUIV_CLASS;
1986 break;
1987 case ':':
1988 if (syntax & RE_CHAR_CLASSES)
1989 {
1990 token->type = OP_OPEN_CHAR_CLASS;
1991 break;
1992 }
1993 /* else fall through. */
1994 default:
1995 token->type = CHARACTER;
1996 token->opr.c = c;
1997 token_len = 1;
1998 break;
1999 }
2000 return token_len;
2001 }
2002 switch (c)
2003 {
2004 case '-':
2005 token->type = OP_CHARSET_RANGE;
2006 break;
2007 case ']':
2008 token->type = OP_CLOSE_BRACKET;
2009 break;
2010 case '^':
2011 token->type = OP_NON_MATCH_LIST;
2012 break;
2013 default:
2014 token->type = CHARACTER;
2015 }
2016 return 1;
2017}
2018
2019/* Functions for parser. */
2020
2021/* Entry point of the parser.
2022 Parse the regular expression REGEXP and return the structure tree.
2023 If an error is occured, ERR is set by error code, and return NULL.
2024 This function build the following tree, from regular expression <reg_exp>:
2025 CAT
2026 / \
2027 / \
2028 <reg_exp> EOR
2029
2030 CAT means concatenation.
2031 EOR means end of regular expression. */
2032
2033static bin_tree_t *
2034parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2035 reg_errcode_t *err)
2036{
2037 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2038 bin_tree_t *tree, *eor, *root;
2039 re_token_t current_token;
2040 dfa->syntax = syntax;
2041 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2042 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2043 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2044 return NULL;
2045 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2046 if (tree != NULL)
2047 root = create_tree (dfa, tree, eor, CONCAT);
2048 else
2049 root = eor;
2050 if (BE (eor == NULL || root == NULL, 0))
2051 {
2052 *err = REG_ESPACE;
2053 return NULL;
2054 }
2055 return root;
2056}
2057
2058/* This function build the following tree, from regular expression
2059 <branch1>|<branch2>:
2060 ALT
2061 / \
2062 / \
2063 <branch1> <branch2>
2064
2065 ALT means alternative, which represents the operator `|'. */
2066
2067static bin_tree_t *
2068parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2069 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2070{
2071 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2072 bin_tree_t *tree, *branch = NULL;
2073 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2074 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2075 return NULL;
2076
2077 while (token->type == OP_ALT)
2078 {
2079 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2080 if (token->type != OP_ALT && token->type != END_OF_RE
2081 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2082 {
2083 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2084 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2085 return NULL;
2086 }
2087 else
2088 branch = NULL;
2089 tree = create_tree (dfa, tree, branch, OP_ALT);
2090 if (BE (tree == NULL, 0))
2091 {
2092 *err = REG_ESPACE;
2093 return NULL;
2094 }
2095 }
2096 return tree;
2097}
2098
2099/* This function build the following tree, from regular expression
2100 <exp1><exp2>:
2101 CAT
2102 / \
2103 / \
2104 <exp1> <exp2>
2105
2106 CAT means concatenation. */
2107
2108static bin_tree_t *
2109parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2110 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2111{
2112 bin_tree_t *tree, *exp;
2113 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2114 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2115 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2116 return NULL;
2117
2118 while (token->type != OP_ALT && token->type != END_OF_RE
2119 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2120 {
2121 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2122 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2123 {
2124 return NULL;
2125 }
2126 if (tree != NULL && exp != NULL)
2127 {
2128 tree = create_tree (dfa, tree, exp, CONCAT);
2129 if (tree == NULL)
2130 {
2131 *err = REG_ESPACE;
2132 return NULL;
2133 }
2134 }
2135 else if (tree == NULL)
2136 tree = exp;
2137 /* Otherwise exp == NULL, we don't need to create new tree. */
2138 }
2139 return tree;
2140}
2141
2142/* This function build the following tree, from regular expression a*:
2143 *
2144 |
2145 a
2146*/
2147
2148static bin_tree_t *
2149parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2150 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2151{
2152 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2153 bin_tree_t *tree;
2154 switch (token->type)
2155 {
2156 case CHARACTER:
2157 tree = create_token_tree (dfa, NULL, NULL, token);
2158 if (BE (tree == NULL, 0))
2159 {
2160 *err = REG_ESPACE;
2161 return NULL;
2162 }
2163#ifdef RE_ENABLE_I18N
2164 if (dfa->mb_cur_max > 1)
2165 {
2166 while (!re_string_eoi (regexp)
2167 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2168 {
2169 bin_tree_t *mbc_remain;
2170 fetch_token (token, regexp, syntax);
2171 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2172 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2173 if (BE (mbc_remain == NULL || tree == NULL, 0))
2174 {
2175 *err = REG_ESPACE;
2176 return NULL;
2177 }
2178 }
2179 }
2180#endif
2181 break;
2182 case OP_OPEN_SUBEXP:
2183 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2184 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2185 return NULL;
2186 break;
2187 case OP_OPEN_BRACKET:
2188 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2189 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190 return NULL;
2191 break;
2192 case OP_BACK_REF:
2193 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2194 {
2195 *err = REG_ESUBREG;
2196 return NULL;
2197 }
2198 dfa->used_bkref_map |= 1 << token->opr.idx;
2199 tree = create_token_tree (dfa, NULL, NULL, token);
2200 if (BE (tree == NULL, 0))
2201 {
2202 *err = REG_ESPACE;
2203 return NULL;
2204 }
2205 ++dfa->nbackref;
2206 dfa->has_mb_node = 1;
2207 break;
2208 case OP_OPEN_DUP_NUM:
2209 if (syntax & RE_CONTEXT_INVALID_DUP)
2210 {
2211 *err = REG_BADRPT;
2212 return NULL;
2213 }
2214 /* FALLTHROUGH */
2215 case OP_DUP_ASTERISK:
2216 case OP_DUP_PLUS:
2217 case OP_DUP_QUESTION:
2218 if (syntax & RE_CONTEXT_INVALID_OPS)
2219 {
2220 *err = REG_BADRPT;
2221 return NULL;
2222 }
2223 else if (syntax & RE_CONTEXT_INDEP_OPS)
2224 {
2225 fetch_token (token, regexp, syntax);
2226 return parse_expression (regexp, preg, token, syntax, nest, err);
2227 }
2228 /* else fall through */
2229 case OP_CLOSE_SUBEXP:
2230 if ((token->type == OP_CLOSE_SUBEXP) &&
2231 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2232 {
2233 *err = REG_ERPAREN;
2234 return NULL;
2235 }
2236 /* else fall through */
2237 case OP_CLOSE_DUP_NUM:
2238 /* We treat it as a normal character. */
2239
2240 /* Then we can these characters as normal characters. */
2241 token->type = CHARACTER;
2242 /* mb_partial and word_char bits should be initialized already
2243 by peek_token. */
2244 tree = create_token_tree (dfa, NULL, NULL, token);
2245 if (BE (tree == NULL, 0))
2246 {
2247 *err = REG_ESPACE;
2248 return NULL;
2249 }
2250 break;
2251 case ANCHOR:
2252 if ((token->opr.ctx_type
2253 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2254 && dfa->word_ops_used == 0)
2255 init_word_char (dfa);
2256 if (token->opr.ctx_type == WORD_DELIM
2257 || token->opr.ctx_type == NOT_WORD_DELIM)
2258 {
2259 bin_tree_t *tree_first, *tree_last;
2260 if (token->opr.ctx_type == WORD_DELIM)
2261 {
2262 token->opr.ctx_type = WORD_FIRST;
2263 tree_first = create_token_tree (dfa, NULL, NULL, token);
2264 token->opr.ctx_type = WORD_LAST;
2265 }
2266 else
2267 {
2268 token->opr.ctx_type = INSIDE_WORD;
2269 tree_first = create_token_tree (dfa, NULL, NULL, token);
2270 token->opr.ctx_type = INSIDE_NOTWORD;
2271 }
2272 tree_last = create_token_tree (dfa, NULL, NULL, token);
2273 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2274 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2275 {
2276 *err = REG_ESPACE;
2277 return NULL;
2278 }
2279 }
2280 else
2281 {
2282 tree = create_token_tree (dfa, NULL, NULL, token);
2283 if (BE (tree == NULL, 0))
2284 {
2285 *err = REG_ESPACE;
2286 return NULL;
2287 }
2288 }
2289 /* We must return here, since ANCHORs can't be followed
2290 by repetition operators.
2291 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2292 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2293 fetch_token (token, regexp, syntax);
2294 return tree;
2295 case OP_PERIOD:
2296 tree = create_token_tree (dfa, NULL, NULL, token);
2297 if (BE (tree == NULL, 0))
2298 {
2299 *err = REG_ESPACE;
2300 return NULL;
2301 }
2302 if (dfa->mb_cur_max > 1)
2303 dfa->has_mb_node = 1;
2304 break;
2305 case OP_WORD:
2306 case OP_NOTWORD:
2307 tree = build_charclass_op (dfa, regexp->trans,
2308 (const unsigned char *) "alnum",
2309 (const unsigned char *) "_",
2310 token->type == OP_NOTWORD, err);
2311 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2312 return NULL;
2313 break;
2314 case OP_SPACE:
2315 case OP_NOTSPACE:
2316 tree = build_charclass_op (dfa, regexp->trans,
2317 (const unsigned char *) "space",
2318 (const unsigned char *) "",
2319 token->type == OP_NOTSPACE, err);
2320 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2321 return NULL;
2322 break;
2323 case OP_ALT:
2324 case END_OF_RE:
2325 return NULL;
2326 case BACK_SLASH:
2327 *err = REG_EESCAPE;
2328 return NULL;
2329 default:
2330 /* Must not happen? */
2331#ifdef DEBUG
2332 assert (0);
2333#endif
2334 return NULL;
2335 }
2336 fetch_token (token, regexp, syntax);
2337
2338 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2339 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2340 {
2341 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2342 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2343 return NULL;
2344 /* In BRE consecutive duplications are not allowed. */
2345 if ((syntax & RE_CONTEXT_INVALID_DUP)
2346 && (token->type == OP_DUP_ASTERISK
2347 || token->type == OP_OPEN_DUP_NUM))
2348 {
2349 *err = REG_BADRPT;
2350 return NULL;
2351 }
2352 }
2353
2354 return tree;
2355}
2356
2357/* This function build the following tree, from regular expression
2358 (<reg_exp>):
2359 SUBEXP
2360 |
2361 <reg_exp>
2362*/
2363
2364static bin_tree_t *
2365parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2366 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2367{
2368 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2369 bin_tree_t *tree;
2370 size_t cur_nsub;
2371 cur_nsub = preg->re_nsub++;
2372
2373 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2374
2375 /* The subexpression may be a null string. */
2376 if (token->type == OP_CLOSE_SUBEXP)
2377 tree = NULL;
2378 else
2379 {
2380 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2381 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2382 *err = REG_EPAREN;
2383 if (BE (*err != REG_NOERROR, 0))
2384 return NULL;
2385 }
2386
2387 if (cur_nsub <= '9' - '1')
2388 dfa->completed_bkref_map |= 1 << cur_nsub;
2389
2390 tree = create_tree (dfa, tree, NULL, SUBEXP);
2391 if (BE (tree == NULL, 0))
2392 {
2393 *err = REG_ESPACE;
2394 return NULL;
2395 }
2396 tree->token.opr.idx = cur_nsub;
2397 return tree;
2398}
2399
2400/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2401
2402static bin_tree_t *
2403parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2404 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2405{
2406 bin_tree_t *tree = NULL, *old_tree = NULL;
2407 int i, start, end, start_idx = re_string_cur_idx (regexp);
2408 re_token_t start_token = *token;
2409
2410 if (token->type == OP_OPEN_DUP_NUM)
2411 {
2412 end = 0;
2413 start = fetch_number (regexp, token, syntax);
2414 if (start == -1)
2415 {
2416 if (token->type == CHARACTER && token->opr.c == ',')
2417 start = 0; /* We treat "{,m}" as "{0,m}". */
2418 else
2419 {
2420 *err = REG_BADBR; /* <re>{} is invalid. */
2421 return NULL;
2422 }
2423 }
2424 if (BE (start != -2, 1))
2425 {
2426 /* We treat "{n}" as "{n,n}". */
2427 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2428 : ((token->type == CHARACTER && token->opr.c == ',')
2429 ? fetch_number (regexp, token, syntax) : -2));
2430 }
2431 if (BE (start == -2 || end == -2, 0))
2432 {
2433 /* Invalid sequence. */
2434 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2435 {
2436 if (token->type == END_OF_RE)
2437 *err = REG_EBRACE;
2438 else
2439 *err = REG_BADBR;
2440
2441 return NULL;
2442 }
2443
2444 /* If the syntax bit is set, rollback. */
2445 re_string_set_index (regexp, start_idx);
2446 *token = start_token;
2447 token->type = CHARACTER;
2448 /* mb_partial and word_char bits should be already initialized by
2449 peek_token. */
2450 return elem;
2451 }
2452
2453 if (BE (end != -1 && start > end, 0))
2454 {
2455 /* First number greater than second. */
2456 *err = REG_BADBR;
2457 return NULL;
2458 }
2459 }
2460 else
2461 {
2462 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2463 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2464 }
2465
2466 fetch_token (token, regexp, syntax);
2467
2468 if (BE (elem == NULL, 0))
2469 return NULL;
2470 if (BE (start == 0 && end == 0, 0))
2471 {
2472 postorder (elem, free_tree, NULL);
2473 return NULL;
2474 }
2475
2476 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2477 if (BE (start > 0, 0))
2478 {
2479 tree = elem;
2480 for (i = 2; i <= start; ++i)
2481 {
2482 elem = duplicate_tree (elem, dfa);
2483 tree = create_tree (dfa, tree, elem, CONCAT);
2484 if (BE (elem == NULL || tree == NULL, 0))
2485 goto parse_dup_op_espace;
2486 }
2487
2488 if (start == end)
2489 return tree;
2490
2491 /* Duplicate ELEM before it is marked optional. */
2492 elem = duplicate_tree (elem, dfa);
2493 old_tree = tree;
2494 }
2495 else
2496 old_tree = NULL;
2497
2498 if (elem->token.type == SUBEXP)
2499 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2500
2501 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2502 if (BE (tree == NULL, 0))
2503 goto parse_dup_op_espace;
2504
2505 /* This loop is actually executed only when end != -1,
2506 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2507 already created the start+1-th copy. */
2508 for (i = start + 2; i <= end; ++i)
2509 {
2510 elem = duplicate_tree (elem, dfa);
2511 tree = create_tree (dfa, tree, elem, CONCAT);
2512 if (BE (elem == NULL || tree == NULL, 0))
2513 goto parse_dup_op_espace;
2514
2515 tree = create_tree (dfa, tree, NULL, OP_ALT);
2516 if (BE (tree == NULL, 0))
2517 goto parse_dup_op_espace;
2518 }
2519
2520 if (old_tree)
2521 tree = create_tree (dfa, old_tree, tree, CONCAT);
2522
2523 return tree;
2524
2525 parse_dup_op_espace:
2526 *err = REG_ESPACE;
2527 return NULL;
2528}
2529
2530/* Size of the names for collating symbol/equivalence_class/character_class.
2531 I'm not sure, but maybe enough. */
2532#define BRACKET_NAME_BUF_SIZE 32
2533
2534#if 1
2535 /* Local function for parse_bracket_exp only used in case of NOT glibc.
2536 Build the range expression which starts from START_ELEM, and ends
2537 at END_ELEM. The result are written to MBCSET and SBCSET.
2538 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2539 mbcset->range_ends, is a pointer argument sinse we may
2540 update it. */
2541
2542static reg_errcode_t
2543internal_function
2544# ifdef RE_ENABLE_I18N
2545build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2546 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2547# else
2548build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2549 bracket_elem_t *end_elem)
2550# endif
2551{
2552 unsigned int start_ch, end_ch;
2553 /* Equivalence Classes and Character Classes can't be a range start/end. */
2554 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2555 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2556 0))
2557 return REG_ERANGE;
2558
2559 /* We can handle no multi character collating elements without libc
2560 support. */
2561 if (BE ((start_elem->type == COLL_SYM
2562 && strlen ((char *) start_elem->opr.name) > 1)
2563 || (end_elem->type == COLL_SYM
2564 && strlen ((char *) end_elem->opr.name) > 1), 0))
2565 return REG_ECOLLATE;
2566
2567# ifdef RE_ENABLE_I18N
2568 {
2569 wchar_t wc;
2570 wint_t start_wc;
2571 wint_t end_wc;
2572 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2573
2574 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2575 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2576 : 0));
2577 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2578 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2579 : 0));
2580 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2581 ? __btowc (start_ch) : start_elem->opr.wch);
2582 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2583 ? __btowc (end_ch) : end_elem->opr.wch);
2584 if (start_wc == WEOF || end_wc == WEOF)
2585 return REG_ECOLLATE;
2586 cmp_buf[0] = start_wc;
2587 cmp_buf[4] = end_wc;
2588 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2589 return REG_ERANGE;
2590
2591 /* Got valid collation sequence values, add them as a new entry.
2592 However, for !glibc we have no collation elements: if the
2593 character set is single byte, the single byte character set
2594 that we build below suffices. parse_bracket_exp passes
2595 no MBCSET if dfa->mb_cur_max == 1. */
2596 if (mbcset)
2597 {
2598 /* Check the space of the arrays. */
2599 if (BE (*range_alloc == mbcset->nranges, 0))
2600 {
2601 /* There is not enough space, need realloc. */
2602 wchar_t *new_array_start, *new_array_end;
2603 int new_nranges;
2604
2605 /* +1 in case of mbcset->nranges is 0. */
2606 new_nranges = 2 * mbcset->nranges + 1;
2607 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2608 are NULL if *range_alloc == 0. */
2609 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2610 new_nranges);
2611 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2612 new_nranges);
2613
2614 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2615 return REG_ESPACE;
2616
2617 mbcset->range_starts = new_array_start;
2618 mbcset->range_ends = new_array_end;
2619 *range_alloc = new_nranges;
2620 }
2621
2622 mbcset->range_starts[mbcset->nranges] = start_wc;
2623 mbcset->range_ends[mbcset->nranges++] = end_wc;
2624 }
2625
2626 /* Build the table for single byte characters. */
2627 for (wc = 0; wc < SBC_MAX; ++wc)
2628 {
2629 cmp_buf[2] = wc;
2630 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2631 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2632 bitset_set (sbcset, wc);
2633 }
2634 }
2635# else /* not RE_ENABLE_I18N */
2636 {
2637 unsigned int ch;
2638 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2639 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2640 : 0));
2641 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2642 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2643 : 0));
2644 if (start_ch > end_ch)
2645 return REG_ERANGE;
2646 /* Build the table for single byte characters. */
2647 for (ch = 0; ch < SBC_MAX; ++ch)
2648 if (start_ch <= ch && ch <= end_ch)
2649 bitset_set (sbcset, ch);
2650 }
2651# endif /* not RE_ENABLE_I18N */
2652 return REG_NOERROR;
2653}
2654#endif
2655
2656#if 1
2657/* Helper function for parse_bracket_exp only used in case of NOT glibc.
2658 Build the collating element which is represented by NAME.
2659 The result are written to MBCSET and SBCSET.
2660 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2661 pointer argument since we may update it. */
2662
2663static reg_errcode_t
2664internal_function
2665# ifdef RE_ENABLE_I18N
2666build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2667 int *coll_sym_alloc, const unsigned char *name)
2668# else
2669build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2670# endif
2671{
2672 size_t name_len = strlen ((const char *) name);
2673 if (BE (name_len != 1, 0))
2674 return REG_ECOLLATE;
2675 bitset_set (sbcset, name[0]);
2676 return REG_NOERROR;
2677}
2678#endif
2679
2680/* This function parse bracket expression like "[abc]", "[a-c]",
2681 "[[.a-a.]]" etc. */
2682
2683static bin_tree_t *
2684parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2685 reg_syntax_t syntax, reg_errcode_t *err)
2686{
2687#if 0
2688 const unsigned char *collseqmb;
2689 const char *collseqwc;
2690 uint32_t nrules;
2691 int32_t table_size;
2692 const int32_t *symb_table;
2693 const unsigned char *extra;
2694
2695 /* Local function for parse_bracket_exp used in glibc.
2696 Seek the collating symbol entry correspondings to NAME.
2697 Return the index of the symbol in the SYMB_TABLE. */
2698
2699 auto __inline__ int32_t
2700 __attribute ((always_inline))
2701 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2702 {
2703 int32_t hash = elem_hash ((const char *) name, name_len);
2704 int32_t elem = hash % table_size;
2705 if (symb_table[2 * elem] != 0)
2706 {
2707 int32_t second = hash % (table_size - 2) + 1;
2708
2709 do
2710 {
2711 /* First compare the hashing value. */
2712 if (symb_table[2 * elem] == hash
2713 /* Compare the length of the name. */
2714 && name_len == extra[symb_table[2 * elem + 1]]
2715 /* Compare the name. */
2716 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2717 name_len) == 0)
2718 {
2719 /* Yep, this is the entry. */
2720 break;
2721 }
2722
2723 /* Next entry. */
2724 elem += second;
2725 }
2726 while (symb_table[2 * elem] != 0);
2727 }
2728 return elem;
2729 }
2730
2731 /* Local function for parse_bracket_exp used in glibc.
2732 Look up the collation sequence value of BR_ELEM.
2733 Return the value if succeeded, UINT_MAX otherwise. */
2734
2735 auto __inline__ unsigned int
2736 __attribute ((always_inline))
2737 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2738 {
2739 if (br_elem->type == SB_CHAR)
2740 {
2741 /*
2742 if (MB_CUR_MAX == 1)
2743 */
2744 if (nrules == 0)
2745 return collseqmb[br_elem->opr.ch];
2746 else
2747 {
2748 wint_t wc = __btowc (br_elem->opr.ch);
2749 return __collseq_table_lookup (collseqwc, wc);
2750 }
2751 }
2752 else if (br_elem->type == MB_CHAR)
2753 {
2754 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2755 }
2756 else if (br_elem->type == COLL_SYM)
2757 {
2758 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2759 if (nrules != 0)
2760 {
2761 int32_t elem, idx;
2762 elem = seek_collating_symbol_entry (br_elem->opr.name,
2763 sym_name_len);
2764 if (symb_table[2 * elem] != 0)
2765 {
2766 /* We found the entry. */
2767 idx = symb_table[2 * elem + 1];
2768 /* Skip the name of collating element name. */
2769 idx += 1 + extra[idx];
2770 /* Skip the byte sequence of the collating element. */
2771 idx += 1 + extra[idx];
2772 /* Adjust for the alignment. */
2773 idx = (idx + 3) & ~3;
2774 /* Skip the multibyte collation sequence value. */
2775 idx += sizeof (unsigned int);
2776 /* Skip the wide char sequence of the collating element. */
2777 idx += sizeof (unsigned int) *
2778 (1 + *(unsigned int *) (extra + idx));
2779 /* Return the collation sequence value. */
2780 return *(unsigned int *) (extra + idx);
2781 }
2782 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2783 {
2784 /* No valid character. Match it as a single byte
2785 character. */
2786 return collseqmb[br_elem->opr.name[0]];
2787 }
2788 }
2789 else if (sym_name_len == 1)
2790 return collseqmb[br_elem->opr.name[0]];
2791 }
2792 return UINT_MAX;
2793 }
2794
2795 /* Local function for parse_bracket_exp used in glibc.
2796 Build the range expression which starts from START_ELEM, and ends
2797 at END_ELEM. The result are written to MBCSET and SBCSET.
2798 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2799 mbcset->range_ends, is a pointer argument sinse we may
2800 update it. */
2801
2802 auto __inline__ reg_errcode_t
2803 __attribute ((always_inline))
2804 build_range_exp (re_charset_t *mbcset,
2805 int *range_alloc,
2806 bitset_t sbcset,
2807 bracket_elem_t *start_elem,
2808 bracket_elem_t *end_elem)
2809 {
2810 unsigned int ch;
2811 uint32_t start_collseq;
2812 uint32_t end_collseq;
2813
2814 /* Equivalence Classes and Character Classes can't be a range
2815 start/end. */
2816 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2817 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2818 0))
2819 return REG_ERANGE;
2820
2821 start_collseq = lookup_collation_sequence_value (start_elem);
2822 end_collseq = lookup_collation_sequence_value (end_elem);
2823 /* Check start/end collation sequence values. */
2824 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2825 return REG_ECOLLATE;
2826 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2827 return REG_ERANGE;
2828
2829 /* Got valid collation sequence values, add them as a new entry.
2830 However, if we have no collation elements, and the character set
2831 is single byte, the single byte character set that we
2832 build below suffices. */
2833 if (nrules > 0 || dfa->mb_cur_max > 1)
2834 {
2835 /* Check the space of the arrays. */
2836 if (BE (*range_alloc == mbcset->nranges, 0))
2837 {
2838 /* There is not enough space, need realloc. */
2839 uint32_t *new_array_start;
2840 uint32_t *new_array_end;
2841 int new_nranges;
2842
2843 /* +1 in case of mbcset->nranges is 0. */
2844 new_nranges = 2 * mbcset->nranges + 1;
2845 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2846 new_nranges);
2847 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2848 new_nranges);
2849
2850 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2851 return REG_ESPACE;
2852
2853 mbcset->range_starts = new_array_start;
2854 mbcset->range_ends = new_array_end;
2855 *range_alloc = new_nranges;
2856 }
2857
2858 mbcset->range_starts[mbcset->nranges] = start_collseq;
2859 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2860 }
2861
2862 /* Build the table for single byte characters. */
2863 for (ch = 0; ch < SBC_MAX; ch++)
2864 {
2865 uint32_t ch_collseq;
2866 /*
2867 if (MB_CUR_MAX == 1)
2868 */
2869 if (nrules == 0)
2870 ch_collseq = collseqmb[ch];
2871 else
2872 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2873 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2874 bitset_set (sbcset, ch);
2875 }
2876 return REG_NOERROR;
2877 }
2878
2879 /* Local function for parse_bracket_exp used in glibc.
2880 Build the collating element which is represented by NAME.
2881 The result are written to MBCSET and SBCSET.
2882 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2883 pointer argument sinse we may update it. */
2884
2885 auto __inline__ reg_errcode_t
2886 __attribute ((always_inline))
2887 build_collating_symbol (re_charset_t *mbcset,
2888 int *coll_sym_alloc,
2889 bitset_t sbcset,
2890 const unsigned char *name)
2891 {
2892 int32_t elem, idx;
2893 size_t name_len = strlen ((const char *) name);
2894 if (nrules != 0)
2895 {
2896 elem = seek_collating_symbol_entry (name, name_len);
2897 if (symb_table[2 * elem] != 0)
2898 {
2899 /* We found the entry. */
2900 idx = symb_table[2 * elem + 1];
2901 /* Skip the name of collating element name. */
2902 idx += 1 + extra[idx];
2903 }
2904 else if (symb_table[2 * elem] == 0 && name_len == 1)
2905 {
2906 /* No valid character, treat it as a normal
2907 character. */
2908 bitset_set (sbcset, name[0]);
2909 return REG_NOERROR;
2910 }
2911 else
2912 return REG_ECOLLATE;
2913
2914 /* Got valid collation sequence, add it as a new entry. */
2915 /* Check the space of the arrays. */
2916 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2917 {
2918 /* Not enough, realloc it. */
2919 /* +1 in case of mbcset->ncoll_syms is 0. */
2920 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2921 /* Use realloc since mbcset->coll_syms is NULL
2922 if *alloc == 0. */
2923 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2924 new_coll_sym_alloc);
2925 if (BE (new_coll_syms == NULL, 0))
2926 return REG_ESPACE;
2927 mbcset->coll_syms = new_coll_syms;
2928 *coll_sym_alloc = new_coll_sym_alloc;
2929 }
2930 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2931 return REG_NOERROR;
2932 }
2933 else
2934 {
2935 if (BE (name_len != 1, 0))
2936 return REG_ECOLLATE;
2937 else
2938 {
2939 bitset_set (sbcset, name[0]);
2940 return REG_NOERROR;
2941 }
2942 }
2943 }
2944#endif
2945
2946 re_token_t br_token;
2947 re_bitset_ptr_t sbcset;
2948#ifdef RE_ENABLE_I18N
2949 re_charset_t *mbcset;
2950 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2951 int equiv_class_alloc = 0, char_class_alloc = 0;
2952#endif
2953 int non_match = 0;
2954 bin_tree_t *work_tree;
2955 int token_len;
2956 int first_round = 1;
2957#if 0
2958 collseqmb = (const unsigned char *)
2959 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2960 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2961 if (nrules)
2962 {
2963 /*
2964 if (MB_CUR_MAX > 1)
2965 */
2966 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2967 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2968 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2969 _NL_COLLATE_SYMB_TABLEMB);
2970 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2971 _NL_COLLATE_SYMB_EXTRAMB);
2972 }
2973#endif
2974 sbcset = calloc (sizeof (bitset_t), 1);
2975#ifdef RE_ENABLE_I18N
2976 mbcset = calloc (sizeof (re_charset_t), 1);
2977#endif
2978#ifdef RE_ENABLE_I18N
2979 if (BE (sbcset == NULL || mbcset == NULL, 0))
2980#else
2981 if (BE (sbcset == NULL, 0))
2982#endif
2983 {
2984 *err = REG_ESPACE;
2985 return NULL;
2986 }
2987
2988 token_len = peek_token_bracket (token, regexp, syntax);
2989 if (BE (token->type == END_OF_RE, 0))
2990 {
2991 *err = REG_BADPAT;
2992 goto parse_bracket_exp_free_return;
2993 }
2994 if (token->type == OP_NON_MATCH_LIST)
2995 {
2996#ifdef RE_ENABLE_I18N
2997 mbcset->non_match = 1;
2998#endif
2999 non_match = 1;
3000 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3001 bitset_set (sbcset, '\0');
3002 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3003 token_len = peek_token_bracket (token, regexp, syntax);
3004 if (BE (token->type == END_OF_RE, 0))
3005 {
3006 *err = REG_BADPAT;
3007 goto parse_bracket_exp_free_return;
3008 }
3009 }
3010
3011 /* We treat the first ']' as a normal character. */
3012 if (token->type == OP_CLOSE_BRACKET)
3013 token->type = CHARACTER;
3014
3015 while (1)
3016 {
3017 bracket_elem_t start_elem, end_elem;
3018 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3019 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3020 reg_errcode_t ret;
3021 int token_len2 = 0, is_range_exp = 0;
3022 re_token_t token2;
3023
3024 start_elem.opr.name = start_name_buf;
3025 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3026 syntax, first_round);
3027 if (BE (ret != REG_NOERROR, 0))
3028 {
3029 *err = ret;
3030 goto parse_bracket_exp_free_return;
3031 }
3032 first_round = 0;
3033
3034 /* Get information about the next token. We need it in any case. */
3035 token_len = peek_token_bracket (token, regexp, syntax);
3036
3037 /* Do not check for ranges if we know they are not allowed. */
3038 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3039 {
3040 if (BE (token->type == END_OF_RE, 0))
3041 {
3042 *err = REG_EBRACK;
3043 goto parse_bracket_exp_free_return;
3044 }
3045 if (token->type == OP_CHARSET_RANGE)
3046 {
3047 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3048 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3049 if (BE (token2.type == END_OF_RE, 0))
3050 {
3051 *err = REG_EBRACK;
3052 goto parse_bracket_exp_free_return;
3053 }
3054 if (token2.type == OP_CLOSE_BRACKET)
3055 {
3056 /* We treat the last '-' as a normal character. */
3057 re_string_skip_bytes (regexp, -token_len);
3058 token->type = CHARACTER;
3059 }
3060 else
3061 is_range_exp = 1;
3062 }
3063 }
3064
3065 if (is_range_exp == 1)
3066 {
3067 end_elem.opr.name = end_name_buf;
3068 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3069 dfa, syntax, 1);
3070 if (BE (ret != REG_NOERROR, 0))
3071 {
3072 *err = ret;
3073 goto parse_bracket_exp_free_return;
3074 }
3075
3076 token_len = peek_token_bracket (token, regexp, syntax);
3077
3078#if 0
3079 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3080 &start_elem, &end_elem);
3081#else
3082# ifdef RE_ENABLE_I18N
3083 *err = build_range_exp (sbcset,
3084 dfa->mb_cur_max > 1 ? mbcset : NULL,
3085 &range_alloc, &start_elem, &end_elem);
3086# else
3087 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3088# endif
3089#endif
3090 if (BE (*err != REG_NOERROR, 0))
3091 goto parse_bracket_exp_free_return;
3092 }
3093 else
3094 {
3095 switch (start_elem.type)
3096 {
3097 case SB_CHAR:
3098 bitset_set (sbcset, start_elem.opr.ch);
3099 break;
3100#ifdef RE_ENABLE_I18N
3101 case MB_CHAR:
3102 /* Check whether the array has enough space. */
3103 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3104 {
3105 wchar_t *new_mbchars;
3106 /* Not enough, realloc it. */
3107 /* +1 in case of mbcset->nmbchars is 0. */
3108 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3109 /* Use realloc since array is NULL if *alloc == 0. */
3110 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3111 mbchar_alloc);
3112 if (BE (new_mbchars == NULL, 0))
3113 goto parse_bracket_exp_espace;
3114 mbcset->mbchars = new_mbchars;
3115 }
3116 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3117 break;
3118#endif /* RE_ENABLE_I18N */
3119 case EQUIV_CLASS:
3120 *err = build_equiv_class (sbcset,
3121#ifdef RE_ENABLE_I18N
3122 mbcset, &equiv_class_alloc,
3123#endif
3124 start_elem.opr.name);
3125 if (BE (*err != REG_NOERROR, 0))
3126 goto parse_bracket_exp_free_return;
3127 break;
3128 case COLL_SYM:
3129 *err = build_collating_symbol (sbcset,
3130#ifdef RE_ENABLE_I18N
3131 mbcset, &coll_sym_alloc,
3132#endif
3133 start_elem.opr.name);
3134 if (BE (*err != REG_NOERROR, 0))
3135 goto parse_bracket_exp_free_return;
3136 break;
3137 case CHAR_CLASS:
3138 *err = build_charclass (regexp->trans, sbcset,
3139#ifdef RE_ENABLE_I18N
3140 mbcset, &char_class_alloc,
3141#endif
3142 start_elem.opr.name, syntax);
3143 if (BE (*err != REG_NOERROR, 0))
3144 goto parse_bracket_exp_free_return;
3145 break;
3146 default:
3147 assert (0);
3148 break;
3149 }
3150 }
3151 if (BE (token->type == END_OF_RE, 0))
3152 {
3153 *err = REG_EBRACK;
3154 goto parse_bracket_exp_free_return;
3155 }
3156 if (token->type == OP_CLOSE_BRACKET)
3157 break;
3158 }
3159
3160 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3161
3162 /* If it is non-matching list. */
3163 if (non_match)
3164 bitset_not (sbcset);
3165
3166#ifdef RE_ENABLE_I18N
3167 /* Ensure only single byte characters are set. */
3168 if (dfa->mb_cur_max > 1)
3169 bitset_mask (sbcset, dfa->sb_char);
3170
3171 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3172 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3173 || mbcset->non_match)))
3174 {
3175 bin_tree_t *mbc_tree;
3176 int sbc_idx;
3177 /* Build a tree for complex bracket. */
3178 dfa->has_mb_node = 1;
3179 br_token.type = COMPLEX_BRACKET;
3180 br_token.opr.mbcset = mbcset;
3181 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3182 if (BE (mbc_tree == NULL, 0))
3183 goto parse_bracket_exp_espace;
3184 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3185 if (sbcset[sbc_idx])
3186 break;
3187 /* If there are no bits set in sbcset, there is no point
3188 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3189 if (sbc_idx < BITSET_WORDS)
3190 {
3191 /* Build a tree for simple bracket. */
3192 br_token.type = SIMPLE_BRACKET;
3193 br_token.opr.sbcset = sbcset;
3194 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3195 if (BE (work_tree == NULL, 0))
3196 goto parse_bracket_exp_espace;
3197
3198 /* Then join them by ALT node. */
3199 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3200 if (BE (work_tree == NULL, 0))
3201 goto parse_bracket_exp_espace;
3202 }
3203 else
3204 {
3205 re_free (sbcset);
3206 work_tree = mbc_tree;
3207 }
3208 }
3209 else
3210#endif /* not RE_ENABLE_I18N */
3211 {
3212#ifdef RE_ENABLE_I18N
3213 free_charset (mbcset);
3214#endif
3215 /* Build a tree for simple bracket. */
3216 br_token.type = SIMPLE_BRACKET;
3217 br_token.opr.sbcset = sbcset;
3218 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3219 if (BE (work_tree == NULL, 0))
3220 goto parse_bracket_exp_espace;
3221 }
3222 return work_tree;
3223
3224 parse_bracket_exp_espace:
3225 *err = REG_ESPACE;
3226 parse_bracket_exp_free_return:
3227 re_free (sbcset);
3228#ifdef RE_ENABLE_I18N
3229 free_charset (mbcset);
3230#endif
3231 return NULL;
3232}
3233
3234/* Parse an element in the bracket expression. */
3235
3236static reg_errcode_t
3237parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3238 re_token_t *token, int token_len, re_dfa_t *dfa,
3239 reg_syntax_t syntax, int accept_hyphen)
3240{
3241#ifdef RE_ENABLE_I18N
3242 int cur_char_size;
3243 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3244 if (cur_char_size > 1)
3245 {
3246 elem->type = MB_CHAR;
3247 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3248 re_string_skip_bytes (regexp, cur_char_size);
3249 return REG_NOERROR;
3250 }
3251#endif /* RE_ENABLE_I18N */
3252 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3253 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3254 || token->type == OP_OPEN_EQUIV_CLASS)
3255 return parse_bracket_symbol (elem, regexp, token);
3256 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3257 {
3258 /* A '-' must only appear as anything but a range indicator before
3259 the closing bracket. Everything else is an error. */
3260 re_token_t token2;
3261 (void) peek_token_bracket (&token2, regexp, syntax);
3262 if (token2.type != OP_CLOSE_BRACKET)
3263 /* The actual error value is not standardized since this whole
3264 case is undefined. But ERANGE makes good sense. */
3265 return REG_ERANGE;
3266 }
3267 elem->type = SB_CHAR;
3268 elem->opr.ch = token->opr.c;
3269 return REG_NOERROR;
3270}
3271
3272/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3273 such as [:<character_class>:], [.<collating_element>.], and
3274 [=<equivalent_class>=]. */
3275
3276static reg_errcode_t
3277parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3278 re_token_t *token)
3279{
3280 unsigned char ch, delim = token->opr.c;
3281 int i = 0;
3282 if (re_string_eoi(regexp))
3283 return REG_EBRACK;
3284 for (;; ++i)
3285 {
3286 if (i >= BRACKET_NAME_BUF_SIZE)
3287 return REG_EBRACK;
3288 if (token->type == OP_OPEN_CHAR_CLASS)
3289 ch = re_string_fetch_byte_case (regexp);
3290 else
3291 ch = re_string_fetch_byte (regexp);
3292 if (re_string_eoi(regexp))
3293 return REG_EBRACK;
3294 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3295 break;
3296 elem->opr.name[i] = ch;
3297 }
3298 re_string_skip_bytes (regexp, 1);
3299 elem->opr.name[i] = '\0';
3300 switch (token->type)
3301 {
3302 case OP_OPEN_COLL_ELEM:
3303 elem->type = COLL_SYM;
3304 break;
3305 case OP_OPEN_EQUIV_CLASS:
3306 elem->type = EQUIV_CLASS;
3307 break;
3308 case OP_OPEN_CHAR_CLASS:
3309 elem->type = CHAR_CLASS;
3310 break;
3311 default:
3312 break;
3313 }
3314 return REG_NOERROR;
3315}
3316
3317 /* Helper function for parse_bracket_exp.
3318 Build the equivalence class which is represented by NAME.
3319 The result are written to MBCSET and SBCSET.
3320 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3321 is a pointer argument sinse we may update it. */
3322
3323static reg_errcode_t
3324#ifdef RE_ENABLE_I18N
3325build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3326 int *equiv_class_alloc, const unsigned char *name)
3327#else
3328build_equiv_class (bitset_t sbcset, const unsigned char *name)
3329#endif
3330{
3331#if 0
3332 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3333 if (nrules != 0)
3334 {
3335 const int32_t *table, *indirect;
3336 const unsigned char *weights, *extra, *cp;
3337 unsigned char char_buf[2];
3338 int32_t idx1, idx2;
3339 unsigned int ch;
3340 size_t len;
3341 /* This #include defines a local function! */
3342# include <locale/weight.h>
3343 /* Calculate the index for equivalence class. */
3344 cp = name;
3345 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3346 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3347 _NL_COLLATE_WEIGHTMB);
3348 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3349 _NL_COLLATE_EXTRAMB);
3350 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3351 _NL_COLLATE_INDIRECTMB);
3352 idx1 = findidx (&cp);
3353 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3354 /* This isn't a valid character. */
3355 return REG_ECOLLATE;
3356
3357 /* Build single byte matcing table for this equivalence class. */
3358 char_buf[1] = (unsigned char) '\0';
3359 len = weights[idx1];
3360 for (ch = 0; ch < SBC_MAX; ++ch)
3361 {
3362 char_buf[0] = ch;
3363 cp = char_buf;
3364 idx2 = findidx (&cp);
3365/*
3366 idx2 = table[ch];
3367*/
3368 if (idx2 == 0)
3369 /* This isn't a valid character. */
3370 continue;
3371 if (len == weights[idx2])
3372 {
3373 int cnt = 0;
3374 while (cnt <= len &&
3375 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3376 ++cnt;
3377
3378 if (cnt > len)
3379 bitset_set (sbcset, ch);
3380 }
3381 }
3382 /* Check whether the array has enough space. */
3383 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3384 {
3385 /* Not enough, realloc it. */
3386 /* +1 in case of mbcset->nequiv_classes is 0. */
3387 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3388 /* Use realloc since the array is NULL if *alloc == 0. */
3389 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3390 int32_t,
3391 new_equiv_class_alloc);
3392 if (BE (new_equiv_classes == NULL, 0))
3393 return REG_ESPACE;
3394 mbcset->equiv_classes = new_equiv_classes;
3395 *equiv_class_alloc = new_equiv_class_alloc;
3396 }
3397 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3398 }
3399 else
3400#endif
3401 {
3402 if (BE (strlen ((const char *) name) != 1, 0))
3403 return REG_ECOLLATE;
3404 bitset_set (sbcset, *name);
3405 }
3406 return REG_NOERROR;
3407}
3408
3409 /* Helper function for parse_bracket_exp.
3410 Build the character class which is represented by NAME.
3411 The result are written to MBCSET and SBCSET.
3412 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3413 is a pointer argument sinse we may update it. */
3414
3415static reg_errcode_t
3416#ifdef RE_ENABLE_I18N
3417build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3418 re_charset_t *mbcset, int *char_class_alloc,
3419 const unsigned char *class_name, reg_syntax_t syntax)
3420#else
3421build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3422 const unsigned char *class_name, reg_syntax_t syntax)
3423#endif
3424{
3425 int i;
3426 const char *name = (const char *) class_name;
3427
3428 /* In case of REG_ICASE "upper" and "lower" match the both of
3429 upper and lower cases. */
3430 if ((syntax & RE_ICASE)
3431 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3432 name = "alpha";
3433
3434#ifdef RE_ENABLE_I18N
3435 /* Check the space of the arrays. */
3436 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3437 {
3438 /* Not enough, realloc it. */
3439 /* +1 in case of mbcset->nchar_classes is 0. */
3440 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3441 /* Use realloc since array is NULL if *alloc == 0. */
3442 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3443 new_char_class_alloc);
3444 if (BE (new_char_classes == NULL, 0))
3445 return REG_ESPACE;
3446 mbcset->char_classes = new_char_classes;
3447 *char_class_alloc = new_char_class_alloc;
3448 }
3449 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3450#endif /* RE_ENABLE_I18N */
3451
3452#define BUILD_CHARCLASS_LOOP(ctype_func) \
3453 do { \
3454 if (BE (trans != NULL, 0)) \
3455 { \
3456 for (i = 0; i < SBC_MAX; ++i) \
3457 if (ctype_func (i)) \
3458 bitset_set (sbcset, trans[i]); \
3459 } \
3460 else \
3461 { \
3462 for (i = 0; i < SBC_MAX; ++i) \
3463 if (ctype_func (i)) \
3464 bitset_set (sbcset, i); \
3465 } \
3466 } while (0)
3467
3468 if (strcmp (name, "alnum") == 0)
3469 BUILD_CHARCLASS_LOOP (isalnum);
3470 else if (strcmp (name, "cntrl") == 0)
3471 BUILD_CHARCLASS_LOOP (iscntrl);
3472 else if (strcmp (name, "lower") == 0)
3473 BUILD_CHARCLASS_LOOP (islower);
3474 else if (strcmp (name, "space") == 0)
3475 BUILD_CHARCLASS_LOOP (isspace);
3476 else if (strcmp (name, "alpha") == 0)
3477 BUILD_CHARCLASS_LOOP (isalpha);
3478 else if (strcmp (name, "digit") == 0)
3479 BUILD_CHARCLASS_LOOP (isdigit);
3480 else if (strcmp (name, "print") == 0)
3481 BUILD_CHARCLASS_LOOP (isprint);
3482 else if (strcmp (name, "upper") == 0)
3483 BUILD_CHARCLASS_LOOP (isupper);
3484 else if (strcmp (name, "blank") == 0)
3485 BUILD_CHARCLASS_LOOP (isblank);
3486 else if (strcmp (name, "graph") == 0)
3487 BUILD_CHARCLASS_LOOP (isgraph);
3488 else if (strcmp (name, "punct") == 0)
3489 BUILD_CHARCLASS_LOOP (ispunct);
3490 else if (strcmp (name, "xdigit") == 0)
3491 BUILD_CHARCLASS_LOOP (isxdigit);
3492 else
3493 return REG_ECTYPE;
3494
3495 return REG_NOERROR;
3496}
3497
3498static bin_tree_t *
3499build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3500 const unsigned char *class_name,
3501 const unsigned char *extra, int non_match,
3502 reg_errcode_t *err)
3503{
3504 re_bitset_ptr_t sbcset;
3505#ifdef RE_ENABLE_I18N
3506 re_charset_t *mbcset;
3507 int alloc = 0;
3508#endif
3509 reg_errcode_t ret;
3510 re_token_t br_token;
3511 bin_tree_t *tree;
3512
3513 sbcset = calloc (sizeof (bitset_t), 1);
3514#ifdef RE_ENABLE_I18N
3515 mbcset = calloc (sizeof (re_charset_t), 1);
3516#endif
3517
3518#ifdef RE_ENABLE_I18N
3519 if (BE (sbcset == NULL || mbcset == NULL, 0))
3520#else
3521 if (BE (sbcset == NULL, 0))
3522#endif
3523 {
3524 *err = REG_ESPACE;
3525 return NULL;
3526 }
3527
3528 if (non_match)
3529 {
3530#ifdef RE_ENABLE_I18N
3531 /*
3532 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3533 bitset_set(cset->sbcset, '\0');
3534 */
3535 mbcset->non_match = 1;
3536#endif
3537 }
3538
3539 /* We don't care the syntax in this case. */
3540 ret = build_charclass (trans, sbcset,
3541#ifdef RE_ENABLE_I18N
3542 mbcset, &alloc,
3543#endif
3544 class_name, 0);
3545
3546 if (BE (ret != REG_NOERROR, 0))
3547 {
3548 re_free (sbcset);
3549#ifdef RE_ENABLE_I18N
3550 free_charset (mbcset);
3551#endif
3552 *err = ret;
3553 return NULL;
3554 }
3555 /* \w match '_' also. */
3556 for (; *extra; extra++)
3557 bitset_set (sbcset, *extra);
3558
3559 /* If it is non-matching list. */
3560 if (non_match)
3561 bitset_not (sbcset);
3562
3563#ifdef RE_ENABLE_I18N
3564 /* Ensure only single byte characters are set. */
3565 if (dfa->mb_cur_max > 1)
3566 bitset_mask (sbcset, dfa->sb_char);
3567#endif
3568
3569 /* Build a tree for simple bracket. */
3570 br_token.type = SIMPLE_BRACKET;
3571 br_token.opr.sbcset = sbcset;
3572 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3573 if (BE (tree == NULL, 0))
3574 goto build_word_op_espace;
3575
3576#ifdef RE_ENABLE_I18N
3577 if (dfa->mb_cur_max > 1)
3578 {
3579 bin_tree_t *mbc_tree;
3580 /* Build a tree for complex bracket. */
3581 br_token.type = COMPLEX_BRACKET;
3582 br_token.opr.mbcset = mbcset;
3583 dfa->has_mb_node = 1;
3584 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3585 if (BE (mbc_tree == NULL, 0))
3586 goto build_word_op_espace;
3587 /* Then join them by ALT node. */
3588 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3589 if (BE (mbc_tree != NULL, 1))
3590 return tree;
3591 }
3592 else
3593 {
3594 free_charset (mbcset);
3595 return tree;
3596 }
3597#else /* not RE_ENABLE_I18N */
3598 return tree;
3599#endif
3600
3601 build_word_op_espace:
3602 re_free (sbcset);
3603#ifdef RE_ENABLE_I18N
3604 free_charset (mbcset);
3605#endif
3606 *err = REG_ESPACE;
3607 return NULL;
3608}
3609
3610/* This is intended for the expressions like "a{1,3}".
3611 Fetch a number from `input', and return the number.
3612 Return -1, if the number field is empty like "{,1}".
3613 Return -2, If an error is occured. */
3614
3615static int
3616fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3617{
3618 int num = -1;
3619 unsigned char c;
3620 while (1)
3621 {
3622 fetch_token (token, input, syntax);
3623 c = token->opr.c;
3624 if (BE (token->type == END_OF_RE, 0))
3625 return -2;
3626 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3627 break;
3628 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3629 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3630 num = (num > RE_DUP_MAX) ? -2 : num;
3631 }
3632 return num;
3633}
3634
3635#ifdef RE_ENABLE_I18N
3636static void
3637free_charset (re_charset_t *cset)
3638{
3639 re_free (cset->mbchars);
3640# if 0
3641 re_free (cset->coll_syms);
3642 re_free (cset->equiv_classes);
3643 re_free (cset->range_starts);
3644 re_free (cset->range_ends);
3645# endif
3646 re_free (cset->char_classes);
3647 re_free (cset);
3648}
3649#endif /* RE_ENABLE_I18N */
3650
3651/* Functions for binary tree operation. */
3652
3653/* Create a tree node. */
3654
3655static bin_tree_t *
3656create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3657 re_token_type_t type)
3658{
3659 re_token_t t;
3660 t.type = type;
3661 return create_token_tree (dfa, left, right, &t);
3662}
3663
3664static bin_tree_t *
3665create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3666 const re_token_t *token)
3667{
3668 bin_tree_t *tree;
3669 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3670 {
3671 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3672
3673 if (storage == NULL)
3674 return NULL;
3675 storage->next = dfa->str_tree_storage;
3676 dfa->str_tree_storage = storage;
3677 dfa->str_tree_storage_idx = 0;
3678 }
3679 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3680
3681 tree->parent = NULL;
3682 tree->left = left;
3683 tree->right = right;
3684 tree->token = *token;
3685 tree->token.duplicated = 0;
3686 tree->token.opt_subexp = 0;
3687 tree->first = NULL;
3688 tree->next = NULL;
3689 tree->node_idx = -1;
3690
3691 if (left != NULL)
3692 left->parent = tree;
3693 if (right != NULL)
3694 right->parent = tree;
3695 return tree;
3696}
3697
3698/* Mark the tree SRC as an optional subexpression.
3699 To be called from preorder or postorder. */
3700
3701static reg_errcode_t
3702mark_opt_subexp (void *extra, bin_tree_t *node)
3703{
3704 int idx = (int) (long) extra;
3705 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3706 node->token.opt_subexp = 1;
3707
3708 return REG_NOERROR;
3709}
3710
3711/* Free the allocated memory inside NODE. */
3712
3713static void
3714free_token (re_token_t *node)
3715{
3716#ifdef RE_ENABLE_I18N
3717 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3718 free_charset (node->opr.mbcset);
3719 else
3720#endif
3721 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3722 re_free (node->opr.sbcset);
3723}
3724
3725/* Worker function for tree walking. Free the allocated memory inside NODE
3726 and its children. */
3727
3728static reg_errcode_t
3729free_tree (void *extra, bin_tree_t *node)
3730{
3731 free_token (&node->token);
3732 return REG_NOERROR;
3733}
3734
3735
3736/* Duplicate the node SRC, and return new node. This is a preorder
3737 visit similar to the one implemented by the generic visitor, but
3738 we need more infrastructure to maintain two parallel trees --- so,
3739 it's easier to duplicate. */
3740
3741static bin_tree_t *
3742duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3743{
3744 const bin_tree_t *node;
3745 bin_tree_t *dup_root;
3746 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3747
3748 for (node = root; ; )
3749 {
3750 /* Create a new tree and link it back to the current parent. */
3751 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3752 if (*p_new == NULL)
3753 return NULL;
3754 (*p_new)->parent = dup_node;
3755 (*p_new)->token.duplicated = 1;
3756 dup_node = *p_new;
3757
3758 /* Go to the left node, or up and to the right. */
3759 if (node->left)
3760 {
3761 node = node->left;
3762 p_new = &dup_node->left;
3763 }
3764 else
3765 {
3766 const bin_tree_t *prev = NULL;
3767 while (node->right == prev || node->right == NULL)
3768 {
3769 prev = node;
3770 node = node->parent;
3771 dup_node = dup_node->parent;
3772 if (!node)
3773 return dup_root;
3774 }
3775 node = node->right;
3776 p_new = &dup_node->right;
3777 }
3778 }
3779}