blob: acf81a319dfb400bfdb1edb9b85aed6c0258a2dc [file] [log] [blame]
xf.libfc6e712025-02-07 01:54:34 -08001/* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
6 This file is part of the GNU C Library.
7
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
12
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with the GNU C Library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 02111-1307 USA. */
22
23/* To exclude some unwanted junk.... */
24#undef emacs
25#include <features.h>
26/* unistd.h must be included with _LIBC defined: we need smallint */
27#include <unistd.h>
28#ifdef __UCLIBC__
29# undef _LIBC
30# define _REGEX_RE_COMP
31# define STDC_HEADERS
32# define RE_TRANSLATE_TYPE char *
33#endif
34#include <stdlib.h>
35#include <stdint.h>
36#include <string.h>
37#include <stdio.h>
38
39/* AIX requires this to be the first thing in the file. */
40#if defined _AIX && !defined REGEX_MALLOC
41# pragma alloca
42#endif
43
44#ifdef HAVE_CONFIG_H
45# include <config.h>
46#endif
47
48#ifndef INSIDE_RECURSION
49
50# if defined STDC_HEADERS && !defined emacs
51# include <stddef.h>
52# else
53/* We need this for `regex.h', and perhaps for the Emacs include files. */
54# include <sys/types.h>
55# endif
56
57
58/* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
60# if defined __UCLIBC_HAS_WCHAR__
61# define WIDE_CHAR_SUPPORT 1
62/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
63# include <wchar.h>
64# include <wctype.h>
65# endif
66
67# if defined _LIBC || defined __UCLIBC__
68/* We have to keep the namespace clean. */
69
70# ifndef __UCLIBC__
71# define btowc __btowc
72
73/* We are also using some library internals. */
74# include <locale/localeinfo.h>
75# include <locale/elem-hash.h>
76# include <langinfo.h>
77# include <locale/coll-lookup.h>
78# endif
79# endif
80
81/* This is for other GNU distributions with internationalized messages. */
82# if defined HAVE_LIBINTL_H || defined _LIBC
83# include <libintl.h>
84# ifdef _LIBC
85# undef gettext
86# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
87# endif
88# else
89# define gettext(msgid) (msgid)
90# endif
91
92# ifndef gettext_noop
93/* This define is so xgettext can find the internationalizable
94 strings. */
95# define gettext_noop(String) String
96# endif
97
98/* The `emacs' switch turns on certain matching commands
99 that make sense only in Emacs. */
100# ifdef emacs
101
102# include "lisp.h"
103# include "buffer.h"
104# include "syntax.h"
105
106# else /* not emacs */
107
108/* If we are not linking with Emacs proper,
109 we can't use the relocating allocator
110 even if config.h says that we can. */
111# undef REL_ALLOC
112
113# if defined STDC_HEADERS || defined _LIBC
114# include <stdlib.h>
115# else
116char *malloc ();
117char *realloc ();
118# endif
119
120/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
121 If nothing else has been done, use the method below. */
122# ifdef INHIBIT_STRING_HEADER
123# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
124# if !defined bzero && !defined bcopy
125# undef INHIBIT_STRING_HEADER
126# endif
127# endif
128# endif
129
130/* This is the normal way of making sure we have a bcopy and a bzero.
131 This is used in most programs--a few other programs avoid this
132 by defining INHIBIT_STRING_HEADER. */
133# ifndef INHIBIT_STRING_HEADER
134# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
135# include <string.h>
136# ifndef bzero
137# ifndef _LIBC
138# define bzero(s, n) (memset (s, '\0', n), (s))
139# else
140# define bzero(s, n) __bzero (s, n)
141# endif
142# endif
143# else
144# include <strings.h>
145# ifndef memcmp
146# define memcmp(s1, s2, n) bcmp (s1, s2, n)
147# endif
148# ifndef memcpy
149# define memcpy(d, s, n) (bcopy (s, d, n), (d))
150# endif
151# endif
152# endif
153
154/* Define the syntax stuff for \<, \>, etc. */
155
156/* This must be nonzero for the wordchar and notwordchar pattern
157 commands in re_match_2. */
158# ifndef Sword
159# define Sword 1
160# endif
161
162# ifdef SWITCH_ENUM_BUG
163# define SWITCH_ENUM_CAST(x) ((int)(x))
164# else
165# define SWITCH_ENUM_CAST(x) (x)
166# endif
167
168# endif /* not emacs */
169
170# if defined _LIBC || defined HAVE_LIMITS_H
171# include <limits.h>
172# endif
173
174# ifndef MB_LEN_MAX
175# define MB_LEN_MAX 1
176# endif
177
178/* Get the interface, including the syntax bits. */
179# include <regex.h>
180
181/* isalpha etc. are used for the character classes. */
182# include <ctype.h>
183
184/* Jim Meyering writes:
185
186 "... Some ctype macros are valid only for character codes that
187 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
188 using /bin/cc or gcc but without giving an ansi option). So, all
189 ctype uses should be through macros like ISPRINT... If
190 STDC_HEADERS is defined, then autoconf has verified that the ctype
191 macros don't need to be guarded with references to isascii. ...
192 Defining isascii to 1 should let any compiler worth its salt
193 eliminate the && through constant folding."
194 Solaris defines some of these symbols so we must undefine them first. */
195
196# undef ISASCII
197# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
198# define ISASCII(c) 1
199# else
200# define ISASCII(c) isascii(c)
201# endif
202
203# ifdef isblank
204# define ISBLANK(c) (ISASCII (c) && isblank (c))
205# else
206# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
207# endif
208# ifdef isgraph
209# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
210# else
211# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
212# endif
213
214# undef ISPRINT
215# define ISPRINT(c) (ISASCII (c) && isprint (c))
216# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
217# define ISALNUM(c) (ISASCII (c) && isalnum (c))
218# define ISALPHA(c) (ISASCII (c) && isalpha (c))
219# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
220# define ISLOWER(c) (ISASCII (c) && islower (c))
221# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
222# define ISSPACE(c) (ISASCII (c) && isspace (c))
223# define ISUPPER(c) (ISASCII (c) && isupper (c))
224# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
225
226# ifdef _tolower
227# define TOLOWER(c) _tolower(c)
228# else
229# define TOLOWER(c) tolower(c)
230# endif
231
232# ifndef NULL
233# define NULL (void *)0
234# endif
235
236/* We remove any previous definition of `SIGN_EXTEND_CHAR',
237 since ours (we hope) works properly with all combinations of
238 machines, compilers, `char' and `unsigned char' argument types.
239 (Per Bothner suggested the basic approach.) */
240# undef SIGN_EXTEND_CHAR
241# if __STDC__
242# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
243# else /* not __STDC__ */
244/* As in Harbison and Steele. */
245# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
246# endif
247
248# ifndef emacs
249/* How many characters in the character set. */
250# define CHAR_SET_SIZE 256
251
252# ifdef SYNTAX_TABLE
253
254extern char *re_syntax_table;
255
256# else /* not SYNTAX_TABLE */
257
258static char re_syntax_table[CHAR_SET_SIZE];
259
260static void init_syntax_once (void);
261
262static void
263init_syntax_once (void)
264{
265 register int c;
266 static smallint done = 0;
267
268 if (done)
269 return;
270 bzero (re_syntax_table, sizeof re_syntax_table);
271
272 for (c = 0; c < CHAR_SET_SIZE; ++c)
273 if (ISALNUM (c))
274 re_syntax_table[c] = Sword;
275
276 re_syntax_table['_'] = Sword;
277
278 done = 1;
279}
280
281# endif /* not SYNTAX_TABLE */
282
283# define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
284
285# endif /* emacs */
286
287/* Integer type for pointers. */
288# if !defined _LIBC && !defined __intptr_t_defined
289typedef unsigned long int uintptr_t;
290# endif
291
292/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
293 use `alloca' instead of `malloc'. This is because using malloc in
294 re_search* or re_match* could cause memory leaks when C-g is used in
295 Emacs; also, malloc is slower and causes storage fragmentation. On
296 the other hand, malloc is more portable, and easier to debug.
297
298 Because we sometimes use alloca, some routines have to be macros,
299 not functions -- `alloca'-allocated space disappears at the end of the
300 function it is called in. */
301
302# ifdef REGEX_MALLOC
303
304# define REGEX_ALLOCATE malloc
305# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
306# define REGEX_FREE free
307
308# else /* not REGEX_MALLOC */
309
310/* Emacs already defines alloca, sometimes. */
311# ifndef alloca
312
313/* Make alloca work the best possible way. */
314# ifdef __GNUC__
315# define alloca __builtin_alloca
316# else /* not __GNUC__ */
317# if HAVE_ALLOCA_H
318# include <alloca.h>
319# endif /* HAVE_ALLOCA_H */
320# endif /* not __GNUC__ */
321
322# endif /* not alloca */
323
324# define REGEX_ALLOCATE alloca
325
326/* Assumes a `char *destination' variable. */
327# define REGEX_REALLOCATE(source, osize, nsize) \
328 (destination = (char *) alloca (nsize), \
329 memcpy (destination, source, osize))
330
331/* No need to do anything to free, after alloca. */
332# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
333
334# endif /* not REGEX_MALLOC */
335
336/* Define how to allocate the failure stack. */
337
338# if defined REL_ALLOC && defined REGEX_MALLOC
339
340# define REGEX_ALLOCATE_STACK(size) \
341 r_alloc (&failure_stack_ptr, (size))
342# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
343 r_re_alloc (&failure_stack_ptr, (nsize))
344# define REGEX_FREE_STACK(ptr) \
345 r_alloc_free (&failure_stack_ptr)
346
347# else /* not using relocating allocator */
348
349# ifdef REGEX_MALLOC
350
351# define REGEX_ALLOCATE_STACK malloc
352# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
353# define REGEX_FREE_STACK free
354
355# else /* not REGEX_MALLOC */
356
357# define REGEX_ALLOCATE_STACK alloca
358
359# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
360 REGEX_REALLOCATE (source, osize, nsize)
361/* No need to explicitly free anything. */
362# define REGEX_FREE_STACK(arg)
363
364# endif /* not REGEX_MALLOC */
365# endif /* not using relocating allocator */
366
367
368/* True if `size1' is non-NULL and PTR is pointing anywhere inside
369 `string1' or just past its end. This works if PTR is NULL, which is
370 a good thing. */
371# define FIRST_STRING_P(ptr) \
372 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
373
374/* (Re)Allocate N items of type T using malloc, or fail. */
375# define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
376# define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
377# define RETALLOC_IF(addr, n, t) \
378 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
379# define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
380
381# define BYTEWIDTH 8 /* In bits. */
382
383# define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
384
385# undef MAX
386# undef MIN
387# define MAX(a, b) ((a) > (b) ? (a) : (b))
388# define MIN(a, b) ((a) < (b) ? (a) : (b))
389
390typedef char boolean;
391# define false 0
392# define true 1
393
394static reg_errcode_t byte_regex_compile (const char *pattern, size_t size,
395 reg_syntax_t syntax,
396 struct re_pattern_buffer *bufp);
397
398static int byte_re_match_2_internal (struct re_pattern_buffer *bufp,
399 const char *string1, int size1,
400 const char *string2, int size2,
401 int pos,
402 struct re_registers *regs,
403 int stop);
404static int byte_re_search_2 (struct re_pattern_buffer *bufp,
405 const char *string1, int size1,
406 const char *string2, int size2,
407 int startpos, int range,
408 struct re_registers *regs, int stop);
409static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp);
410
411#ifdef MBS_SUPPORT
412static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size,
413 reg_syntax_t syntax,
414 struct re_pattern_buffer *bufp);
415
416
417static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
418 const char *cstring1, int csize1,
419 const char *cstring2, int csize2,
420 int pos,
421 struct re_registers *regs,
422 int stop,
423 wchar_t *string1, int size1,
424 wchar_t *string2, int size2,
425 int *mbs_offset1, int *mbs_offset2);
426static int wcs_re_search_2 (struct re_pattern_buffer *bufp,
427 const char *string1, int size1,
428 const char *string2, int size2,
429 int startpos, int range,
430 struct re_registers *regs, int stop);
431static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp);
432#endif
433
434/* These are the command codes that appear in compiled regular
435 expressions. Some opcodes are followed by argument bytes. A
436 command code can specify any interpretation whatsoever for its
437 arguments. Zero bytes may appear in the compiled regular expression. */
438
439typedef enum
440{
441 no_op = 0,
442
443 /* Succeed right away--no more backtracking. */
444 succeed,
445
446 /* Followed by one byte giving n, then by n literal bytes. */
447 exactn,
448
449# ifdef MBS_SUPPORT
450 /* Same as exactn, but contains binary data. */
451 exactn_bin,
452# endif
453
454 /* Matches any (more or less) character. */
455 anychar,
456
457 /* Matches any one char belonging to specified set. First
458 following byte is number of bitmap bytes. Then come bytes
459 for a bitmap saying which chars are in. Bits in each byte
460 are ordered low-bit-first. A character is in the set if its
461 bit is 1. A character too large to have a bit in the map is
462 automatically not in the set. */
463 /* ifdef MBS_SUPPORT, following element is length of character
464 classes, length of collating symbols, length of equivalence
465 classes, length of character ranges, and length of characters.
466 Next, character class element, collating symbols elements,
467 equivalence class elements, range elements, and character
468 elements follow.
469 See regex_compile function. */
470 charset,
471
472 /* Same parameters as charset, but match any character that is
473 not one of those specified. */
474 charset_not,
475
476 /* Start remembering the text that is matched, for storing in a
477 register. Followed by one byte with the register number, in
478 the range 0 to one less than the pattern buffer's re_nsub
479 field. Then followed by one byte with the number of groups
480 inner to this one. (This last has to be part of the
481 start_memory only because we need it in the on_failure_jump
482 of re_match_2.) */
483 start_memory,
484
485 /* Stop remembering the text that is matched and store it in a
486 memory register. Followed by one byte with the register
487 number, in the range 0 to one less than `re_nsub' in the
488 pattern buffer, and one byte with the number of inner groups,
489 just like `start_memory'. (We need the number of inner
490 groups here because we don't have any easy way of finding the
491 corresponding start_memory when we're at a stop_memory.) */
492 stop_memory,
493
494 /* Match a duplicate of something remembered. Followed by one
495 byte containing the register number. */
496 duplicate,
497
498 /* Fail unless at beginning of line. */
499 begline,
500
501 /* Fail unless at end of line. */
502 endline,
503
504 /* Succeeds if at beginning of buffer (if emacs) or at beginning
505 of string to be matched (if not). */
506 begbuf,
507
508 /* Analogously, for end of buffer/string. */
509 endbuf,
510
511 /* Followed by two byte relative address to which to jump. */
512 jump,
513
514 /* Same as jump, but marks the end of an alternative. */
515 jump_past_alt,
516
517 /* Followed by two-byte relative address of place to resume at
518 in case of failure. */
519 /* ifdef MBS_SUPPORT, the size of address is 1. */
520 on_failure_jump,
521
522 /* Like on_failure_jump, but pushes a placeholder instead of the
523 current string position when executed. */
524 on_failure_keep_string_jump,
525
526 /* Throw away latest failure point and then jump to following
527 two-byte relative address. */
528 /* ifdef MBS_SUPPORT, the size of address is 1. */
529 pop_failure_jump,
530
531 /* Change to pop_failure_jump if know won't have to backtrack to
532 match; otherwise change to jump. This is used to jump
533 back to the beginning of a repeat. If what follows this jump
534 clearly won't match what the repeat does, such that we can be
535 sure that there is no use backtracking out of repetitions
536 already matched, then we change it to a pop_failure_jump.
537 Followed by two-byte address. */
538 /* ifdef MBS_SUPPORT, the size of address is 1. */
539 maybe_pop_jump,
540
541 /* Jump to following two-byte address, and push a dummy failure
542 point. This failure point will be thrown away if an attempt
543 is made to use it for a failure. A `+' construct makes this
544 before the first repeat. Also used as an intermediary kind
545 of jump when compiling an alternative. */
546 /* ifdef MBS_SUPPORT, the size of address is 1. */
547 dummy_failure_jump,
548
549 /* Push a dummy failure point and continue. Used at the end of
550 alternatives. */
551 push_dummy_failure,
552
553 /* Followed by two-byte relative address and two-byte number n.
554 After matching N times, jump to the address upon failure. */
555 /* ifdef MBS_SUPPORT, the size of address is 1. */
556 succeed_n,
557
558 /* Followed by two-byte relative address, and two-byte number n.
559 Jump to the address N times, then fail. */
560 /* ifdef MBS_SUPPORT, the size of address is 1. */
561 jump_n,
562
563 /* Set the following two-byte relative address to the
564 subsequent two-byte number. The address *includes* the two
565 bytes of number. */
566 /* ifdef MBS_SUPPORT, the size of address is 1. */
567 set_number_at,
568
569 wordchar, /* Matches any word-constituent character. */
570 notwordchar, /* Matches any char that is not a word-constituent. */
571
572 wordbeg, /* Succeeds if at word beginning. */
573 wordend, /* Succeeds if at word end. */
574
575 wordbound, /* Succeeds if at a word boundary. */
576 notwordbound /* Succeeds if not at a word boundary. */
577
578# ifdef emacs
579 ,before_dot, /* Succeeds if before point. */
580 at_dot, /* Succeeds if at point. */
581 after_dot, /* Succeeds if after point. */
582
583 /* Matches any character whose syntax is specified. Followed by
584 a byte which contains a syntax code, e.g., Sword. */
585 syntaxspec,
586
587 /* Matches any character whose syntax is not that specified. */
588 notsyntaxspec
589# endif /* emacs */
590} re_opcode_t;
591#endif /* not INSIDE_RECURSION */
592
593
594#ifdef BYTE
595# define CHAR_T char
596# define UCHAR_T unsigned char
597# define COMPILED_BUFFER_VAR bufp->buffer
598# define OFFSET_ADDRESS_SIZE 2
599# define PREFIX(name) byte_##name
600# define ARG_PREFIX(name) name
601# define PUT_CHAR(c) putchar (c)
602#else
603# ifdef WCHAR
604# define CHAR_T wchar_t
605# define UCHAR_T wchar_t
606# define COMPILED_BUFFER_VAR wc_buffer
607# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
608# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
609# define PREFIX(name) wcs_##name
610# define ARG_PREFIX(name) c##name
611/* Should we use wide stream?? */
612# define PUT_CHAR(c) printf ("%C", c);
613# define TRUE 1
614# define FALSE 0
615# else
616# ifdef MBS_SUPPORT
617# define WCHAR
618# define INSIDE_RECURSION
619# include "regex_old.c"
620# undef INSIDE_RECURSION
621# endif
622# define BYTE
623# define INSIDE_RECURSION
624# include "regex_old.c"
625# undef INSIDE_RECURSION
626# endif
627#endif
628
629#ifdef INSIDE_RECURSION
630/* Common operations on the compiled pattern. */
631
632/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
633/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
634
635# ifdef WCHAR
636# define STORE_NUMBER(destination, number) \
637 do { \
638 *(destination) = (UCHAR_T)(number); \
639 } while (0)
640# else /* BYTE */
641# define STORE_NUMBER(destination, number) \
642 do { \
643 (destination)[0] = (number) & 0377; \
644 (destination)[1] = (number) >> 8; \
645 } while (0)
646# endif /* WCHAR */
647
648/* Same as STORE_NUMBER, except increment DESTINATION to
649 the byte after where the number is stored. Therefore, DESTINATION
650 must be an lvalue. */
651/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
652
653# define STORE_NUMBER_AND_INCR(destination, number) \
654 do { \
655 STORE_NUMBER (destination, number); \
656 (destination) += OFFSET_ADDRESS_SIZE; \
657 } while (0)
658
659/* Put into DESTINATION a number stored in two contiguous bytes starting
660 at SOURCE. */
661/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
662
663# ifdef WCHAR
664# define EXTRACT_NUMBER(destination, source) \
665 do { \
666 (destination) = *(source); \
667 } while (0)
668# else /* BYTE */
669# define EXTRACT_NUMBER(destination, source) \
670 do { \
671 (destination) = *(source) & 0377; \
672 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
673 } while (0)
674# endif
675
676# ifdef DEBUG
677static void PREFIX(extract_number) (int *dest, UCHAR_T *source)
678{
679# ifdef WCHAR
680 *dest = *source;
681# else /* BYTE */
682 int temp = SIGN_EXTEND_CHAR (*(source + 1));
683 *dest = *source & 0377;
684 *dest += temp << 8;
685# endif
686}
687
688# ifndef EXTRACT_MACROS /* To debug the macros. */
689# undef EXTRACT_NUMBER
690# define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
691# endif /* not EXTRACT_MACROS */
692
693# endif /* DEBUG */
694
695/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
696 SOURCE must be an lvalue. */
697
698# define EXTRACT_NUMBER_AND_INCR(destination, source) \
699 do { \
700 EXTRACT_NUMBER (destination, source); \
701 (source) += OFFSET_ADDRESS_SIZE; \
702 } while (0)
703
704# ifdef DEBUG
705static void PREFIX(extract_number_and_incr) (int *destination,
706 UCHAR_T **source)
707{
708 PREFIX(extract_number) (destination, *source);
709 *source += OFFSET_ADDRESS_SIZE;
710}
711
712# ifndef EXTRACT_MACROS
713# undef EXTRACT_NUMBER_AND_INCR
714# define EXTRACT_NUMBER_AND_INCR(dest, src) \
715 PREFIX(extract_number_and_incr) (&dest, &src)
716# endif /* not EXTRACT_MACROS */
717
718# endif /* DEBUG */
719
720
721
722/* If DEBUG is defined, Regex prints many voluminous messages about what
723 it is doing (if the variable `debug' is nonzero). If linked with the
724 main program in `iregex.c', you can enter patterns and strings
725 interactively. And if linked with the main program in `main.c' and
726 the other test files, you can run the already-written tests. */
727
728# ifdef DEBUG
729
730# ifndef DEFINED_ONCE
731
732/* We use standard I/O for debugging. */
733# include <stdio.h>
734
735/* It is useful to test things that ``must'' be true when debugging. */
736# include <assert.h>
737
738static smallint debug;
739
740# define DEBUG_STATEMENT(e) e
741# define DEBUG_PRINT1(x) if (debug) printf (x)
742# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
743# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
744# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
745# endif /* not DEFINED_ONCE */
746
747# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
748 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
749# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
750 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
751
752
753/* Print the fastmap in human-readable form. */
754
755# ifndef DEFINED_ONCE
756static void
757print_fastmap (char *fastmap)
758{
759 unsigned was_a_range = 0;
760 unsigned i = 0;
761
762 while (i < (1 << BYTEWIDTH))
763 {
764 if (fastmap[i++])
765 {
766 was_a_range = 0;
767 putchar (i - 1);
768 while (i < (1 << BYTEWIDTH) && fastmap[i])
769 {
770 was_a_range = 1;
771 i++;
772 }
773 if (was_a_range)
774 {
775 printf ("-");
776 putchar (i - 1);
777 }
778 }
779 }
780 putchar ('\n');
781}
782# endif /* not DEFINED_ONCE */
783
784
785/* Print a compiled pattern string in human-readable form, starting at
786 the START pointer into it and ending just before the pointer END. */
787
788static void
789PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end)
790{
791 int mcnt, mcnt2;
792 UCHAR_T *p1;
793 UCHAR_T *p = start;
794 UCHAR_T *pend = end;
795
796 if (start == NULL)
797 {
798 printf ("(null)\n");
799 return;
800 }
801
802 /* Loop over pattern commands. */
803 while (p < pend)
804 {
805# ifdef _LIBC
806 printf ("%td:\t", p - start);
807# else
808 printf ("%ld:\t", (long int) (p - start));
809# endif
810
811 switch ((re_opcode_t) *p++)
812 {
813 case no_op:
814 printf ("/no_op");
815 break;
816
817 case exactn:
818 mcnt = *p++;
819 printf ("/exactn/%d", mcnt);
820 do
821 {
822 putchar ('/');
823 PUT_CHAR (*p++);
824 }
825 while (--mcnt);
826 break;
827
828# ifdef MBS_SUPPORT
829 case exactn_bin:
830 mcnt = *p++;
831 printf ("/exactn_bin/%d", mcnt);
832 do
833 {
834 printf("/%lx", (long int) *p++);
835 }
836 while (--mcnt);
837 break;
838# endif /* MBS_SUPPORT */
839
840 case start_memory:
841 mcnt = *p++;
842 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
843 break;
844
845 case stop_memory:
846 mcnt = *p++;
847 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
848 break;
849
850 case duplicate:
851 printf ("/duplicate/%ld", (long int) *p++);
852 break;
853
854 case anychar:
855 printf ("/anychar");
856 break;
857
858 case charset:
859 case charset_not:
860 {
861# ifdef WCHAR
862 int i, length;
863 wchar_t *workp = p;
864 printf ("/charset [%s",
865 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
866 p += 5;
867 length = *workp++; /* the length of char_classes */
868 for (i=0 ; i<length ; i++)
869 printf("[:%lx:]", (long int) *p++);
870 length = *workp++; /* the length of collating_symbol */
871 for (i=0 ; i<length ;)
872 {
873 printf("[.");
874 while(*p != 0)
875 PUT_CHAR((i++,*p++));
876 i++,p++;
877 printf(".]");
878 }
879 length = *workp++; /* the length of equivalence_class */
880 for (i=0 ; i<length ;)
881 {
882 printf("[=");
883 while(*p != 0)
884 PUT_CHAR((i++,*p++));
885 i++,p++;
886 printf("=]");
887 }
888 length = *workp++; /* the length of char_range */
889 for (i=0 ; i<length ; i++)
890 {
891 wchar_t range_start = *p++;
892 wchar_t range_end = *p++;
893 printf("%C-%C", range_start, range_end);
894 }
895 length = *workp++; /* the length of char */
896 for (i=0 ; i<length ; i++)
897 printf("%C", *p++);
898 putchar (']');
899# else
900 register int c, last = -100;
901 register int in_range = 0;
902
903 printf ("/charset [%s",
904 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
905
906 assert (p + *p < pend);
907
908 for (c = 0; c < 256; c++)
909 if (c / 8 < *p
910 && (p[1 + (c/8)] & (1 << (c % 8))))
911 {
912 /* Are we starting a range? */
913 if (last + 1 == c && ! in_range)
914 {
915 putchar ('-');
916 in_range = 1;
917 }
918 /* Have we broken a range? */
919 else if (last + 1 != c && in_range)
920 {
921 putchar (last);
922 in_range = 0;
923 }
924
925 if (! in_range)
926 putchar (c);
927
928 last = c;
929 }
930
931 if (in_range)
932 putchar (last);
933
934 putchar (']');
935
936 p += 1 + *p;
937# endif /* WCHAR */
938 }
939 break;
940
941 case begline:
942 printf ("/begline");
943 break;
944
945 case endline:
946 printf ("/endline");
947 break;
948
949 case on_failure_jump:
950 PREFIX(extract_number_and_incr) (&mcnt, &p);
951# ifdef _LIBC
952 printf ("/on_failure_jump to %td", p + mcnt - start);
953# else
954 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
955# endif
956 break;
957
958 case on_failure_keep_string_jump:
959 PREFIX(extract_number_and_incr) (&mcnt, &p);
960# ifdef _LIBC
961 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
962# else
963 printf ("/on_failure_keep_string_jump to %ld",
964 (long int) (p + mcnt - start));
965# endif
966 break;
967
968 case dummy_failure_jump:
969 PREFIX(extract_number_and_incr) (&mcnt, &p);
970# ifdef _LIBC
971 printf ("/dummy_failure_jump to %td", p + mcnt - start);
972# else
973 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
974# endif
975 break;
976
977 case push_dummy_failure:
978 printf ("/push_dummy_failure");
979 break;
980
981 case maybe_pop_jump:
982 PREFIX(extract_number_and_incr) (&mcnt, &p);
983# ifdef _LIBC
984 printf ("/maybe_pop_jump to %td", p + mcnt - start);
985# else
986 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
987# endif
988 break;
989
990 case pop_failure_jump:
991 PREFIX(extract_number_and_incr) (&mcnt, &p);
992# ifdef _LIBC
993 printf ("/pop_failure_jump to %td", p + mcnt - start);
994# else
995 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
996# endif
997 break;
998
999 case jump_past_alt:
1000 PREFIX(extract_number_and_incr) (&mcnt, &p);
1001# ifdef _LIBC
1002 printf ("/jump_past_alt to %td", p + mcnt - start);
1003# else
1004 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1005# endif
1006 break;
1007
1008 case jump:
1009 PREFIX(extract_number_and_incr) (&mcnt, &p);
1010# ifdef _LIBC
1011 printf ("/jump to %td", p + mcnt - start);
1012# else
1013 printf ("/jump to %ld", (long int) (p + mcnt - start));
1014# endif
1015 break;
1016
1017 case succeed_n:
1018 PREFIX(extract_number_and_incr) (&mcnt, &p);
1019 p1 = p + mcnt;
1020 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1021# ifdef _LIBC
1022 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1023# else
1024 printf ("/succeed_n to %ld, %d times",
1025 (long int) (p1 - start), mcnt2);
1026# endif
1027 break;
1028
1029 case jump_n:
1030 PREFIX(extract_number_and_incr) (&mcnt, &p);
1031 p1 = p + mcnt;
1032 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1033 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1034 break;
1035
1036 case set_number_at:
1037 PREFIX(extract_number_and_incr) (&mcnt, &p);
1038 p1 = p + mcnt;
1039 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1040# ifdef _LIBC
1041 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1042# else
1043 printf ("/set_number_at location %ld to %d",
1044 (long int) (p1 - start), mcnt2);
1045# endif
1046 break;
1047
1048 case wordbound:
1049 printf ("/wordbound");
1050 break;
1051
1052 case notwordbound:
1053 printf ("/notwordbound");
1054 break;
1055
1056 case wordbeg:
1057 printf ("/wordbeg");
1058 break;
1059
1060 case wordend:
1061 printf ("/wordend");
1062 break;
1063
1064# ifdef emacs
1065 case before_dot:
1066 printf ("/before_dot");
1067 break;
1068
1069 case at_dot:
1070 printf ("/at_dot");
1071 break;
1072
1073 case after_dot:
1074 printf ("/after_dot");
1075 break;
1076
1077 case syntaxspec:
1078 printf ("/syntaxspec");
1079 mcnt = *p++;
1080 printf ("/%d", mcnt);
1081 break;
1082
1083 case notsyntaxspec:
1084 printf ("/notsyntaxspec");
1085 mcnt = *p++;
1086 printf ("/%d", mcnt);
1087 break;
1088# endif /* emacs */
1089
1090 case wordchar:
1091 printf ("/wordchar");
1092 break;
1093
1094 case notwordchar:
1095 printf ("/notwordchar");
1096 break;
1097
1098 case begbuf:
1099 printf ("/begbuf");
1100 break;
1101
1102 case endbuf:
1103 printf ("/endbuf");
1104 break;
1105
1106 default:
1107 printf ("?%ld", (long int) *(p-1));
1108 }
1109
1110 putchar ('\n');
1111 }
1112
1113# ifdef _LIBC
1114 printf ("%td:\tend of pattern.\n", p - start);
1115# else
1116 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1117# endif
1118}
1119
1120
1121static void
1122PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp)
1123{
1124 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1125
1126 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1127 + bufp->used / sizeof(UCHAR_T));
1128 printf ("%ld bytes used/%ld bytes allocated.\n",
1129 bufp->used, bufp->allocated);
1130
1131 if (bufp->fastmap_accurate && bufp->fastmap)
1132 {
1133 printf ("fastmap: ");
1134 print_fastmap (bufp->fastmap);
1135 }
1136
1137# ifdef _LIBC
1138 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1139# else
1140 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1141# endif
1142 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1143 printf ("can_be_null: %d\t", bufp->can_be_null);
1144 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1145 printf ("no_sub: %d\t", bufp->no_sub);
1146 printf ("not_bol: %d\t", bufp->not_bol);
1147 printf ("not_eol: %d\t", bufp->not_eol);
1148 printf ("syntax: %lx\n", bufp->syntax);
1149 /* Perhaps we should print the translate table? */
1150}
1151
1152
1153static void
1154PREFIX(print_double_string) (
1155 const CHAR_T *where,
1156 const CHAR_T *string1,
1157 int size1,
1158 const CHAR_T *string2,
1159 int size2)
1160{
1161 int this_char;
1162
1163 if (where == NULL)
1164 printf ("(null)");
1165 else
1166 {
1167 int cnt;
1168
1169 if (FIRST_STRING_P (where))
1170 {
1171 for (this_char = where - string1; this_char < size1; this_char++)
1172 PUT_CHAR (string1[this_char]);
1173
1174 where = string2;
1175 }
1176
1177 cnt = 0;
1178 for (this_char = where - string2; this_char < size2; this_char++)
1179 {
1180 PUT_CHAR (string2[this_char]);
1181 if (++cnt > 100)
1182 {
1183 fputs ("...", stdout);
1184 break;
1185 }
1186 }
1187 }
1188}
1189
1190# if 0 /* ndef DEFINED_ONCE */
1191void
1192printchar (int c)
1193{
1194 putc (c, stderr);
1195}
1196# endif
1197
1198# else /* not DEBUG */
1199
1200# ifndef DEFINED_ONCE
1201# undef assert
1202# define assert(e)
1203
1204# define DEBUG_STATEMENT(e)
1205# define DEBUG_PRINT1(x)
1206# define DEBUG_PRINT2(x1, x2)
1207# define DEBUG_PRINT3(x1, x2, x3)
1208# define DEBUG_PRINT4(x1, x2, x3, x4)
1209# endif /* not DEFINED_ONCE */
1210# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1211# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1212
1213# endif /* not DEBUG */
1214
1215
1216
1217# ifdef WCHAR
1218/* This convert a multibyte string to a wide character string.
1219 And write their correspondances to offset_buffer(see below)
1220 and write whether each wchar_t is binary data to is_binary.
1221 This assume invalid multibyte sequences as binary data.
1222 We assume offset_buffer and is_binary is already allocated
1223 enough space. */
1224
1225static size_t
1226convert_mbs_to_wcs (
1227 CHAR_T *dest,
1228 const unsigned char* src,
1229 size_t len, /* the length of multibyte string. */
1230
1231 /* It hold correspondances between src(char string) and
1232 dest(wchar_t string) for optimization.
1233 e.g. src = "xxxyzz"
1234 dest = {'X', 'Y', 'Z'}
1235 (each "xxx", "y" and "zz" represent one multibyte character
1236 corresponding to 'X', 'Y' and 'Z'.)
1237 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1238 = {0, 3, 4, 6}
1239 */
1240 int *offset_buffer,
1241 char *is_binary)
1242{
1243 wchar_t *pdest = dest;
1244 const unsigned char *psrc = src;
1245 size_t wc_count = 0;
1246
1247 mbstate_t mbs;
1248 int i, consumed;
1249 size_t mb_remain = len;
1250 size_t mb_count = 0;
1251
1252 /* Initialize the conversion state. */
1253 memset (&mbs, 0, sizeof (mbstate_t));
1254
1255 offset_buffer[0] = 0;
1256 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1257 psrc += consumed)
1258 {
1259#ifdef _LIBC
1260 consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs);
1261#else
1262 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1263#endif
1264
1265 if (consumed <= 0)
1266 /* failed to convert. maybe src contains binary data.
1267 So we consume 1 byte manualy. */
1268 {
1269 *pdest = *psrc;
1270 consumed = 1;
1271 is_binary[wc_count] = TRUE;
1272 }
1273 else
1274 is_binary[wc_count] = FALSE;
1275 /* In sjis encoding, we use yen sign as escape character in
1276 place of reverse solidus. So we convert 0x5c(yen sign in
1277 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1278 solidus in UCS2). */
1279 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1280 *pdest = (wchar_t) *psrc;
1281
1282 offset_buffer[wc_count + 1] = mb_count += consumed;
1283 }
1284
1285 /* Fill remain of the buffer with sentinel. */
1286 for (i = wc_count + 1 ; i <= len ; i++)
1287 offset_buffer[i] = mb_count + 1;
1288
1289 return wc_count;
1290}
1291
1292# endif /* WCHAR */
1293
1294#else /* not INSIDE_RECURSION */
1295
1296/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1297 also be assigned to arbitrarily: each pattern buffer stores its own
1298 syntax, so it can be changed between regex compilations. */
1299/* This has no initializer because initialized variables in Emacs
1300 become read-only after dumping. */
1301reg_syntax_t re_syntax_options;
1302
1303
1304/* Specify the precise syntax of regexps for compilation. This provides
1305 for compatibility for various utilities which historically have
1306 different, incompatible syntaxes.
1307
1308 The argument SYNTAX is a bit mask comprised of the various bits
1309 defined in regex.h. We return the old syntax. */
1310
1311reg_syntax_t
1312re_set_syntax (reg_syntax_t syntax)
1313{
1314 reg_syntax_t ret = re_syntax_options;
1315
1316 re_syntax_options = syntax;
1317# ifdef DEBUG
1318 if (syntax & RE_DEBUG)
1319 debug = 1;
1320 else if (debug) /* was on but now is not */
1321 debug = 0;
1322# endif /* DEBUG */
1323 return ret;
1324}
1325
1326/* This table gives an error message for each of the error codes listed
1327 in regex.h. Obviously the order here has to be same as there.
1328 POSIX doesn't require that we do anything for REG_NOERROR,
1329 but why not be nice? */
1330
1331static const char re_error_msgid[] =
1332 {
1333# define REG_NOERROR_IDX 0
1334 gettext_noop ("Success") /* REG_NOERROR */
1335 "\0"
1336# define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1337 gettext_noop ("No match") /* REG_NOMATCH */
1338 "\0"
1339# define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1340 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1341 "\0"
1342# define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1343 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1344 "\0"
1345# define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1346 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1347 "\0"
1348# define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1349 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1350 "\0"
1351# define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1352 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1353 "\0"
1354# define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1355 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1356 "\0"
1357# define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1358 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1359 "\0"
1360# define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1361 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1362 "\0"
1363# define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1364 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1365 "\0"
1366# define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1367 gettext_noop ("Invalid range end") /* REG_ERANGE */
1368 "\0"
1369# define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1370 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1371 "\0"
1372# define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1373 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1374 "\0"
1375# define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1376 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1377 "\0"
1378# define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1379 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1380 "\0"
1381# define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1382 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1383 };
1384
1385static const uint16_t re_error_msgid_idx[] =
1386 {
1387 REG_NOERROR_IDX,
1388 REG_NOMATCH_IDX,
1389 REG_BADPAT_IDX,
1390 REG_ECOLLATE_IDX,
1391 REG_ECTYPE_IDX,
1392 REG_EESCAPE_IDX,
1393 REG_ESUBREG_IDX,
1394 REG_EBRACK_IDX,
1395 REG_EPAREN_IDX,
1396 REG_EBRACE_IDX,
1397 REG_BADBR_IDX,
1398 REG_ERANGE_IDX,
1399 REG_ESPACE_IDX,
1400 REG_BADRPT_IDX,
1401 REG_EEND_IDX,
1402 REG_ESIZE_IDX,
1403 REG_ERPAREN_IDX
1404 };
1405
1406#endif /* INSIDE_RECURSION */
1407
1408#ifndef DEFINED_ONCE
1409/* Avoiding alloca during matching, to placate r_alloc. */
1410
1411/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1412 searching and matching functions should not call alloca. On some
1413 systems, alloca is implemented in terms of malloc, and if we're
1414 using the relocating allocator routines, then malloc could cause a
1415 relocation, which might (if the strings being searched are in the
1416 ralloc heap) shift the data out from underneath the regexp
1417 routines.
1418
1419 Here's another reason to avoid allocation: Emacs
1420 processes input from X in a signal handler; processing X input may
1421 call malloc; if input arrives while a matching routine is calling
1422 malloc, then we're scrod. But Emacs can't just block input while
1423 calling matching routines; then we don't notice interrupts when
1424 they come in. So, Emacs blocks input around all regexp calls
1425 except the matching calls, which it leaves unprotected, in the
1426 faith that they will not malloc. */
1427
1428/* Normally, this is fine. */
1429# define MATCH_MAY_ALLOCATE
1430
1431/* When using GNU C, we are not REALLY using the C alloca, no matter
1432 what config.h may say. So don't take precautions for it. */
1433# ifdef __GNUC__
1434# undef C_ALLOCA
1435# endif
1436
1437/* The match routines may not allocate if (1) they would do it with malloc
1438 and (2) it's not safe for them to use malloc.
1439 Note that if REL_ALLOC is defined, matching would not use malloc for the
1440 failure stack, but we would still use it for the register vectors;
1441 so REL_ALLOC should not affect this. */
1442# if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1443# undef MATCH_MAY_ALLOCATE
1444# endif
1445#endif /* not DEFINED_ONCE */
1446
1447#ifdef INSIDE_RECURSION
1448/* Failure stack declarations and macros; both re_compile_fastmap and
1449 re_match_2 use a failure stack. These have to be macros because of
1450 REGEX_ALLOCATE_STACK. */
1451
1452
1453/* Number of failure points for which to initially allocate space
1454 when matching. If this number is exceeded, we allocate more
1455 space, so it is not a hard limit. */
1456# ifndef INIT_FAILURE_ALLOC
1457# define INIT_FAILURE_ALLOC 5
1458# endif
1459
1460/* Roughly the maximum number of failure points on the stack. Would be
1461 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1462 This is a variable only so users of regex can assign to it; we never
1463 change it ourselves. */
1464
1465# ifdef INT_IS_16BIT
1466
1467# ifndef DEFINED_ONCE
1468# if defined MATCH_MAY_ALLOCATE
1469/* 4400 was enough to cause a crash on Alpha OSF/1,
1470 whose default stack limit is 2mb. */
1471long int re_max_failures = 4000;
1472# else
1473long int re_max_failures = 2000;
1474# endif
1475# endif
1476
1477union PREFIX(fail_stack_elt)
1478{
1479 UCHAR_T *pointer;
1480 long int integer;
1481};
1482
1483typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1484
1485typedef struct
1486{
1487 PREFIX(fail_stack_elt_t) *stack;
1488 unsigned long int size;
1489 unsigned long int avail; /* Offset of next open position. */
1490} PREFIX(fail_stack_type);
1491
1492# else /* not INT_IS_16BIT */
1493
1494# ifndef DEFINED_ONCE
1495# if defined MATCH_MAY_ALLOCATE
1496/* 4400 was enough to cause a crash on Alpha OSF/1,
1497 whose default stack limit is 2mb. */
1498int re_max_failures = 4000;
1499# else
1500int re_max_failures = 2000;
1501# endif
1502# endif
1503
1504union PREFIX(fail_stack_elt)
1505{
1506 UCHAR_T *pointer;
1507 int integer;
1508};
1509
1510typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1511
1512typedef struct
1513{
1514 PREFIX(fail_stack_elt_t) *stack;
1515 unsigned size;
1516 unsigned avail; /* Offset of next open position. */
1517} PREFIX(fail_stack_type);
1518
1519# endif /* INT_IS_16BIT */
1520
1521# ifndef DEFINED_ONCE
1522# define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1523# define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1524# define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1525# endif
1526
1527
1528/* Define macros to initialize and free the failure stack.
1529 Do `return -2' if the alloc fails. */
1530
1531# ifdef MATCH_MAY_ALLOCATE
1532# define INIT_FAIL_STACK() \
1533 do { \
1534 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1535 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1536 \
1537 if (fail_stack.stack == NULL) \
1538 return -2; \
1539 \
1540 fail_stack.size = INIT_FAILURE_ALLOC; \
1541 fail_stack.avail = 0; \
1542 } while (0)
1543
1544# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1545# else
1546# define INIT_FAIL_STACK() \
1547 do { \
1548 fail_stack.avail = 0; \
1549 } while (0)
1550
1551# define RESET_FAIL_STACK()
1552# endif
1553
1554
1555/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1556
1557 Return 1 if succeeds, and 0 if either ran out of memory
1558 allocating space for it or it was already too large.
1559
1560 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1561
1562# define DOUBLE_FAIL_STACK(fail_stack) \
1563 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1564 ? 0 \
1565 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1566 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1567 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1568 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1569 \
1570 (fail_stack).stack == NULL \
1571 ? 0 \
1572 : ((fail_stack).size <<= 1, \
1573 1)))
1574
1575
1576/* Push pointer POINTER on FAIL_STACK.
1577 Return 1 if was able to do so and 0 if ran out of memory allocating
1578 space to do so. */
1579# define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1580 ((FAIL_STACK_FULL () \
1581 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1582 ? 0 \
1583 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1584 1))
1585
1586/* Push a pointer value onto the failure stack.
1587 Assumes the variable `fail_stack'. Probably should only
1588 be called from within `PUSH_FAILURE_POINT'. */
1589# define PUSH_FAILURE_POINTER(item) \
1590 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1591
1592/* This pushes an integer-valued item onto the failure stack.
1593 Assumes the variable `fail_stack'. Probably should only
1594 be called from within `PUSH_FAILURE_POINT'. */
1595# define PUSH_FAILURE_INT(item) \
1596 fail_stack.stack[fail_stack.avail++].integer = (item)
1597
1598/* Push a fail_stack_elt_t value onto the failure stack.
1599 Assumes the variable `fail_stack'. Probably should only
1600 be called from within `PUSH_FAILURE_POINT'. */
1601# define PUSH_FAILURE_ELT(item) \
1602 fail_stack.stack[fail_stack.avail++] = (item)
1603
1604/* These three POP... operations complement the three PUSH... operations.
1605 All assume that `fail_stack' is nonempty. */
1606# define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1607# define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1608# define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1609
1610/* Used to omit pushing failure point id's when we're not debugging. */
1611# ifdef DEBUG
1612# define DEBUG_PUSH PUSH_FAILURE_INT
1613# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1614# else
1615# define DEBUG_PUSH(item)
1616# define DEBUG_POP(item_addr)
1617# endif
1618
1619
1620/* Push the information about the state we will need
1621 if we ever fail back to it.
1622
1623 Requires variables fail_stack, regstart, regend, reg_info, and
1624 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1625 be declared.
1626
1627 Does `return FAILURE_CODE' if runs out of memory. */
1628
1629# define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1630 do { \
1631 char *destination; \
1632 /* Must be int, so when we don't save any registers, the arithmetic \
1633 of 0 + -1 isn't done as unsigned. */ \
1634 /* Can't be int, since there is not a shred of a guarantee that int \
1635 is wide enough to hold a value of something to which pointer can \
1636 be assigned */ \
1637 active_reg_t this_reg; \
1638 \
1639 DEBUG_STATEMENT (failure_id++); \
1640 DEBUG_STATEMENT (nfailure_points_pushed++); \
1641 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1642 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1643 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1644 \
1645 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1646 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1647 \
1648 /* Ensure we have enough space allocated for what we will push. */ \
1649 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1650 { \
1651 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1652 return failure_code; \
1653 \
1654 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1655 (fail_stack).size); \
1656 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1657 } \
1658 \
1659 /* Push the info, starting with the registers. */ \
1660 DEBUG_PRINT1 ("\n"); \
1661 \
1662 if (1) \
1663 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1664 this_reg++) \
1665 { \
1666 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1667 DEBUG_STATEMENT (num_regs_pushed++); \
1668 \
1669 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1670 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1671 \
1672 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1673 PUSH_FAILURE_POINTER (regend[this_reg]); \
1674 \
1675 DEBUG_PRINT2 (" info: %p\n ", \
1676 reg_info[this_reg].word.pointer); \
1677 DEBUG_PRINT2 (" match_null=%d", \
1678 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1679 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1680 DEBUG_PRINT2 (" matched_something=%d", \
1681 MATCHED_SOMETHING (reg_info[this_reg])); \
1682 DEBUG_PRINT2 (" ever_matched=%d", \
1683 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1684 DEBUG_PRINT1 ("\n"); \
1685 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1686 } \
1687 \
1688 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1689 PUSH_FAILURE_INT (lowest_active_reg); \
1690 \
1691 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1692 PUSH_FAILURE_INT (highest_active_reg); \
1693 \
1694 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1695 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1696 PUSH_FAILURE_POINTER (pattern_place); \
1697 \
1698 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1699 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1700 size2); \
1701 DEBUG_PRINT1 ("'\n"); \
1702 PUSH_FAILURE_POINTER (string_place); \
1703 \
1704 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1705 DEBUG_PUSH (failure_id); \
1706 } while (0)
1707
1708# ifndef DEFINED_ONCE
1709/* This is the number of items that are pushed and popped on the stack
1710 for each register. */
1711# define NUM_REG_ITEMS 3
1712
1713/* Individual items aside from the registers. */
1714# ifdef DEBUG
1715# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1716# else
1717# define NUM_NONREG_ITEMS 4
1718# endif
1719
1720/* We push at most this many items on the stack. */
1721/* We used to use (num_regs - 1), which is the number of registers
1722 this regexp will save; but that was changed to 5
1723 to avoid stack overflow for a regexp with lots of parens. */
1724# define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1725
1726/* We actually push this many items. */
1727# define NUM_FAILURE_ITEMS \
1728 (((0 \
1729 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1730 * NUM_REG_ITEMS) \
1731 + NUM_NONREG_ITEMS)
1732
1733/* How many items can still be added to the stack without overflowing it. */
1734# define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1735# endif /* not DEFINED_ONCE */
1736
1737
1738/* Pops what PUSH_FAIL_STACK pushes.
1739
1740 We restore into the parameters, all of which should be lvalues:
1741 STR -- the saved data position.
1742 PAT -- the saved pattern position.
1743 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1744 REGSTART, REGEND -- arrays of string positions.
1745 REG_INFO -- array of information about each subexpression.
1746
1747 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1748 `pend', `string1', `size1', `string2', and `size2'. */
1749# define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1750{ \
1751 DEBUG_STATEMENT (unsigned failure_id;) \
1752 active_reg_t this_reg; \
1753 const UCHAR_T *string_temp; \
1754 \
1755 assert (!FAIL_STACK_EMPTY ()); \
1756 \
1757 /* Remove failure points and point to how many regs pushed. */ \
1758 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1759 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1760 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1761 \
1762 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1763 \
1764 DEBUG_POP (&failure_id); \
1765 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1766 \
1767 /* If the saved string location is NULL, it came from an \
1768 on_failure_keep_string_jump opcode, and we want to throw away the \
1769 saved NULL, thus retaining our current position in the string. */ \
1770 string_temp = POP_FAILURE_POINTER (); \
1771 if (string_temp != NULL) \
1772 str = (const CHAR_T *) string_temp; \
1773 \
1774 DEBUG_PRINT2 (" Popping string %p: `", str); \
1775 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1776 DEBUG_PRINT1 ("'\n"); \
1777 \
1778 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1779 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1780 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1781 \
1782 /* Restore register info. */ \
1783 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1784 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1785 \
1786 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1787 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1788 \
1789 if (1) \
1790 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1791 { \
1792 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1793 \
1794 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1795 DEBUG_PRINT2 (" info: %p\n", \
1796 reg_info[this_reg].word.pointer); \
1797 \
1798 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1799 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1800 \
1801 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1802 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1803 } \
1804 else \
1805 { \
1806 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1807 { \
1808 reg_info[this_reg].word.integer = 0; \
1809 regend[this_reg] = 0; \
1810 regstart[this_reg] = 0; \
1811 } \
1812 highest_active_reg = high_reg; \
1813 } \
1814 \
1815 set_regs_matched_done = 0; \
1816 DEBUG_STATEMENT (nfailure_points_popped++); \
1817} /* POP_FAILURE_POINT */
1818
1819/* Structure for per-register (a.k.a. per-group) information.
1820 Other register information, such as the
1821 starting and ending positions (which are addresses), and the list of
1822 inner groups (which is a bits list) are maintained in separate
1823 variables.
1824
1825 We are making a (strictly speaking) nonportable assumption here: that
1826 the compiler will pack our bit fields into something that fits into
1827 the type of `word', i.e., is something that fits into one item on the
1828 failure stack. */
1829
1830
1831/* Declarations and macros for re_match_2. */
1832
1833typedef union
1834{
1835 PREFIX(fail_stack_elt_t) word;
1836 struct
1837 {
1838 /* This field is one if this group can match the empty string,
1839 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1840# define MATCH_NULL_UNSET_VALUE 3
1841 unsigned match_null_string_p : 2;
1842 unsigned is_active : 1;
1843 unsigned matched_something : 1;
1844 unsigned ever_matched_something : 1;
1845 } bits;
1846} PREFIX(register_info_type);
1847
1848# ifndef DEFINED_ONCE
1849# define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1850# define IS_ACTIVE(R) ((R).bits.is_active)
1851# define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1852# define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1853
1854
1855/* Call this when have matched a real character; it sets `matched' flags
1856 for the subexpressions which we are currently inside. Also records
1857 that those subexprs have matched. */
1858# define SET_REGS_MATCHED() \
1859 do \
1860 { \
1861 if (!set_regs_matched_done) \
1862 { \
1863 active_reg_t r; \
1864 set_regs_matched_done = 1; \
1865 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1866 { \
1867 MATCHED_SOMETHING (reg_info[r]) \
1868 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1869 = 1; \
1870 } \
1871 } \
1872 } \
1873 while (0)
1874# endif /* not DEFINED_ONCE */
1875
1876/* Registers are set to a sentinel when they haven't yet matched. */
1877static CHAR_T PREFIX(reg_unset_dummy);
1878# define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1879# define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1880
1881/* Subroutine declarations and macros for regex_compile. */
1882static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg);
1883static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc,
1884 int arg1, int arg2);
1885static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc,
1886 int arg, UCHAR_T *end);
1887static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc,
1888 int arg1, int arg2, UCHAR_T *end);
1889static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern,
1890 const CHAR_T *p,
1891 reg_syntax_t syntax);
1892static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p,
1893 const CHAR_T *pend,
1894 reg_syntax_t syntax);
1895# ifdef WCHAR
1896static reg_errcode_t wcs_compile_range (CHAR_T range_start,
1897 const CHAR_T **p_ptr,
1898 const CHAR_T *pend,
1899 char *translate,
1900 reg_syntax_t syntax,
1901 UCHAR_T *b,
1902 CHAR_T *char_set);
1903static void insert_space (int num, CHAR_T *loc, CHAR_T *end);
1904# else /* BYTE */
1905static reg_errcode_t byte_compile_range (unsigned int range_start,
1906 const char **p_ptr,
1907 const char *pend,
1908 char *translate,
1909 reg_syntax_t syntax,
1910 unsigned char *b);
1911# endif /* WCHAR */
1912
1913/* Fetch the next character in the uncompiled pattern---translating it
1914 if necessary. Also cast from a signed character in the constant
1915 string passed to us by the user to an unsigned char that we can use
1916 as an array index (in, e.g., `translate'). */
1917/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1918 because it is impossible to allocate 4GB array for some encodings
1919 which have 4 byte character_set like UCS4. */
1920# ifndef PATFETCH
1921# ifdef WCHAR
1922# define PATFETCH(c) \
1923 do {if (p == pend) return REG_EEND; \
1924 c = (UCHAR_T) *p++; \
1925 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1926 } while (0)
1927# else /* BYTE */
1928# define PATFETCH(c) \
1929 do {if (p == pend) return REG_EEND; \
1930 c = (unsigned char) *p++; \
1931 if (translate) c = (unsigned char) translate[c]; \
1932 } while (0)
1933# endif /* WCHAR */
1934# endif
1935
1936/* Fetch the next character in the uncompiled pattern, with no
1937 translation. */
1938# define PATFETCH_RAW(c) \
1939 do {if (p == pend) return REG_EEND; \
1940 c = (UCHAR_T) *p++; \
1941 } while (0)
1942
1943/* Go backwards one character in the pattern. */
1944# define PATUNFETCH p--
1945
1946
1947/* If `translate' is non-null, return translate[D], else just D. We
1948 cast the subscript to translate because some data is declared as
1949 `char *', to avoid warnings when a string constant is passed. But
1950 when we use a character as a subscript we must make it unsigned. */
1951/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1952 because it is impossible to allocate 4GB array for some encodings
1953 which have 4 byte character_set like UCS4. */
1954
1955# ifndef TRANSLATE
1956# ifdef WCHAR
1957# define TRANSLATE(d) \
1958 ((translate && ((UCHAR_T) (d)) <= 0xff) \
1959 ? (char) translate[(unsigned char) (d)] : (d))
1960# else /* BYTE */
1961# define TRANSLATE(d) \
1962 (translate ? (char) translate[(unsigned char) (d)] : (d))
1963# endif /* WCHAR */
1964# endif
1965
1966
1967/* Macros for outputting the compiled pattern into `buffer'. */
1968
1969/* If the buffer isn't allocated when it comes in, use this. */
1970# define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
1971
1972/* Make sure we have at least N more bytes of space in buffer. */
1973# ifdef WCHAR
1974# define GET_BUFFER_SPACE(n) \
1975 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
1976 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
1977 EXTEND_BUFFER ()
1978# else /* BYTE */
1979# define GET_BUFFER_SPACE(n) \
1980 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1981 EXTEND_BUFFER ()
1982# endif /* WCHAR */
1983
1984/* Make sure we have one more byte of buffer space and then add C to it. */
1985# define BUF_PUSH(c) \
1986 do { \
1987 GET_BUFFER_SPACE (1); \
1988 *b++ = (UCHAR_T) (c); \
1989 } while (0)
1990
1991
1992/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1993# define BUF_PUSH_2(c1, c2) \
1994 do { \
1995 GET_BUFFER_SPACE (2); \
1996 *b++ = (UCHAR_T) (c1); \
1997 *b++ = (UCHAR_T) (c2); \
1998 } while (0)
1999
2000
2001/* As with BUF_PUSH_2, except for three bytes. */
2002# define BUF_PUSH_3(c1, c2, c3) \
2003 do { \
2004 GET_BUFFER_SPACE (3); \
2005 *b++ = (UCHAR_T) (c1); \
2006 *b++ = (UCHAR_T) (c2); \
2007 *b++ = (UCHAR_T) (c3); \
2008 } while (0)
2009
2010/* Store a jump with opcode OP at LOC to location TO. We store a
2011 relative address offset by the three bytes the jump itself occupies. */
2012# define STORE_JUMP(op, loc, to) \
2013 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
2014
2015/* Likewise, for a two-argument jump. */
2016# define STORE_JUMP2(op, loc, to, arg) \
2017 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
2018
2019/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
2020# define INSERT_JUMP(op, loc, to) \
2021 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
2022
2023/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
2024# define INSERT_JUMP2(op, loc, to, arg) \
2025 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
2026 arg, b)
2027
2028/* This is not an arbitrary limit: the arguments which represent offsets
2029 into the pattern are two bytes long. So if 2^16 bytes turns out to
2030 be too small, many things would have to change. */
2031/* Any other compiler which, like MSC, has allocation limit below 2^16
2032 bytes will have to use approach similar to what was done below for
2033 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2034 reallocating to 0 bytes. Such thing is not going to work too well.
2035 You have been warned!! */
2036# ifndef DEFINED_ONCE
2037# if defined _MSC_VER && !defined WIN32
2038/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2039 The REALLOC define eliminates a flurry of conversion warnings,
2040 but is not required. */
2041# define MAX_BUF_SIZE 65500L
2042# define REALLOC(p,s) realloc ((p), (size_t) (s))
2043# else
2044# define MAX_BUF_SIZE (1L << 16)
2045# define REALLOC(p,s) realloc ((p), (s))
2046# endif
2047# endif /* not DEFINED_ONCE */
2048
2049/* Extend the buffer by twice its current size via realloc and
2050 reset the pointers that pointed into the old block to point to the
2051 correct places in the new one. If extending the buffer results in it
2052 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2053# ifdef WCHAR
2054# define EXTEND_BUFFER() \
2055 do { \
2056 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2057 int wchar_count; \
2058 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2059 return REG_ESIZE; \
2060 bufp->allocated <<= 1; \
2061 if (bufp->allocated > MAX_BUF_SIZE) \
2062 bufp->allocated = MAX_BUF_SIZE; \
2063 /* How many characters the new buffer can have? */ \
2064 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2065 if (wchar_count == 0) wchar_count = 1; \
2066 /* Truncate the buffer to CHAR_T align. */ \
2067 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2068 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2069 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2070 if (COMPILED_BUFFER_VAR == NULL) \
2071 return REG_ESPACE; \
2072 /* If the buffer moved, move all the pointers into it. */ \
2073 if (old_buffer != COMPILED_BUFFER_VAR) \
2074 { \
2075 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2076 b += incr; \
2077 begalt += incr; \
2078 if (fixup_alt_jump) \
2079 fixup_alt_jump += incr; \
2080 if (laststart) \
2081 laststart += incr; \
2082 if (pending_exact) \
2083 pending_exact += incr; \
2084 } \
2085 } while (0)
2086# else /* BYTE */
2087# define EXTEND_BUFFER() \
2088 do { \
2089 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2090 if (bufp->allocated == MAX_BUF_SIZE) \
2091 return REG_ESIZE; \
2092 bufp->allocated <<= 1; \
2093 if (bufp->allocated > MAX_BUF_SIZE) \
2094 bufp->allocated = MAX_BUF_SIZE; \
2095 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2096 bufp->allocated); \
2097 if (COMPILED_BUFFER_VAR == NULL) \
2098 return REG_ESPACE; \
2099 /* If the buffer moved, move all the pointers into it. */ \
2100 if (old_buffer != COMPILED_BUFFER_VAR) \
2101 { \
2102 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2103 b += incr; \
2104 begalt += incr; \
2105 if (fixup_alt_jump) \
2106 fixup_alt_jump += incr; \
2107 if (laststart) \
2108 laststart += incr; \
2109 if (pending_exact) \
2110 pending_exact += incr; \
2111 } \
2112 } while (0)
2113# endif /* WCHAR */
2114
2115# ifndef DEFINED_ONCE
2116/* Since we have one byte reserved for the register number argument to
2117 {start,stop}_memory, the maximum number of groups we can report
2118 things about is what fits in that byte. */
2119# define MAX_REGNUM 255
2120
2121/* But patterns can have more than `MAX_REGNUM' registers. We just
2122 ignore the excess. */
2123typedef unsigned regnum_t;
2124
2125
2126/* Macros for the compile stack. */
2127
2128/* Since offsets can go either forwards or backwards, this type needs to
2129 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2130/* int may be not enough when sizeof(int) == 2. */
2131typedef long pattern_offset_t;
2132
2133typedef struct
2134{
2135 pattern_offset_t begalt_offset;
2136 pattern_offset_t fixup_alt_jump;
2137 pattern_offset_t inner_group_offset;
2138 pattern_offset_t laststart_offset;
2139 regnum_t regnum;
2140} compile_stack_elt_t;
2141
2142
2143typedef struct
2144{
2145 compile_stack_elt_t *stack;
2146 unsigned size;
2147 unsigned avail; /* Offset of next open position. */
2148} compile_stack_type;
2149
2150
2151# define INIT_COMPILE_STACK_SIZE 32
2152
2153# define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2154# define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2155
2156/* The next available element. */
2157# define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2158
2159# endif /* not DEFINED_ONCE */
2160
2161/* Set the bit for character C in a list. */
2162# ifndef DEFINED_ONCE
2163# define SET_LIST_BIT(c) \
2164 (b[((unsigned char) (c)) / BYTEWIDTH] \
2165 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2166# endif /* DEFINED_ONCE */
2167
2168/* Get the next unsigned number in the uncompiled pattern. */
2169# define GET_UNSIGNED_NUMBER(num) \
2170 { \
2171 while (p != pend) \
2172 { \
2173 PATFETCH (c); \
2174 if (c < '0' || c > '9') \
2175 break; \
2176 if (num <= RE_DUP_MAX) \
2177 { \
2178 if (num < 0) \
2179 num = 0; \
2180 num = num * 10 + c - '0'; \
2181 } \
2182 } \
2183 }
2184
2185# ifndef DEFINED_ONCE
2186# if defined _LIBC || defined WIDE_CHAR_SUPPORT
2187/* The GNU C library provides support for user-defined character classes
2188 and the functions from ISO C amendement 1. */
2189# ifdef CHARCLASS_NAME_MAX
2190# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2191# else
2192/* This shouldn't happen but some implementation might still have this
2193 problem. Use a reasonable default value. */
2194# define CHAR_CLASS_MAX_LENGTH 256
2195# endif
2196
2197# ifdef _LIBC
2198# define IS_CHAR_CLASS(string) __wctype (string)
2199# else
2200# define IS_CHAR_CLASS(string) wctype (string)
2201# endif
2202# else
2203# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2204
2205# define IS_CHAR_CLASS(string) \
2206 (STREQ (string, "alpha") || STREQ (string, "upper") \
2207 || STREQ (string, "lower") || STREQ (string, "digit") \
2208 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2209 || STREQ (string, "space") || STREQ (string, "print") \
2210 || STREQ (string, "punct") || STREQ (string, "graph") \
2211 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2212# endif
2213# endif /* DEFINED_ONCE */
2214
2215# ifndef MATCH_MAY_ALLOCATE
2216
2217/* If we cannot allocate large objects within re_match_2_internal,
2218 we make the fail stack and register vectors global.
2219 The fail stack, we grow to the maximum size when a regexp
2220 is compiled.
2221 The register vectors, we adjust in size each time we
2222 compile a regexp, according to the number of registers it needs. */
2223
2224static PREFIX(fail_stack_type) fail_stack;
2225
2226/* Size with which the following vectors are currently allocated.
2227 That is so we can make them bigger as needed,
2228 but never make them smaller. */
2229# ifdef DEFINED_ONCE
2230static int regs_allocated_size;
2231
2232static const char ** regstart, ** regend;
2233static const char ** old_regstart, ** old_regend;
2234static const char **best_regstart, **best_regend;
2235static const char **reg_dummy;
2236# endif /* DEFINED_ONCE */
2237
2238static PREFIX(register_info_type) *PREFIX(reg_info);
2239static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2240
2241/* Make the register vectors big enough for NUM_REGS registers,
2242 but don't make them smaller. */
2243
2244static void
2245PREFIX(regex_grow_registers) (int num_regs)
2246{
2247 if (num_regs > regs_allocated_size)
2248 {
2249 RETALLOC_IF (regstart, num_regs, const char *);
2250 RETALLOC_IF (regend, num_regs, const char *);
2251 RETALLOC_IF (old_regstart, num_regs, const char *);
2252 RETALLOC_IF (old_regend, num_regs, const char *);
2253 RETALLOC_IF (best_regstart, num_regs, const char *);
2254 RETALLOC_IF (best_regend, num_regs, const char *);
2255 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2256 RETALLOC_IF (reg_dummy, num_regs, const char *);
2257 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2258
2259 regs_allocated_size = num_regs;
2260 }
2261}
2262
2263# endif /* not MATCH_MAY_ALLOCATE */
2264
2265# ifndef DEFINED_ONCE
2266static boolean group_in_compile_stack (compile_stack_type
2267 compile_stack,
2268 regnum_t regnum);
2269# endif /* not DEFINED_ONCE */
2270
2271/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2272 Returns one of error codes defined in `regex.h', or zero for success.
2273
2274 Assumes the `allocated' (and perhaps `buffer') and `translate'
2275 fields are set in BUFP on entry.
2276
2277 If it succeeds, results are put in BUFP (if it returns an error, the
2278 contents of BUFP are undefined):
2279 `buffer' is the compiled pattern;
2280 `syntax' is set to SYNTAX;
2281 `used' is set to the length of the compiled pattern;
2282 `fastmap_accurate' is zero;
2283 `re_nsub' is the number of subexpressions in PATTERN;
2284 `not_bol' and `not_eol' are zero;
2285
2286 The `fastmap' and `newline_anchor' fields are neither
2287 examined nor set. */
2288
2289/* Return, freeing storage we allocated. */
2290# ifdef WCHAR
2291# define FREE_STACK_RETURN(value) \
2292 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2293# else
2294# define FREE_STACK_RETURN(value) \
2295 return (free (compile_stack.stack), value)
2296# endif /* WCHAR */
2297
2298static reg_errcode_t
2299PREFIX(regex_compile) (
2300 const char *ARG_PREFIX(pattern),
2301 size_t ARG_PREFIX(size),
2302 reg_syntax_t syntax,
2303 struct re_pattern_buffer *bufp)
2304{
2305 /* We fetch characters from PATTERN here. Even though PATTERN is
2306 `char *' (i.e., signed), we declare these variables as unsigned, so
2307 they can be reliably used as array indices. */
2308 register UCHAR_T c, c1;
2309
2310#ifdef WCHAR
2311 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2312 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2313 size_t size;
2314 /* offset buffer for optimization. See convert_mbs_to_wc. */
2315 int *mbs_offset = NULL;
2316 /* It hold whether each wchar_t is binary data or not. */
2317 char *is_binary = NULL;
2318 /* A flag whether exactn is handling binary data or not. */
2319 char is_exactn_bin = FALSE;
2320#endif /* WCHAR */
2321
2322 /* A random temporary spot in PATTERN. */
2323 const CHAR_T *p1;
2324
2325 /* Points to the end of the buffer, where we should append. */
2326 register UCHAR_T *b;
2327
2328 /* Keeps track of unclosed groups. */
2329 compile_stack_type compile_stack;
2330
2331 /* Points to the current (ending) position in the pattern. */
2332#ifdef WCHAR
2333 const CHAR_T *p;
2334 const CHAR_T *pend;
2335#else /* BYTE */
2336 const CHAR_T *p = pattern;
2337 const CHAR_T *pend = pattern + size;
2338#endif /* WCHAR */
2339
2340 /* How to translate the characters in the pattern. */
2341 RE_TRANSLATE_TYPE translate = bufp->translate;
2342
2343 /* Address of the count-byte of the most recently inserted `exactn'
2344 command. This makes it possible to tell if a new exact-match
2345 character can be added to that command or if the character requires
2346 a new `exactn' command. */
2347 UCHAR_T *pending_exact = 0;
2348
2349 /* Address of start of the most recently finished expression.
2350 This tells, e.g., postfix * where to find the start of its
2351 operand. Reset at the beginning of groups and alternatives. */
2352 UCHAR_T *laststart = 0;
2353
2354 /* Address of beginning of regexp, or inside of last group. */
2355 UCHAR_T *begalt;
2356
2357 /* Address of the place where a forward jump should go to the end of
2358 the containing expression. Each alternative of an `or' -- except the
2359 last -- ends with a forward jump of this sort. */
2360 UCHAR_T *fixup_alt_jump = 0;
2361
2362 /* Counts open-groups as they are encountered. Remembered for the
2363 matching close-group on the compile stack, so the same register
2364 number is put in the stop_memory as the start_memory. */
2365 regnum_t regnum = 0;
2366
2367#ifdef WCHAR
2368 /* Initialize the wchar_t PATTERN and offset_buffer. */
2369 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2370 mbs_offset = TALLOC(csize + 1, int);
2371 is_binary = TALLOC(csize + 1, char);
2372 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2373 {
2374 free(pattern);
2375 free(mbs_offset);
2376 free(is_binary);
2377 return REG_ESPACE;
2378 }
2379 pattern[csize] = L'\0'; /* sentinel */
2380 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2381 pend = p + size;
2382 if (size < 0)
2383 {
2384 free(pattern);
2385 free(mbs_offset);
2386 free(is_binary);
2387 return REG_BADPAT;
2388 }
2389#endif
2390
2391#ifdef DEBUG
2392 DEBUG_PRINT1 ("\nCompiling pattern: ");
2393 if (debug)
2394 {
2395 unsigned debug_count;
2396
2397 for (debug_count = 0; debug_count < size; debug_count++)
2398 PUT_CHAR (pattern[debug_count]);
2399 putchar ('\n');
2400 }
2401#endif /* DEBUG */
2402
2403 /* Initialize the compile stack. */
2404 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2405 if (compile_stack.stack == NULL)
2406 {
2407#ifdef WCHAR
2408 free(pattern);
2409 free(mbs_offset);
2410 free(is_binary);
2411#endif
2412 return REG_ESPACE;
2413 }
2414
2415 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2416 compile_stack.avail = 0;
2417
2418 /* Initialize the pattern buffer. */
2419 bufp->syntax = syntax;
2420 bufp->fastmap_accurate = 0;
2421 bufp->not_bol = bufp->not_eol = 0;
2422
2423 /* Set `used' to zero, so that if we return an error, the pattern
2424 printer (for debugging) will think there's no pattern. We reset it
2425 at the end. */
2426 bufp->used = 0;
2427
2428 /* Always count groups, whether or not bufp->no_sub is set. */
2429 bufp->re_nsub = 0;
2430
2431#if !defined emacs && !defined SYNTAX_TABLE
2432 /* Initialize the syntax table. */
2433 init_syntax_once ();
2434#endif
2435
2436 if (bufp->allocated == 0)
2437 {
2438 if (bufp->buffer)
2439 { /* If zero allocated, but buffer is non-null, try to realloc
2440 enough space. This loses if buffer's address is bogus, but
2441 that is the user's responsibility. */
2442#ifdef WCHAR
2443 /* Free bufp->buffer and allocate an array for wchar_t pattern
2444 buffer. */
2445 free(bufp->buffer);
2446 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2447 UCHAR_T);
2448#else
2449 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2450#endif /* WCHAR */
2451 }
2452 else
2453 { /* Caller did not allocate a buffer. Do it for them. */
2454 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2455 UCHAR_T);
2456 }
2457
2458 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2459#ifdef WCHAR
2460 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2461#endif /* WCHAR */
2462 bufp->allocated = INIT_BUF_SIZE;
2463 }
2464#ifdef WCHAR
2465 else
2466 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2467#endif
2468
2469 begalt = b = COMPILED_BUFFER_VAR;
2470
2471 /* Loop through the uncompiled pattern until we're at the end. */
2472 while (p != pend)
2473 {
2474 PATFETCH (c);
2475
2476 switch (c)
2477 {
2478 case '^':
2479 {
2480 if ( /* If at start of pattern, it's an operator. */
2481 p == pattern + 1
2482 /* If context independent, it's an operator. */
2483 || syntax & RE_CONTEXT_INDEP_ANCHORS
2484 /* Otherwise, depends on what's come before. */
2485 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2486 BUF_PUSH (begline);
2487 else
2488 goto normal_char;
2489 }
2490 break;
2491
2492
2493 case '$':
2494 {
2495 if ( /* If at end of pattern, it's an operator. */
2496 p == pend
2497 /* If context independent, it's an operator. */
2498 || syntax & RE_CONTEXT_INDEP_ANCHORS
2499 /* Otherwise, depends on what's next. */
2500 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2501 BUF_PUSH (endline);
2502 else
2503 goto normal_char;
2504 }
2505 break;
2506
2507
2508 case '+':
2509 case '?':
2510 if ((syntax & RE_BK_PLUS_QM)
2511 || (syntax & RE_LIMITED_OPS))
2512 goto normal_char;
2513 handle_plus:
2514 case '*':
2515 /* If there is no previous pattern... */
2516 if (!laststart)
2517 {
2518 if (syntax & RE_CONTEXT_INVALID_OPS)
2519 FREE_STACK_RETURN (REG_BADRPT);
2520 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2521 goto normal_char;
2522 }
2523
2524 {
2525 /* Are we optimizing this jump? */
2526 boolean keep_string_p = false;
2527
2528 /* 1 means zero (many) matches is allowed. */
2529 char zero_times_ok = 0, many_times_ok = 0;
2530
2531 /* If there is a sequence of repetition chars, collapse it
2532 down to just one (the right one). We can't combine
2533 interval operators with these because of, e.g., `a{2}*',
2534 which should only match an even number of `a's. */
2535
2536 for (;;)
2537 {
2538 zero_times_ok |= c != '+';
2539 many_times_ok |= c != '?';
2540
2541 if (p == pend)
2542 break;
2543
2544 PATFETCH (c);
2545
2546 if (c == '*'
2547 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2548 ;
2549
2550 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2551 {
2552 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2553
2554 PATFETCH (c1);
2555 if (!(c1 == '+' || c1 == '?'))
2556 {
2557 PATUNFETCH;
2558 PATUNFETCH;
2559 break;
2560 }
2561
2562 c = c1;
2563 }
2564 else
2565 {
2566 PATUNFETCH;
2567 break;
2568 }
2569
2570 /* If we get here, we found another repeat character. */
2571 }
2572
2573 /* Star, etc. applied to an empty pattern is equivalent
2574 to an empty pattern. */
2575 if (!laststart)
2576 break;
2577
2578 /* Now we know whether or not zero matches is allowed
2579 and also whether or not two or more matches is allowed. */
2580 if (many_times_ok)
2581 { /* More than one repetition is allowed, so put in at the
2582 end a backward relative jump from `b' to before the next
2583 jump we're going to put in below (which jumps from
2584 laststart to after this jump).
2585
2586 But if we are at the `*' in the exact sequence `.*\n',
2587 insert an unconditional jump backwards to the .,
2588 instead of the beginning of the loop. This way we only
2589 push a failure point once, instead of every time
2590 through the loop. */
2591 assert (p - 1 > pattern);
2592
2593 /* Allocate the space for the jump. */
2594 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2595
2596 /* We know we are not at the first character of the pattern,
2597 because laststart was nonzero. And we've already
2598 incremented `p', by the way, to be the character after
2599 the `*'. Do we have to do something analogous here
2600 for null bytes, because of RE_DOT_NOT_NULL? */
2601 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2602 && zero_times_ok
2603 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2604 && !(syntax & RE_DOT_NEWLINE))
2605 { /* We have .*\n. */
2606 STORE_JUMP (jump, b, laststart);
2607 keep_string_p = true;
2608 }
2609 else
2610 /* Anything else. */
2611 STORE_JUMP (maybe_pop_jump, b, laststart -
2612 (1 + OFFSET_ADDRESS_SIZE));
2613
2614 /* We've added more stuff to the buffer. */
2615 b += 1 + OFFSET_ADDRESS_SIZE;
2616 }
2617
2618 /* On failure, jump from laststart to b + 3, which will be the
2619 end of the buffer after this jump is inserted. */
2620 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2621 'b + 3'. */
2622 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2623 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2624 : on_failure_jump,
2625 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2626 pending_exact = 0;
2627 b += 1 + OFFSET_ADDRESS_SIZE;
2628
2629 if (!zero_times_ok)
2630 {
2631 /* At least one repetition is required, so insert a
2632 `dummy_failure_jump' before the initial
2633 `on_failure_jump' instruction of the loop. This
2634 effects a skip over that instruction the first time
2635 we hit that loop. */
2636 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2637 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2638 2 + 2 * OFFSET_ADDRESS_SIZE);
2639 b += 1 + OFFSET_ADDRESS_SIZE;
2640 }
2641 }
2642 break;
2643
2644
2645 case '.':
2646 laststart = b;
2647 BUF_PUSH (anychar);
2648 break;
2649
2650
2651 case '[':
2652 {
2653 boolean had_char_class = false;
2654#ifdef WCHAR
2655 CHAR_T range_start = 0xffffffff;
2656#else
2657 unsigned int range_start = 0xffffffff;
2658#endif
2659 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2660
2661#ifdef WCHAR
2662 /* We assume a charset(_not) structure as a wchar_t array.
2663 charset[0] = (re_opcode_t) charset(_not)
2664 charset[1] = l (= length of char_classes)
2665 charset[2] = m (= length of collating_symbols)
2666 charset[3] = n (= length of equivalence_classes)
2667 charset[4] = o (= length of char_ranges)
2668 charset[5] = p (= length of chars)
2669
2670 charset[6] = char_class (wctype_t)
2671 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2672 ...
2673 charset[l+5] = char_class (wctype_t)
2674
2675 charset[l+6] = collating_symbol (wchar_t)
2676 ...
2677 charset[l+m+5] = collating_symbol (wchar_t)
2678 ifdef _LIBC we use the index if
2679 _NL_COLLATE_SYMB_EXTRAMB instead of
2680 wchar_t string.
2681
2682 charset[l+m+6] = equivalence_classes (wchar_t)
2683 ...
2684 charset[l+m+n+5] = equivalence_classes (wchar_t)
2685 ifdef _LIBC we use the index in
2686 _NL_COLLATE_WEIGHT instead of
2687 wchar_t string.
2688
2689 charset[l+m+n+6] = range_start
2690 charset[l+m+n+7] = range_end
2691 ...
2692 charset[l+m+n+2o+4] = range_start
2693 charset[l+m+n+2o+5] = range_end
2694 ifdef _LIBC we use the value looked up
2695 in _NL_COLLATE_COLLSEQ instead of
2696 wchar_t character.
2697
2698 charset[l+m+n+2o+6] = char
2699 ...
2700 charset[l+m+n+2o+p+5] = char
2701
2702 */
2703
2704 /* We need at least 6 spaces: the opcode, the length of
2705 char_classes, the length of collating_symbols, the length of
2706 equivalence_classes, the length of char_ranges, the length of
2707 chars. */
2708 GET_BUFFER_SPACE (6);
2709
2710 /* Save b as laststart. And We use laststart as the pointer
2711 to the first element of the charset here.
2712 In other words, laststart[i] indicates charset[i]. */
2713 laststart = b;
2714
2715 /* We test `*p == '^' twice, instead of using an if
2716 statement, so we only need one BUF_PUSH. */
2717 BUF_PUSH (*p == '^' ? charset_not : charset);
2718 if (*p == '^')
2719 p++;
2720
2721 /* Push the length of char_classes, the length of
2722 collating_symbols, the length of equivalence_classes, the
2723 length of char_ranges and the length of chars. */
2724 BUF_PUSH_3 (0, 0, 0);
2725 BUF_PUSH_2 (0, 0);
2726
2727 /* Remember the first position in the bracket expression. */
2728 p1 = p;
2729
2730 /* charset_not matches newline according to a syntax bit. */
2731 if ((re_opcode_t) b[-6] == charset_not
2732 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2733 {
2734 BUF_PUSH('\n');
2735 laststart[5]++; /* Update the length of characters */
2736 }
2737
2738 /* Read in characters and ranges, setting map bits. */
2739 for (;;)
2740 {
2741 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2742
2743 PATFETCH (c);
2744
2745 /* \ might escape characters inside [...] and [^...]. */
2746 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2747 {
2748 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2749
2750 PATFETCH (c1);
2751 BUF_PUSH(c1);
2752 laststart[5]++; /* Update the length of chars */
2753 range_start = c1;
2754 continue;
2755 }
2756
2757 /* Could be the end of the bracket expression. If it's
2758 not (i.e., when the bracket expression is `[]' so
2759 far), the ']' character bit gets set way below. */
2760 if (c == ']' && p != p1 + 1)
2761 break;
2762
2763 /* Look ahead to see if it's a range when the last thing
2764 was a character class. */
2765 if (had_char_class && c == '-' && *p != ']')
2766 FREE_STACK_RETURN (REG_ERANGE);
2767
2768 /* Look ahead to see if it's a range when the last thing
2769 was a character: if this is a hyphen not at the
2770 beginning or the end of a list, then it's the range
2771 operator. */
2772 if (c == '-'
2773 && !(p - 2 >= pattern && p[-2] == '[')
2774 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2775 && *p != ']')
2776 {
2777 reg_errcode_t ret;
2778 /* Allocate the space for range_start and range_end. */
2779 GET_BUFFER_SPACE (2);
2780 /* Update the pointer to indicate end of buffer. */
2781 b += 2;
2782 ret = wcs_compile_range (range_start, &p, pend, translate,
2783 syntax, b, laststart);
2784 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2785 range_start = 0xffffffff;
2786 }
2787 else if (p[0] == '-' && p[1] != ']')
2788 { /* This handles ranges made up of characters only. */
2789 reg_errcode_t ret;
2790
2791 /* Move past the `-'. */
2792 PATFETCH (c1);
2793 /* Allocate the space for range_start and range_end. */
2794 GET_BUFFER_SPACE (2);
2795 /* Update the pointer to indicate end of buffer. */
2796 b += 2;
2797 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2798 laststart);
2799 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2800 range_start = 0xffffffff;
2801 }
2802
2803 /* See if we're at the beginning of a possible character
2804 class. */
2805 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2806 { /* Leave room for the null. */
2807 char str[CHAR_CLASS_MAX_LENGTH + 1];
2808
2809 PATFETCH (c);
2810 c1 = 0;
2811
2812 /* If pattern is `[[:'. */
2813 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2814
2815 for (;;)
2816 {
2817 PATFETCH (c);
2818 if ((c == ':' && *p == ']') || p == pend)
2819 break;
2820 if (c1 < CHAR_CLASS_MAX_LENGTH)
2821 str[c1++] = c;
2822 else
2823 /* This is in any case an invalid class name. */
2824 str[0] = '\0';
2825 }
2826 str[c1] = '\0';
2827
2828 /* If isn't a word bracketed by `[:' and `:]':
2829 undo the ending character, the letters, and leave
2830 the leading `:' and `[' (but store them as character). */
2831 if (c == ':' && *p == ']')
2832 {
2833 wctype_t wt;
2834 uintptr_t alignedp;
2835
2836 /* Query the character class as wctype_t. */
2837 wt = IS_CHAR_CLASS (str);
2838 if (wt == 0)
2839 FREE_STACK_RETURN (REG_ECTYPE);
2840
2841 /* Throw away the ] at the end of the character
2842 class. */
2843 PATFETCH (c);
2844
2845 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2846
2847 /* Allocate the space for character class. */
2848 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2849 /* Update the pointer to indicate end of buffer. */
2850 b += CHAR_CLASS_SIZE;
2851 /* Move data which follow character classes
2852 not to violate the data. */
2853 insert_space(CHAR_CLASS_SIZE,
2854 laststart + 6 + laststart[1],
2855 b - 1);
2856 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2857 + __alignof__(wctype_t) - 1)
2858 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2859 /* Store the character class. */
2860 *((wctype_t*)alignedp) = wt;
2861 /* Update length of char_classes */
2862 laststart[1] += CHAR_CLASS_SIZE;
2863
2864 had_char_class = true;
2865 }
2866 else
2867 {
2868 c1++;
2869 while (c1--)
2870 PATUNFETCH;
2871 BUF_PUSH ('[');
2872 BUF_PUSH (':');
2873 laststart[5] += 2; /* Update the length of characters */
2874 range_start = ':';
2875 had_char_class = false;
2876 }
2877 }
2878 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2879 || *p == '.'))
2880 {
2881 CHAR_T str[128]; /* Should be large enough. */
2882 CHAR_T delim = *p; /* '=' or '.' */
2883# ifdef _LIBC
2884 uint32_t nrules =
2885 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2886# endif
2887 PATFETCH (c);
2888 c1 = 0;
2889
2890 /* If pattern is `[[=' or '[[.'. */
2891 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2892
2893 for (;;)
2894 {
2895 PATFETCH (c);
2896 if ((c == delim && *p == ']') || p == pend)
2897 break;
2898 if (c1 < sizeof (str) - 1)
2899 str[c1++] = c;
2900 else
2901 /* This is in any case an invalid class name. */
2902 str[0] = '\0';
2903 }
2904 str[c1] = '\0';
2905
2906 if (c == delim && *p == ']' && str[0] != '\0')
2907 {
2908 unsigned int i, offset;
2909 /* If we have no collation data we use the default
2910 collation in which each character is in a class
2911 by itself. It also means that ASCII is the
2912 character set and therefore we cannot have character
2913 with more than one byte in the multibyte
2914 representation. */
2915
2916 /* If not defined _LIBC, we push the name and
2917 `\0' for the sake of matching performance. */
2918 int datasize = c1 + 1;
2919
2920# ifdef _LIBC
2921 int32_t idx = 0;
2922 if (nrules == 0)
2923# endif
2924 {
2925 if (c1 != 1)
2926 FREE_STACK_RETURN (REG_ECOLLATE);
2927 }
2928# ifdef _LIBC
2929 else
2930 {
2931 const int32_t *table;
2932 const int32_t *weights;
2933 const int32_t *extra;
2934 const int32_t *indirect;
2935 wint_t *cp;
2936
2937 /* This #include defines a local function! */
2938# include <locale/weightwc.h>
2939
2940 if(delim == '=')
2941 {
2942 /* We push the index for equivalence class. */
2943 cp = (wint_t*)str;
2944
2945 table = (const int32_t *)
2946 _NL_CURRENT (LC_COLLATE,
2947 _NL_COLLATE_TABLEWC);
2948 weights = (const int32_t *)
2949 _NL_CURRENT (LC_COLLATE,
2950 _NL_COLLATE_WEIGHTWC);
2951 extra = (const int32_t *)
2952 _NL_CURRENT (LC_COLLATE,
2953 _NL_COLLATE_EXTRAWC);
2954 indirect = (const int32_t *)
2955 _NL_CURRENT (LC_COLLATE,
2956 _NL_COLLATE_INDIRECTWC);
2957
2958 idx = findidx ((const wint_t**)&cp);
2959 if (idx == 0 || cp < (wint_t*) str + c1)
2960 /* This is no valid character. */
2961 FREE_STACK_RETURN (REG_ECOLLATE);
2962
2963 str[0] = (wchar_t)idx;
2964 }
2965 else /* delim == '.' */
2966 {
2967 /* We push collation sequence value
2968 for collating symbol. */
2969 int32_t table_size;
2970 const int32_t *symb_table;
2971 const unsigned char *extra;
2972 int32_t idx;
2973 int32_t elem;
2974 int32_t second;
2975 int32_t hash;
2976 char char_str[c1];
2977
2978 /* We have to convert the name to a single-byte
2979 string. This is possible since the names
2980 consist of ASCII characters and the internal
2981 representation is UCS4. */
2982 for (i = 0; i < c1; ++i)
2983 char_str[i] = str[i];
2984
2985 table_size =
2986 _NL_CURRENT_WORD (LC_COLLATE,
2987 _NL_COLLATE_SYMB_HASH_SIZEMB);
2988 symb_table = (const int32_t *)
2989 _NL_CURRENT (LC_COLLATE,
2990 _NL_COLLATE_SYMB_TABLEMB);
2991 extra = (const unsigned char *)
2992 _NL_CURRENT (LC_COLLATE,
2993 _NL_COLLATE_SYMB_EXTRAMB);
2994
2995 /* Locate the character in the hashing table. */
2996 hash = elem_hash (char_str, c1);
2997
2998 idx = 0;
2999 elem = hash % table_size;
3000 second = hash % (table_size - 2);
3001 while (symb_table[2 * elem] != 0)
3002 {
3003 /* First compare the hashing value. */
3004 if (symb_table[2 * elem] == hash
3005 && c1 == extra[symb_table[2 * elem + 1]]
3006 && memcmp (char_str,
3007 &extra[symb_table[2 * elem + 1]
3008 + 1], c1) == 0)
3009 {
3010 /* Yep, this is the entry. */
3011 idx = symb_table[2 * elem + 1];
3012 idx += 1 + extra[idx];
3013 break;
3014 }
3015
3016 /* Next entry. */
3017 elem += second;
3018 }
3019
3020 if (symb_table[2 * elem] != 0)
3021 {
3022 /* Compute the index of the byte sequence
3023 in the table. */
3024 idx += 1 + extra[idx];
3025 /* Adjust for the alignment. */
3026 idx = (idx + 3) & ~3;
3027
3028 str[0] = (wchar_t) idx + 4;
3029 }
3030 else if (symb_table[2 * elem] == 0 && c1 == 1)
3031 {
3032 /* No valid character. Match it as a
3033 single byte character. */
3034 had_char_class = false;
3035 BUF_PUSH(str[0]);
3036 /* Update the length of characters */
3037 laststart[5]++;
3038 range_start = str[0];
3039
3040 /* Throw away the ] at the end of the
3041 collating symbol. */
3042 PATFETCH (c);
3043 /* exit from the switch block. */
3044 continue;
3045 }
3046 else
3047 FREE_STACK_RETURN (REG_ECOLLATE);
3048 }
3049 datasize = 1;
3050 }
3051# endif
3052 /* Throw away the ] at the end of the equivalence
3053 class (or collating symbol). */
3054 PATFETCH (c);
3055
3056 /* Allocate the space for the equivalence class
3057 (or collating symbol) (and '\0' if needed). */
3058 GET_BUFFER_SPACE(datasize);
3059 /* Update the pointer to indicate end of buffer. */
3060 b += datasize;
3061
3062 if (delim == '=')
3063 { /* equivalence class */
3064 /* Calculate the offset of char_ranges,
3065 which is next to equivalence_classes. */
3066 offset = laststart[1] + laststart[2]
3067 + laststart[3] +6;
3068 /* Insert space. */
3069 insert_space(datasize, laststart + offset, b - 1);
3070
3071 /* Write the equivalence_class and \0. */
3072 for (i = 0 ; i < datasize ; i++)
3073 laststart[offset + i] = str[i];
3074
3075 /* Update the length of equivalence_classes. */
3076 laststart[3] += datasize;
3077 had_char_class = true;
3078 }
3079 else /* delim == '.' */
3080 { /* collating symbol */
3081 /* Calculate the offset of the equivalence_classes,
3082 which is next to collating_symbols. */
3083 offset = laststart[1] + laststart[2] + 6;
3084 /* Insert space and write the collationg_symbol
3085 and \0. */
3086 insert_space(datasize, laststart + offset, b-1);
3087 for (i = 0 ; i < datasize ; i++)
3088 laststart[offset + i] = str[i];
3089
3090 /* In re_match_2_internal if range_start < -1, we
3091 assume -range_start is the offset of the
3092 collating symbol which is specified as
3093 the character of the range start. So we assign
3094 -(laststart[1] + laststart[2] + 6) to
3095 range_start. */
3096 range_start = -(laststart[1] + laststart[2] + 6);
3097 /* Update the length of collating_symbol. */
3098 laststart[2] += datasize;
3099 had_char_class = false;
3100 }
3101 }
3102 else
3103 {
3104 c1++;
3105 while (c1--)
3106 PATUNFETCH;
3107 BUF_PUSH ('[');
3108 BUF_PUSH (delim);
3109 laststart[5] += 2; /* Update the length of characters */
3110 range_start = delim;
3111 had_char_class = false;
3112 }
3113 }
3114 else
3115 {
3116 had_char_class = false;
3117 BUF_PUSH(c);
3118 laststart[5]++; /* Update the length of characters */
3119 range_start = c;
3120 }
3121 }
3122
3123#else /* BYTE */
3124 /* Ensure that we have enough space to push a charset: the
3125 opcode, the length count, and the bitset; 34 bytes in all. */
3126 GET_BUFFER_SPACE (34);
3127
3128 laststart = b;
3129
3130 /* We test `*p == '^' twice, instead of using an if
3131 statement, so we only need one BUF_PUSH. */
3132 BUF_PUSH (*p == '^' ? charset_not : charset);
3133 if (*p == '^')
3134 p++;
3135
3136 /* Remember the first position in the bracket expression. */
3137 p1 = p;
3138
3139 /* Push the number of bytes in the bitmap. */
3140 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3141
3142 /* Clear the whole map. */
3143 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3144
3145 /* charset_not matches newline according to a syntax bit. */
3146 if ((re_opcode_t) b[-2] == charset_not
3147 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3148 SET_LIST_BIT ('\n');
3149
3150 /* Read in characters and ranges, setting map bits. */
3151 for (;;)
3152 {
3153 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3154
3155 PATFETCH (c);
3156
3157 /* \ might escape characters inside [...] and [^...]. */
3158 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3159 {
3160 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3161
3162 PATFETCH (c1);
3163 SET_LIST_BIT (c1);
3164 range_start = c1;
3165 continue;
3166 }
3167
3168 /* Could be the end of the bracket expression. If it's
3169 not (i.e., when the bracket expression is `[]' so
3170 far), the ']' character bit gets set way below. */
3171 if (c == ']' && p != p1 + 1)
3172 break;
3173
3174 /* Look ahead to see if it's a range when the last thing
3175 was a character class. */
3176 if (had_char_class && c == '-' && *p != ']')
3177 FREE_STACK_RETURN (REG_ERANGE);
3178
3179 /* Look ahead to see if it's a range when the last thing
3180 was a character: if this is a hyphen not at the
3181 beginning or the end of a list, then it's the range
3182 operator. */
3183 if (c == '-'
3184 && !(p - 2 >= pattern && p[-2] == '[')
3185 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3186 && *p != ']')
3187 {
3188 reg_errcode_t ret
3189 = byte_compile_range (range_start, &p, pend, translate,
3190 syntax, b);
3191 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3192 range_start = 0xffffffff;
3193 }
3194
3195 else if (p[0] == '-' && p[1] != ']')
3196 { /* This handles ranges made up of characters only. */
3197 reg_errcode_t ret;
3198
3199 /* Move past the `-'. */
3200 PATFETCH (c1);
3201
3202 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3203 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3204 range_start = 0xffffffff;
3205 }
3206
3207 /* See if we're at the beginning of a possible character
3208 class. */
3209
3210 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3211 { /* Leave room for the null. */
3212 char str[CHAR_CLASS_MAX_LENGTH + 1];
3213
3214 PATFETCH (c);
3215 c1 = 0;
3216
3217 /* If pattern is `[[:'. */
3218 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3219
3220 for (;;)
3221 {
3222 PATFETCH (c);
3223 if ((c == ':' && *p == ']') || p == pend)
3224 break;
3225#if CHAR_CLASS_MAX_LENGTH != 256
3226 if (c1 < CHAR_CLASS_MAX_LENGTH)
3227 str[c1++] = c;
3228 else
3229 /* This is in any case an invalid class name. */
3230 str[0] = '\0';
3231#else
3232 str[c1++] = c;
3233#endif
3234 }
3235 str[c1] = '\0';
3236
3237 /* If isn't a word bracketed by `[:' and `:]':
3238 undo the ending character, the letters, and leave
3239 the leading `:' and `[' (but set bits for them). */
3240 if (c == ':' && *p == ']')
3241 {
3242# if defined _LIBC || defined WIDE_CHAR_SUPPORT
3243 boolean is_lower = STREQ (str, "lower");
3244 boolean is_upper = STREQ (str, "upper");
3245 wctype_t wt;
3246 int ch;
3247
3248 wt = IS_CHAR_CLASS (str);
3249 if (wt == 0)
3250 FREE_STACK_RETURN (REG_ECTYPE);
3251
3252 /* Throw away the ] at the end of the character
3253 class. */
3254 PATFETCH (c);
3255
3256 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3257
3258 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3259 {
3260# ifdef _LIBC
3261 if (__iswctype (__btowc (ch), wt))
3262 SET_LIST_BIT (ch);
3263# else
3264 if (iswctype (btowc (ch), wt))
3265 SET_LIST_BIT (ch);
3266# endif
3267
3268 if (translate && (is_upper || is_lower)
3269 && (ISUPPER (ch) || ISLOWER (ch)))
3270 SET_LIST_BIT (ch);
3271 }
3272
3273 had_char_class = true;
3274# else
3275 int ch;
3276 boolean is_alnum = STREQ (str, "alnum");
3277 boolean is_alpha = STREQ (str, "alpha");
3278 boolean is_blank = STREQ (str, "blank");
3279 boolean is_cntrl = STREQ (str, "cntrl");
3280 boolean is_digit = STREQ (str, "digit");
3281 boolean is_graph = STREQ (str, "graph");
3282 boolean is_lower = STREQ (str, "lower");
3283 boolean is_print = STREQ (str, "print");
3284 boolean is_punct = STREQ (str, "punct");
3285 boolean is_space = STREQ (str, "space");
3286 boolean is_upper = STREQ (str, "upper");
3287 boolean is_xdigit = STREQ (str, "xdigit");
3288
3289 if (!IS_CHAR_CLASS (str))
3290 FREE_STACK_RETURN (REG_ECTYPE);
3291
3292 /* Throw away the ] at the end of the character
3293 class. */
3294 PATFETCH (c);
3295
3296 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3297
3298 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3299 {
3300 /* This was split into 3 if's to
3301 avoid an arbitrary limit in some compiler. */
3302 if ( (is_alnum && ISALNUM (ch))
3303 || (is_alpha && ISALPHA (ch))
3304 || (is_blank && ISBLANK (ch))
3305 || (is_cntrl && ISCNTRL (ch)))
3306 SET_LIST_BIT (ch);
3307 if ( (is_digit && ISDIGIT (ch))
3308 || (is_graph && ISGRAPH (ch))
3309 || (is_lower && ISLOWER (ch))
3310 || (is_print && ISPRINT (ch)))
3311 SET_LIST_BIT (ch);
3312 if ( (is_punct && ISPUNCT (ch))
3313 || (is_space && ISSPACE (ch))
3314 || (is_upper && ISUPPER (ch))
3315 || (is_xdigit && ISXDIGIT (ch)))
3316 SET_LIST_BIT (ch);
3317 if ( translate && (is_upper || is_lower)
3318 && (ISUPPER (ch) || ISLOWER (ch)))
3319 SET_LIST_BIT (ch);
3320 }
3321 had_char_class = true;
3322# endif /* libc || wctype.h */
3323 }
3324 else
3325 {
3326 c1++;
3327 while (c1--)
3328 PATUNFETCH;
3329 SET_LIST_BIT ('[');
3330 SET_LIST_BIT (':');
3331 range_start = ':';
3332 had_char_class = false;
3333 }
3334 }
3335 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3336 {
3337 unsigned char str[MB_LEN_MAX + 1];
3338# ifdef _LIBC
3339 uint32_t nrules =
3340 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3341# endif
3342
3343 PATFETCH (c);
3344 c1 = 0;
3345
3346 /* If pattern is `[[='. */
3347 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3348
3349 for (;;)
3350 {
3351 PATFETCH (c);
3352 if ((c == '=' && *p == ']') || p == pend)
3353 break;
3354 if (c1 < MB_LEN_MAX)
3355 str[c1++] = c;
3356 else
3357 /* This is in any case an invalid class name. */
3358 str[0] = '\0';
3359 }
3360 str[c1] = '\0';
3361
3362 if (c == '=' && *p == ']' && str[0] != '\0')
3363 {
3364 /* If we have no collation data we use the default
3365 collation in which each character is in a class
3366 by itself. It also means that ASCII is the
3367 character set and therefore we cannot have character
3368 with more than one byte in the multibyte
3369 representation. */
3370# ifdef _LIBC
3371 if (nrules == 0)
3372# endif
3373 {
3374 if (c1 != 1)
3375 FREE_STACK_RETURN (REG_ECOLLATE);
3376
3377 /* Throw away the ] at the end of the equivalence
3378 class. */
3379 PATFETCH (c);
3380
3381 /* Set the bit for the character. */
3382 SET_LIST_BIT (str[0]);
3383 }
3384# ifdef _LIBC
3385 else
3386 {
3387 /* Try to match the byte sequence in `str' against
3388 those known to the collate implementation.
3389 First find out whether the bytes in `str' are
3390 actually from exactly one character. */
3391 const int32_t *table;
3392 const unsigned char *weights;
3393 const unsigned char *extra;
3394 const int32_t *indirect;
3395 int32_t idx;
3396 const unsigned char *cp = str;
3397 int ch;
3398
3399 /* This #include defines a local function! */
3400# include <locale/weight.h>
3401
3402 table = (const int32_t *)
3403 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3404 weights = (const unsigned char *)
3405 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3406 extra = (const unsigned char *)
3407 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3408 indirect = (const int32_t *)
3409 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3410
3411 idx = findidx (&cp);
3412 if (idx == 0 || cp < str + c1)
3413 /* This is no valid character. */
3414 FREE_STACK_RETURN (REG_ECOLLATE);
3415
3416 /* Throw away the ] at the end of the equivalence
3417 class. */
3418 PATFETCH (c);
3419
3420 /* Now we have to go throught the whole table
3421 and find all characters which have the same
3422 first level weight.
3423
3424 XXX Note that this is not entirely correct.
3425 we would have to match multibyte sequences
3426 but this is not possible with the current
3427 implementation. */
3428 for (ch = 1; ch < 256; ++ch)
3429 /* XXX This test would have to be changed if we
3430 would allow matching multibyte sequences. */
3431 if (table[ch] > 0)
3432 {
3433 int32_t idx2 = table[ch];
3434 size_t len = weights[idx2];
3435
3436 /* Test whether the lenghts match. */
3437 if (weights[idx] == len)
3438 {
3439 /* They do. New compare the bytes of
3440 the weight. */
3441 size_t cnt = 0;
3442
3443 while (cnt < len
3444 && (weights[idx + 1 + cnt]
3445 == weights[idx2 + 1 + cnt]))
3446 ++cnt;
3447
3448 if (cnt == len)
3449 /* They match. Mark the character as
3450 acceptable. */
3451 SET_LIST_BIT (ch);
3452 }
3453 }
3454 }
3455# endif
3456 had_char_class = true;
3457 }
3458 else
3459 {
3460 c1++;
3461 while (c1--)
3462 PATUNFETCH;
3463 SET_LIST_BIT ('[');
3464 SET_LIST_BIT ('=');
3465 range_start = '=';
3466 had_char_class = false;
3467 }
3468 }
3469 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3470 {
3471 unsigned char str[128]; /* Should be large enough. */
3472# ifdef _LIBC
3473 uint32_t nrules =
3474 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3475# endif
3476
3477 PATFETCH (c);
3478 c1 = 0;
3479
3480 /* If pattern is `[[.'. */
3481 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3482
3483 for (;;)
3484 {
3485 PATFETCH (c);
3486 if ((c == '.' && *p == ']') || p == pend)
3487 break;
3488 if (c1 < sizeof (str))
3489 str[c1++] = c;
3490 else
3491 /* This is in any case an invalid class name. */
3492 str[0] = '\0';
3493 }
3494 str[c1] = '\0';
3495
3496 if (c == '.' && *p == ']' && str[0] != '\0')
3497 {
3498 /* If we have no collation data we use the default
3499 collation in which each character is the name
3500 for its own class which contains only the one
3501 character. It also means that ASCII is the
3502 character set and therefore we cannot have character
3503 with more than one byte in the multibyte
3504 representation. */
3505# ifdef _LIBC
3506 if (nrules == 0)
3507# endif
3508 {
3509 if (c1 != 1)
3510 FREE_STACK_RETURN (REG_ECOLLATE);
3511
3512 /* Throw away the ] at the end of the equivalence
3513 class. */
3514 PATFETCH (c);
3515
3516 /* Set the bit for the character. */
3517 SET_LIST_BIT (str[0]);
3518 range_start = ((const unsigned char *) str)[0];
3519 }
3520# ifdef _LIBC
3521 else
3522 {
3523 /* Try to match the byte sequence in `str' against
3524 those known to the collate implementation.
3525 First find out whether the bytes in `str' are
3526 actually from exactly one character. */
3527 int32_t table_size;
3528 const int32_t *symb_table;
3529 const unsigned char *extra;
3530 int32_t idx;
3531 int32_t elem;
3532 int32_t second;
3533 int32_t hash;
3534
3535 table_size =
3536 _NL_CURRENT_WORD (LC_COLLATE,
3537 _NL_COLLATE_SYMB_HASH_SIZEMB);
3538 symb_table = (const int32_t *)
3539 _NL_CURRENT (LC_COLLATE,
3540 _NL_COLLATE_SYMB_TABLEMB);
3541 extra = (const unsigned char *)
3542 _NL_CURRENT (LC_COLLATE,
3543 _NL_COLLATE_SYMB_EXTRAMB);
3544
3545 /* Locate the character in the hashing table. */
3546 hash = elem_hash (str, c1);
3547
3548 idx = 0;
3549 elem = hash % table_size;
3550 second = hash % (table_size - 2);
3551 while (symb_table[2 * elem] != 0)
3552 {
3553 /* First compare the hashing value. */
3554 if (symb_table[2 * elem] == hash
3555 && c1 == extra[symb_table[2 * elem + 1]]
3556 && memcmp (str,
3557 &extra[symb_table[2 * elem + 1]
3558 + 1],
3559 c1) == 0)
3560 {
3561 /* Yep, this is the entry. */
3562 idx = symb_table[2 * elem + 1];
3563 idx += 1 + extra[idx];
3564 break;
3565 }
3566
3567 /* Next entry. */
3568 elem += second;
3569 }
3570
3571 if (symb_table[2 * elem] == 0)
3572 /* This is no valid character. */
3573 FREE_STACK_RETURN (REG_ECOLLATE);
3574
3575 /* Throw away the ] at the end of the equivalence
3576 class. */
3577 PATFETCH (c);
3578
3579 /* Now add the multibyte character(s) we found
3580 to the accept list.
3581
3582 XXX Note that this is not entirely correct.
3583 we would have to match multibyte sequences
3584 but this is not possible with the current
3585 implementation. Also, we have to match
3586 collating symbols, which expand to more than
3587 one file, as a whole and not allow the
3588 individual bytes. */
3589 c1 = extra[idx++];
3590 if (c1 == 1)
3591 range_start = extra[idx];
3592 while (c1-- > 0)
3593 {
3594 SET_LIST_BIT (extra[idx]);
3595 ++idx;
3596 }
3597 }
3598# endif
3599 had_char_class = false;
3600 }
3601 else
3602 {
3603 c1++;
3604 while (c1--)
3605 PATUNFETCH;
3606 SET_LIST_BIT ('[');
3607 SET_LIST_BIT ('.');
3608 range_start = '.';
3609 had_char_class = false;
3610 }
3611 }
3612 else
3613 {
3614 had_char_class = false;
3615 SET_LIST_BIT (c);
3616 range_start = c;
3617 }
3618 }
3619
3620 /* Discard any (non)matching list bytes that are all 0 at the
3621 end of the map. Decrease the map-length byte too. */
3622 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3623 b[-1]--;
3624 b += b[-1];
3625#endif /* WCHAR */
3626 }
3627 break;
3628
3629
3630 case '(':
3631 if (syntax & RE_NO_BK_PARENS)
3632 goto handle_open;
3633 else
3634 goto normal_char;
3635
3636
3637 case ')':
3638 if (syntax & RE_NO_BK_PARENS)
3639 goto handle_close;
3640 else
3641 goto normal_char;
3642
3643
3644 case '\n':
3645 if (syntax & RE_NEWLINE_ALT)
3646 goto handle_alt;
3647 else
3648 goto normal_char;
3649
3650
3651 case '|':
3652 if (syntax & RE_NO_BK_VBAR)
3653 goto handle_alt;
3654 else
3655 goto normal_char;
3656
3657
3658 case '{':
3659 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3660 goto handle_interval;
3661 else
3662 goto normal_char;
3663
3664
3665 case '\\':
3666 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3667
3668 /* Do not translate the character after the \, so that we can
3669 distinguish, e.g., \B from \b, even if we normally would
3670 translate, e.g., B to b. */
3671 PATFETCH_RAW (c);
3672
3673 switch (c)
3674 {
3675 case '(':
3676 if (syntax & RE_NO_BK_PARENS)
3677 goto normal_backslash;
3678
3679 handle_open:
3680 bufp->re_nsub++;
3681 regnum++;
3682
3683 if (COMPILE_STACK_FULL)
3684 {
3685 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3686 compile_stack_elt_t);
3687 if (compile_stack.stack == NULL) return REG_ESPACE;
3688
3689 compile_stack.size <<= 1;
3690 }
3691
3692 /* These are the values to restore when we hit end of this
3693 group. They are all relative offsets, so that if the
3694 whole pattern moves because of realloc, they will still
3695 be valid. */
3696 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3697 COMPILE_STACK_TOP.fixup_alt_jump
3698 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3699 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3700 COMPILE_STACK_TOP.regnum = regnum;
3701
3702 /* We will eventually replace the 0 with the number of
3703 groups inner to this one. But do not push a
3704 start_memory for groups beyond the last one we can
3705 represent in the compiled pattern. */
3706 if (regnum <= MAX_REGNUM)
3707 {
3708 COMPILE_STACK_TOP.inner_group_offset = b
3709 - COMPILED_BUFFER_VAR + 2;
3710 BUF_PUSH_3 (start_memory, regnum, 0);
3711 }
3712
3713 compile_stack.avail++;
3714
3715 fixup_alt_jump = 0;
3716 laststart = 0;
3717 begalt = b;
3718 /* If we've reached MAX_REGNUM groups, then this open
3719 won't actually generate any code, so we'll have to
3720 clear pending_exact explicitly. */
3721 pending_exact = 0;
3722 break;
3723
3724
3725 case ')':
3726 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3727
3728 if (COMPILE_STACK_EMPTY)
3729 {
3730 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3731 goto normal_backslash;
3732 else
3733 FREE_STACK_RETURN (REG_ERPAREN);
3734 }
3735
3736 handle_close:
3737 if (fixup_alt_jump)
3738 { /* Push a dummy failure point at the end of the
3739 alternative for a possible future
3740 `pop_failure_jump' to pop. See comments at
3741 `push_dummy_failure' in `re_match_2'. */
3742 BUF_PUSH (push_dummy_failure);
3743
3744 /* We allocated space for this jump when we assigned
3745 to `fixup_alt_jump', in the `handle_alt' case below. */
3746 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3747 }
3748
3749 /* See similar code for backslashed left paren above. */
3750 if (COMPILE_STACK_EMPTY)
3751 {
3752 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3753 goto normal_char;
3754 else
3755 FREE_STACK_RETURN (REG_ERPAREN);
3756 }
3757
3758 /* Since we just checked for an empty stack above, this
3759 ``can't happen''. */
3760 assert (compile_stack.avail != 0);
3761 {
3762 /* We don't just want to restore into `regnum', because
3763 later groups should continue to be numbered higher,
3764 as in `(ab)c(de)' -- the second group is #2. */
3765 regnum_t this_group_regnum;
3766
3767 compile_stack.avail--;
3768 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3769 fixup_alt_jump
3770 = COMPILE_STACK_TOP.fixup_alt_jump
3771 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3772 : 0;
3773 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3774 this_group_regnum = COMPILE_STACK_TOP.regnum;
3775 /* If we've reached MAX_REGNUM groups, then this open
3776 won't actually generate any code, so we'll have to
3777 clear pending_exact explicitly. */
3778 pending_exact = 0;
3779
3780 /* We're at the end of the group, so now we know how many
3781 groups were inside this one. */
3782 if (this_group_regnum <= MAX_REGNUM)
3783 {
3784 UCHAR_T *inner_group_loc
3785 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3786
3787 *inner_group_loc = regnum - this_group_regnum;
3788 BUF_PUSH_3 (stop_memory, this_group_regnum,
3789 regnum - this_group_regnum);
3790 }
3791 }
3792 break;
3793
3794
3795 case '|': /* `\|'. */
3796 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3797 goto normal_backslash;
3798 handle_alt:
3799 if (syntax & RE_LIMITED_OPS)
3800 goto normal_char;
3801
3802 /* Insert before the previous alternative a jump which
3803 jumps to this alternative if the former fails. */
3804 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3805 INSERT_JUMP (on_failure_jump, begalt,
3806 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3807 pending_exact = 0;
3808 b += 1 + OFFSET_ADDRESS_SIZE;
3809
3810 /* The alternative before this one has a jump after it
3811 which gets executed if it gets matched. Adjust that
3812 jump so it will jump to this alternative's analogous
3813 jump (put in below, which in turn will jump to the next
3814 (if any) alternative's such jump, etc.). The last such
3815 jump jumps to the correct final destination. A picture:
3816 _____ _____
3817 | | | |
3818 | v | v
3819 a | b | c
3820
3821 If we are at `b', then fixup_alt_jump right now points to a
3822 three-byte space after `a'. We'll put in the jump, set
3823 fixup_alt_jump to right after `b', and leave behind three
3824 bytes which we'll fill in when we get to after `c'. */
3825
3826 if (fixup_alt_jump)
3827 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3828
3829 /* Mark and leave space for a jump after this alternative,
3830 to be filled in later either by next alternative or
3831 when know we're at the end of a series of alternatives. */
3832 fixup_alt_jump = b;
3833 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3834 b += 1 + OFFSET_ADDRESS_SIZE;
3835
3836 laststart = 0;
3837 begalt = b;
3838 break;
3839
3840
3841 case '{':
3842 /* If \{ is a literal. */
3843 if (!(syntax & RE_INTERVALS)
3844 /* If we're at `\{' and it's not the open-interval
3845 operator. */
3846 || (syntax & RE_NO_BK_BRACES))
3847 goto normal_backslash;
3848
3849 handle_interval:
3850 {
3851 /* If got here, then the syntax allows intervals. */
3852
3853 /* At least (most) this many matches must be made. */
3854 int lower_bound = -1, upper_bound = -1;
3855
3856 /* Place in the uncompiled pattern (i.e., just after
3857 the '{') to go back to if the interval is invalid. */
3858 const CHAR_T *beg_interval = p;
3859
3860 if (p == pend)
3861 goto invalid_interval;
3862
3863 GET_UNSIGNED_NUMBER (lower_bound);
3864
3865 if (c == ',')
3866 {
3867 GET_UNSIGNED_NUMBER (upper_bound);
3868 if (upper_bound < 0)
3869 upper_bound = RE_DUP_MAX;
3870 }
3871 else
3872 /* Interval such as `{1}' => match exactly once. */
3873 upper_bound = lower_bound;
3874
3875 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3876 goto invalid_interval;
3877
3878 if (!(syntax & RE_NO_BK_BRACES))
3879 {
3880 if (c != '\\' || p == pend)
3881 goto invalid_interval;
3882 PATFETCH (c);
3883 }
3884
3885 if (c != '}')
3886 goto invalid_interval;
3887
3888 /* If it's invalid to have no preceding re. */
3889 if (!laststart)
3890 {
3891 if (syntax & RE_CONTEXT_INVALID_OPS
3892 && !(syntax & RE_INVALID_INTERVAL_ORD))
3893 FREE_STACK_RETURN (REG_BADRPT);
3894 else if (syntax & RE_CONTEXT_INDEP_OPS)
3895 laststart = b;
3896 else
3897 goto unfetch_interval;
3898 }
3899
3900 /* We just parsed a valid interval. */
3901
3902 if (RE_DUP_MAX < upper_bound)
3903 FREE_STACK_RETURN (REG_BADBR);
3904
3905 /* If the upper bound is zero, don't want to succeed at
3906 all; jump from `laststart' to `b + 3', which will be
3907 the end of the buffer after we insert the jump. */
3908 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3909 instead of 'b + 3'. */
3910 if (upper_bound == 0)
3911 {
3912 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3913 INSERT_JUMP (jump, laststart, b + 1
3914 + OFFSET_ADDRESS_SIZE);
3915 b += 1 + OFFSET_ADDRESS_SIZE;
3916 }
3917
3918 /* Otherwise, we have a nontrivial interval. When
3919 we're all done, the pattern will look like:
3920 set_number_at <jump count> <upper bound>
3921 set_number_at <succeed_n count> <lower bound>
3922 succeed_n <after jump addr> <succeed_n count>
3923 <body of loop>
3924 jump_n <succeed_n addr> <jump count>
3925 (The upper bound and `jump_n' are omitted if
3926 `upper_bound' is 1, though.) */
3927 else
3928 { /* If the upper bound is > 1, we need to insert
3929 more at the end of the loop. */
3930 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3931 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3932
3933 GET_BUFFER_SPACE (nbytes);
3934
3935 /* Initialize lower bound of the `succeed_n', even
3936 though it will be set during matching by its
3937 attendant `set_number_at' (inserted next),
3938 because `re_compile_fastmap' needs to know.
3939 Jump to the `jump_n' we might insert below. */
3940 INSERT_JUMP2 (succeed_n, laststart,
3941 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3942 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3943 , lower_bound);
3944 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3945
3946 /* Code to initialize the lower bound. Insert
3947 before the `succeed_n'. The `5' is the last two
3948 bytes of this `set_number_at', plus 3 bytes of
3949 the following `succeed_n'. */
3950 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
3951 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3952 of the following `succeed_n'. */
3953 PREFIX(insert_op2) (set_number_at, laststart, 1
3954 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
3955 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3956
3957 if (upper_bound > 1)
3958 { /* More than one repetition is allowed, so
3959 append a backward jump to the `succeed_n'
3960 that starts this interval.
3961
3962 When we've reached this during matching,
3963 we'll have matched the interval once, so
3964 jump back only `upper_bound - 1' times. */
3965 STORE_JUMP2 (jump_n, b, laststart
3966 + 2 * OFFSET_ADDRESS_SIZE + 1,
3967 upper_bound - 1);
3968 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3969
3970 /* The location we want to set is the second
3971 parameter of the `jump_n'; that is `b-2' as
3972 an absolute address. `laststart' will be
3973 the `set_number_at' we're about to insert;
3974 `laststart+3' the number to set, the source
3975 for the relative address. But we are
3976 inserting into the middle of the pattern --
3977 so everything is getting moved up by 5.
3978 Conclusion: (b - 2) - (laststart + 3) + 5,
3979 i.e., b - laststart.
3980
3981 We insert this at the beginning of the loop
3982 so that if we fail during matching, we'll
3983 reinitialize the bounds. */
3984 PREFIX(insert_op2) (set_number_at, laststart,
3985 b - laststart,
3986 upper_bound - 1, b);
3987 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3988 }
3989 }
3990 pending_exact = 0;
3991 break;
3992
3993 invalid_interval:
3994 if (!(syntax & RE_INVALID_INTERVAL_ORD))
3995 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
3996 unfetch_interval:
3997 /* Match the characters as literals. */
3998 p = beg_interval;
3999 c = '{';
4000 if (syntax & RE_NO_BK_BRACES)
4001 goto normal_char;
4002 else
4003 goto normal_backslash;
4004 }
4005
4006#ifdef emacs
4007 /* There is no way to specify the before_dot and after_dot
4008 operators. rms says this is ok. --karl */
4009 case '=':
4010 BUF_PUSH (at_dot);
4011 break;
4012
4013 case 's':
4014 laststart = b;
4015 PATFETCH (c);
4016 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4017 break;
4018
4019 case 'S':
4020 laststart = b;
4021 PATFETCH (c);
4022 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4023 break;
4024#endif /* emacs */
4025
4026
4027 case 'w':
4028 if (syntax & RE_NO_GNU_OPS)
4029 goto normal_char;
4030 laststart = b;
4031 BUF_PUSH (wordchar);
4032 break;
4033
4034
4035 case 'W':
4036 if (syntax & RE_NO_GNU_OPS)
4037 goto normal_char;
4038 laststart = b;
4039 BUF_PUSH (notwordchar);
4040 break;
4041
4042
4043 case '<':
4044 if (syntax & RE_NO_GNU_OPS)
4045 goto normal_char;
4046 BUF_PUSH (wordbeg);
4047 break;
4048
4049 case '>':
4050 if (syntax & RE_NO_GNU_OPS)
4051 goto normal_char;
4052 BUF_PUSH (wordend);
4053 break;
4054
4055 case 'b':
4056 if (syntax & RE_NO_GNU_OPS)
4057 goto normal_char;
4058 BUF_PUSH (wordbound);
4059 break;
4060
4061 case 'B':
4062 if (syntax & RE_NO_GNU_OPS)
4063 goto normal_char;
4064 BUF_PUSH (notwordbound);
4065 break;
4066
4067 case '`':
4068 if (syntax & RE_NO_GNU_OPS)
4069 goto normal_char;
4070 BUF_PUSH (begbuf);
4071 break;
4072
4073 case '\'':
4074 if (syntax & RE_NO_GNU_OPS)
4075 goto normal_char;
4076 BUF_PUSH (endbuf);
4077 break;
4078
4079 case '1': case '2': case '3': case '4': case '5':
4080 case '6': case '7': case '8': case '9':
4081 if (syntax & RE_NO_BK_REFS)
4082 goto normal_char;
4083
4084 c1 = c - '0';
4085
4086 if (c1 > regnum)
4087 FREE_STACK_RETURN (REG_ESUBREG);
4088
4089 /* Can't back reference to a subexpression if inside of it. */
4090 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4091 goto normal_char;
4092
4093 laststart = b;
4094 BUF_PUSH_2 (duplicate, c1);
4095 break;
4096
4097
4098 case '+':
4099 case '?':
4100 if (syntax & RE_BK_PLUS_QM)
4101 goto handle_plus;
4102 else
4103 goto normal_backslash;
4104
4105 default:
4106 normal_backslash:
4107 /* You might think it would be useful for \ to mean
4108 not to translate; but if we don't translate it
4109 it will never match anything. */
4110 c = TRANSLATE (c);
4111 goto normal_char;
4112 }
4113 break;
4114
4115
4116 default:
4117 /* Expects the character in `c'. */
4118 normal_char:
4119 /* If no exactn currently being built. */
4120 if (!pending_exact
4121#ifdef WCHAR
4122 /* If last exactn handle binary(or character) and
4123 new exactn handle character(or binary). */
4124 || is_exactn_bin != is_binary[p - 1 - pattern]
4125#endif /* WCHAR */
4126
4127 /* If last exactn not at current position. */
4128 || pending_exact + *pending_exact + 1 != b
4129
4130 /* We have only one byte following the exactn for the count. */
4131 || *pending_exact == (1 << BYTEWIDTH) - 1
4132
4133 /* If followed by a repetition operator. */
4134 || *p == '*' || *p == '^'
4135 || ((syntax & RE_BK_PLUS_QM)
4136 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4137 : (*p == '+' || *p == '?'))
4138 || ((syntax & RE_INTERVALS)
4139 && ((syntax & RE_NO_BK_BRACES)
4140 ? *p == '{'
4141 : (p[0] == '\\' && p[1] == '{'))))
4142 {
4143 /* Start building a new exactn. */
4144
4145 laststart = b;
4146
4147#ifdef WCHAR
4148 /* Is this exactn binary data or character? */
4149 is_exactn_bin = is_binary[p - 1 - pattern];
4150 if (is_exactn_bin)
4151 BUF_PUSH_2 (exactn_bin, 0);
4152 else
4153 BUF_PUSH_2 (exactn, 0);
4154#else
4155 BUF_PUSH_2 (exactn, 0);
4156#endif /* WCHAR */
4157 pending_exact = b - 1;
4158 }
4159
4160 BUF_PUSH (c);
4161 (*pending_exact)++;
4162 break;
4163 } /* switch (c) */
4164 } /* while p != pend */
4165
4166
4167 /* Through the pattern now. */
4168
4169 if (fixup_alt_jump)
4170 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4171
4172 if (!COMPILE_STACK_EMPTY)
4173 FREE_STACK_RETURN (REG_EPAREN);
4174
4175 /* If we don't want backtracking, force success
4176 the first time we reach the end of the compiled pattern. */
4177 if (syntax & RE_NO_POSIX_BACKTRACKING)
4178 BUF_PUSH (succeed);
4179
4180#ifdef WCHAR
4181 free (pattern);
4182 free (mbs_offset);
4183 free (is_binary);
4184#endif
4185 free (compile_stack.stack);
4186
4187 /* We have succeeded; set the length of the buffer. */
4188#ifdef WCHAR
4189 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4190#else
4191 bufp->used = b - bufp->buffer;
4192#endif
4193
4194#ifdef DEBUG
4195 if (debug)
4196 {
4197 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4198 PREFIX(print_compiled_pattern) (bufp);
4199 }
4200#endif /* DEBUG */
4201
4202#ifndef MATCH_MAY_ALLOCATE
4203 /* Initialize the failure stack to the largest possible stack. This
4204 isn't necessary unless we're trying to avoid calling alloca in
4205 the search and match routines. */
4206 {
4207 int num_regs = bufp->re_nsub + 1;
4208
4209 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4210 is strictly greater than re_max_failures, the largest possible stack
4211 is 2 * re_max_failures failure points. */
4212 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4213 {
4214 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4215
4216# ifdef emacs
4217 if (! fail_stack.stack)
4218 fail_stack.stack
4219 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4220 * sizeof (PREFIX(fail_stack_elt_t)));
4221 else
4222 fail_stack.stack
4223 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4224 (fail_stack.size
4225 * sizeof (PREFIX(fail_stack_elt_t))));
4226# else /* not emacs */
4227 if (! fail_stack.stack)
4228 fail_stack.stack
4229 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4230 * sizeof (PREFIX(fail_stack_elt_t)));
4231 else
4232 fail_stack.stack
4233 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4234 (fail_stack.size
4235 * sizeof (PREFIX(fail_stack_elt_t))));
4236# endif /* not emacs */
4237 }
4238
4239 PREFIX(regex_grow_registers) (num_regs);
4240 }
4241#endif /* not MATCH_MAY_ALLOCATE */
4242
4243 return REG_NOERROR;
4244} /* regex_compile */
4245
4246/* Subroutines for `regex_compile'. */
4247
4248/* Store OP at LOC followed by two-byte integer parameter ARG. */
4249/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4250
4251static void
4252PREFIX(store_op1) (
4253 re_opcode_t op,
4254 UCHAR_T *loc,
4255 int arg)
4256{
4257 *loc = (UCHAR_T) op;
4258 STORE_NUMBER (loc + 1, arg);
4259}
4260
4261
4262/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4263/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4264
4265static void
4266PREFIX(store_op2) (
4267 re_opcode_t op,
4268 UCHAR_T *loc,
4269 int arg1, int arg2)
4270{
4271 *loc = (UCHAR_T) op;
4272 STORE_NUMBER (loc + 1, arg1);
4273 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4274}
4275
4276
4277/* Copy the bytes from LOC to END to open up three bytes of space at LOC
4278 for OP followed by two-byte integer parameter ARG. */
4279/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4280
4281static void
4282PREFIX(insert_op1) (
4283 re_opcode_t op,
4284 UCHAR_T *loc,
4285 int arg,
4286 UCHAR_T *end)
4287{
4288 register UCHAR_T *pfrom = end;
4289 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4290
4291 while (pfrom != loc)
4292 *--pto = *--pfrom;
4293
4294 PREFIX(store_op1) (op, loc, arg);
4295}
4296
4297
4298/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4299/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4300
4301static void
4302PREFIX(insert_op2) (
4303 re_opcode_t op,
4304 UCHAR_T *loc,
4305 int arg1, int arg2,
4306 UCHAR_T *end)
4307{
4308 register UCHAR_T *pfrom = end;
4309 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4310
4311 while (pfrom != loc)
4312 *--pto = *--pfrom;
4313
4314 PREFIX(store_op2) (op, loc, arg1, arg2);
4315}
4316
4317
4318/* P points to just after a ^ in PATTERN. Return true if that ^ comes
4319 after an alternative or a begin-subexpression. We assume there is at
4320 least one character before the ^. */
4321
4322static boolean
4323PREFIX(at_begline_loc_p) (
4324 const CHAR_T *pattern, const CHAR_T *p,
4325 reg_syntax_t syntax)
4326{
4327 const CHAR_T *prev = p - 2;
4328 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4329
4330 return
4331 /* After a subexpression? */
4332 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4333 /* After an alternative? */
4334 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4335}
4336
4337
4338/* The dual of at_begline_loc_p. This one is for $. We assume there is
4339 at least one character after the $, i.e., `P < PEND'. */
4340
4341static boolean
4342PREFIX(at_endline_loc_p) (
4343 const CHAR_T *p, const CHAR_T *pend,
4344 reg_syntax_t syntax)
4345{
4346 const CHAR_T *next = p;
4347 boolean next_backslash = *next == '\\';
4348 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4349
4350 return
4351 /* Before a subexpression? */
4352 (syntax & RE_NO_BK_PARENS ? *next == ')'
4353 : next_backslash && next_next && *next_next == ')')
4354 /* Before an alternative? */
4355 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4356 : next_backslash && next_next && *next_next == '|');
4357}
4358
4359#else /* not INSIDE_RECURSION */
4360
4361/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4362 false if it's not. */
4363
4364static boolean
4365group_in_compile_stack (
4366 compile_stack_type compile_stack,
4367 regnum_t regnum)
4368{
4369 int this_element;
4370
4371 for (this_element = compile_stack.avail - 1;
4372 this_element >= 0;
4373 this_element--)
4374 if (compile_stack.stack[this_element].regnum == regnum)
4375 return true;
4376
4377 return false;
4378}
4379#endif /* not INSIDE_RECURSION */
4380
4381#ifdef INSIDE_RECURSION
4382
4383#ifdef WCHAR
4384/* This insert space, which size is "num", into the pattern at "loc".
4385 "end" must point the end of the allocated buffer. */
4386static void
4387insert_space (
4388 int num,
4389 CHAR_T *loc,
4390 CHAR_T *end)
4391{
4392 register CHAR_T *pto = end;
4393 register CHAR_T *pfrom = end - num;
4394
4395 while (pfrom >= loc)
4396 *pto-- = *pfrom--;
4397}
4398#endif /* WCHAR */
4399
4400#ifdef WCHAR
4401static reg_errcode_t
4402wcs_compile_range (
4403 CHAR_T range_start_char,
4404 const CHAR_T **p_ptr, const CHAR_T *pend,
4405 RE_TRANSLATE_TYPE translate,
4406 reg_syntax_t syntax,
4407 CHAR_T *b, CHAR_T *char_set)
4408{
4409 const CHAR_T *p = *p_ptr;
4410 CHAR_T range_start, range_end;
4411 reg_errcode_t ret;
4412# ifdef _LIBC
4413 uint32_t nrules;
4414 uint32_t start_val, end_val;
4415# endif
4416 if (p == pend)
4417 return REG_ERANGE;
4418
4419# ifdef _LIBC
4420 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4421 if (nrules != 0)
4422 {
4423 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4424 _NL_COLLATE_COLLSEQWC);
4425 const unsigned char *extra = (const unsigned char *)
4426 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4427
4428 if (range_start_char < -1)
4429 {
4430 /* range_start is a collating symbol. */
4431 int32_t *wextra;
4432 /* Retreive the index and get collation sequence value. */
4433 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4434 start_val = wextra[1 + *wextra];
4435 }
4436 else
4437 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4438
4439 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4440
4441 /* Report an error if the range is empty and the syntax prohibits
4442 this. */
4443 ret = ((syntax & RE_NO_EMPTY_RANGES)
4444 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4445
4446 /* Insert space to the end of the char_ranges. */
4447 insert_space(2, b - char_set[5] - 2, b - 1);
4448 *(b - char_set[5] - 2) = (wchar_t)start_val;
4449 *(b - char_set[5] - 1) = (wchar_t)end_val;
4450 char_set[4]++; /* ranges_index */
4451 }
4452 else
4453# endif
4454 {
4455 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4456 range_start_char;
4457 range_end = TRANSLATE (p[0]);
4458 /* Report an error if the range is empty and the syntax prohibits
4459 this. */
4460 ret = ((syntax & RE_NO_EMPTY_RANGES)
4461 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4462
4463 /* Insert space to the end of the char_ranges. */
4464 insert_space(2, b - char_set[5] - 2, b - 1);
4465 *(b - char_set[5] - 2) = range_start;
4466 *(b - char_set[5] - 1) = range_end;
4467 char_set[4]++; /* ranges_index */
4468 }
4469 /* Have to increment the pointer into the pattern string, so the
4470 caller isn't still at the ending character. */
4471 (*p_ptr)++;
4472
4473 return ret;
4474}
4475#else /* BYTE */
4476/* Read the ending character of a range (in a bracket expression) from the
4477 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4478 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4479 Then we set the translation of all bits between the starting and
4480 ending characters (inclusive) in the compiled pattern B.
4481
4482 Return an error code.
4483
4484 We use these short variable names so we can use the same macros as
4485 `regex_compile' itself. */
4486
4487static reg_errcode_t
4488byte_compile_range (
4489 unsigned int range_start_char,
4490 const char **p_ptr, const char *pend,
4491 RE_TRANSLATE_TYPE translate,
4492 reg_syntax_t syntax,
4493 unsigned char *b)
4494{
4495 unsigned this_char;
4496 const char *p = *p_ptr;
4497 reg_errcode_t ret;
4498# ifdef _LIBC
4499 const unsigned char *collseq;
4500 unsigned int start_colseq;
4501 unsigned int end_colseq;
4502# else
4503 unsigned end_char;
4504# endif
4505
4506 if (p == pend)
4507 return REG_ERANGE;
4508
4509 /* Have to increment the pointer into the pattern string, so the
4510 caller isn't still at the ending character. */
4511 (*p_ptr)++;
4512
4513 /* Report an error if the range is empty and the syntax prohibits this. */
4514 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4515
4516# ifdef _LIBC
4517 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4518 _NL_COLLATE_COLLSEQMB);
4519
4520 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4521 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4522 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4523 {
4524 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4525
4526 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4527 {
4528 SET_LIST_BIT (TRANSLATE (this_char));
4529 ret = REG_NOERROR;
4530 }
4531 }
4532# else
4533 /* Here we see why `this_char' has to be larger than an `unsigned
4534 char' -- we would otherwise go into an infinite loop, since all
4535 characters <= 0xff. */
4536 range_start_char = TRANSLATE (range_start_char);
4537 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4538 and some compilers cast it to int implicitly, so following for_loop
4539 may fall to (almost) infinite loop.
4540 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4541 To avoid this, we cast p[0] to unsigned int and truncate it. */
4542 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4543
4544 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4545 {
4546 SET_LIST_BIT (TRANSLATE (this_char));
4547 ret = REG_NOERROR;
4548 }
4549# endif
4550
4551 return ret;
4552}
4553#endif /* WCHAR */
4554
4555/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4556 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4557 characters can start a string that matches the pattern. This fastmap
4558 is used by re_search to skip quickly over impossible starting points.
4559
4560 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4561 area as BUFP->fastmap.
4562
4563 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4564 the pattern buffer.
4565
4566 Returns 0 if we succeed, -2 if an internal error. */
4567
4568#ifdef WCHAR
4569/* local function for re_compile_fastmap.
4570 truncate wchar_t character to char. */
4571static unsigned char truncate_wchar (CHAR_T c)
4572{
4573 unsigned char buf[MB_CUR_MAX];
4574 mbstate_t state;
4575 int retval;
4576 memset (&state, '\0', sizeof (state));
4577# ifdef _LIBC
4578 retval = __wcrtomb (buf, c, &state);
4579# else
4580 retval = wcrtomb (buf, c, &state);
4581# endif
4582 return retval > 0 ? buf[0] : (unsigned char) c;
4583}
4584#endif /* WCHAR */
4585
4586static int
4587PREFIX(re_compile_fastmap) (struct re_pattern_buffer *bufp)
4588{
4589 int j, k;
4590#ifdef MATCH_MAY_ALLOCATE
4591 PREFIX(fail_stack_type) fail_stack;
4592#endif
4593#ifndef REGEX_MALLOC
4594 char *destination;
4595#endif
4596
4597 register char *fastmap = bufp->fastmap;
4598
4599#ifdef WCHAR
4600 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4601 pattern to (char*) in regex_compile. */
4602 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4603 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4604#else /* BYTE */
4605 UCHAR_T *pattern = bufp->buffer;
4606 register UCHAR_T *pend = pattern + bufp->used;
4607#endif /* WCHAR */
4608 UCHAR_T *p = pattern;
4609
4610#ifdef REL_ALLOC
4611 /* This holds the pointer to the failure stack, when
4612 it is allocated relocatably. */
4613 fail_stack_elt_t *failure_stack_ptr;
4614#endif
4615
4616 /* Assume that each path through the pattern can be null until
4617 proven otherwise. We set this false at the bottom of switch
4618 statement, to which we get only if a particular path doesn't
4619 match the empty string. */
4620 boolean path_can_be_null = true;
4621
4622 /* We aren't doing a `succeed_n' to begin with. */
4623 boolean succeed_n_p = false;
4624
4625 assert (fastmap != NULL && p != NULL);
4626
4627 INIT_FAIL_STACK ();
4628 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4629 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4630 bufp->can_be_null = 0;
4631
4632 while (1)
4633 {
4634 if (p == pend || *p == succeed)
4635 {
4636 /* We have reached the (effective) end of pattern. */
4637 if (!FAIL_STACK_EMPTY ())
4638 {
4639 bufp->can_be_null |= path_can_be_null;
4640
4641 /* Reset for next path. */
4642 path_can_be_null = true;
4643
4644 p = fail_stack.stack[--fail_stack.avail].pointer;
4645
4646 continue;
4647 }
4648 else
4649 break;
4650 }
4651
4652 /* We should never be about to go beyond the end of the pattern. */
4653 assert (p < pend);
4654
4655 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4656 {
4657
4658 /* I guess the idea here is to simply not bother with a fastmap
4659 if a backreference is used, since it's too hard to figure out
4660 the fastmap for the corresponding group. Setting
4661 `can_be_null' stops `re_search_2' from using the fastmap, so
4662 that is all we do. */
4663 case duplicate:
4664 bufp->can_be_null = 1;
4665 goto done;
4666
4667
4668 /* Following are the cases which match a character. These end
4669 with `break'. */
4670
4671#ifdef WCHAR
4672 case exactn:
4673 fastmap[truncate_wchar(p[1])] = 1;
4674 break;
4675#else /* BYTE */
4676 case exactn:
4677 fastmap[p[1]] = 1;
4678 break;
4679#endif /* WCHAR */
4680#ifdef MBS_SUPPORT
4681 case exactn_bin:
4682 fastmap[p[1]] = 1;
4683 break;
4684#endif
4685
4686#ifdef WCHAR
4687 /* It is hard to distinguish fastmap from (multi byte) characters
4688 which depends on current locale. */
4689 case charset:
4690 case charset_not:
4691 case wordchar:
4692 case notwordchar:
4693 bufp->can_be_null = 1;
4694 goto done;
4695#else /* BYTE */
4696 case charset:
4697 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4698 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4699 fastmap[j] = 1;
4700 break;
4701
4702
4703 case charset_not:
4704 /* Chars beyond end of map must be allowed. */
4705 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4706 fastmap[j] = 1;
4707
4708 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4709 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4710 fastmap[j] = 1;
4711 break;
4712
4713
4714 case wordchar:
4715 for (j = 0; j < (1 << BYTEWIDTH); j++)
4716 if (SYNTAX (j) == Sword)
4717 fastmap[j] = 1;
4718 break;
4719
4720
4721 case notwordchar:
4722 for (j = 0; j < (1 << BYTEWIDTH); j++)
4723 if (SYNTAX (j) != Sword)
4724 fastmap[j] = 1;
4725 break;
4726#endif /* WCHAR */
4727
4728 case anychar:
4729 {
4730 int fastmap_newline = fastmap['\n'];
4731
4732 /* `.' matches anything ... */
4733 for (j = 0; j < (1 << BYTEWIDTH); j++)
4734 fastmap[j] = 1;
4735
4736 /* ... except perhaps newline. */
4737 if (!(bufp->syntax & RE_DOT_NEWLINE))
4738 fastmap['\n'] = fastmap_newline;
4739
4740 /* Return if we have already set `can_be_null'; if we have,
4741 then the fastmap is irrelevant. Something's wrong here. */
4742 else if (bufp->can_be_null)
4743 goto done;
4744
4745 /* Otherwise, have to check alternative paths. */
4746 break;
4747 }
4748
4749#ifdef emacs
4750 case syntaxspec:
4751 k = *p++;
4752 for (j = 0; j < (1 << BYTEWIDTH); j++)
4753 if (SYNTAX (j) == (enum syntaxcode) k)
4754 fastmap[j] = 1;
4755 break;
4756
4757
4758 case notsyntaxspec:
4759 k = *p++;
4760 for (j = 0; j < (1 << BYTEWIDTH); j++)
4761 if (SYNTAX (j) != (enum syntaxcode) k)
4762 fastmap[j] = 1;
4763 break;
4764
4765
4766 /* All cases after this match the empty string. These end with
4767 `continue'. */
4768
4769
4770 case before_dot:
4771 case at_dot:
4772 case after_dot:
4773 continue;
4774#endif /* emacs */
4775
4776
4777 case no_op:
4778 case begline:
4779 case endline:
4780 case begbuf:
4781 case endbuf:
4782 case wordbound:
4783 case notwordbound:
4784 case wordbeg:
4785 case wordend:
4786 case push_dummy_failure:
4787 continue;
4788
4789
4790 case jump_n:
4791 case pop_failure_jump:
4792 case maybe_pop_jump:
4793 case jump:
4794 case jump_past_alt:
4795 case dummy_failure_jump:
4796 EXTRACT_NUMBER_AND_INCR (j, p);
4797 p += j;
4798 if (j > 0)
4799 continue;
4800
4801 /* Jump backward implies we just went through the body of a
4802 loop and matched nothing. Opcode jumped to should be
4803 `on_failure_jump' or `succeed_n'. Just treat it like an
4804 ordinary jump. For a * loop, it has pushed its failure
4805 point already; if so, discard that as redundant. */
4806 if ((re_opcode_t) *p != on_failure_jump
4807 && (re_opcode_t) *p != succeed_n)
4808 continue;
4809
4810 p++;
4811 EXTRACT_NUMBER_AND_INCR (j, p);
4812 p += j;
4813
4814 /* If what's on the stack is where we are now, pop it. */
4815 if (!FAIL_STACK_EMPTY ()
4816 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4817 fail_stack.avail--;
4818
4819 continue;
4820
4821
4822 case on_failure_jump:
4823 case on_failure_keep_string_jump:
4824 handle_on_failure_jump:
4825 EXTRACT_NUMBER_AND_INCR (j, p);
4826
4827 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4828 end of the pattern. We don't want to push such a point,
4829 since when we restore it above, entering the switch will
4830 increment `p' past the end of the pattern. We don't need
4831 to push such a point since we obviously won't find any more
4832 fastmap entries beyond `pend'. Such a pattern can match
4833 the null string, though. */
4834 if (p + j < pend)
4835 {
4836 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4837 {
4838 RESET_FAIL_STACK ();
4839 return -2;
4840 }
4841 }
4842 else
4843 bufp->can_be_null = 1;
4844
4845 if (succeed_n_p)
4846 {
4847 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4848 succeed_n_p = false;
4849 }
4850
4851 continue;
4852
4853
4854 case succeed_n:
4855 /* Get to the number of times to succeed. */
4856 p += OFFSET_ADDRESS_SIZE;
4857
4858 /* Increment p past the n for when k != 0. */
4859 EXTRACT_NUMBER_AND_INCR (k, p);
4860 if (k == 0)
4861 {
4862 p -= 2 * OFFSET_ADDRESS_SIZE;
4863 succeed_n_p = true; /* Spaghetti code alert. */
4864 goto handle_on_failure_jump;
4865 }
4866 continue;
4867
4868
4869 case set_number_at:
4870 p += 2 * OFFSET_ADDRESS_SIZE;
4871 continue;
4872
4873
4874 case start_memory:
4875 case stop_memory:
4876 p += 2;
4877 continue;
4878
4879
4880 default:
4881 abort (); /* We have listed all the cases. */
4882 } /* switch *p++ */
4883
4884 /* Getting here means we have found the possible starting
4885 characters for one path of the pattern -- and that the empty
4886 string does not match. We need not follow this path further.
4887 Instead, look at the next alternative (remembered on the
4888 stack), or quit if no more. The test at the top of the loop
4889 does these things. */
4890 path_can_be_null = false;
4891 p = pend;
4892 } /* while p */
4893
4894 /* Set `can_be_null' for the last path (also the first path, if the
4895 pattern is empty). */
4896 bufp->can_be_null |= path_can_be_null;
4897
4898 done:
4899 RESET_FAIL_STACK ();
4900 return 0;
4901}
4902
4903#else /* not INSIDE_RECURSION */
4904
4905int
4906re_compile_fastmap (struct re_pattern_buffer *bufp)
4907{
4908# ifdef MBS_SUPPORT
4909 if (MB_CUR_MAX != 1)
4910 return wcs_re_compile_fastmap(bufp);
4911# endif
4912 return byte_re_compile_fastmap(bufp);
4913}
4914libc_hidden_def(re_compile_fastmap)
4915
4916
4917/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4918 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4919 this memory for recording register information. STARTS and ENDS
4920 must be allocated using the malloc library routine, and must each
4921 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4922
4923 If NUM_REGS == 0, then subsequent matches should allocate their own
4924 register data.
4925
4926 Unless this function is called, the first search or match using
4927 PATTERN_BUFFER will allocate its own register data, without
4928 freeing the old data. */
4929
4930void
4931re_set_registers (
4932 struct re_pattern_buffer *bufp,
4933 struct re_registers *regs,
4934 unsigned num_regs,
4935 regoff_t *starts, regoff_t *ends)
4936{
4937 if (num_regs)
4938 {
4939 bufp->regs_allocated = REGS_REALLOCATE;
4940 regs->num_regs = num_regs;
4941 regs->start = starts;
4942 regs->end = ends;
4943 }
4944 else
4945 {
4946 bufp->regs_allocated = REGS_UNALLOCATED;
4947 regs->num_regs = 0;
4948 regs->start = regs->end = (regoff_t *) 0;
4949 }
4950}
4951
4952/* Searching routines. */
4953
4954/* Like re_search_2, below, but only one string is specified, and
4955 doesn't let you say where to stop matching. */
4956
4957int
4958re_search (
4959 struct re_pattern_buffer *bufp,
4960 const char *string,
4961 int size, int startpos, int range,
4962 struct re_registers *regs)
4963{
4964 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4965 regs, size);
4966}
4967libc_hidden_def(re_search)
4968
4969
4970/* Using the compiled pattern in BUFP->buffer, first tries to match the
4971 virtual concatenation of STRING1 and STRING2, starting first at index
4972 STARTPOS, then at STARTPOS + 1, and so on.
4973
4974 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4975
4976 RANGE is how far to scan while trying to match. RANGE = 0 means try
4977 only at STARTPOS; in general, the last start tried is STARTPOS +
4978 RANGE.
4979
4980 In REGS, return the indices of the virtual concatenation of STRING1
4981 and STRING2 that matched the entire BUFP->buffer and its contained
4982 subexpressions.
4983
4984 Do not consider matching one past the index STOP in the virtual
4985 concatenation of STRING1 and STRING2.
4986
4987 We return either the position in the strings at which the match was
4988 found, -1 if no match, or -2 if error (such as failure
4989 stack overflow). */
4990
4991int
4992re_search_2 (
4993 struct re_pattern_buffer *bufp,
4994 const char *string1, int size1,
4995 const char *string2, int size2,
4996 int startpos,
4997 int range,
4998 struct re_registers *regs,
4999 int stop)
5000{
5001# ifdef MBS_SUPPORT
5002 if (MB_CUR_MAX != 1)
5003 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5004 range, regs, stop);
5005# endif
5006 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5007 range, regs, stop);
5008}
5009libc_hidden_def(re_search_2)
5010
5011#endif /* not INSIDE_RECURSION */
5012
5013#ifdef INSIDE_RECURSION
5014
5015#ifdef MATCH_MAY_ALLOCATE
5016# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
5017#else
5018# define FREE_VAR(var) free (var); var = NULL
5019#endif
5020
5021#ifdef WCHAR
5022# define MAX_ALLOCA_SIZE 2000
5023
5024# define FREE_WCS_BUFFERS() \
5025 do { \
5026 if (size1 > MAX_ALLOCA_SIZE) \
5027 { \
5028 free (wcs_string1); \
5029 free (mbs_offset1); \
5030 } \
5031 else \
5032 { \
5033 FREE_VAR (wcs_string1); \
5034 FREE_VAR (mbs_offset1); \
5035 } \
5036 if (size2 > MAX_ALLOCA_SIZE) \
5037 { \
5038 free (wcs_string2); \
5039 free (mbs_offset2); \
5040 } \
5041 else \
5042 { \
5043 FREE_VAR (wcs_string2); \
5044 FREE_VAR (mbs_offset2); \
5045 } \
5046 } while (0)
5047
5048#endif
5049
5050
5051static int
5052PREFIX(re_search_2) (
5053 struct re_pattern_buffer *bufp,
5054 const char *string1, int size1,
5055 const char *string2, int size2,
5056 int startpos,
5057 int range,
5058 struct re_registers *regs,
5059 int stop)
5060{
5061 int val;
5062 register char *fastmap = bufp->fastmap;
5063 register RE_TRANSLATE_TYPE translate = bufp->translate;
5064 int total_size = size1 + size2;
5065 int endpos = startpos + range;
5066#ifdef WCHAR
5067 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5068 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5069 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5070 int wcs_size1 = 0, wcs_size2 = 0;
5071 /* offset buffer for optimization. See convert_mbs_to_wc. */
5072 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5073 /* They hold whether each wchar_t is binary data or not. */
5074 char *is_binary = NULL;
5075#endif /* WCHAR */
5076
5077 /* Check for out-of-range STARTPOS. */
5078 if (startpos < 0 || startpos > total_size)
5079 return -1;
5080
5081 /* Fix up RANGE if it might eventually take us outside
5082 the virtual concatenation of STRING1 and STRING2.
5083 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5084 if (endpos < 0)
5085 range = 0 - startpos;
5086 else if (endpos > total_size)
5087 range = total_size - startpos;
5088
5089 /* If the search isn't to be a backwards one, don't waste time in a
5090 search for a pattern that must be anchored. */
5091 if (bufp->used > 0 && range > 0
5092 && ((re_opcode_t) bufp->buffer[0] == begbuf
5093 /* `begline' is like `begbuf' if it cannot match at newlines. */
5094 || ((re_opcode_t) bufp->buffer[0] == begline
5095 && !bufp->newline_anchor)))
5096 {
5097 if (startpos > 0)
5098 return -1;
5099 else
5100 range = 1;
5101 }
5102
5103#ifdef emacs
5104 /* In a forward search for something that starts with \=.
5105 don't keep searching past point. */
5106 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5107 {
5108 range = PT - startpos;
5109 if (range <= 0)
5110 return -1;
5111 }
5112#endif /* emacs */
5113
5114 /* Update the fastmap now if not correct already. */
5115 if (fastmap && !bufp->fastmap_accurate)
5116 if (re_compile_fastmap (bufp) == -2)
5117 return -2;
5118
5119#ifdef WCHAR
5120 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5121 fill them with converted string. */
5122 if (size1 != 0)
5123 {
5124 if (size1 > MAX_ALLOCA_SIZE)
5125 {
5126 wcs_string1 = TALLOC (size1 + 1, CHAR_T);
5127 mbs_offset1 = TALLOC (size1 + 1, int);
5128 is_binary = TALLOC (size1 + 1, char);
5129 }
5130 else
5131 {
5132 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5133 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5134 is_binary = REGEX_TALLOC (size1 + 1, char);
5135 }
5136 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5137 {
5138 if (size1 > MAX_ALLOCA_SIZE)
5139 {
5140 free (wcs_string1);
5141 free (mbs_offset1);
5142 free (is_binary);
5143 }
5144 else
5145 {
5146 FREE_VAR (wcs_string1);
5147 FREE_VAR (mbs_offset1);
5148 FREE_VAR (is_binary);
5149 }
5150 return -2;
5151 }
5152 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5153 mbs_offset1, is_binary);
5154 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5155 if (size1 > MAX_ALLOCA_SIZE)
5156 free (is_binary);
5157 else
5158 FREE_VAR (is_binary);
5159 }
5160 if (size2 != 0)
5161 {
5162 if (size2 > MAX_ALLOCA_SIZE)
5163 {
5164 wcs_string2 = TALLOC (size2 + 1, CHAR_T);
5165 mbs_offset2 = TALLOC (size2 + 1, int);
5166 is_binary = TALLOC (size2 + 1, char);
5167 }
5168 else
5169 {
5170 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5171 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5172 is_binary = REGEX_TALLOC (size2 + 1, char);
5173 }
5174 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5175 {
5176 FREE_WCS_BUFFERS ();
5177 if (size2 > MAX_ALLOCA_SIZE)
5178 free (is_binary);
5179 else
5180 FREE_VAR (is_binary);
5181 return -2;
5182 }
5183 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5184 mbs_offset2, is_binary);
5185 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5186 if (size2 > MAX_ALLOCA_SIZE)
5187 free (is_binary);
5188 else
5189 FREE_VAR (is_binary);
5190 }
5191#endif /* WCHAR */
5192
5193
5194 /* Loop through the string, looking for a place to start matching. */
5195 for (;;)
5196 {
5197 /* If a fastmap is supplied, skip quickly over characters that
5198 cannot be the start of a match. If the pattern can match the
5199 null string, however, we don't need to skip characters; we want
5200 the first null string. */
5201 if (fastmap && startpos < total_size && !bufp->can_be_null)
5202 {
5203 if (range > 0) /* Searching forwards. */
5204 {
5205 register const char *d;
5206 register int lim = 0;
5207 int irange = range;
5208
5209 if (startpos < size1 && startpos + range >= size1)
5210 lim = range - (size1 - startpos);
5211
5212 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5213
5214 /* Written out as an if-else to avoid testing `translate'
5215 inside the loop. */
5216 if (translate)
5217 while (range > lim
5218 && !fastmap[(unsigned char)
5219 translate[(unsigned char) *d++]])
5220 range--;
5221 else
5222 while (range > lim && !fastmap[(unsigned char) *d++])
5223 range--;
5224
5225 startpos += irange - range;
5226 }
5227 else /* Searching backwards. */
5228 {
5229 register CHAR_T c = (size1 == 0 || startpos >= size1
5230 ? string2[startpos - size1]
5231 : string1[startpos]);
5232
5233 if (!fastmap[(unsigned char) TRANSLATE (c)])
5234 goto advance;
5235 }
5236 }
5237
5238 /* If can't match the null string, and that's all we have left, fail. */
5239 if (range >= 0 && startpos == total_size && fastmap
5240 && !bufp->can_be_null)
5241 {
5242#ifdef WCHAR
5243 FREE_WCS_BUFFERS ();
5244#endif
5245 return -1;
5246 }
5247
5248#ifdef WCHAR
5249 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5250 size2, startpos, regs, stop,
5251 wcs_string1, wcs_size1,
5252 wcs_string2, wcs_size2,
5253 mbs_offset1, mbs_offset2);
5254#else /* BYTE */
5255 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5256 size2, startpos, regs, stop);
5257#endif /* BYTE */
5258
5259#ifndef REGEX_MALLOC
5260# ifdef C_ALLOCA
5261 alloca (0);
5262# endif
5263#endif
5264
5265 if (val >= 0)
5266 {
5267#ifdef WCHAR
5268 FREE_WCS_BUFFERS ();
5269#endif
5270 return startpos;
5271 }
5272
5273 if (val == -2)
5274 {
5275#ifdef WCHAR
5276 FREE_WCS_BUFFERS ();
5277#endif
5278 return -2;
5279 }
5280
5281 advance:
5282 if (!range)
5283 break;
5284 else if (range > 0)
5285 {
5286 range--;
5287 startpos++;
5288 }
5289 else
5290 {
5291 range++;
5292 startpos--;
5293 }
5294 }
5295#ifdef WCHAR
5296 FREE_WCS_BUFFERS ();
5297#endif
5298 return -1;
5299}
5300
5301#ifdef WCHAR
5302/* This converts PTR, a pointer into one of the search wchar_t strings
5303 `string1' and `string2' into an multibyte string offset from the
5304 beginning of that string. We use mbs_offset to optimize.
5305 See convert_mbs_to_wcs. */
5306# define POINTER_TO_OFFSET(ptr) \
5307 (FIRST_STRING_P (ptr) \
5308 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5309 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5310 + csize1)))
5311#else /* BYTE */
5312/* This converts PTR, a pointer into one of the search strings `string1'
5313 and `string2' into an offset from the beginning of that string. */
5314# define POINTER_TO_OFFSET(ptr) \
5315 (FIRST_STRING_P (ptr) \
5316 ? ((regoff_t) ((ptr) - string1)) \
5317 : ((regoff_t) ((ptr) - string2 + size1)))
5318#endif /* WCHAR */
5319
5320/* Macros for dealing with the split strings in re_match_2. */
5321
5322#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5323
5324/* Call before fetching a character with *d. This switches over to
5325 string2 if necessary. */
5326#define PREFETCH() \
5327 while (d == dend) \
5328 { \
5329 /* End of string2 => fail. */ \
5330 if (dend == end_match_2) \
5331 goto fail; \
5332 /* End of string1 => advance to string2. */ \
5333 d = string2; \
5334 dend = end_match_2; \
5335 }
5336
5337/* Test if at very beginning or at very end of the virtual concatenation
5338 of `string1' and `string2'. If only one string, it's `string2'. */
5339#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5340#define AT_STRINGS_END(d) ((d) == end2)
5341
5342
5343/* Test if D points to a character which is word-constituent. We have
5344 two special cases to check for: if past the end of string1, look at
5345 the first character in string2; and if before the beginning of
5346 string2, look at the last character in string1. */
5347#ifdef WCHAR
5348/* Use internationalized API instead of SYNTAX. */
5349# define WORDCHAR_P(d) \
5350 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5351 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5352 || ((d) == end1 ? *string2 \
5353 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5354#else /* BYTE */
5355# define WORDCHAR_P(d) \
5356 (SYNTAX ((d) == end1 ? *string2 \
5357 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5358 == Sword)
5359#endif /* WCHAR */
5360
5361/* Disabled due to a compiler bug -- see comment at case wordbound */
5362#if 0
5363/* Test if the character before D and the one at D differ with respect
5364 to being word-constituent. */
5365#define AT_WORD_BOUNDARY(d) \
5366 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5367 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5368#endif
5369
5370/* Free everything we malloc. */
5371#ifdef MATCH_MAY_ALLOCATE
5372# ifdef WCHAR
5373# define FREE_VARIABLES() \
5374 do { \
5375 REGEX_FREE_STACK (fail_stack.stack); \
5376 FREE_VAR (regstart); \
5377 FREE_VAR (regend); \
5378 FREE_VAR (old_regstart); \
5379 FREE_VAR (old_regend); \
5380 FREE_VAR (best_regstart); \
5381 FREE_VAR (best_regend); \
5382 FREE_VAR (reg_info); \
5383 FREE_VAR (reg_dummy); \
5384 FREE_VAR (reg_info_dummy); \
5385 if (!cant_free_wcs_buf) \
5386 { \
5387 FREE_VAR (string1); \
5388 FREE_VAR (string2); \
5389 FREE_VAR (mbs_offset1); \
5390 FREE_VAR (mbs_offset2); \
5391 } \
5392 } while (0)
5393# else /* BYTE */
5394# define FREE_VARIABLES() \
5395 do { \
5396 REGEX_FREE_STACK (fail_stack.stack); \
5397 FREE_VAR (regstart); \
5398 FREE_VAR (regend); \
5399 FREE_VAR (old_regstart); \
5400 FREE_VAR (old_regend); \
5401 FREE_VAR (best_regstart); \
5402 FREE_VAR (best_regend); \
5403 FREE_VAR (reg_info); \
5404 FREE_VAR (reg_dummy); \
5405 FREE_VAR (reg_info_dummy); \
5406 } while (0)
5407# endif /* WCHAR */
5408#else
5409# ifdef WCHAR
5410# define FREE_VARIABLES() \
5411 do { \
5412 if (!cant_free_wcs_buf) \
5413 { \
5414 FREE_VAR (string1); \
5415 FREE_VAR (string2); \
5416 FREE_VAR (mbs_offset1); \
5417 FREE_VAR (mbs_offset2); \
5418 } \
5419 } while (0)
5420# else /* BYTE */
5421# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5422# endif /* WCHAR */
5423#endif /* not MATCH_MAY_ALLOCATE */
5424
5425/* These values must meet several constraints. They must not be valid
5426 register values; since we have a limit of 255 registers (because
5427 we use only one byte in the pattern for the register number), we can
5428 use numbers larger than 255. They must differ by 1, because of
5429 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5430 be larger than the value for the highest register, so we do not try
5431 to actually save any registers when none are active. */
5432#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5433#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5434
5435#else /* not INSIDE_RECURSION */
5436/* Matching routines. */
5437
5438#ifndef emacs /* Emacs never uses this. */
5439/* re_match is like re_match_2 except it takes only a single string. */
5440
5441int
5442re_match (
5443 struct re_pattern_buffer *bufp,
5444 const char *string,
5445 int size, int pos,
5446 struct re_registers *regs)
5447{
5448 int result;
5449# ifdef MBS_SUPPORT
5450 if (MB_CUR_MAX != 1)
5451 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5452 pos, regs, size,
5453 NULL, 0, NULL, 0, NULL, NULL);
5454 else
5455# endif
5456 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5457 pos, regs, size);
5458# ifndef REGEX_MALLOC
5459# ifdef C_ALLOCA
5460 alloca (0);
5461# endif
5462# endif
5463 return result;
5464}
5465#endif /* not emacs */
5466
5467#endif /* not INSIDE_RECURSION */
5468
5469#ifdef INSIDE_RECURSION
5470static boolean PREFIX(group_match_null_string_p) (UCHAR_T **p,
5471 UCHAR_T *end,
5472 PREFIX(register_info_type) *reg_info);
5473static boolean PREFIX(alt_match_null_string_p) (UCHAR_T *p,
5474 UCHAR_T *end,
5475 PREFIX(register_info_type) *reg_info);
5476static boolean PREFIX(common_op_match_null_string_p) (UCHAR_T **p,
5477 UCHAR_T *end,
5478 PREFIX(register_info_type) *reg_info);
5479static int PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2,
5480 int len, char *translate);
5481#else /* not INSIDE_RECURSION */
5482
5483/* re_match_2 matches the compiled pattern in BUFP against the
5484 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5485 and SIZE2, respectively). We start matching at POS, and stop
5486 matching at STOP.
5487
5488 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5489 store offsets for the substring each group matched in REGS. See the
5490 documentation for exactly how many groups we fill.
5491
5492 We return -1 if no match, -2 if an internal error (such as the
5493 failure stack overflowing). Otherwise, we return the length of the
5494 matched substring. */
5495
5496int
5497re_match_2 (
5498 struct re_pattern_buffer *bufp,
5499 const char *string1, int size1,
5500 const char *string2, int size2,
5501 int pos,
5502 struct re_registers *regs,
5503 int stop)
5504{
5505 int result;
5506# ifdef MBS_SUPPORT
5507 if (MB_CUR_MAX != 1)
5508 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5509 pos, regs, stop,
5510 NULL, 0, NULL, 0, NULL, NULL);
5511 else
5512# endif
5513 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5514 pos, regs, stop);
5515
5516#ifndef REGEX_MALLOC
5517# ifdef C_ALLOCA
5518 alloca (0);
5519# endif
5520#endif
5521 return result;
5522}
5523
5524#endif /* not INSIDE_RECURSION */
5525
5526#ifdef INSIDE_RECURSION
5527
5528#ifdef WCHAR
5529static int count_mbs_length (int *, int);
5530
5531/* This check the substring (from 0, to length) of the multibyte string,
5532 to which offset_buffer correspond. And count how many wchar_t_characters
5533 the substring occupy. We use offset_buffer to optimization.
5534 See convert_mbs_to_wcs. */
5535
5536static int
5537count_mbs_length(
5538 int *offset_buffer,
5539 int length)
5540{
5541 int upper, lower;
5542
5543 /* Check whether the size is valid. */
5544 if (length < 0)
5545 return -1;
5546
5547 if (offset_buffer == NULL)
5548 return 0;
5549
5550 /* If there are no multibyte character, offset_buffer[i] == i.
5551 Optmize for this case. */
5552 if (offset_buffer[length] == length)
5553 return length;
5554
5555 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5556 upper = length;
5557 lower = 0;
5558
5559 while (true)
5560 {
5561 int middle = (lower + upper) / 2;
5562 if (middle == lower || middle == upper)
5563 break;
5564 if (offset_buffer[middle] > length)
5565 upper = middle;
5566 else if (offset_buffer[middle] < length)
5567 lower = middle;
5568 else
5569 return middle;
5570 }
5571
5572 return -1;
5573}
5574#endif /* WCHAR */
5575
5576/* This is a separate function so that we can force an alloca cleanup
5577 afterwards. */
5578#ifdef WCHAR
5579static int
5580wcs_re_match_2_internal (
5581 struct re_pattern_buffer *bufp,
5582 const char *cstring1, int csize1,
5583 const char *cstring2, int csize2,
5584 int pos,
5585 struct re_registers *regs,
5586 int stop,
5587 /* string1 == string2 == NULL means string1/2, size1/2 and
5588 mbs_offset1/2 need seting up in this function. */
5589 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5590 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5591 wchar_t *string1, int size1,
5592 wchar_t *string2, int size2,
5593 /* offset buffer for optimization. See convert_mbs_to_wc. */
5594 int *mbs_offset1, int *mbs_offset2)
5595#else /* BYTE */
5596static int
5597byte_re_match_2_internal (
5598 struct re_pattern_buffer *bufp,
5599 const char *string1, int size1,
5600 const char *string2, int size2,
5601 int pos,
5602 struct re_registers *regs,
5603 int stop)
5604#endif /* BYTE */
5605{
5606 /* General temporaries. */
5607 int mcnt;
5608 UCHAR_T *p1;
5609#ifdef WCHAR
5610 /* They hold whether each wchar_t is binary data or not. */
5611 char *is_binary = NULL;
5612 /* If true, we can't free string1/2, mbs_offset1/2. */
5613 int cant_free_wcs_buf = 1;
5614#endif /* WCHAR */
5615
5616 /* Just past the end of the corresponding string. */
5617 const CHAR_T *end1, *end2;
5618
5619 /* Pointers into string1 and string2, just past the last characters in
5620 each to consider matching. */
5621 const CHAR_T *end_match_1, *end_match_2;
5622
5623 /* Where we are in the data, and the end of the current string. */
5624 const CHAR_T *d, *dend;
5625
5626 /* Where we are in the pattern, and the end of the pattern. */
5627#ifdef WCHAR
5628 UCHAR_T *pattern, *p;
5629 register UCHAR_T *pend;
5630#else /* BYTE */
5631 UCHAR_T *p = bufp->buffer;
5632 register UCHAR_T *pend = p + bufp->used;
5633#endif /* WCHAR */
5634
5635 /* Mark the opcode just after a start_memory, so we can test for an
5636 empty subpattern when we get to the stop_memory. */
5637 UCHAR_T *just_past_start_mem = 0;
5638
5639 /* We use this to map every character in the string. */
5640 RE_TRANSLATE_TYPE translate = bufp->translate;
5641
5642 /* Failure point stack. Each place that can handle a failure further
5643 down the line pushes a failure point on this stack. It consists of
5644 restart, regend, and reg_info for all registers corresponding to
5645 the subexpressions we're currently inside, plus the number of such
5646 registers, and, finally, two char *'s. The first char * is where
5647 to resume scanning the pattern; the second one is where to resume
5648 scanning the strings. If the latter is zero, the failure point is
5649 a ``dummy''; if a failure happens and the failure point is a dummy,
5650 it gets discarded and the next next one is tried. */
5651#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5652 PREFIX(fail_stack_type) fail_stack;
5653#endif
5654#ifdef DEBUG
5655 static unsigned failure_id;
5656 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5657#endif
5658
5659#ifdef REL_ALLOC
5660 /* This holds the pointer to the failure stack, when
5661 it is allocated relocatably. */
5662 fail_stack_elt_t *failure_stack_ptr;
5663#endif
5664
5665 /* We fill all the registers internally, independent of what we
5666 return, for use in backreferences. The number here includes
5667 an element for register zero. */
5668 size_t num_regs = bufp->re_nsub + 1;
5669
5670 /* The currently active registers. */
5671 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5672 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5673
5674 /* Information on the contents of registers. These are pointers into
5675 the input strings; they record just what was matched (on this
5676 attempt) by a subexpression part of the pattern, that is, the
5677 regnum-th regstart pointer points to where in the pattern we began
5678 matching and the regnum-th regend points to right after where we
5679 stopped matching the regnum-th subexpression. (The zeroth register
5680 keeps track of what the whole pattern matches.) */
5681#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5682 const CHAR_T **regstart, **regend;
5683#endif
5684
5685 /* If a group that's operated upon by a repetition operator fails to
5686 match anything, then the register for its start will need to be
5687 restored because it will have been set to wherever in the string we
5688 are when we last see its open-group operator. Similarly for a
5689 register's end. */
5690#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5691 const CHAR_T **old_regstart, **old_regend;
5692#endif
5693
5694 /* The is_active field of reg_info helps us keep track of which (possibly
5695 nested) subexpressions we are currently in. The matched_something
5696 field of reg_info[reg_num] helps us tell whether or not we have
5697 matched any of the pattern so far this time through the reg_num-th
5698 subexpression. These two fields get reset each time through any
5699 loop their register is in. */
5700#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5701 PREFIX(register_info_type) *reg_info;
5702#endif
5703
5704 /* The following record the register info as found in the above
5705 variables when we find a match better than any we've seen before.
5706 This happens as we backtrack through the failure points, which in
5707 turn happens only if we have not yet matched the entire string. */
5708 unsigned best_regs_set = false;
5709#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5710 const CHAR_T **best_regstart, **best_regend;
5711#endif
5712
5713 /* Logically, this is `best_regend[0]'. But we don't want to have to
5714 allocate space for that if we're not allocating space for anything
5715 else (see below). Also, we never need info about register 0 for
5716 any of the other register vectors, and it seems rather a kludge to
5717 treat `best_regend' differently than the rest. So we keep track of
5718 the end of the best match so far in a separate variable. We
5719 initialize this to NULL so that when we backtrack the first time
5720 and need to test it, it's not garbage. */
5721 const CHAR_T *match_end = NULL;
5722
5723 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5724 int set_regs_matched_done = 0;
5725
5726 /* Used when we pop values we don't care about. */
5727#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5728 const CHAR_T **reg_dummy;
5729 PREFIX(register_info_type) *reg_info_dummy;
5730#endif
5731
5732#ifdef DEBUG
5733 /* Counts the total number of registers pushed. */
5734 unsigned num_regs_pushed = 0;
5735#endif
5736
5737 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5738
5739 INIT_FAIL_STACK ();
5740
5741#ifdef MATCH_MAY_ALLOCATE
5742 /* Do not bother to initialize all the register variables if there are
5743 no groups in the pattern, as it takes a fair amount of time. If
5744 there are groups, we include space for register 0 (the whole
5745 pattern), even though we never use it, since it simplifies the
5746 array indexing. We should fix this. */
5747 if (bufp->re_nsub)
5748 {
5749 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5750 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5751 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5752 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5753 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5754 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5755 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5756 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5757 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5758
5759 if (!(regstart && regend && old_regstart && old_regend && reg_info
5760 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5761 {
5762 FREE_VARIABLES ();
5763 return -2;
5764 }
5765 }
5766 else
5767 {
5768 /* We must initialize all our variables to NULL, so that
5769 `FREE_VARIABLES' doesn't try to free them. */
5770 regstart = regend = old_regstart = old_regend = best_regstart
5771 = best_regend = reg_dummy = NULL;
5772 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5773 }
5774#endif /* MATCH_MAY_ALLOCATE */
5775
5776 /* The starting position is bogus. */
5777#ifdef WCHAR
5778 if (pos < 0 || pos > csize1 + csize2)
5779#else /* BYTE */
5780 if (pos < 0 || pos > size1 + size2)
5781#endif
5782 {
5783 FREE_VARIABLES ();
5784 return -1;
5785 }
5786
5787#ifdef WCHAR
5788 /* Allocate wchar_t array for string1 and string2 and
5789 fill them with converted string. */
5790 if (string1 == NULL && string2 == NULL)
5791 {
5792 /* We need seting up buffers here. */
5793
5794 /* We must free wcs buffers in this function. */
5795 cant_free_wcs_buf = 0;
5796
5797 if (csize1 != 0)
5798 {
5799 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5800 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5801 is_binary = REGEX_TALLOC (csize1 + 1, char);
5802 if (!string1 || !mbs_offset1 || !is_binary)
5803 {
5804 FREE_VAR (string1);
5805 FREE_VAR (mbs_offset1);
5806 FREE_VAR (is_binary);
5807 return -2;
5808 }
5809 }
5810 if (csize2 != 0)
5811 {
5812 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5813 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5814 is_binary = REGEX_TALLOC (csize2 + 1, char);
5815 if (!string2 || !mbs_offset2 || !is_binary)
5816 {
5817 FREE_VAR (string1);
5818 FREE_VAR (mbs_offset1);
5819 FREE_VAR (string2);
5820 FREE_VAR (mbs_offset2);
5821 FREE_VAR (is_binary);
5822 return -2;
5823 }
5824 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5825 mbs_offset2, is_binary);
5826 string2[size2] = L'\0'; /* for a sentinel */
5827 FREE_VAR (is_binary);
5828 }
5829 }
5830
5831 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5832 pattern to (char*) in regex_compile. */
5833 p = pattern = (CHAR_T*)bufp->buffer;
5834 pend = (CHAR_T*)(bufp->buffer + bufp->used);
5835
5836#endif /* WCHAR */
5837
5838 /* Initialize subexpression text positions to -1 to mark ones that no
5839 start_memory/stop_memory has been seen for. Also initialize the
5840 register information struct. */
5841 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5842 {
5843 regstart[mcnt] = regend[mcnt]
5844 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5845
5846 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5847 IS_ACTIVE (reg_info[mcnt]) = 0;
5848 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5849 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5850 }
5851
5852 /* We move `string1' into `string2' if the latter's empty -- but not if
5853 `string1' is null. */
5854 if (size2 == 0 && string1 != NULL)
5855 {
5856 string2 = string1;
5857 size2 = size1;
5858 string1 = 0;
5859 size1 = 0;
5860#ifdef WCHAR
5861 mbs_offset2 = mbs_offset1;
5862 csize2 = csize1;
5863 mbs_offset1 = NULL;
5864 csize1 = 0;
5865#endif
5866 }
5867 end1 = string1 + size1;
5868 end2 = string2 + size2;
5869
5870 /* Compute where to stop matching, within the two strings. */
5871#ifdef WCHAR
5872 if (stop <= csize1)
5873 {
5874 mcnt = count_mbs_length(mbs_offset1, stop);
5875 end_match_1 = string1 + mcnt;
5876 end_match_2 = string2;
5877 }
5878 else
5879 {
5880 if (stop > csize1 + csize2)
5881 stop = csize1 + csize2;
5882 end_match_1 = end1;
5883 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5884 end_match_2 = string2 + mcnt;
5885 }
5886 if (mcnt < 0)
5887 { /* count_mbs_length return error. */
5888 FREE_VARIABLES ();
5889 return -1;
5890 }
5891#else
5892 if (stop <= size1)
5893 {
5894 end_match_1 = string1 + stop;
5895 end_match_2 = string2;
5896 }
5897 else
5898 {
5899 end_match_1 = end1;
5900 end_match_2 = string2 + stop - size1;
5901 }
5902#endif /* WCHAR */
5903
5904 /* `p' scans through the pattern as `d' scans through the data.
5905 `dend' is the end of the input string that `d' points within. `d'
5906 is advanced into the following input string whenever necessary, but
5907 this happens before fetching; therefore, at the beginning of the
5908 loop, `d' can be pointing at the end of a string, but it cannot
5909 equal `string2'. */
5910#ifdef WCHAR
5911 if (size1 > 0 && pos <= csize1)
5912 {
5913 mcnt = count_mbs_length(mbs_offset1, pos);
5914 d = string1 + mcnt;
5915 dend = end_match_1;
5916 }
5917 else
5918 {
5919 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5920 d = string2 + mcnt;
5921 dend = end_match_2;
5922 }
5923
5924 if (mcnt < 0)
5925 { /* count_mbs_length return error. */
5926 FREE_VARIABLES ();
5927 return -1;
5928 }
5929#else
5930 if (size1 > 0 && pos <= size1)
5931 {
5932 d = string1 + pos;
5933 dend = end_match_1;
5934 }
5935 else
5936 {
5937 d = string2 + pos - size1;
5938 dend = end_match_2;
5939 }
5940#endif /* WCHAR */
5941
5942 DEBUG_PRINT1 ("The compiled pattern is:\n");
5943 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5944 DEBUG_PRINT1 ("The string to match is: `");
5945 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5946 DEBUG_PRINT1 ("'\n");
5947
5948 /* This loops over pattern commands. It exits by returning from the
5949 function if the match is complete, or it drops through if the match
5950 fails at this starting point in the input data. */
5951 for (;;)
5952 {
5953#ifdef _LIBC
5954 DEBUG_PRINT2 ("\n%p: ", p);
5955#else
5956 DEBUG_PRINT2 ("\n0x%x: ", p);
5957#endif
5958
5959 if (p == pend)
5960 { /* End of pattern means we might have succeeded. */
5961 DEBUG_PRINT1 ("end of pattern ... ");
5962
5963 /* If we haven't matched the entire string, and we want the
5964 longest match, try backtracking. */
5965 if (d != end_match_2)
5966 {
5967 /* 1 if this match ends in the same string (string1 or string2)
5968 as the best previous match. */
5969 boolean same_str_p = (FIRST_STRING_P (match_end)
5970 == MATCHING_IN_FIRST_STRING);
5971 /* 1 if this match is the best seen so far. */
5972 boolean best_match_p;
5973
5974 /* AIX compiler got confused when this was combined
5975 with the previous declaration. */
5976 if (same_str_p)
5977 best_match_p = d > match_end;
5978 else
5979 best_match_p = !MATCHING_IN_FIRST_STRING;
5980
5981 DEBUG_PRINT1 ("backtracking.\n");
5982
5983 if (!FAIL_STACK_EMPTY ())
5984 { /* More failure points to try. */
5985
5986 /* If exceeds best match so far, save it. */
5987 if (!best_regs_set || best_match_p)
5988 {
5989 best_regs_set = true;
5990 match_end = d;
5991
5992 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5993
5994 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5995 {
5996 best_regstart[mcnt] = regstart[mcnt];
5997 best_regend[mcnt] = regend[mcnt];
5998 }
5999 }
6000 goto fail;
6001 }
6002
6003 /* If no failure points, don't restore garbage. And if
6004 last match is real best match, don't restore second
6005 best one. */
6006 else if (best_regs_set && !best_match_p)
6007 {
6008 restore_best_regs:
6009 /* Restore best match. It may happen that `dend ==
6010 end_match_1' while the restored d is in string2.
6011 For example, the pattern `x.*y.*z' against the
6012 strings `x-' and `y-z-', if the two strings are
6013 not consecutive in memory. */
6014 DEBUG_PRINT1 ("Restoring best registers.\n");
6015
6016 d = match_end;
6017 dend = ((d >= string1 && d <= end1)
6018 ? end_match_1 : end_match_2);
6019
6020 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6021 {
6022 regstart[mcnt] = best_regstart[mcnt];
6023 regend[mcnt] = best_regend[mcnt];
6024 }
6025 }
6026 } /* d != end_match_2 */
6027
6028 succeed_label:
6029 DEBUG_PRINT1 ("Accepting match.\n");
6030 /* If caller wants register contents data back, do it. */
6031 if (regs && !bufp->no_sub)
6032 {
6033 /* Have the register data arrays been allocated? */
6034 if (bufp->regs_allocated == REGS_UNALLOCATED)
6035 { /* No. So allocate them with malloc. We need one
6036 extra element beyond `num_regs' for the `-1' marker
6037 GNU code uses. */
6038/* regex specs say:
6039 * "If REGS_UNALLOCATED, allocate space in the regs structure
6040 * for max(RE_NREGS, re_nsub + 1) groups"
6041 * but real-world testsuites fail with contrived examples
6042 * with lots of groups.
6043 * I don't see why we can't just allocate exact needed number.
6044 * Incidentally, it makes RE_NREGS unused.
6045 *
6046 * regs->num_regs = MAX (RE_NREGS, num_regs + 1); - VERY WRONG
6047 * regs->num_regs = MIN (RE_NREGS, num_regs + 1); - slightly less wrong
6048 * good one which passes uclibc test/regex/tst-regex2.c:
6049 */
6050 regs->num_regs = num_regs + 1;
6051 regs->start = TALLOC (regs->num_regs, regoff_t);
6052 regs->end = TALLOC (regs->num_regs, regoff_t);
6053 if (regs->start == NULL || regs->end == NULL)
6054 {
6055 FREE_VARIABLES ();
6056 return -2;
6057 }
6058 bufp->regs_allocated = REGS_REALLOCATE;
6059 }
6060 else if (bufp->regs_allocated == REGS_REALLOCATE)
6061 { /* Yes. If we need more elements than were already
6062 allocated, reallocate them. If we need fewer, just
6063 leave it alone. */
6064 if (regs->num_regs < num_regs + 1)
6065 {
6066 regs->num_regs = num_regs + 1;
6067 RETALLOC (regs->start, regs->num_regs, regoff_t);
6068 RETALLOC (regs->end, regs->num_regs, regoff_t);
6069 if (regs->start == NULL || regs->end == NULL)
6070 {
6071 FREE_VARIABLES ();
6072 return -2;
6073 }
6074 }
6075 }
6076 else
6077 {
6078 /* These braces fend off a "empty body in an else-statement"
6079 warning under GCC when assert expands to nothing. */
6080 assert (bufp->regs_allocated == REGS_FIXED);
6081 }
6082
6083 /* Convert the pointer data in `regstart' and `regend' to
6084 indices. Register zero has to be set differently,
6085 since we haven't kept track of any info for it. */
6086 if (regs->num_regs > 0)
6087 {
6088 regs->start[0] = pos;
6089#ifdef WCHAR
6090 if (MATCHING_IN_FIRST_STRING)
6091 regs->end[0] = mbs_offset1 != NULL ?
6092 mbs_offset1[d-string1] : 0;
6093 else
6094 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
6095 mbs_offset2[d-string2] : 0);
6096#else
6097 regs->end[0] = (MATCHING_IN_FIRST_STRING
6098 ? ((regoff_t) (d - string1))
6099 : ((regoff_t) (d - string2 + size1)));
6100#endif /* WCHAR */
6101 }
6102
6103 /* Go through the first `min (num_regs, regs->num_regs)'
6104 registers, since that is all we initialized. */
6105 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6106 mcnt++)
6107 {
6108 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6109 regs->start[mcnt] = regs->end[mcnt] = -1;
6110 else
6111 {
6112 regs->start[mcnt]
6113 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6114 regs->end[mcnt]
6115 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6116 }
6117 }
6118
6119 /* If the regs structure we return has more elements than
6120 were in the pattern, set the extra elements to -1. If
6121 we (re)allocated the registers, this is the case,
6122 because we always allocate enough to have at least one
6123 -1 at the end. */
6124 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6125 regs->start[mcnt] = regs->end[mcnt] = -1;
6126 } /* regs && !bufp->no_sub */
6127
6128 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6129 nfailure_points_pushed, nfailure_points_popped,
6130 nfailure_points_pushed - nfailure_points_popped);
6131 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6132
6133#ifdef WCHAR
6134 if (MATCHING_IN_FIRST_STRING)
6135 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6136 else
6137 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6138 csize1;
6139 mcnt -= pos;
6140#else
6141 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6142 ? string1
6143 : string2 - size1);
6144#endif /* WCHAR */
6145
6146 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6147
6148 FREE_VARIABLES ();
6149 return mcnt;
6150 }
6151
6152 /* Otherwise match next pattern command. */
6153 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6154 {
6155 /* Ignore these. Used to ignore the n of succeed_n's which
6156 currently have n == 0. */
6157 case no_op:
6158 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6159 break;
6160
6161 case succeed:
6162 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6163 goto succeed_label;
6164
6165 /* Match the next n pattern characters exactly. The following
6166 byte in the pattern defines n, and the n bytes after that
6167 are the characters to match. */
6168 case exactn:
6169#ifdef MBS_SUPPORT
6170 case exactn_bin:
6171#endif
6172 mcnt = *p++;
6173 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6174
6175 /* This is written out as an if-else so we don't waste time
6176 testing `translate' inside the loop. */
6177 if (translate)
6178 {
6179 do
6180 {
6181 PREFETCH ();
6182#ifdef WCHAR
6183 if (*d <= 0xff)
6184 {
6185 if ((UCHAR_T) translate[(unsigned char) *d++]
6186 != (UCHAR_T) *p++)
6187 goto fail;
6188 }
6189 else
6190 {
6191 if (*d++ != (CHAR_T) *p++)
6192 goto fail;
6193 }
6194#else
6195 if ((UCHAR_T) translate[(unsigned char) *d++]
6196 != (UCHAR_T) *p++)
6197 goto fail;
6198#endif /* WCHAR */
6199 }
6200 while (--mcnt);
6201 }
6202 else
6203 {
6204 do
6205 {
6206 PREFETCH ();
6207 if (*d++ != (CHAR_T) *p++) goto fail;
6208 }
6209 while (--mcnt);
6210 }
6211 SET_REGS_MATCHED ();
6212 break;
6213
6214
6215 /* Match any character except possibly a newline or a null. */
6216 case anychar:
6217 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6218
6219 PREFETCH ();
6220
6221 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6222 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6223 goto fail;
6224
6225 SET_REGS_MATCHED ();
6226 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6227 d++;
6228 break;
6229
6230
6231 case charset:
6232 case charset_not:
6233 {
6234 register UCHAR_T c;
6235#ifdef WCHAR
6236 unsigned int i, char_class_length, coll_symbol_length,
6237 equiv_class_length, ranges_length, chars_length, length;
6238 CHAR_T *workp, *workp2, *charset_top;
6239#define WORK_BUFFER_SIZE 128
6240 CHAR_T str_buf[WORK_BUFFER_SIZE];
6241# ifdef _LIBC
6242 uint32_t nrules;
6243# endif /* _LIBC */
6244#endif /* WCHAR */
6245 boolean not = (re_opcode_t) *(p - 1) == charset_not;
6246
6247 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
6248 PREFETCH ();
6249 c = TRANSLATE (*d); /* The character to match. */
6250#ifdef WCHAR
6251# ifdef _LIBC
6252 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6253# endif /* _LIBC */
6254 charset_top = p - 1;
6255 char_class_length = *p++;
6256 coll_symbol_length = *p++;
6257 equiv_class_length = *p++;
6258 ranges_length = *p++;
6259 chars_length = *p++;
6260 /* p points charset[6], so the address of the next instruction
6261 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6262 where l=length of char_classes, m=length of collating_symbol,
6263 n=equivalence_class, o=length of char_range,
6264 p'=length of character. */
6265 workp = p;
6266 /* Update p to indicate the next instruction. */
6267 p += char_class_length + coll_symbol_length+ equiv_class_length +
6268 2*ranges_length + chars_length;
6269
6270 /* match with char_class? */
6271 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6272 {
6273 wctype_t wctype;
6274 uintptr_t alignedp = ((uintptr_t)workp
6275 + __alignof__(wctype_t) - 1)
6276 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6277 wctype = *((wctype_t*)alignedp);
6278 workp += CHAR_CLASS_SIZE;
6279# ifdef _LIBC
6280 if (__iswctype((wint_t)c, wctype))
6281 goto char_set_matched;
6282# else
6283 if (iswctype((wint_t)c, wctype))
6284 goto char_set_matched;
6285# endif
6286 }
6287
6288 /* match with collating_symbol? */
6289# ifdef _LIBC
6290 if (nrules != 0)
6291 {
6292 const unsigned char *extra = (const unsigned char *)
6293 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6294
6295 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6296 workp++)
6297 {
6298 int32_t *wextra;
6299 wextra = (int32_t*)(extra + *workp++);
6300 for (i = 0; i < *wextra; ++i)
6301 if (TRANSLATE(d[i]) != wextra[1 + i])
6302 break;
6303
6304 if (i == *wextra)
6305 {
6306 /* Update d, however d will be incremented at
6307 char_set_matched:, we decrement d here. */
6308 d += i - 1;
6309 goto char_set_matched;
6310 }
6311 }
6312 }
6313 else /* (nrules == 0) */
6314# endif
6315 /* If we can't look up collation data, we use wcscoll
6316 instead. */
6317 {
6318 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6319 {
6320 const CHAR_T *backup_d = d, *backup_dend = dend;
6321# ifdef _LIBC
6322 length = __wcslen (workp);
6323# else
6324 length = wcslen (workp);
6325# endif
6326
6327 /* If wcscoll(the collating symbol, whole string) > 0,
6328 any substring of the string never match with the
6329 collating symbol. */
6330# ifdef _LIBC
6331 if (__wcscoll (workp, d) > 0)
6332# else
6333 if (wcscoll (workp, d) > 0)
6334# endif
6335 {
6336 workp += length + 1;
6337 continue;
6338 }
6339
6340 /* First, we compare the collating symbol with
6341 the first character of the string.
6342 If it don't match, we add the next character to
6343 the compare buffer in turn. */
6344 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6345 {
6346 int match;
6347 if (d == dend)
6348 {
6349 if (dend == end_match_2)
6350 break;
6351 d = string2;
6352 dend = end_match_2;
6353 }
6354
6355 /* add next character to the compare buffer. */
6356 str_buf[i] = TRANSLATE(*d);
6357 str_buf[i+1] = '\0';
6358
6359# ifdef _LIBC
6360 match = __wcscoll (workp, str_buf);
6361# else
6362 match = wcscoll (workp, str_buf);
6363# endif
6364 if (match == 0)
6365 goto char_set_matched;
6366
6367 if (match < 0)
6368 /* (str_buf > workp) indicate (str_buf + X > workp),
6369 because for all X (str_buf + X > str_buf).
6370 So we don't need continue this loop. */
6371 break;
6372
6373 /* Otherwise(str_buf < workp),
6374 (str_buf+next_character) may equals (workp).
6375 So we continue this loop. */
6376 }
6377 /* not matched */
6378 d = backup_d;
6379 dend = backup_dend;
6380 workp += length + 1;
6381 }
6382 }
6383 /* match with equivalence_class? */
6384# ifdef _LIBC
6385 if (nrules != 0)
6386 {
6387 const CHAR_T *backup_d = d, *backup_dend = dend;
6388 /* Try to match the equivalence class against
6389 those known to the collate implementation. */
6390 const int32_t *table;
6391 const int32_t *weights;
6392 const int32_t *extra;
6393 const int32_t *indirect;
6394 int32_t idx, idx2;
6395 wint_t *cp;
6396 size_t len;
6397
6398 /* This #include defines a local function! */
6399# include <locale/weightwc.h>
6400
6401 table = (const int32_t *)
6402 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6403 weights = (const wint_t *)
6404 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6405 extra = (const wint_t *)
6406 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6407 indirect = (const int32_t *)
6408 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6409
6410 /* Write 1 collating element to str_buf, and
6411 get its index. */
6412 idx2 = 0;
6413
6414 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6415 {
6416 cp = (wint_t*)str_buf;
6417 if (d == dend)
6418 {
6419 if (dend == end_match_2)
6420 break;
6421 d = string2;
6422 dend = end_match_2;
6423 }
6424 str_buf[i] = TRANSLATE(*(d+i));
6425 str_buf[i+1] = '\0'; /* sentinel */
6426 idx2 = findidx ((const wint_t**)&cp);
6427 }
6428
6429 /* Update d, however d will be incremented at
6430 char_set_matched:, we decrement d here. */
6431 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6432 if (d >= dend)
6433 {
6434 if (dend == end_match_2)
6435 d = dend;
6436 else
6437 {
6438 d = string2;
6439 dend = end_match_2;
6440 }
6441 }
6442
6443 len = weights[idx2];
6444
6445 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6446 workp++)
6447 {
6448 idx = (int32_t)*workp;
6449 /* We already checked idx != 0 in regex_compile. */
6450
6451 if (idx2 != 0 && len == weights[idx])
6452 {
6453 int cnt = 0;
6454 while (cnt < len && (weights[idx + 1 + cnt]
6455 == weights[idx2 + 1 + cnt]))
6456 ++cnt;
6457
6458 if (cnt == len)
6459 goto char_set_matched;
6460 }
6461 }
6462 /* not matched */
6463 d = backup_d;
6464 dend = backup_dend;
6465 }
6466 else /* (nrules == 0) */
6467# endif
6468 /* If we can't look up collation data, we use wcscoll
6469 instead. */
6470 {
6471 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6472 {
6473 const CHAR_T *backup_d = d, *backup_dend = dend;
6474# ifdef _LIBC
6475 length = __wcslen (workp);
6476# else
6477 length = wcslen (workp);
6478# endif
6479
6480 /* If wcscoll(the collating symbol, whole string) > 0,
6481 any substring of the string never match with the
6482 collating symbol. */
6483# ifdef _LIBC
6484 if (__wcscoll (workp, d) > 0)
6485# else
6486 if (wcscoll (workp, d) > 0)
6487# endif
6488 {
6489 workp += length + 1;
6490 break;
6491 }
6492
6493 /* First, we compare the equivalence class with
6494 the first character of the string.
6495 If it don't match, we add the next character to
6496 the compare buffer in turn. */
6497 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6498 {
6499 int match;
6500 if (d == dend)
6501 {
6502 if (dend == end_match_2)
6503 break;
6504 d = string2;
6505 dend = end_match_2;
6506 }
6507
6508 /* add next character to the compare buffer. */
6509 str_buf[i] = TRANSLATE(*d);
6510 str_buf[i+1] = '\0';
6511
6512# ifdef _LIBC
6513 match = __wcscoll (workp, str_buf);
6514# else
6515 match = wcscoll (workp, str_buf);
6516# endif
6517
6518 if (match == 0)
6519 goto char_set_matched;
6520
6521 if (match < 0)
6522 /* (str_buf > workp) indicate (str_buf + X > workp),
6523 because for all X (str_buf + X > str_buf).
6524 So we don't need continue this loop. */
6525 break;
6526
6527 /* Otherwise(str_buf < workp),
6528 (str_buf+next_character) may equals (workp).
6529 So we continue this loop. */
6530 }
6531 /* not matched */
6532 d = backup_d;
6533 dend = backup_dend;
6534 workp += length + 1;
6535 }
6536 }
6537
6538 /* match with char_range? */
6539# ifdef _LIBC
6540 if (nrules != 0)
6541 {
6542 uint32_t collseqval;
6543 const char *collseq = (const char *)
6544 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6545
6546 collseqval = collseq_table_lookup (collseq, c);
6547
6548 for (; workp < p - chars_length ;)
6549 {
6550 uint32_t start_val, end_val;
6551
6552 /* We already compute the collation sequence value
6553 of the characters (or collating symbols). */
6554 start_val = (uint32_t) *workp++; /* range_start */
6555 end_val = (uint32_t) *workp++; /* range_end */
6556
6557 if (start_val <= collseqval && collseqval <= end_val)
6558 goto char_set_matched;
6559 }
6560 }
6561 else
6562# endif
6563 {
6564 /* We set range_start_char at str_buf[0], range_end_char
6565 at str_buf[4], and compared char at str_buf[2]. */
6566 str_buf[1] = 0;
6567 str_buf[2] = c;
6568 str_buf[3] = 0;
6569 str_buf[5] = 0;
6570 for (; workp < p - chars_length ;)
6571 {
6572 wchar_t *range_start_char, *range_end_char;
6573
6574 /* match if (range_start_char <= c <= range_end_char). */
6575
6576 /* If range_start(or end) < 0, we assume -range_start(end)
6577 is the offset of the collating symbol which is specified
6578 as the character of the range start(end). */
6579
6580 /* range_start */
6581 if (*workp < 0)
6582 range_start_char = charset_top - (*workp++);
6583 else
6584 {
6585 str_buf[0] = *workp++;
6586 range_start_char = str_buf;
6587 }
6588
6589 /* range_end */
6590 if (*workp < 0)
6591 range_end_char = charset_top - (*workp++);
6592 else
6593 {
6594 str_buf[4] = *workp++;
6595 range_end_char = str_buf + 4;
6596 }
6597
6598# ifdef _LIBC
6599 if (__wcscoll (range_start_char, str_buf+2) <= 0
6600 && __wcscoll (str_buf+2, range_end_char) <= 0)
6601# else
6602 if (wcscoll (range_start_char, str_buf+2) <= 0
6603 && wcscoll (str_buf+2, range_end_char) <= 0)
6604# endif
6605 goto char_set_matched;
6606 }
6607 }
6608
6609 /* match with char? */
6610 for (; workp < p ; workp++)
6611 if (c == *workp)
6612 goto char_set_matched;
6613
6614 not = !not;
6615
6616 char_set_matched:
6617 if (not) goto fail;
6618#else
6619 /* Cast to `unsigned' instead of `unsigned char' in case the
6620 bit list is a full 32 bytes long. */
6621 if (c < (unsigned) (*p * BYTEWIDTH)
6622 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6623 not = !not;
6624
6625 p += 1 + *p;
6626
6627 if (!not) goto fail;
6628#undef WORK_BUFFER_SIZE
6629#endif /* WCHAR */
6630 SET_REGS_MATCHED ();
6631 d++;
6632 break;
6633 }
6634
6635
6636 /* The beginning of a group is represented by start_memory.
6637 The arguments are the register number in the next byte, and the
6638 number of groups inner to this one in the next. The text
6639 matched within the group is recorded (in the internal
6640 registers data structure) under the register number. */
6641 case start_memory:
6642 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6643 (long int) *p, (long int) p[1]);
6644
6645 /* Find out if this group can match the empty string. */
6646 p1 = p; /* To send to group_match_null_string_p. */
6647
6648 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6649 REG_MATCH_NULL_STRING_P (reg_info[*p])
6650 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6651
6652 /* Save the position in the string where we were the last time
6653 we were at this open-group operator in case the group is
6654 operated upon by a repetition operator, e.g., with `(a*)*b'
6655 against `ab'; then we want to ignore where we are now in
6656 the string in case this attempt to match fails. */
6657 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6658 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6659 : regstart[*p];
6660 DEBUG_PRINT2 (" old_regstart: %d\n",
6661 POINTER_TO_OFFSET (old_regstart[*p]));
6662
6663 regstart[*p] = d;
6664 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6665
6666 IS_ACTIVE (reg_info[*p]) = 1;
6667 MATCHED_SOMETHING (reg_info[*p]) = 0;
6668
6669 /* Clear this whenever we change the register activity status. */
6670 set_regs_matched_done = 0;
6671
6672 /* This is the new highest active register. */
6673 highest_active_reg = *p;
6674
6675 /* If nothing was active before, this is the new lowest active
6676 register. */
6677 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6678 lowest_active_reg = *p;
6679
6680 /* Move past the register number and inner group count. */
6681 p += 2;
6682 just_past_start_mem = p;
6683
6684 break;
6685
6686
6687 /* The stop_memory opcode represents the end of a group. Its
6688 arguments are the same as start_memory's: the register
6689 number, and the number of inner groups. */
6690 case stop_memory:
6691 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6692 (long int) *p, (long int) p[1]);
6693
6694 /* We need to save the string position the last time we were at
6695 this close-group operator in case the group is operated
6696 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6697 against `aba'; then we want to ignore where we are now in
6698 the string in case this attempt to match fails. */
6699 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6700 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6701 : regend[*p];
6702 DEBUG_PRINT2 (" old_regend: %d\n",
6703 POINTER_TO_OFFSET (old_regend[*p]));
6704
6705 regend[*p] = d;
6706 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6707
6708 /* This register isn't active anymore. */
6709 IS_ACTIVE (reg_info[*p]) = 0;
6710
6711 /* Clear this whenever we change the register activity status. */
6712 set_regs_matched_done = 0;
6713
6714 /* If this was the only register active, nothing is active
6715 anymore. */
6716 if (lowest_active_reg == highest_active_reg)
6717 {
6718 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6719 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6720 }
6721 else
6722 { /* We must scan for the new highest active register, since
6723 it isn't necessarily one less than now: consider
6724 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6725 new highest active register is 1. */
6726 UCHAR_T r = *p - 1;
6727 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6728 r--;
6729
6730 /* If we end up at register zero, that means that we saved
6731 the registers as the result of an `on_failure_jump', not
6732 a `start_memory', and we jumped to past the innermost
6733 `stop_memory'. For example, in ((.)*) we save
6734 registers 1 and 2 as a result of the *, but when we pop
6735 back to the second ), we are at the stop_memory 1.
6736 Thus, nothing is active. */
6737 if (r == 0)
6738 {
6739 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6740 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6741 }
6742 else
6743 highest_active_reg = r;
6744 }
6745
6746 /* If just failed to match something this time around with a
6747 group that's operated on by a repetition operator, try to
6748 force exit from the ``loop'', and restore the register
6749 information for this group that we had before trying this
6750 last match. */
6751 if ((!MATCHED_SOMETHING (reg_info[*p])
6752 || just_past_start_mem == p - 1)
6753 && (p + 2) < pend)
6754 {
6755 boolean is_a_jump_n = false;
6756
6757 p1 = p + 2;
6758 mcnt = 0;
6759 switch ((re_opcode_t) *p1++)
6760 {
6761 case jump_n:
6762 is_a_jump_n = true;
6763 case pop_failure_jump:
6764 case maybe_pop_jump:
6765 case jump:
6766 case dummy_failure_jump:
6767 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6768 if (is_a_jump_n)
6769 p1 += OFFSET_ADDRESS_SIZE;
6770 break;
6771
6772 default:
6773 /* do nothing */ ;
6774 }
6775 p1 += mcnt;
6776
6777 /* If the next operation is a jump backwards in the pattern
6778 to an on_failure_jump right before the start_memory
6779 corresponding to this stop_memory, exit from the loop
6780 by forcing a failure after pushing on the stack the
6781 on_failure_jump's jump in the pattern, and d. */
6782 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6783 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6784 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6785 {
6786 /* If this group ever matched anything, then restore
6787 what its registers were before trying this last
6788 failed match, e.g., with `(a*)*b' against `ab' for
6789 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6790 against `aba' for regend[3].
6791
6792 Also restore the registers for inner groups for,
6793 e.g., `((a*)(b*))*' against `aba' (register 3 would
6794 otherwise get trashed). */
6795
6796 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6797 {
6798 unsigned r;
6799
6800 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6801
6802 /* Restore this and inner groups' (if any) registers. */
6803 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6804 r++)
6805 {
6806 regstart[r] = old_regstart[r];
6807
6808 /* xx why this test? */
6809 if (old_regend[r] >= regstart[r])
6810 regend[r] = old_regend[r];
6811 }
6812 }
6813 p1++;
6814 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6815 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6816
6817 goto fail;
6818 }
6819 }
6820
6821 /* Move past the register number and the inner group count. */
6822 p += 2;
6823 break;
6824
6825
6826 /* \<digit> has been turned into a `duplicate' command which is
6827 followed by the numeric value of <digit> as the register number. */
6828 case duplicate:
6829 {
6830 register const CHAR_T *d2, *dend2;
6831 int regno = *p++; /* Get which register to match against. */
6832 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6833
6834 /* Can't back reference a group which we've never matched. */
6835 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6836 goto fail;
6837
6838 /* Where in input to try to start matching. */
6839 d2 = regstart[regno];
6840
6841 /* Where to stop matching; if both the place to start and
6842 the place to stop matching are in the same string, then
6843 set to the place to stop, otherwise, for now have to use
6844 the end of the first string. */
6845
6846 dend2 = ((FIRST_STRING_P (regstart[regno])
6847 == FIRST_STRING_P (regend[regno]))
6848 ? regend[regno] : end_match_1);
6849 for (;;)
6850 {
6851 /* If necessary, advance to next segment in register
6852 contents. */
6853 while (d2 == dend2)
6854 {
6855 if (dend2 == end_match_2) break;
6856 if (dend2 == regend[regno]) break;
6857
6858 /* End of string1 => advance to string2. */
6859 d2 = string2;
6860 dend2 = regend[regno];
6861 }
6862 /* At end of register contents => success */
6863 if (d2 == dend2) break;
6864
6865 /* If necessary, advance to next segment in data. */
6866 PREFETCH ();
6867
6868 /* How many characters left in this segment to match. */
6869 mcnt = dend - d;
6870
6871 /* Want how many consecutive characters we can match in
6872 one shot, so, if necessary, adjust the count. */
6873 if (mcnt > dend2 - d2)
6874 mcnt = dend2 - d2;
6875
6876 /* Compare that many; failure if mismatch, else move
6877 past them. */
6878 if (translate
6879 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
6880 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
6881 goto fail;
6882 d += mcnt, d2 += mcnt;
6883
6884 /* Do this because we've match some characters. */
6885 SET_REGS_MATCHED ();
6886 }
6887 }
6888 break;
6889
6890
6891 /* begline matches the empty string at the beginning of the string
6892 (unless `not_bol' is set in `bufp'), and, if
6893 `newline_anchor' is set, after newlines. */
6894 case begline:
6895 DEBUG_PRINT1 ("EXECUTING begline.\n");
6896
6897 if (AT_STRINGS_BEG (d))
6898 {
6899 if (!bufp->not_bol) break;
6900 }
6901 else if (d[-1] == '\n' && bufp->newline_anchor)
6902 {
6903 break;
6904 }
6905 /* In all other cases, we fail. */
6906 goto fail;
6907
6908
6909 /* endline is the dual of begline. */
6910 case endline:
6911 DEBUG_PRINT1 ("EXECUTING endline.\n");
6912
6913 if (AT_STRINGS_END (d))
6914 {
6915 if (!bufp->not_eol) break;
6916 }
6917
6918 /* We have to ``prefetch'' the next character. */
6919 else if ((d == end1 ? *string2 : *d) == '\n'
6920 && bufp->newline_anchor)
6921 {
6922 break;
6923 }
6924 goto fail;
6925
6926
6927 /* Match at the very beginning of the data. */
6928 case begbuf:
6929 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6930 if (AT_STRINGS_BEG (d))
6931 break;
6932 goto fail;
6933
6934
6935 /* Match at the very end of the data. */
6936 case endbuf:
6937 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6938 if (AT_STRINGS_END (d))
6939 break;
6940 goto fail;
6941
6942
6943 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6944 pushes NULL as the value for the string on the stack. Then
6945 `pop_failure_point' will keep the current value for the
6946 string, instead of restoring it. To see why, consider
6947 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6948 then the . fails against the \n. But the next thing we want
6949 to do is match the \n against the \n; if we restored the
6950 string value, we would be back at the foo.
6951
6952 Because this is used only in specific cases, we don't need to
6953 check all the things that `on_failure_jump' does, to make
6954 sure the right things get saved on the stack. Hence we don't
6955 share its code. The only reason to push anything on the
6956 stack at all is that otherwise we would have to change
6957 `anychar's code to do something besides goto fail in this
6958 case; that seems worse than this. */
6959 case on_failure_keep_string_jump:
6960 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6961
6962 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6963#ifdef _LIBC
6964 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6965#else
6966 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6967#endif
6968
6969 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6970 break;
6971
6972
6973 /* Uses of on_failure_jump:
6974
6975 Each alternative starts with an on_failure_jump that points
6976 to the beginning of the next alternative. Each alternative
6977 except the last ends with a jump that in effect jumps past
6978 the rest of the alternatives. (They really jump to the
6979 ending jump of the following alternative, because tensioning
6980 these jumps is a hassle.)
6981
6982 Repeats start with an on_failure_jump that points past both
6983 the repetition text and either the following jump or
6984 pop_failure_jump back to this on_failure_jump. */
6985 case on_failure_jump:
6986 on_failure:
6987 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6988
6989 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6990#ifdef _LIBC
6991 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6992#else
6993 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6994#endif
6995
6996 /* If this on_failure_jump comes right before a group (i.e.,
6997 the original * applied to a group), save the information
6998 for that group and all inner ones, so that if we fail back
6999 to this point, the group's information will be correct.
7000 For example, in \(a*\)*\1, we need the preceding group,
7001 and in \(zz\(a*\)b*\)\2, we need the inner group. */
7002
7003 /* We can't use `p' to check ahead because we push
7004 a failure point to `p + mcnt' after we do this. */
7005 p1 = p;
7006
7007 /* We need to skip no_op's before we look for the
7008 start_memory in case this on_failure_jump is happening as
7009 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
7010 against aba. */
7011 while (p1 < pend && (re_opcode_t) *p1 == no_op)
7012 p1++;
7013
7014 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
7015 {
7016 /* We have a new highest active register now. This will
7017 get reset at the start_memory we are about to get to,
7018 but we will have saved all the registers relevant to
7019 this repetition op, as described above. */
7020 highest_active_reg = *(p1 + 1) + *(p1 + 2);
7021 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
7022 lowest_active_reg = *(p1 + 1);
7023 }
7024
7025 DEBUG_PRINT1 (":\n");
7026 PUSH_FAILURE_POINT (p + mcnt, d, -2);
7027 break;
7028
7029
7030 /* A smart repeat ends with `maybe_pop_jump'.
7031 We change it to either `pop_failure_jump' or `jump'. */
7032 case maybe_pop_jump:
7033 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7034 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
7035 {
7036 register UCHAR_T *p2 = p;
7037
7038 /* Compare the beginning of the repeat with what in the
7039 pattern follows its end. If we can establish that there
7040 is nothing that they would both match, i.e., that we
7041 would have to backtrack because of (as in, e.g., `a*a')
7042 then we can change to pop_failure_jump, because we'll
7043 never have to backtrack.
7044
7045 This is not true in the case of alternatives: in
7046 `(a|ab)*' we do need to backtrack to the `ab' alternative
7047 (e.g., if the string was `ab'). But instead of trying to
7048 detect that here, the alternative has put on a dummy
7049 failure point which is what we will end up popping. */
7050
7051 /* Skip over open/close-group commands.
7052 If what follows this loop is a ...+ construct,
7053 look at what begins its body, since we will have to
7054 match at least one of that. */
7055 while (1)
7056 {
7057 if (p2 + 2 < pend
7058 && ((re_opcode_t) *p2 == stop_memory
7059 || (re_opcode_t) *p2 == start_memory))
7060 p2 += 3;
7061 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7062 && (re_opcode_t) *p2 == dummy_failure_jump)
7063 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7064 else
7065 break;
7066 }
7067
7068 p1 = p + mcnt;
7069 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7070 to the `maybe_finalize_jump' of this case. Examine what
7071 follows. */
7072
7073 /* If we're at the end of the pattern, we can change. */
7074 if (p2 == pend)
7075 {
7076 /* Consider what happens when matching ":\(.*\)"
7077 against ":/". I don't really understand this code
7078 yet. */
7079 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7080 pop_failure_jump;
7081 DEBUG_PRINT1
7082 (" End of pattern: change to `pop_failure_jump'.\n");
7083 }
7084
7085 else if ((re_opcode_t) *p2 == exactn
7086#ifdef MBS_SUPPORT
7087 || (re_opcode_t) *p2 == exactn_bin
7088#endif
7089 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7090 {
7091 register UCHAR_T c
7092 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7093
7094 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7095#ifdef MBS_SUPPORT
7096 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7097#endif
7098 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7099 {
7100 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7101 pop_failure_jump;
7102#ifdef WCHAR
7103 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7104 (wint_t) c,
7105 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7106#else
7107 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7108 (char) c,
7109 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7110#endif
7111 }
7112
7113#ifndef WCHAR
7114 else if ((re_opcode_t) p1[3] == charset
7115 || (re_opcode_t) p1[3] == charset_not)
7116 {
7117 int not = (re_opcode_t) p1[3] == charset_not;
7118
7119 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7120 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7121 not = !not;
7122
7123 /* `not' is equal to 1 if c would match, which means
7124 that we can't change to pop_failure_jump. */
7125 if (!not)
7126 {
7127 p[-3] = (unsigned char) pop_failure_jump;
7128 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7129 }
7130 }
7131#endif /* not WCHAR */
7132 }
7133#ifndef WCHAR
7134 else if ((re_opcode_t) *p2 == charset)
7135 {
7136 /* We win if the first character of the loop is not part
7137 of the charset. */
7138 if ((re_opcode_t) p1[3] == exactn
7139 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7140 && (p2[2 + p1[5] / BYTEWIDTH]
7141 & (1 << (p1[5] % BYTEWIDTH)))))
7142 {
7143 p[-3] = (unsigned char) pop_failure_jump;
7144 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7145 }
7146
7147 else if ((re_opcode_t) p1[3] == charset_not)
7148 {
7149 int idx;
7150 /* We win if the charset_not inside the loop
7151 lists every character listed in the charset after. */
7152 for (idx = 0; idx < (int) p2[1]; idx++)
7153 if (! (p2[2 + idx] == 0
7154 || (idx < (int) p1[4]
7155 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7156 break;
7157
7158 if (idx == p2[1])
7159 {
7160 p[-3] = (unsigned char) pop_failure_jump;
7161 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7162 }
7163 }
7164 else if ((re_opcode_t) p1[3] == charset)
7165 {
7166 int idx;
7167 /* We win if the charset inside the loop
7168 has no overlap with the one after the loop. */
7169 for (idx = 0;
7170 idx < (int) p2[1] && idx < (int) p1[4];
7171 idx++)
7172 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7173 break;
7174
7175 if (idx == p2[1] || idx == p1[4])
7176 {
7177 p[-3] = (unsigned char) pop_failure_jump;
7178 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7179 }
7180 }
7181 }
7182#endif /* not WCHAR */
7183 }
7184 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7185 if ((re_opcode_t) p[-1] != pop_failure_jump)
7186 {
7187 p[-1] = (UCHAR_T) jump;
7188 DEBUG_PRINT1 (" Match => jump.\n");
7189 goto unconditional_jump;
7190 }
7191 /* Note fall through. */
7192
7193
7194 /* The end of a simple repeat has a pop_failure_jump back to
7195 its matching on_failure_jump, where the latter will push a
7196 failure point. The pop_failure_jump takes off failure
7197 points put on by this pop_failure_jump's matching
7198 on_failure_jump; we got through the pattern to here from the
7199 matching on_failure_jump, so didn't fail. */
7200 case pop_failure_jump:
7201 {
7202 /* We need to pass separate storage for the lowest and
7203 highest registers, even though we don't care about the
7204 actual values. Otherwise, we will restore only one
7205 register from the stack, since lowest will == highest in
7206 `pop_failure_point'. */
7207 active_reg_t dummy_low_reg, dummy_high_reg;
7208 UCHAR_T *pdummy = NULL;
7209 const CHAR_T *sdummy = NULL;
7210
7211 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7212 POP_FAILURE_POINT (sdummy, pdummy,
7213 dummy_low_reg, dummy_high_reg,
7214 reg_dummy, reg_dummy, reg_info_dummy);
7215 }
7216 /* Note fall through. */
7217
7218 unconditional_jump:
7219#ifdef _LIBC
7220 DEBUG_PRINT2 ("\n%p: ", p);
7221#else
7222 DEBUG_PRINT2 ("\n0x%x: ", p);
7223#endif
7224 /* Note fall through. */
7225
7226 /* Unconditionally jump (without popping any failure points). */
7227 case jump:
7228 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7229 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7230 p += mcnt; /* Do the jump. */
7231#ifdef _LIBC
7232 DEBUG_PRINT2 ("(to %p).\n", p);
7233#else
7234 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7235#endif
7236 break;
7237
7238
7239 /* We need this opcode so we can detect where alternatives end
7240 in `group_match_null_string_p' et al. */
7241 case jump_past_alt:
7242 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7243 goto unconditional_jump;
7244
7245
7246 /* Normally, the on_failure_jump pushes a failure point, which
7247 then gets popped at pop_failure_jump. We will end up at
7248 pop_failure_jump, also, and with a pattern of, say, `a+', we
7249 are skipping over the on_failure_jump, so we have to push
7250 something meaningless for pop_failure_jump to pop. */
7251 case dummy_failure_jump:
7252 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7253 /* It doesn't matter what we push for the string here. What
7254 the code at `fail' tests is the value for the pattern. */
7255 PUSH_FAILURE_POINT (NULL, NULL, -2);
7256 goto unconditional_jump;
7257
7258
7259 /* At the end of an alternative, we need to push a dummy failure
7260 point in case we are followed by a `pop_failure_jump', because
7261 we don't want the failure point for the alternative to be
7262 popped. For example, matching `(a|ab)*' against `aab'
7263 requires that we match the `ab' alternative. */
7264 case push_dummy_failure:
7265 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7266 /* See comments just above at `dummy_failure_jump' about the
7267 two zeroes. */
7268 PUSH_FAILURE_POINT (NULL, NULL, -2);
7269 break;
7270
7271 /* Have to succeed matching what follows at least n times.
7272 After that, handle like `on_failure_jump'. */
7273 case succeed_n:
7274 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7275 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7276
7277 assert (mcnt >= 0);
7278 /* Originally, this is how many times we HAVE to succeed. */
7279 if (mcnt > 0)
7280 {
7281 mcnt--;
7282 p += OFFSET_ADDRESS_SIZE;
7283 STORE_NUMBER_AND_INCR (p, mcnt);
7284#ifdef _LIBC
7285 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7286 , mcnt);
7287#else
7288 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7289 , mcnt);
7290#endif
7291 }
7292 else if (mcnt == 0)
7293 {
7294#ifdef _LIBC
7295 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7296 p + OFFSET_ADDRESS_SIZE);
7297#else
7298 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7299 p + OFFSET_ADDRESS_SIZE);
7300#endif /* _LIBC */
7301
7302#ifdef WCHAR
7303 p[1] = (UCHAR_T) no_op;
7304#else
7305 p[2] = (UCHAR_T) no_op;
7306 p[3] = (UCHAR_T) no_op;
7307#endif /* WCHAR */
7308 goto on_failure;
7309 }
7310 break;
7311
7312 case jump_n:
7313 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7314 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7315
7316 /* Originally, this is how many times we CAN jump. */
7317 if (mcnt)
7318 {
7319 mcnt--;
7320 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7321
7322#ifdef _LIBC
7323 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7324 mcnt);
7325#else
7326 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7327 mcnt);
7328#endif /* _LIBC */
7329 goto unconditional_jump;
7330 }
7331 /* If don't have to jump any more, skip over the rest of command. */
7332 else
7333 p += 2 * OFFSET_ADDRESS_SIZE;
7334 break;
7335
7336 case set_number_at:
7337 {
7338 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7339
7340 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7341 p1 = p + mcnt;
7342 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7343#ifdef _LIBC
7344 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7345#else
7346 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7347#endif
7348 STORE_NUMBER (p1, mcnt);
7349 break;
7350 }
7351
7352#if 0
7353 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7354 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7355 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7356 macro and introducing temporary variables works around the bug. */
7357
7358 case wordbound:
7359 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7360 if (AT_WORD_BOUNDARY (d))
7361 break;
7362 goto fail;
7363
7364 case notwordbound:
7365 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7366 if (AT_WORD_BOUNDARY (d))
7367 goto fail;
7368 break;
7369#else
7370 case wordbound:
7371 {
7372 boolean prevchar, thischar;
7373
7374 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7375 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7376 break;
7377
7378 prevchar = WORDCHAR_P (d - 1);
7379 thischar = WORDCHAR_P (d);
7380 if (prevchar != thischar)
7381 break;
7382 goto fail;
7383 }
7384
7385 case notwordbound:
7386 {
7387 boolean prevchar, thischar;
7388
7389 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7390 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7391 goto fail;
7392
7393 prevchar = WORDCHAR_P (d - 1);
7394 thischar = WORDCHAR_P (d);
7395 if (prevchar != thischar)
7396 goto fail;
7397 break;
7398 }
7399#endif
7400
7401 case wordbeg:
7402 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7403 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7404 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7405 break;
7406 goto fail;
7407
7408 case wordend:
7409 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7410 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7411 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7412 break;
7413 goto fail;
7414
7415#ifdef emacs
7416 case before_dot:
7417 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7418 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7419 goto fail;
7420 break;
7421
7422 case at_dot:
7423 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7424 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7425 goto fail;
7426 break;
7427
7428 case after_dot:
7429 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7430 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7431 goto fail;
7432 break;
7433
7434 case syntaxspec:
7435 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7436 mcnt = *p++;
7437 goto matchsyntax;
7438
7439 case wordchar:
7440 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7441 mcnt = (int) Sword;
7442 matchsyntax:
7443 PREFETCH ();
7444 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7445 d++;
7446 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7447 goto fail;
7448 SET_REGS_MATCHED ();
7449 break;
7450
7451 case notsyntaxspec:
7452 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7453 mcnt = *p++;
7454 goto matchnotsyntax;
7455
7456 case notwordchar:
7457 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7458 mcnt = (int) Sword;
7459 matchnotsyntax:
7460 PREFETCH ();
7461 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7462 d++;
7463 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7464 goto fail;
7465 SET_REGS_MATCHED ();
7466 break;
7467
7468#else /* not emacs */
7469 case wordchar:
7470 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7471 PREFETCH ();
7472 if (!WORDCHAR_P (d))
7473 goto fail;
7474 SET_REGS_MATCHED ();
7475 d++;
7476 break;
7477
7478 case notwordchar:
7479 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7480 PREFETCH ();
7481 if (WORDCHAR_P (d))
7482 goto fail;
7483 SET_REGS_MATCHED ();
7484 d++;
7485 break;
7486#endif /* not emacs */
7487
7488 default:
7489 abort ();
7490 }
7491 continue; /* Successfully executed one pattern command; keep going. */
7492
7493
7494 /* We goto here if a matching operation fails. */
7495 fail:
7496 if (!FAIL_STACK_EMPTY ())
7497 { /* A restart point is known. Restore to that state. */
7498 DEBUG_PRINT1 ("\nFAIL:\n");
7499 POP_FAILURE_POINT (d, p,
7500 lowest_active_reg, highest_active_reg,
7501 regstart, regend, reg_info);
7502
7503 /* If this failure point is a dummy, try the next one. */
7504 if (!p)
7505 goto fail;
7506
7507 /* If we failed to the end of the pattern, don't examine *p. */
7508 assert (p <= pend);
7509 if (p < pend)
7510 {
7511 boolean is_a_jump_n = false;
7512
7513 /* If failed to a backwards jump that's part of a repetition
7514 loop, need to pop this failure point and use the next one. */
7515 switch ((re_opcode_t) *p)
7516 {
7517 case jump_n:
7518 is_a_jump_n = true;
7519 case maybe_pop_jump:
7520 case pop_failure_jump:
7521 case jump:
7522 p1 = p + 1;
7523 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7524 p1 += mcnt;
7525
7526 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7527 || (!is_a_jump_n
7528 && (re_opcode_t) *p1 == on_failure_jump))
7529 goto fail;
7530 break;
7531 default:
7532 /* do nothing */ ;
7533 }
7534 }
7535
7536 if (d >= string1 && d <= end1)
7537 dend = end_match_1;
7538 }
7539 else
7540 break; /* Matching at this starting point really fails. */
7541 } /* for (;;) */
7542
7543 if (best_regs_set)
7544 goto restore_best_regs;
7545
7546 FREE_VARIABLES ();
7547
7548 return -1; /* Failure to match. */
7549} /* re_match_2 */
7550
7551/* Subroutine definitions for re_match_2. */
7552
7553
7554/* We are passed P pointing to a register number after a start_memory.
7555
7556 Return true if the pattern up to the corresponding stop_memory can
7557 match the empty string, and false otherwise.
7558
7559 If we find the matching stop_memory, sets P to point to one past its number.
7560 Otherwise, sets P to an undefined byte less than or equal to END.
7561
7562 We don't handle duplicates properly (yet). */
7563
7564static boolean
7565PREFIX(group_match_null_string_p) (
7566 UCHAR_T **p, UCHAR_T *end,
7567 PREFIX(register_info_type) *reg_info)
7568{
7569 int mcnt;
7570 /* Point to after the args to the start_memory. */
7571 UCHAR_T *p1 = *p + 2;
7572
7573 while (p1 < end)
7574 {
7575 /* Skip over opcodes that can match nothing, and return true or
7576 false, as appropriate, when we get to one that can't, or to the
7577 matching stop_memory. */
7578
7579 switch ((re_opcode_t) *p1)
7580 {
7581 /* Could be either a loop or a series of alternatives. */
7582 case on_failure_jump:
7583 p1++;
7584 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7585
7586 /* If the next operation is not a jump backwards in the
7587 pattern. */
7588
7589 if (mcnt >= 0)
7590 {
7591 /* Go through the on_failure_jumps of the alternatives,
7592 seeing if any of the alternatives cannot match nothing.
7593 The last alternative starts with only a jump,
7594 whereas the rest start with on_failure_jump and end
7595 with a jump, e.g., here is the pattern for `a|b|c':
7596
7597 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7598 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7599 /exactn/1/c
7600
7601 So, we have to first go through the first (n-1)
7602 alternatives and then deal with the last one separately. */
7603
7604
7605 /* Deal with the first (n-1) alternatives, which start
7606 with an on_failure_jump (see above) that jumps to right
7607 past a jump_past_alt. */
7608
7609 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7610 jump_past_alt)
7611 {
7612 /* `mcnt' holds how many bytes long the alternative
7613 is, including the ending `jump_past_alt' and
7614 its number. */
7615
7616 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7617 (1 + OFFSET_ADDRESS_SIZE),
7618 reg_info))
7619 return false;
7620
7621 /* Move to right after this alternative, including the
7622 jump_past_alt. */
7623 p1 += mcnt;
7624
7625 /* Break if it's the beginning of an n-th alternative
7626 that doesn't begin with an on_failure_jump. */
7627 if ((re_opcode_t) *p1 != on_failure_jump)
7628 break;
7629
7630 /* Still have to check that it's not an n-th
7631 alternative that starts with an on_failure_jump. */
7632 p1++;
7633 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7634 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7635 jump_past_alt)
7636 {
7637 /* Get to the beginning of the n-th alternative. */
7638 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7639 break;
7640 }
7641 }
7642
7643 /* Deal with the last alternative: go back and get number
7644 of the `jump_past_alt' just before it. `mcnt' contains
7645 the length of the alternative. */
7646 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7647
7648 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7649 return false;
7650
7651 p1 += mcnt; /* Get past the n-th alternative. */
7652 } /* if mcnt > 0 */
7653 break;
7654
7655
7656 case stop_memory:
7657 assert (p1[1] == **p);
7658 *p = p1 + 2;
7659 return true;
7660
7661
7662 default:
7663 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7664 return false;
7665 }
7666 } /* while p1 < end */
7667
7668 return false;
7669} /* group_match_null_string_p */
7670
7671
7672/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7673 It expects P to be the first byte of a single alternative and END one
7674 byte past the last. The alternative can contain groups. */
7675
7676static boolean
7677PREFIX(alt_match_null_string_p) (
7678 UCHAR_T *p, UCHAR_T *end,
7679 PREFIX(register_info_type) *reg_info)
7680{
7681 int mcnt;
7682 UCHAR_T *p1 = p;
7683
7684 while (p1 < end)
7685 {
7686 /* Skip over opcodes that can match nothing, and break when we get
7687 to one that can't. */
7688
7689 switch ((re_opcode_t) *p1)
7690 {
7691 /* It's a loop. */
7692 case on_failure_jump:
7693 p1++;
7694 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7695 p1 += mcnt;
7696 break;
7697
7698 default:
7699 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7700 return false;
7701 }
7702 } /* while p1 < end */
7703
7704 return true;
7705} /* alt_match_null_string_p */
7706
7707
7708/* Deals with the ops common to group_match_null_string_p and
7709 alt_match_null_string_p.
7710
7711 Sets P to one after the op and its arguments, if any. */
7712
7713static boolean
7714PREFIX(common_op_match_null_string_p) (
7715 UCHAR_T **p, UCHAR_T *end,
7716 PREFIX(register_info_type) *reg_info)
7717{
7718 int mcnt;
7719 boolean ret;
7720 int reg_no;
7721 UCHAR_T *p1 = *p;
7722
7723 switch ((re_opcode_t) *p1++)
7724 {
7725 case no_op:
7726 case begline:
7727 case endline:
7728 case begbuf:
7729 case endbuf:
7730 case wordbeg:
7731 case wordend:
7732 case wordbound:
7733 case notwordbound:
7734#ifdef emacs
7735 case before_dot:
7736 case at_dot:
7737 case after_dot:
7738#endif
7739 break;
7740
7741 case start_memory:
7742 reg_no = *p1;
7743 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7744 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7745
7746 /* Have to set this here in case we're checking a group which
7747 contains a group and a back reference to it. */
7748
7749 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7750 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7751
7752 if (!ret)
7753 return false;
7754 break;
7755
7756 /* If this is an optimized succeed_n for zero times, make the jump. */
7757 case jump:
7758 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7759 if (mcnt >= 0)
7760 p1 += mcnt;
7761 else
7762 return false;
7763 break;
7764
7765 case succeed_n:
7766 /* Get to the number of times to succeed. */
7767 p1 += OFFSET_ADDRESS_SIZE;
7768 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7769
7770 if (mcnt == 0)
7771 {
7772 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7773 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7774 p1 += mcnt;
7775 }
7776 else
7777 return false;
7778 break;
7779
7780 case duplicate:
7781 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7782 return false;
7783 break;
7784
7785 case set_number_at:
7786 p1 += 2 * OFFSET_ADDRESS_SIZE;
7787
7788 default:
7789 /* All other opcodes mean we cannot match the empty string. */
7790 return false;
7791 }
7792
7793 *p = p1;
7794 return true;
7795} /* common_op_match_null_string_p */
7796
7797
7798/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7799 bytes; nonzero otherwise. */
7800
7801static int
7802PREFIX(bcmp_translate) (
7803 const CHAR_T *s1, const CHAR_T *s2,
7804 register int len,
7805 RE_TRANSLATE_TYPE translate)
7806{
7807 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7808 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7809 while (len)
7810 {
7811#ifdef WCHAR
7812 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7813 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7814 return 1;
7815#else /* BYTE */
7816 if (translate[*p1++] != translate[*p2++]) return 1;
7817#endif /* WCHAR */
7818 len--;
7819 }
7820 return 0;
7821}
7822
7823
7824#else /* not INSIDE_RECURSION */
7825
7826/* Entry points for GNU code. */
7827
7828/* re_compile_pattern is the GNU regular expression compiler: it
7829 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7830 Returns 0 if the pattern was valid, otherwise an error string.
7831
7832 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7833 are set in BUFP on entry.
7834
7835 We call regex_compile to do the actual compilation. */
7836
7837const char *
7838re_compile_pattern (const char *pattern,
7839 size_t length,
7840 struct re_pattern_buffer *bufp)
7841{
7842 reg_errcode_t ret;
7843
7844 /* GNU code is written to assume at least RE_NREGS registers will be set
7845 (and at least one extra will be -1). */
7846 bufp->regs_allocated = REGS_UNALLOCATED;
7847
7848 /* And GNU code determines whether or not to get register information
7849 by passing null for the REGS argument to re_match, etc., not by
7850 setting no_sub. */
7851 bufp->no_sub = 0;
7852
7853 /* Match anchors at newline. */
7854 bufp->newline_anchor = 1;
7855
7856# ifdef MBS_SUPPORT
7857 if (MB_CUR_MAX != 1)
7858 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
7859 else
7860# endif
7861 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
7862
7863 if (!ret)
7864 return NULL;
7865 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7866}
7867
7868/* Entry points compatible with 4.2 BSD regex library. We don't define
7869 them unless specifically requested. */
7870
7871#if defined _REGEX_RE_COMP || defined _LIBC
7872
7873/* BSD has one and only one pattern buffer. */
7874static struct re_pattern_buffer re_comp_buf;
7875
7876char *
7877#ifdef _LIBC
7878/* Make these definitions weak in libc, so POSIX programs can redefine
7879 these names if they don't use our functions, and still use
7880 regcomp/regexec below without link errors. */
7881weak_function
7882#endif
7883re_comp (const char *s)
7884{
7885 reg_errcode_t ret;
7886
7887 if (!s)
7888 {
7889 if (!re_comp_buf.buffer)
7890 return gettext ("No previous regular expression");
7891 return 0;
7892 }
7893
7894 if (!re_comp_buf.buffer)
7895 {
7896 re_comp_buf.buffer = (unsigned char *) malloc (200);
7897 if (re_comp_buf.buffer == NULL)
7898 return (char *) gettext (re_error_msgid
7899 + re_error_msgid_idx[(int) REG_ESPACE]);
7900 re_comp_buf.allocated = 200;
7901
7902 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7903 if (re_comp_buf.fastmap == NULL)
7904 return (char *) gettext (re_error_msgid
7905 + re_error_msgid_idx[(int) REG_ESPACE]);
7906 }
7907
7908 /* Since `re_exec' always passes NULL for the `regs' argument, we
7909 don't need to initialize the pattern buffer fields which affect it. */
7910
7911 /* Match anchors at newlines. */
7912 re_comp_buf.newline_anchor = 1;
7913
7914# ifdef MBS_SUPPORT
7915 if (MB_CUR_MAX != 1)
7916 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7917 else
7918# endif
7919 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7920
7921 if (!ret)
7922 return NULL;
7923
7924 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7925 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7926}
7927
7928
7929int
7930#if defined _LIBC || defined __UCLIBC__
7931weak_function
7932#endif
7933re_exec (const char *s)
7934{
7935 const int len = strlen (s);
7936 return
7937 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7938}
7939
7940#endif /* _REGEX_RE_COMP */
7941
7942/* POSIX.2 functions. Don't define these for Emacs. */
7943
7944#ifndef emacs
7945
7946/* regcomp takes a regular expression as a string and compiles it.
7947
7948 PREG is a regex_t *. We do not expect any fields to be initialized,
7949 since POSIX says we shouldn't. Thus, we set
7950
7951 `buffer' to the compiled pattern;
7952 `used' to the length of the compiled pattern;
7953 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7954 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7955 RE_SYNTAX_POSIX_BASIC;
7956 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7957 `fastmap' to an allocated space for the fastmap;
7958 `fastmap_accurate' to zero;
7959 `re_nsub' to the number of subexpressions in PATTERN.
7960
7961 PATTERN is the address of the pattern string.
7962
7963 CFLAGS is a series of bits which affect compilation.
7964
7965 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7966 use POSIX basic syntax.
7967
7968 If REG_NEWLINE is set, then . and [^...] don't match newline.
7969 Also, regexec will try a match beginning after every newline.
7970
7971 If REG_ICASE is set, then we considers upper- and lowercase
7972 versions of letters to be equivalent when matching.
7973
7974 If REG_NOSUB is set, then when PREG is passed to regexec, that
7975 routine will report only success or failure, and nothing about the
7976 registers.
7977
7978 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7979 the return codes and their meanings.) */
7980
7981int
7982regcomp (
7983 regex_t *preg,
7984 const char *pattern,
7985 int cflags)
7986{
7987 reg_errcode_t ret;
7988 reg_syntax_t syntax
7989 = (cflags & REG_EXTENDED) ?
7990 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7991
7992 /* regex_compile will allocate the space for the compiled pattern. */
7993 preg->buffer = 0;
7994 preg->allocated = 0;
7995 preg->used = 0;
7996
7997 /* Try to allocate space for the fastmap. */
7998 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
7999
8000 if (cflags & REG_ICASE)
8001 {
8002 unsigned i;
8003
8004 preg->translate
8005 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8006 * sizeof (*(RE_TRANSLATE_TYPE)0));
8007 if (preg->translate == NULL)
8008 return (int) REG_ESPACE;
8009
8010 /* Map uppercase characters to corresponding lowercase ones. */
8011 for (i = 0; i < CHAR_SET_SIZE; i++)
8012 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
8013 }
8014 else
8015 preg->translate = NULL;
8016
8017 /* If REG_NEWLINE is set, newlines are treated differently. */
8018 if (cflags & REG_NEWLINE)
8019 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
8020 syntax &= ~RE_DOT_NEWLINE;
8021 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
8022 /* It also changes the matching behavior. */
8023 preg->newline_anchor = 1;
8024 }
8025 else
8026 preg->newline_anchor = 0;
8027
8028 preg->no_sub = !!(cflags & REG_NOSUB);
8029
8030 /* POSIX says a null character in the pattern terminates it, so we
8031 can use strlen here in compiling the pattern. */
8032# ifdef MBS_SUPPORT
8033 if (MB_CUR_MAX != 1)
8034 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
8035 else
8036# endif
8037 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
8038
8039 /* POSIX doesn't distinguish between an unmatched open-group and an
8040 unmatched close-group: both are REG_EPAREN. */
8041 if (ret == REG_ERPAREN) ret = REG_EPAREN;
8042
8043 if (ret == REG_NOERROR && preg->fastmap)
8044 {
8045 /* Compute the fastmap now, since regexec cannot modify the pattern
8046 buffer. */
8047 if (re_compile_fastmap (preg) == -2)
8048 {
8049 /* Some error occurred while computing the fastmap, just forget
8050 about it. */
8051 free (preg->fastmap);
8052 preg->fastmap = NULL;
8053 }
8054 }
8055
8056 return (int) ret;
8057}
8058
8059
8060/* regexec searches for a given pattern, specified by PREG, in the
8061 string STRING.
8062
8063 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8064 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8065 least NMATCH elements, and we set them to the offsets of the
8066 corresponding matched substrings.
8067
8068 EFLAGS specifies `execution flags' which affect matching: if
8069 REG_NOTBOL is set, then ^ does not match at the beginning of the
8070 string; if REG_NOTEOL is set, then $ does not match at the end.
8071
8072 We return 0 if we find a match and REG_NOMATCH if not. */
8073
8074int
8075regexec (
8076 const regex_t *preg,
8077 const char *string,
8078 size_t nmatch,
8079 regmatch_t pmatch[],
8080 int eflags)
8081{
8082 int ret;
8083 struct re_registers regs;
8084 regex_t private_preg;
8085 int len = strlen (string);
8086 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8087
8088 /* use hidden memcpy() ourselves rather than gcc calling public memcpy() */
8089 memcpy(&private_preg, preg, sizeof(*preg));
8090
8091 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8092 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8093
8094 /* The user has told us exactly how many registers to return
8095 information about, via `nmatch'. We have to pass that on to the
8096 matching routines. */
8097 private_preg.regs_allocated = REGS_FIXED;
8098
8099 if (want_reg_info)
8100 {
8101 regs.num_regs = nmatch;
8102 regs.start = TALLOC (nmatch * 2, regoff_t);
8103 if (regs.start == NULL)
8104 return (int) REG_NOMATCH;
8105 regs.end = regs.start + nmatch;
8106 }
8107
8108 /* Perform the searching operation. */
8109 ret = re_search (&private_preg, string, len,
8110 /* start: */ 0, /* range: */ len,
8111 want_reg_info ? &regs : (struct re_registers *) 0);
8112
8113 /* Copy the register information to the POSIX structure. */
8114 if (want_reg_info)
8115 {
8116 if (ret >= 0)
8117 {
8118 unsigned r;
8119
8120 for (r = 0; r < nmatch; r++)
8121 {
8122 pmatch[r].rm_so = regs.start[r];
8123 pmatch[r].rm_eo = regs.end[r];
8124 }
8125 }
8126
8127 /* If we needed the temporary register info, free the space now. */
8128 free (regs.start);
8129 }
8130
8131 /* We want zero return to mean success, unlike `re_search'. */
8132 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8133}
8134libc_hidden_def(regexec)
8135
8136
8137/* Returns a message corresponding to an error code, ERRCODE, returned
8138 from either regcomp or regexec. We don't use PREG here. */
8139
8140size_t
8141regerror (
8142 int errcode,
8143 const regex_t * preg attribute_unused,
8144 char *errbuf,
8145 size_t errbuf_size)
8146{
8147 const char *msg;
8148 size_t msg_size;
8149
8150 if (errcode < 0
8151 || errcode >= (int) (sizeof (re_error_msgid_idx)
8152 / sizeof (re_error_msgid_idx[0])))
8153 /* Only error codes returned by the rest of the code should be passed
8154 to this routine. If we are given anything else, or if other regex
8155 code generates an invalid error code, then the program has a bug.
8156 Dump core so we can fix it. */
8157 abort ();
8158
8159 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
8160
8161 msg_size = strlen (msg) + 1; /* Includes the null. */
8162
8163 if (errbuf_size != 0)
8164 {
8165 if (msg_size > errbuf_size)
8166 {
8167 memcpy (errbuf, msg, errbuf_size - 1);
8168 errbuf[errbuf_size - 1] = 0;
8169 }
8170 else
8171 memcpy (errbuf, msg, msg_size);
8172 }
8173
8174 return msg_size;
8175}
8176
8177
8178/* Free dynamically allocated space used by PREG. */
8179
8180void
8181regfree (regex_t *preg)
8182{
8183 free (preg->buffer);
8184 preg->buffer = NULL;
8185
8186 preg->allocated = 0;
8187 preg->used = 0;
8188
8189 free (preg->fastmap);
8190 preg->fastmap = NULL;
8191 preg->fastmap_accurate = 0;
8192
8193 free (preg->translate);
8194 preg->translate = NULL;
8195}
8196libc_hidden_def(regfree)
8197
8198#endif /* not emacs */
8199
8200#endif /* not INSIDE_RECURSION */
8201
8202
8203#undef STORE_NUMBER
8204#undef STORE_NUMBER_AND_INCR
8205#undef EXTRACT_NUMBER
8206#undef EXTRACT_NUMBER_AND_INCR
8207
8208#undef DEBUG_PRINT_COMPILED_PATTERN
8209#undef DEBUG_PRINT_DOUBLE_STRING
8210
8211#undef INIT_FAIL_STACK
8212#undef RESET_FAIL_STACK
8213#undef DOUBLE_FAIL_STACK
8214#undef PUSH_PATTERN_OP
8215#undef PUSH_FAILURE_POINTER
8216#undef PUSH_FAILURE_INT
8217#undef PUSH_FAILURE_ELT
8218#undef POP_FAILURE_POINTER
8219#undef POP_FAILURE_INT
8220#undef POP_FAILURE_ELT
8221#undef DEBUG_PUSH
8222#undef DEBUG_POP
8223#undef PUSH_FAILURE_POINT
8224#undef POP_FAILURE_POINT
8225
8226#undef REG_UNSET_VALUE
8227#undef REG_UNSET
8228
8229#undef PATFETCH
8230#undef PATFETCH_RAW
8231#undef PATUNFETCH
8232#undef TRANSLATE
8233
8234#undef INIT_BUF_SIZE
8235#undef GET_BUFFER_SPACE
8236#undef BUF_PUSH
8237#undef BUF_PUSH_2
8238#undef BUF_PUSH_3
8239#undef STORE_JUMP
8240#undef STORE_JUMP2
8241#undef INSERT_JUMP
8242#undef INSERT_JUMP2
8243#undef EXTEND_BUFFER
8244#undef GET_UNSIGNED_NUMBER
8245#undef FREE_STACK_RETURN
8246
8247# undef POINTER_TO_OFFSET
8248# undef MATCHING_IN_FRST_STRING
8249# undef PREFETCH
8250# undef AT_STRINGS_BEG
8251# undef AT_STRINGS_END
8252# undef WORDCHAR_P
8253# undef FREE_VAR
8254# undef FREE_VARIABLES
8255# undef NO_HIGHEST_ACTIVE_REG
8256# undef NO_LOWEST_ACTIVE_REG
8257
8258# undef CHAR_T
8259# undef UCHAR_T
8260# undef COMPILED_BUFFER_VAR
8261# undef OFFSET_ADDRESS_SIZE
8262# undef CHAR_CLASS_SIZE
8263# undef PREFIX
8264# undef ARG_PREFIX
8265# undef PUT_CHAR
8266# undef BYTE
8267# undef WCHAR
8268
8269# define DEFINED_ONCE