lh | 9ed821d | 2023-04-07 01:36:19 -0700 | [diff] [blame^] | 1 | /* Test for UTF-8 regular expression optimizations. |
| 2 | Copyright (C) 2003-2015 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. |
| 5 | |
| 6 | The GNU C Library is free software; you can redistribute it and/or |
| 7 | modify it under the terms of the GNU Lesser General Public |
| 8 | License as published by the Free Software Foundation; either |
| 9 | version 2.1 of the License, or (at your option) any later version. |
| 10 | |
| 11 | The GNU C Library is distributed in the hope that it will be useful, |
| 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | Lesser General Public License for more details. |
| 15 | |
| 16 | You should have received a copy of the GNU Lesser General Public |
| 17 | License along with the GNU C Library; if not, see |
| 18 | <http://www.gnu.org/licenses/>. */ |
| 19 | |
| 20 | #include <sys/types.h> |
| 21 | #include <mcheck.h> |
| 22 | #include <regex.h> |
| 23 | #include <stdio.h> |
| 24 | #include <stdlib.h> |
| 25 | #include <string.h> |
| 26 | #include <locale.h> |
| 27 | |
| 28 | #define RE_NO_INTERNAL_PROTOTYPES 1 |
| 29 | #include "regex_internal.h" |
| 30 | |
| 31 | #define BRE RE_SYNTAX_POSIX_BASIC |
| 32 | #define ERE RE_SYNTAX_POSIX_EXTENDED |
| 33 | |
| 34 | static struct |
| 35 | { |
| 36 | int syntax; |
| 37 | const char *pattern; |
| 38 | const char *string; |
| 39 | int res, optimize; |
| 40 | } tests[] = { |
| 41 | /* \xc3\x84 LATIN CAPITAL LETTER A WITH DIAERESIS |
| 42 | \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS |
| 43 | \xc3\xa4 LATIN SMALL LETTER A WITH DIAERESIS |
| 44 | \xc3\xb6 LATIN SMALL LETTER O WITH DIAERESIS |
| 45 | \xe2\x80\x94 EM DASH */ |
| 46 | /* Should be optimized. */ |
| 47 | {BRE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1}, |
| 48 | {BRE, "b\xc3\xa4z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, |
| 49 | {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, |
| 50 | {BRE, "b\xc3\xa4*z", "b\xc3\xa4rfoobz", 7, 1}, |
| 51 | {BRE, "b\xc3\xa4\\+z", "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z", 7, 1}, |
| 52 | {BRE, "b\xc3\xa4\\?z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, |
| 53 | {BRE, "b\xc3\xa4\\{1,2\\}z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, |
| 54 | {BRE, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1}, |
| 55 | {BRE, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1}, |
| 56 | {BRE, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1}, |
| 57 | {BRE, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1}, |
| 58 | {BRE, "^x\\|x\xc3\xa4*z$", "\xc3\xb6x\xc3\xa4\xc3\xa4z", 2, 1}, |
| 59 | {BRE, "^x\\\\\xc3\x84\\{6\\}z\\+", |
| 60 | "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1}, |
| 61 | {BRE, "^x\\\\\xc3\x84\\{2,36\\}z\\+", "x\\\xc3\x84zz\xc3\xb6", -1, 1}, |
| 62 | {BRE, "^x\\\\\xc3\x84\\{,3\\}z\\+", |
| 63 | "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1}, |
| 64 | {BRE, "x[C]y", "axCy", 1, 1}, |
| 65 | {BRE, "x[ABC]y", "axCy", 1, 1}, |
| 66 | {BRE, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1}, |
| 67 | {BRE, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1}, |
| 68 | {BRE, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1}, |
| 69 | {BRE, "\\`\xc3\x84\\|z\\'", "\xc3\x84\xe2\x80\x94", 0, 1}, |
| 70 | {BRE, "\\(x\xc3\x84\\)z\\1\x61\\1", |
| 71 | "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96", 3, 1}, |
| 72 | {BRE, "x\xc3\x96\\?z", "\xc3\x84xz\xc3\xb6", 2, 1}, |
| 73 | {BRE, "x.y", "ax\xe2\x80\x94yz", 1, 1}, |
| 74 | {BRE, "x.*z", "\xc3\x84xz", 2, 1}, |
| 75 | {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1}, |
| 76 | {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1}, |
| 77 | {BRE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1}, |
| 78 | {BRE, "x.\\?z", "axz", 1, 1}, |
| 79 | {BRE, "x.\\?z", "axyz", 1, 1}, |
| 80 | {BRE, "x.\\?z", "ax\xc3\x84z", 1, 1}, |
| 81 | {BRE, "x.\\?z", "ax\xe2\x80\x94z", 1, 1}, |
| 82 | {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80z", 1, 1}, |
| 83 | {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84z", 1, 1}, |
| 84 | {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1}, |
| 85 | {BRE, ".", "y", 0, 1}, |
| 86 | {BRE, ".", "\xc3\x84", 0, 1}, |
| 87 | {BRE, ".", "\xe2\x80\x94", 0, 1}, |
| 88 | {BRE, ".", "\xf0\x9d\x80\x80", 0, 1}, |
| 89 | {BRE, ".", "\xf9\x81\x82\x83\x84", 0, 1}, |
| 90 | {BRE, ".", "\xfd\xbf\xbf\xbf\xbf\xbf", 0, 1}, |
| 91 | {BRE, "x.\\?z", "axyyz", -1, 1}, |
| 92 | {BRE, "x.\\?z", "ax\xc3\x84\xc3\x96z", -1, 1}, |
| 93 | {BRE, "x.\\?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1}, |
| 94 | {BRE, "x.\\?z", "ax\xf0\x9d\x80\x80yz", -1, 1}, |
| 95 | {BRE, "x.\\?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1}, |
| 96 | {BRE, "x.\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1}, |
| 97 | {BRE, "x.\\+z", "\xe2\x80\x94xz", -1, 1}, |
| 98 | {BRE, "x.\\+z", "\xe2\x80\x94xyz", 3, 1}, |
| 99 | {BRE, "x.\\+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1}, |
| 100 | {BRE, "x.\\+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, |
| 101 | {BRE, "x.\\+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, |
| 102 | {BRE, "x.\\+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1}, |
| 103 | {BRE, "x.\\+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, |
| 104 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xz", -1, 1}, |
| 105 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1}, |
| 106 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94xyz", 3, 1}, |
| 107 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1}, |
| 108 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, |
| 109 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, |
| 110 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1}, |
| 111 | {BRE, "x.\\{1,2\\}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, |
| 112 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "axz", 1, 1}, |
| 113 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1}, |
| 114 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xc3\x86z", 1, 1}, |
| 115 | {BRE, "x\\(.w\\|\xc3\x86\\)\\?z", "ax\xe2\x80\x96wz", 1, 1}, |
| 116 | {ERE, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1}, |
| 117 | {ERE, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1}, |
| 118 | {ERE, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1}, |
| 119 | {ERE, "^x\\\\y{2,36}z+", "x\\yzz\xc3\xb6", -1, 1}, |
| 120 | {ERE, "^x\\\\y{,3}z+", "x\\yyyzz\xc3\xb6", 0, 1}, |
| 121 | {ERE, "x[C]y", "axCy", 1, 1}, |
| 122 | {ERE, "x[ABC]y", "axCy", 1, 1}, |
| 123 | {ERE, "\\`x|z\\'", "x\xe2\x80\x94", 0, 1}, |
| 124 | {ERE, "(xy)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1}, |
| 125 | {ERE, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1}, |
| 126 | {ERE, "x.y", "ax\xe2\x80\x94yz", 1, 1}, |
| 127 | {ERE, "x.*z", "\xc3\x84xz", 2, 1}, |
| 128 | {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94z", 2, 1}, |
| 129 | {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94y\xf1\x90\x80\x90z", 2, 1}, |
| 130 | {ERE, "x.*z", "\xc3\x84x\xe2\x80\x94\xc3\x94\xf1\x90\x80\x90z", 2, 1}, |
| 131 | {ERE, "x.?z", "axz", 1, 1}, |
| 132 | {ERE, "x.?z", "axyz", 1, 1}, |
| 133 | {ERE, "x.?z", "ax\xc3\x84z", 1, 1}, |
| 134 | {ERE, "x.?z", "ax\xe2\x80\x94z", 1, 1}, |
| 135 | {ERE, "x.?z", "ax\xf0\x9d\x80\x80z", 1, 1}, |
| 136 | {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84z", 1, 1}, |
| 137 | {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfz", 1, 1}, |
| 138 | {ERE, "x.?z", "axyyz", -1, 1}, |
| 139 | {ERE, "x.?z", "ax\xc3\x84\xc3\x96z", -1, 1}, |
| 140 | {ERE, "x.?z", "ax\xe2\x80\x94\xc3\xa4z", -1, 1}, |
| 141 | {ERE, "x.?z", "ax\xf0\x9d\x80\x80yz", -1, 1}, |
| 142 | {ERE, "x.?z", "ax\xf9\x81\x82\x83\x84\xf0\x9d\x80\x81z", -1, 1}, |
| 143 | {ERE, "x.?z", "ax\xfd\xbf\xbf\xbf\xbf\xbf\xc3\x96z", -1, 1}, |
| 144 | {ERE, "x.+z", "\xe2\x80\x94xz", -1, 1}, |
| 145 | {ERE, "x.+z", "\xe2\x80\x94xyz", 3, 1}, |
| 146 | {ERE, "x.+z", "\xe2\x80\x94x\xc3\x84y\xe2\x80\x94z", 3, 1}, |
| 147 | {ERE, "x.+z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, |
| 148 | {ERE, "x.+z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, |
| 149 | {ERE, "x.+z", "\xe2\x80\x94x.~\xe2\x80\x94\xf9\x81\x82\x83\x84z", 3, 1}, |
| 150 | {ERE, "x.+z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, |
| 151 | {ERE, "x.{1,2}z", "\xe2\x80\x94xz", -1, 1}, |
| 152 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x96y\xc3\xa4z", -1, 1}, |
| 153 | {ERE, "x.{1,2}z", "\xe2\x80\x94xyz", 3, 1}, |
| 154 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xc3\x84\xe2\x80\x94z", 3, 1}, |
| 155 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xe2\x80\x94z", 3, 1}, |
| 156 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xf0\x9d\x80\x80\xc3\x84z", 3, 1}, |
| 157 | {ERE, "x.{1,2}z", "\xe2\x80\x94x~\xe2\x80\x94z", 3, 1}, |
| 158 | {ERE, "x.{1,2}z", "\xe2\x80\x94x\xfd\xbf\xbf\xbf\xbf\xbfz", 3, 1}, |
| 159 | {ERE, "x(.w|\xc3\x86)?z", "axz", 1, 1}, |
| 160 | {ERE, "x(.w|\xc3\x86)?z", "ax\xfd\xbf\xbf\xbf\xbf\xbfwz", 1, 1}, |
| 161 | {ERE, "x(.w|\xc3\x86)?z", "ax\xc3\x86z", 1, 1}, |
| 162 | {ERE, "x(.w|\xc3\x86)?z", "ax\xe2\x80\x96wz", 1, 1}, |
| 163 | /* Should not be optimized. */ |
| 164 | {BRE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0}, |
| 165 | {BRE, "x[A-Z,]y", "axCy", 1, 0}, |
| 166 | {BRE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0}, |
| 167 | {BRE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0}, |
| 168 | {BRE, "x[[=A=]]z", "axAz", 1, 0}, |
| 169 | {BRE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0}, |
| 170 | {BRE, "\\<g", "\xe2\x80\x94g", 3, 0}, |
| 171 | {BRE, "\\bg\\b", "\xe2\x80\x94g", 3, 0}, |
| 172 | {BRE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0}, |
| 173 | {BRE, "a\\wz", "a\xc3\x84z", 0, 0}, |
| 174 | {BRE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0}, |
| 175 | {ERE, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0}, |
| 176 | {ERE, "x[A-Z,]y", "axCy", 1, 0}, |
| 177 | {ERE, "x[^y]z", "ax\xe2\x80\x94z", 1, 0}, |
| 178 | {ERE, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0}, |
| 179 | {ERE, "x[[=A=]]z", "axAz", 1, 0}, |
| 180 | {ERE, "x[[=\xc3\x84=]]z", "ax\xc3\x84z", 1, 0}, |
| 181 | {ERE, "\\<g", "\xe2\x80\x94g", 3, 0}, |
| 182 | {ERE, "\\bg\\b", "\xe2\x80\x94g", 3, 0}, |
| 183 | {ERE, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0}, |
| 184 | {ERE, "a\\wz", "a\xc3\x84z", 0, 0}, |
| 185 | {ERE, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0}, |
| 186 | }; |
| 187 | |
| 188 | int |
| 189 | main (void) |
| 190 | { |
| 191 | struct re_pattern_buffer regbuf; |
| 192 | const char *err; |
| 193 | size_t i; |
| 194 | int ret = 0; |
| 195 | |
| 196 | mtrace (); |
| 197 | |
| 198 | setlocale (LC_ALL, "de_DE.UTF-8"); |
| 199 | for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i) |
| 200 | { |
| 201 | int res, optimized; |
| 202 | |
| 203 | re_set_syntax (tests[i].syntax); |
| 204 | memset (®buf, '\0', sizeof (regbuf)); |
| 205 | err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern), |
| 206 | ®buf); |
| 207 | if (err != NULL) |
| 208 | { |
| 209 | printf ("re_compile_pattern failed: %s\n", err); |
| 210 | ret = 1; |
| 211 | continue; |
| 212 | } |
| 213 | |
| 214 | /* Check if re_search will be done as multi-byte or single-byte. */ |
| 215 | optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1; |
| 216 | if (optimized != tests[i].optimize) |
| 217 | { |
| 218 | printf ("pattern %zd %soptimized while it should%s be\n", |
| 219 | i, optimized ? "" : "not ", tests[i].optimize ? "" : " not"); |
| 220 | ret = 1; |
| 221 | } |
| 222 | |
| 223 | int str_len = strlen (tests[i].string); |
| 224 | res = re_search (®buf, tests[i].string, str_len, 0, str_len, NULL); |
| 225 | if (res != tests[i].res) |
| 226 | { |
| 227 | printf ("re_search %zd failed: %d\n", i, res); |
| 228 | ret = 1; |
| 229 | regfree (®buf); |
| 230 | continue; |
| 231 | } |
| 232 | |
| 233 | res = re_search (®buf, tests[i].string, str_len, str_len, -str_len, |
| 234 | NULL); |
| 235 | if (res != tests[i].res) |
| 236 | { |
| 237 | printf ("backward re_search %zd failed: %d\n", i, res); |
| 238 | ret = 1; |
| 239 | regfree (®buf); |
| 240 | continue; |
| 241 | } |
| 242 | regfree (®buf); |
| 243 | |
| 244 | re_set_syntax (tests[i].syntax | RE_ICASE); |
| 245 | memset (®buf, '\0', sizeof (regbuf)); |
| 246 | err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern), |
| 247 | ®buf); |
| 248 | if (err != NULL) |
| 249 | { |
| 250 | printf ("re_compile_pattern failed: %s\n", err); |
| 251 | ret = 1; |
| 252 | continue; |
| 253 | } |
| 254 | |
| 255 | /* Check if re_search will be done as multi-byte or single-byte. */ |
| 256 | optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1; |
| 257 | if (optimized) |
| 258 | { |
| 259 | printf ("pattern %zd optimized while it should not be when case insensitive\n", |
| 260 | i); |
| 261 | ret = 1; |
| 262 | } |
| 263 | |
| 264 | res = re_search (®buf, tests[i].string, str_len, 0, str_len, NULL); |
| 265 | if (res != tests[i].res) |
| 266 | { |
| 267 | printf ("ICASE re_search %zd failed: %d\n", i, res); |
| 268 | ret = 1; |
| 269 | regfree (®buf); |
| 270 | continue; |
| 271 | } |
| 272 | |
| 273 | res = re_search (®buf, tests[i].string, str_len, str_len, -str_len, |
| 274 | NULL); |
| 275 | if (res != tests[i].res) |
| 276 | { |
| 277 | printf ("ICASE backward re_search %zd failed: %d\n", i, res); |
| 278 | ret = 1; |
| 279 | regfree (®buf); |
| 280 | continue; |
| 281 | } |
| 282 | regfree (®buf); |
| 283 | } |
| 284 | |
| 285 | return ret; |
| 286 | } |