| xj | b04a402 | 2021-11-25 15:01:52 +0800 | [diff] [blame] | 1 | /* | 
|  | 2 | * Kernel module for testing utf-8 support. | 
|  | 3 | * | 
|  | 4 | * Copyright 2017 Collabora Ltd. | 
|  | 5 | * | 
|  | 6 | * This software is licensed under the terms of the GNU General Public | 
|  | 7 | * License version 2, as published by the Free Software Foundation, and | 
|  | 8 | * may be copied, distributed, and modified under those terms. | 
|  | 9 | * | 
|  | 10 | * This program is distributed in the hope that it will be useful, | 
|  | 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 13 | * GNU General Public License for more details. | 
|  | 14 | */ | 
|  | 15 |  | 
|  | 16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 
|  | 17 |  | 
|  | 18 | #include <linux/module.h> | 
|  | 19 | #include <linux/printk.h> | 
|  | 20 | #include <linux/unicode.h> | 
|  | 21 | #include <linux/dcache.h> | 
|  | 22 |  | 
|  | 23 | #include "utf8n.h" | 
|  | 24 |  | 
|  | 25 | unsigned int failed_tests; | 
|  | 26 | unsigned int total_tests; | 
|  | 27 |  | 
|  | 28 | /* Tests will be based on this version. */ | 
|  | 29 | #define latest_maj 12 | 
|  | 30 | #define latest_min 1 | 
|  | 31 | #define latest_rev 0 | 
|  | 32 |  | 
|  | 33 | #define _test(cond, func, line, fmt, ...) do {				\ | 
|  | 34 | total_tests++;						\ | 
|  | 35 | if (!cond) {						\ | 
|  | 36 | failed_tests++;					\ | 
|  | 37 | pr_err("test %s:%d Failed: %s%s",		\ | 
|  | 38 | func, line, #cond, (fmt?":":"."));	\ | 
|  | 39 | if (fmt)					\ | 
|  | 40 | pr_err(fmt, ##__VA_ARGS__);		\ | 
|  | 41 | }							\ | 
|  | 42 | } while (0) | 
|  | 43 | #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) | 
|  | 44 | #define test(cond) _test(cond, __func__, __LINE__, "") | 
|  | 45 |  | 
|  | 46 | const static struct { | 
|  | 47 | /* UTF-8 strings in this vector _must_ be NULL-terminated. */ | 
|  | 48 | unsigned char str[10]; | 
|  | 49 | unsigned char dec[10]; | 
|  | 50 | } nfdi_test_data[] = { | 
|  | 51 | /* Trivial sequence */ | 
|  | 52 | { | 
|  | 53 | /* "ABba" decomposes to itself */ | 
|  | 54 | .str = "aBba", | 
|  | 55 | .dec = "aBba", | 
|  | 56 | }, | 
|  | 57 | /* Simple equivalent sequences */ | 
|  | 58 | { | 
|  | 59 | /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to | 
|  | 60 | 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on | 
|  | 61 | canonical decomposition */ | 
|  | 62 | .str = {0xc2, 0xbc, 0x00}, | 
|  | 63 | .dec = {0xc2, 0xbc, 0x00}, | 
|  | 64 | }, | 
|  | 65 | { | 
|  | 66 | /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to | 
|  | 67 | 'LETTER A' + 'COMBINING DIAERESIS' */ | 
|  | 68 | .str = {0xc3, 0xa4, 0x00}, | 
|  | 69 | .dec = {0x61, 0xcc, 0x88, 0x00}, | 
|  | 70 | }, | 
|  | 71 | { | 
|  | 72 | /* 'LATIN SMALL LETTER LJ' can't decompose to | 
|  | 73 | 'LETTER L' + 'LETTER J' on canonical decomposition */ | 
|  | 74 | .str = {0xC7, 0x89, 0x00}, | 
|  | 75 | .dec = {0xC7, 0x89, 0x00}, | 
|  | 76 | }, | 
|  | 77 | { | 
|  | 78 | /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ | 
|  | 79 | .str = {0xCE, 0x87, 0x00}, | 
|  | 80 | .dec = {0xC2, 0xB7, 0x00} | 
|  | 81 | }, | 
|  | 82 | /* Canonical ordering */ | 
|  | 83 | { | 
|  | 84 | /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes | 
|  | 85 | to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ | 
|  | 86 | .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, | 
|  | 87 | .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, | 
|  | 88 | }, | 
|  | 89 | { | 
|  | 90 | /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' | 
|  | 91 | decomposes to | 
|  | 92 | 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ | 
|  | 93 | .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, | 
|  | 94 |  | 
|  | 95 | .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, | 
|  | 96 | }, | 
|  | 97 |  | 
|  | 98 | }; | 
|  | 99 |  | 
|  | 100 | const static struct { | 
|  | 101 | /* UTF-8 strings in this vector _must_ be NULL-terminated. */ | 
|  | 102 | unsigned char str[30]; | 
|  | 103 | unsigned char ncf[30]; | 
|  | 104 | } nfdicf_test_data[] = { | 
|  | 105 | /* Trivial sequences */ | 
|  | 106 | { | 
|  | 107 | /* "ABba" folds to lowercase */ | 
|  | 108 | .str = {0x41, 0x42, 0x62, 0x61, 0x00}, | 
|  | 109 | .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, | 
|  | 110 | }, | 
|  | 111 | { | 
|  | 112 | /* All ASCII folds to lower-case */ | 
|  | 113 | .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1", | 
|  | 114 | .ncf = "abcdefghijklmnopqrstuvwxyz0.1", | 
|  | 115 | }, | 
|  | 116 | { | 
|  | 117 | /* LATIN SMALL LETTER SHARP S folds to | 
|  | 118 | LATIN SMALL LETTER S + LATIN SMALL LETTER S */ | 
|  | 119 | .str = {0xc3, 0x9f, 0x00}, | 
|  | 120 | .ncf = {0x73, 0x73, 0x00}, | 
|  | 121 | }, | 
|  | 122 | { | 
|  | 123 | /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to | 
|  | 124 | LATIN SMALL LETTER A + COMBINING RING ABOVE */ | 
|  | 125 | .str = {0xC3, 0x85, 0x00}, | 
|  | 126 | .ncf = {0x61, 0xcc, 0x8a, 0x00}, | 
|  | 127 | }, | 
|  | 128 | /* Introduced by UTF-8.0.0. */ | 
|  | 129 | /* Cherokee letters are interesting test-cases because they fold | 
|  | 130 | to upper-case.  Before 8.0.0, Cherokee lowercase were | 
|  | 131 | undefined, thus, the folding from LC is not stable between | 
|  | 132 | 7.0.0 -> 8.0.0, but it is from UC. */ | 
|  | 133 | { | 
|  | 134 | /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ | 
|  | 135 | .str = {0xea, 0xad, 0xb0, 0x00}, | 
|  | 136 | .ncf = {0xe1, 0x8e, 0xa0, 0x00}, | 
|  | 137 | }, | 
|  | 138 | { | 
|  | 139 | /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ | 
|  | 140 | .str = {0xe1, 0x8f, 0xb8, 0x00}, | 
|  | 141 | .ncf = {0xe1, 0x8f, 0xb0, 0x00}, | 
|  | 142 | }, | 
|  | 143 | { | 
|  | 144 | /* OLD HUNGARIAN CAPITAL LETTER AMB folds to | 
|  | 145 | OLD HUNGARIAN SMALL LETTER AMB */ | 
|  | 146 | .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, | 
|  | 147 | .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, | 
|  | 148 | }, | 
|  | 149 | /* Introduced by UTF-9.0.0. */ | 
|  | 150 | { | 
|  | 151 | /* OSAGE CAPITAL LETTER CHA folds to | 
|  | 152 | OSAGE SMALL LETTER CHA */ | 
|  | 153 | .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, | 
|  | 154 | .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, | 
|  | 155 | }, | 
|  | 156 | { | 
|  | 157 | /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to | 
|  | 158 | LATIN LETTER SMALL CAPITAL I */ | 
|  | 159 | .str = {0xea, 0x9e, 0xae, 0x00}, | 
|  | 160 | .ncf = {0xc9, 0xaa, 0x00}, | 
|  | 161 | }, | 
|  | 162 | /* Introduced by UTF-11.0.0. */ | 
|  | 163 | { | 
|  | 164 | /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI | 
|  | 165 | CAPITAL LETTER AN */ | 
|  | 166 | .str = {0xe1, 0xb2, 0x90, 0x00}, | 
|  | 167 | .ncf = {0xe1, 0x83, 0x90, 0x00}, | 
|  | 168 | } | 
|  | 169 | }; | 
|  | 170 |  | 
|  | 171 | static void check_utf8_nfdi(void) | 
|  | 172 | { | 
|  | 173 | int i; | 
|  | 174 | struct utf8cursor u8c; | 
|  | 175 | const struct utf8data *data; | 
|  | 176 |  | 
|  | 177 | data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev)); | 
|  | 178 | if (!data) { | 
|  | 179 | pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", | 
|  | 180 | __func__, latest_maj, latest_min, latest_rev); | 
|  | 181 | return; | 
|  | 182 | } | 
|  | 183 |  | 
|  | 184 | for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { | 
|  | 185 | int len = strlen(nfdi_test_data[i].str); | 
|  | 186 | int nlen = strlen(nfdi_test_data[i].dec); | 
|  | 187 | int j = 0; | 
|  | 188 | unsigned char c; | 
|  | 189 |  | 
|  | 190 | test((utf8len(data, nfdi_test_data[i].str) == nlen)); | 
|  | 191 | test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen)); | 
|  | 192 |  | 
|  | 193 | if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0) | 
|  | 194 | pr_err("can't create cursor\n"); | 
|  | 195 |  | 
|  | 196 | while ((c = utf8byte(&u8c)) > 0) { | 
|  | 197 | test_f((c == nfdi_test_data[i].dec[j]), | 
|  | 198 | "Unexpected byte 0x%x should be 0x%x\n", | 
|  | 199 | c, nfdi_test_data[i].dec[j]); | 
|  | 200 | j++; | 
|  | 201 | } | 
|  | 202 |  | 
|  | 203 | test((j == nlen)); | 
|  | 204 | } | 
|  | 205 | } | 
|  | 206 |  | 
|  | 207 | static void check_utf8_nfdicf(void) | 
|  | 208 | { | 
|  | 209 | int i; | 
|  | 210 | struct utf8cursor u8c; | 
|  | 211 | const struct utf8data *data; | 
|  | 212 |  | 
|  | 213 | data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev)); | 
|  | 214 | if (!data) { | 
|  | 215 | pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", | 
|  | 216 | __func__, latest_maj, latest_min, latest_rev); | 
|  | 217 | return; | 
|  | 218 | } | 
|  | 219 |  | 
|  | 220 | for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { | 
|  | 221 | int len = strlen(nfdicf_test_data[i].str); | 
|  | 222 | int nlen = strlen(nfdicf_test_data[i].ncf); | 
|  | 223 | int j = 0; | 
|  | 224 | unsigned char c; | 
|  | 225 |  | 
|  | 226 | test((utf8len(data, nfdicf_test_data[i].str) == nlen)); | 
|  | 227 | test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen)); | 
|  | 228 |  | 
|  | 229 | if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0) | 
|  | 230 | pr_err("can't create cursor\n"); | 
|  | 231 |  | 
|  | 232 | while ((c = utf8byte(&u8c)) > 0) { | 
|  | 233 | test_f((c == nfdicf_test_data[i].ncf[j]), | 
|  | 234 | "Unexpected byte 0x%x should be 0x%x\n", | 
|  | 235 | c, nfdicf_test_data[i].ncf[j]); | 
|  | 236 | j++; | 
|  | 237 | } | 
|  | 238 |  | 
|  | 239 | test((j == nlen)); | 
|  | 240 | } | 
|  | 241 | } | 
|  | 242 |  | 
|  | 243 | static void check_utf8_comparisons(void) | 
|  | 244 | { | 
|  | 245 | int i; | 
|  | 246 | struct unicode_map *table = utf8_load("12.1.0"); | 
|  | 247 |  | 
|  | 248 | if (IS_ERR(table)) { | 
|  | 249 | pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n", | 
|  | 250 | __func__, latest_maj, latest_min, latest_rev); | 
|  | 251 | return; | 
|  | 252 | } | 
|  | 253 |  | 
|  | 254 | for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { | 
|  | 255 | const struct qstr s1 = {.name = nfdi_test_data[i].str, | 
|  | 256 | .len = sizeof(nfdi_test_data[i].str)}; | 
|  | 257 | const struct qstr s2 = {.name = nfdi_test_data[i].dec, | 
|  | 258 | .len = sizeof(nfdi_test_data[i].dec)}; | 
|  | 259 |  | 
|  | 260 | test_f(!utf8_strncmp(table, &s1, &s2), | 
|  | 261 | "%s %s comparison mismatch\n", s1.name, s2.name); | 
|  | 262 | } | 
|  | 263 |  | 
|  | 264 | for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { | 
|  | 265 | const struct qstr s1 = {.name = nfdicf_test_data[i].str, | 
|  | 266 | .len = sizeof(nfdicf_test_data[i].str)}; | 
|  | 267 | const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, | 
|  | 268 | .len = sizeof(nfdicf_test_data[i].ncf)}; | 
|  | 269 |  | 
|  | 270 | test_f(!utf8_strncasecmp(table, &s1, &s2), | 
|  | 271 | "%s %s comparison mismatch\n", s1.name, s2.name); | 
|  | 272 | } | 
|  | 273 |  | 
|  | 274 | utf8_unload(table); | 
|  | 275 | } | 
|  | 276 |  | 
|  | 277 | static void check_supported_versions(void) | 
|  | 278 | { | 
|  | 279 | /* Unicode 7.0.0 should be supported. */ | 
|  | 280 | test(utf8version_is_supported(7, 0, 0)); | 
|  | 281 |  | 
|  | 282 | /* Unicode 9.0.0 should be supported. */ | 
|  | 283 | test(utf8version_is_supported(9, 0, 0)); | 
|  | 284 |  | 
|  | 285 | /* Unicode 1x.0.0 (the latest version) should be supported. */ | 
|  | 286 | test(utf8version_is_supported(latest_maj, latest_min, latest_rev)); | 
|  | 287 |  | 
|  | 288 | /* Next versions don't exist. */ | 
|  | 289 | test(!utf8version_is_supported(13, 0, 0)); | 
|  | 290 | test(!utf8version_is_supported(0, 0, 0)); | 
|  | 291 | test(!utf8version_is_supported(-1, -1, -1)); | 
|  | 292 | } | 
|  | 293 |  | 
|  | 294 | static int __init init_test_ucd(void) | 
|  | 295 | { | 
|  | 296 | failed_tests = 0; | 
|  | 297 | total_tests = 0; | 
|  | 298 |  | 
|  | 299 | check_supported_versions(); | 
|  | 300 | check_utf8_nfdi(); | 
|  | 301 | check_utf8_nfdicf(); | 
|  | 302 | check_utf8_comparisons(); | 
|  | 303 |  | 
|  | 304 | if (!failed_tests) | 
|  | 305 | pr_info("All %u tests passed\n", total_tests); | 
|  | 306 | else | 
|  | 307 | pr_err("%u out of %u tests failed\n", failed_tests, | 
|  | 308 | total_tests); | 
|  | 309 | return 0; | 
|  | 310 | } | 
|  | 311 |  | 
|  | 312 | static void __exit exit_test_ucd(void) | 
|  | 313 | { | 
|  | 314 | } | 
|  | 315 |  | 
|  | 316 | module_init(init_test_ucd); | 
|  | 317 | module_exit(exit_test_ucd); | 
|  | 318 |  | 
|  | 319 | MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); | 
|  | 320 | MODULE_LICENSE("GPL"); |