| xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame] | 1 | /* Copyright (C) 2000-2016 Free Software Foundation, Inc. | 
|  | 2 | This file is part of the GNU C Library. | 
|  | 3 | Contributed by Bruno Haible <haible@clisp.cons.org>, 2000. | 
|  | 4 |  | 
|  | 5 | The GNU C Library is free software; you can redistribute it and/or | 
|  | 6 | modify it under the terms of the GNU Lesser General Public | 
|  | 7 | License as published by the Free Software Foundation; either | 
|  | 8 | version 2.1 of the License, or (at your option) any later version. | 
|  | 9 |  | 
|  | 10 | The GNU C Library is distributed in the hope that it will be useful, | 
|  | 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|  | 13 | Lesser General Public License for more details. | 
|  | 14 |  | 
|  | 15 | You should have received a copy of the GNU Lesser General Public | 
|  | 16 | License along with the GNU C Library; if not, see | 
|  | 17 | <http://www.gnu.org/licenses/>.  */ | 
|  | 18 |  | 
|  | 19 | /* Create a table from CHARSET to Unicode. | 
|  | 20 | This is a good test for CHARSET's iconv() module, in particular the | 
|  | 21 | FROM_LOOP BODY macro.  */ | 
|  | 22 |  | 
|  | 23 | #include <stddef.h> | 
|  | 24 | #include <stdio.h> | 
|  | 25 | #include <stdlib.h> | 
|  | 26 | #include <string.h> | 
|  | 27 | #include <iconv.h> | 
|  | 28 | #include <errno.h> | 
|  | 29 |  | 
|  | 30 | /* If nonzero, ignore conversions outside Unicode plane 0.  */ | 
|  | 31 | static int bmp_only; | 
|  | 32 |  | 
|  | 33 | /* Converts a byte buffer to a hexadecimal string.  */ | 
|  | 34 | static const char* | 
|  | 35 | hexbuf (unsigned char buf[], unsigned int buflen) | 
|  | 36 | { | 
|  | 37 | static char msg[50]; | 
|  | 38 |  | 
|  | 39 | switch (buflen) | 
|  | 40 | { | 
|  | 41 | case 1: | 
|  | 42 | sprintf (msg, "0x%02X", buf[0]); | 
|  | 43 | break; | 
|  | 44 | case 2: | 
|  | 45 | sprintf (msg, "0x%02X%02X", buf[0], buf[1]); | 
|  | 46 | break; | 
|  | 47 | case 3: | 
|  | 48 | sprintf (msg, "0x%02X%02X%02X", buf[0], buf[1], buf[2]); | 
|  | 49 | break; | 
|  | 50 | case 4: | 
|  | 51 | sprintf (msg, "0x%02X%02X%02X%02X", buf[0], buf[1], buf[2], buf[3]); | 
|  | 52 | break; | 
|  | 53 | default: | 
|  | 54 | abort (); | 
|  | 55 | } | 
|  | 56 | return msg; | 
|  | 57 | } | 
|  | 58 |  | 
|  | 59 | /* Attempts to convert a byte buffer BUF (BUFLEN bytes) to OUT (12 bytes) | 
|  | 60 | using the conversion descriptor CD.  Returns the number of written bytes, | 
|  | 61 | or 0 if ambiguous, or -1 if invalid.  */ | 
|  | 62 | static int | 
|  | 63 | try (iconv_t cd, unsigned char buf[], unsigned int buflen, unsigned char *out) | 
|  | 64 | { | 
|  | 65 | const char *inbuf = (const char *) buf; | 
|  | 66 | size_t inbytesleft = buflen; | 
|  | 67 | char *outbuf = (char *) out; | 
|  | 68 | size_t outbytesleft = 12; | 
|  | 69 | size_t result; | 
|  | 70 |  | 
|  | 71 | iconv (cd, NULL, NULL, NULL, NULL); | 
|  | 72 | result = iconv (cd, (char **) &inbuf, &inbytesleft, &outbuf, &outbytesleft); | 
|  | 73 | if (result != (size_t)(-1)) | 
|  | 74 | result = iconv (cd, NULL, NULL, &outbuf, &outbytesleft); | 
|  | 75 |  | 
|  | 76 | if (result == (size_t)(-1)) | 
|  | 77 | { | 
|  | 78 | if (errno == EILSEQ) | 
|  | 79 | { | 
|  | 80 | return -1; | 
|  | 81 | } | 
|  | 82 | else if (errno == EINVAL) | 
|  | 83 | { | 
|  | 84 | return 0; | 
|  | 85 | } | 
|  | 86 | else | 
|  | 87 | { | 
|  | 88 | int saved_errno = errno; | 
|  | 89 | fprintf (stderr, "%s: iconv error: ", hexbuf (buf, buflen)); | 
|  | 90 | errno = saved_errno; | 
|  | 91 | perror (""); | 
|  | 92 | exit (1); | 
|  | 93 | } | 
|  | 94 | } | 
|  | 95 | else | 
|  | 96 | { | 
|  | 97 | if (inbytesleft != 0) | 
|  | 98 | { | 
|  | 99 | fprintf (stderr, "%s: inbytes = %ld, outbytes = %ld\n", | 
|  | 100 | hexbuf (buf, buflen), | 
|  | 101 | (long) (buflen - inbytesleft), | 
|  | 102 | (long) (12 - outbytesleft)); | 
|  | 103 | exit (1); | 
|  | 104 | } | 
|  | 105 | return 12 - outbytesleft; | 
|  | 106 | } | 
|  | 107 | } | 
|  | 108 |  | 
|  | 109 | /* Returns the out[] buffer as a Unicode value, formatted as 0x%04X.  */ | 
|  | 110 | static const char * | 
|  | 111 | utf8_decode (const unsigned char *out, unsigned int outlen) | 
|  | 112 | { | 
|  | 113 | static char hexbuf[84]; | 
|  | 114 | char *p = hexbuf; | 
|  | 115 |  | 
|  | 116 | while (outlen > 0) | 
|  | 117 | { | 
|  | 118 | if (p > hexbuf) | 
|  | 119 | *p++ = ' '; | 
|  | 120 |  | 
|  | 121 | if (out[0] < 0x80) | 
|  | 122 | { | 
|  | 123 | sprintf (p, "0x%04X", out[0]); | 
|  | 124 | out += 1; outlen -= 1; | 
|  | 125 | } | 
|  | 126 | else if (out[0] >= 0xc0 && out[0] < 0xe0 && outlen >= 2) | 
|  | 127 | { | 
|  | 128 | sprintf (p, "0x%04X", ((out[0] & 0x1f) << 6) + (out[1] & 0x3f)); | 
|  | 129 | out += 2; outlen -= 2; | 
|  | 130 | } | 
|  | 131 | else if (out[0] >= 0xe0 && out[0] < 0xf0 && outlen >= 3) | 
|  | 132 | { | 
|  | 133 | sprintf (p, "0x%04X", ((out[0] & 0x0f) << 12) | 
|  | 134 | + ((out[1] & 0x3f) << 6) + (out[2] & 0x3f)); | 
|  | 135 | out += 3; outlen -= 3; | 
|  | 136 | } | 
|  | 137 | else if (out[0] >= 0xf0 && out[0] < 0xf8 && outlen >= 4) | 
|  | 138 | { | 
|  | 139 | sprintf (p, "0x%04X", ((out[0] & 0x07) << 18) | 
|  | 140 | + ((out[1] & 0x3f) << 12) | 
|  | 141 | + ((out[2] & 0x3f) << 6) + (out[3] & 0x3f)); | 
|  | 142 | out += 4; outlen -= 4; | 
|  | 143 | } | 
|  | 144 | else if (out[0] >= 0xf8 && out[0] < 0xfc && outlen >= 5) | 
|  | 145 | { | 
|  | 146 | sprintf (p, "0x%04X", ((out[0] & 0x03) << 24) | 
|  | 147 | + ((out[1] & 0x3f) << 18) | 
|  | 148 | + ((out[2] & 0x3f) << 12) | 
|  | 149 | + ((out[3] & 0x3f) << 6) + (out[4] & 0x3f)); | 
|  | 150 | out += 5; outlen -= 5; | 
|  | 151 | } | 
|  | 152 | else if (out[0] >= 0xfc && out[0] < 0xfe && outlen >= 6) | 
|  | 153 | { | 
|  | 154 | sprintf (p, "0x%04X", ((out[0] & 0x01) << 30) | 
|  | 155 | + ((out[1] & 0x3f) << 24) | 
|  | 156 | + ((out[2] & 0x3f) << 18) | 
|  | 157 | + ((out[3] & 0x3f) << 12) | 
|  | 158 | + ((out[4] & 0x3f) << 6) + (out[5] & 0x3f)); | 
|  | 159 | out += 6; outlen -= 6; | 
|  | 160 | } | 
|  | 161 | else | 
|  | 162 | { | 
|  | 163 | sprintf (p, "0x????"); | 
|  | 164 | out += 1; outlen -= 1; | 
|  | 165 | } | 
|  | 166 |  | 
|  | 167 | if (bmp_only && strlen (p) > 6) | 
|  | 168 | /* Ignore conversions outside Unicode plane 0.  */ | 
|  | 169 | return NULL; | 
|  | 170 |  | 
|  | 171 | p += strlen (p); | 
|  | 172 | } | 
|  | 173 |  | 
|  | 174 | return hexbuf; | 
|  | 175 | } | 
|  | 176 |  | 
|  | 177 | int | 
|  | 178 | main (int argc, char *argv[]) | 
|  | 179 | { | 
|  | 180 | const char *charset; | 
|  | 181 | iconv_t cd; | 
|  | 182 | int search_depth; | 
|  | 183 |  | 
|  | 184 | if (argc != 2) | 
|  | 185 | { | 
|  | 186 | fprintf (stderr, "Usage: tst-table-from charset\n"); | 
|  | 187 | exit (1); | 
|  | 188 | } | 
|  | 189 | charset = argv[1]; | 
|  | 190 |  | 
|  | 191 | cd = iconv_open ("UTF-8", charset); | 
|  | 192 | if (cd == (iconv_t)(-1)) | 
|  | 193 | { | 
|  | 194 | perror ("iconv_open"); | 
|  | 195 | exit (1); | 
|  | 196 | } | 
|  | 197 |  | 
|  | 198 | /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output | 
|  | 199 | file gets too big.  */ | 
|  | 200 | bmp_only = (strcmp (charset, "UTF-8") == 0 | 
|  | 201 | || strcmp (charset, "GB18030") == 0); | 
|  | 202 | search_depth = (strcmp (charset, "UTF-8") == 0 ? 3 : 4); | 
|  | 203 |  | 
|  | 204 | { | 
|  | 205 | unsigned char out[12]; | 
|  | 206 | unsigned char buf[4]; | 
|  | 207 | unsigned int i0, i1, i2, i3; | 
|  | 208 | int result; | 
|  | 209 |  | 
|  | 210 | for (i0 = 0; i0 < 0x100; i0++) | 
|  | 211 | { | 
|  | 212 | buf[0] = i0; | 
|  | 213 | result = try (cd, buf, 1, out); | 
|  | 214 | if (result < 0) | 
|  | 215 | { | 
|  | 216 | } | 
|  | 217 | else if (result > 0) | 
|  | 218 | { | 
|  | 219 | const char *unicode = utf8_decode (out, result); | 
|  | 220 | if (unicode != NULL) | 
|  | 221 | printf ("0x%02X\t%s\n", i0, unicode); | 
|  | 222 | } | 
|  | 223 | else | 
|  | 224 | { | 
|  | 225 | for (i1 = 0; i1 < 0x100; i1++) | 
|  | 226 | { | 
|  | 227 | buf[1] = i1; | 
|  | 228 | result = try (cd, buf, 2, out); | 
|  | 229 | if (result < 0) | 
|  | 230 | { | 
|  | 231 | } | 
|  | 232 | else if (result > 0) | 
|  | 233 | { | 
|  | 234 | const char *unicode = utf8_decode (out, result); | 
|  | 235 | if (unicode != NULL) | 
|  | 236 | printf ("0x%02X%02X\t%s\n", i0, i1, unicode); | 
|  | 237 | } | 
|  | 238 | else | 
|  | 239 | { | 
|  | 240 | for (i2 = 0; i2 < 0x100; i2++) | 
|  | 241 | { | 
|  | 242 | buf[2] = i2; | 
|  | 243 | result = try (cd, buf, 3, out); | 
|  | 244 | if (result < 0) | 
|  | 245 | { | 
|  | 246 | } | 
|  | 247 | else if (result > 0) | 
|  | 248 | { | 
|  | 249 | const char *unicode = utf8_decode (out, result); | 
|  | 250 | if (unicode != NULL) | 
|  | 251 | printf ("0x%02X%02X%02X\t%s\n", | 
|  | 252 | i0, i1, i2, unicode); | 
|  | 253 | } | 
|  | 254 | else if (search_depth > 3) | 
|  | 255 | { | 
|  | 256 | for (i3 = 0; i3 < 0x100; i3++) | 
|  | 257 | { | 
|  | 258 | buf[3] = i3; | 
|  | 259 | result = try (cd, buf, 4, out); | 
|  | 260 | if (result < 0) | 
|  | 261 | { | 
|  | 262 | } | 
|  | 263 | else if (result > 0) | 
|  | 264 | { | 
|  | 265 | const char *unicode = | 
|  | 266 | utf8_decode (out, result); | 
|  | 267 | if (unicode != NULL) | 
|  | 268 | printf ("0x%02X%02X%02X%02X\t%s\n", | 
|  | 269 | i0, i1, i2, i3, unicode); | 
|  | 270 | } | 
|  | 271 | else | 
|  | 272 | { | 
|  | 273 | fprintf (stderr, | 
|  | 274 | "%s: incomplete byte sequence\n", | 
|  | 275 | hexbuf (buf, 4)); | 
|  | 276 | exit (1); | 
|  | 277 | } | 
|  | 278 | } | 
|  | 279 | } | 
|  | 280 | } | 
|  | 281 | } | 
|  | 282 | } | 
|  | 283 | } | 
|  | 284 | } | 
|  | 285 | } | 
|  | 286 |  | 
|  | 287 | if (iconv_close (cd) < 0) | 
|  | 288 | { | 
|  | 289 | perror ("iconv_close"); | 
|  | 290 | exit (1); | 
|  | 291 | } | 
|  | 292 |  | 
|  | 293 | if (ferror (stdin) || fflush (stdout) || ferror (stdout)) | 
|  | 294 | { | 
|  | 295 | fprintf (stderr, "I/O error\n"); | 
|  | 296 | exit (1); | 
|  | 297 | } | 
|  | 298 |  | 
|  | 299 | return 0; | 
|  | 300 | } |