| xf.li | bdd93d5 | 2023-05-12 07:10:14 -0700 | [diff] [blame] | 1 | /* Copyright (C) 2000-2016 Free Software Foundation, Inc. | 
 | 2 |    This file is part of the GNU C Library. | 
 | 3 |    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000. | 
 | 4 |  | 
 | 5 |    The GNU C Library is free software; you can redistribute it and/or | 
 | 6 |    modify it under the terms of the GNU Lesser General Public | 
 | 7 |    License as published by the Free Software Foundation; either | 
 | 8 |    version 2.1 of the License, or (at your option) any later version. | 
 | 9 |  | 
 | 10 |    The GNU C Library is distributed in the hope that it will be useful, | 
 | 11 |    but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 | 12 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
 | 13 |    Lesser General Public License for more details. | 
 | 14 |  | 
 | 15 |    You should have received a copy of the GNU Lesser General Public | 
 | 16 |    License along with the GNU C Library; if not, see | 
 | 17 |    <http://www.gnu.org/licenses/>.  */ | 
 | 18 |  | 
 | 19 | /* Create a table from CHARSET to Unicode. | 
 | 20 |    This is a good test for CHARSET's iconv() module, in particular the | 
 | 21 |    FROM_LOOP BODY macro.  */ | 
 | 22 |  | 
 | 23 | #include <stddef.h> | 
 | 24 | #include <stdio.h> | 
 | 25 | #include <stdlib.h> | 
 | 26 | #include <string.h> | 
 | 27 | #include <iconv.h> | 
 | 28 | #include <errno.h> | 
 | 29 |  | 
 | 30 | /* If nonzero, ignore conversions outside Unicode plane 0.  */ | 
 | 31 | static int bmp_only; | 
 | 32 |  | 
 | 33 | /* Converts a byte buffer to a hexadecimal string.  */ | 
 | 34 | static const char* | 
 | 35 | hexbuf (unsigned char buf[], unsigned int buflen) | 
 | 36 | { | 
 | 37 |   static char msg[50]; | 
 | 38 |  | 
 | 39 |   switch (buflen) | 
 | 40 |     { | 
 | 41 |     case 1: | 
 | 42 |       sprintf (msg, "0x%02X", buf[0]); | 
 | 43 |       break; | 
 | 44 |     case 2: | 
 | 45 |       sprintf (msg, "0x%02X%02X", buf[0], buf[1]); | 
 | 46 |       break; | 
 | 47 |     case 3: | 
 | 48 |       sprintf (msg, "0x%02X%02X%02X", buf[0], buf[1], buf[2]); | 
 | 49 |       break; | 
 | 50 |     case 4: | 
 | 51 |       sprintf (msg, "0x%02X%02X%02X%02X", buf[0], buf[1], buf[2], buf[3]); | 
 | 52 |       break; | 
 | 53 |     default: | 
 | 54 |       abort (); | 
 | 55 |     } | 
 | 56 |   return msg; | 
 | 57 | } | 
 | 58 |  | 
 | 59 | /* Attempts to convert a byte buffer BUF (BUFLEN bytes) to OUT (12 bytes) | 
 | 60 |    using the conversion descriptor CD.  Returns the number of written bytes, | 
 | 61 |    or 0 if ambiguous, or -1 if invalid.  */ | 
 | 62 | static int | 
 | 63 | try (iconv_t cd, unsigned char buf[], unsigned int buflen, unsigned char *out) | 
 | 64 | { | 
 | 65 |   const char *inbuf = (const char *) buf; | 
 | 66 |   size_t inbytesleft = buflen; | 
 | 67 |   char *outbuf = (char *) out; | 
 | 68 |   size_t outbytesleft = 12; | 
 | 69 |   size_t result; | 
 | 70 |  | 
 | 71 |   iconv (cd, NULL, NULL, NULL, NULL); | 
 | 72 |   result = iconv (cd, (char **) &inbuf, &inbytesleft, &outbuf, &outbytesleft); | 
 | 73 |   if (result != (size_t)(-1)) | 
 | 74 |     result = iconv (cd, NULL, NULL, &outbuf, &outbytesleft); | 
 | 75 |  | 
 | 76 |   if (result == (size_t)(-1)) | 
 | 77 |     { | 
 | 78 |       if (errno == EILSEQ) | 
 | 79 | 	{ | 
 | 80 | 	  return -1; | 
 | 81 | 	} | 
 | 82 |       else if (errno == EINVAL) | 
 | 83 | 	{ | 
 | 84 | 	  return 0; | 
 | 85 | 	} | 
 | 86 |       else | 
 | 87 | 	{ | 
 | 88 | 	  int saved_errno = errno; | 
 | 89 | 	  fprintf (stderr, "%s: iconv error: ", hexbuf (buf, buflen)); | 
 | 90 | 	  errno = saved_errno; | 
 | 91 | 	  perror (""); | 
 | 92 | 	  exit (1); | 
 | 93 | 	} | 
 | 94 |     } | 
 | 95 |   else | 
 | 96 |     { | 
 | 97 |       if (inbytesleft != 0) | 
 | 98 | 	{ | 
 | 99 | 	  fprintf (stderr, "%s: inbytes = %ld, outbytes = %ld\n", | 
 | 100 | 		   hexbuf (buf, buflen), | 
 | 101 | 		   (long) (buflen - inbytesleft), | 
 | 102 | 		   (long) (12 - outbytesleft)); | 
 | 103 | 	  exit (1); | 
 | 104 | 	} | 
 | 105 |       return 12 - outbytesleft; | 
 | 106 |     } | 
 | 107 | } | 
 | 108 |  | 
 | 109 | /* Returns the out[] buffer as a Unicode value, formatted as 0x%04X.  */ | 
 | 110 | static const char * | 
 | 111 | utf8_decode (const unsigned char *out, unsigned int outlen) | 
 | 112 | { | 
 | 113 |   static char hexbuf[84]; | 
 | 114 |   char *p = hexbuf; | 
 | 115 |  | 
 | 116 |   while (outlen > 0) | 
 | 117 |     { | 
 | 118 |       if (p > hexbuf) | 
 | 119 | 	*p++ = ' '; | 
 | 120 |  | 
 | 121 |       if (out[0] < 0x80) | 
 | 122 | 	{ | 
 | 123 | 	  sprintf (p, "0x%04X", out[0]); | 
 | 124 | 	  out += 1; outlen -= 1; | 
 | 125 | 	} | 
 | 126 |       else if (out[0] >= 0xc0 && out[0] < 0xe0 && outlen >= 2) | 
 | 127 | 	{ | 
 | 128 | 	  sprintf (p, "0x%04X", ((out[0] & 0x1f) << 6) + (out[1] & 0x3f)); | 
 | 129 | 	  out += 2; outlen -= 2; | 
 | 130 | 	} | 
 | 131 |       else if (out[0] >= 0xe0 && out[0] < 0xf0 && outlen >= 3) | 
 | 132 | 	{ | 
 | 133 | 	  sprintf (p, "0x%04X", ((out[0] & 0x0f) << 12) | 
 | 134 | 				+ ((out[1] & 0x3f) << 6) + (out[2] & 0x3f)); | 
 | 135 | 	  out += 3; outlen -= 3; | 
 | 136 | 	} | 
 | 137 |       else if (out[0] >= 0xf0 && out[0] < 0xf8 && outlen >= 4) | 
 | 138 | 	{ | 
 | 139 | 	  sprintf (p, "0x%04X", ((out[0] & 0x07) << 18) | 
 | 140 | 				+ ((out[1] & 0x3f) << 12) | 
 | 141 | 				+ ((out[2] & 0x3f) << 6) + (out[3] & 0x3f)); | 
 | 142 | 	  out += 4; outlen -= 4; | 
 | 143 | 	} | 
 | 144 |       else if (out[0] >= 0xf8 && out[0] < 0xfc && outlen >= 5) | 
 | 145 | 	{ | 
 | 146 | 	  sprintf (p, "0x%04X", ((out[0] & 0x03) << 24) | 
 | 147 | 				+ ((out[1] & 0x3f) << 18) | 
 | 148 | 				+ ((out[2] & 0x3f) << 12) | 
 | 149 | 				+ ((out[3] & 0x3f) << 6) + (out[4] & 0x3f)); | 
 | 150 | 	  out += 5; outlen -= 5; | 
 | 151 | 	} | 
 | 152 |       else if (out[0] >= 0xfc && out[0] < 0xfe && outlen >= 6) | 
 | 153 | 	{ | 
 | 154 | 	  sprintf (p, "0x%04X", ((out[0] & 0x01) << 30) | 
 | 155 | 				+ ((out[1] & 0x3f) << 24) | 
 | 156 | 				+ ((out[2] & 0x3f) << 18) | 
 | 157 | 				+ ((out[3] & 0x3f) << 12) | 
 | 158 | 				+ ((out[4] & 0x3f) << 6) + (out[5] & 0x3f)); | 
 | 159 | 	  out += 6; outlen -= 6; | 
 | 160 | 	} | 
 | 161 |       else | 
 | 162 | 	{ | 
 | 163 | 	  sprintf (p, "0x????"); | 
 | 164 | 	  out += 1; outlen -= 1; | 
 | 165 | 	} | 
 | 166 |  | 
 | 167 |       if (bmp_only && strlen (p) > 6) | 
 | 168 | 	/* Ignore conversions outside Unicode plane 0.  */ | 
 | 169 | 	return NULL; | 
 | 170 |  | 
 | 171 |       p += strlen (p); | 
 | 172 |     } | 
 | 173 |  | 
 | 174 |   return hexbuf; | 
 | 175 | } | 
 | 176 |  | 
 | 177 | int | 
 | 178 | main (int argc, char *argv[]) | 
 | 179 | { | 
 | 180 |   const char *charset; | 
 | 181 |   iconv_t cd; | 
 | 182 |   int search_depth; | 
 | 183 |  | 
 | 184 |   if (argc != 2) | 
 | 185 |     { | 
 | 186 |       fprintf (stderr, "Usage: tst-table-from charset\n"); | 
 | 187 |       exit (1); | 
 | 188 |     } | 
 | 189 |   charset = argv[1]; | 
 | 190 |  | 
 | 191 |   cd = iconv_open ("UTF-8", charset); | 
 | 192 |   if (cd == (iconv_t)(-1)) | 
 | 193 |     { | 
 | 194 |       perror ("iconv_open"); | 
 | 195 |       exit (1); | 
 | 196 |     } | 
 | 197 |  | 
 | 198 |   /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output | 
 | 199 |      file gets too big.  */ | 
 | 200 |   bmp_only = (strcmp (charset, "UTF-8") == 0 | 
 | 201 | 	      || strcmp (charset, "GB18030") == 0); | 
 | 202 |   search_depth = (strcmp (charset, "UTF-8") == 0 ? 3 : 4); | 
 | 203 |  | 
 | 204 |   { | 
 | 205 |     unsigned char out[12]; | 
 | 206 |     unsigned char buf[4]; | 
 | 207 |     unsigned int i0, i1, i2, i3; | 
 | 208 |     int result; | 
 | 209 |  | 
 | 210 |     for (i0 = 0; i0 < 0x100; i0++) | 
 | 211 |       { | 
 | 212 | 	buf[0] = i0; | 
 | 213 | 	result = try (cd, buf, 1, out); | 
 | 214 | 	if (result < 0) | 
 | 215 | 	  { | 
 | 216 | 	  } | 
 | 217 | 	else if (result > 0) | 
 | 218 | 	  { | 
 | 219 | 	    const char *unicode = utf8_decode (out, result); | 
 | 220 | 	    if (unicode != NULL) | 
 | 221 | 	      printf ("0x%02X\t%s\n", i0, unicode); | 
 | 222 | 	  } | 
 | 223 | 	else | 
 | 224 | 	  { | 
 | 225 | 	    for (i1 = 0; i1 < 0x100; i1++) | 
 | 226 | 	      { | 
 | 227 | 		buf[1] = i1; | 
 | 228 | 		result = try (cd, buf, 2, out); | 
 | 229 | 		if (result < 0) | 
 | 230 | 		  { | 
 | 231 | 		  } | 
 | 232 | 		else if (result > 0) | 
 | 233 | 		  { | 
 | 234 | 		    const char *unicode = utf8_decode (out, result); | 
 | 235 | 		    if (unicode != NULL) | 
 | 236 | 		      printf ("0x%02X%02X\t%s\n", i0, i1, unicode); | 
 | 237 | 		  } | 
 | 238 | 		else | 
 | 239 | 		  { | 
 | 240 | 		    for (i2 = 0; i2 < 0x100; i2++) | 
 | 241 | 		      { | 
 | 242 | 			buf[2] = i2; | 
 | 243 | 			result = try (cd, buf, 3, out); | 
 | 244 | 			if (result < 0) | 
 | 245 | 			  { | 
 | 246 | 			  } | 
 | 247 | 			else if (result > 0) | 
 | 248 | 			  { | 
 | 249 | 			    const char *unicode = utf8_decode (out, result); | 
 | 250 | 			    if (unicode != NULL) | 
 | 251 | 			      printf ("0x%02X%02X%02X\t%s\n", | 
 | 252 | 				      i0, i1, i2, unicode); | 
 | 253 | 			  } | 
 | 254 | 			else if (search_depth > 3) | 
 | 255 | 			  { | 
 | 256 | 			    for (i3 = 0; i3 < 0x100; i3++) | 
 | 257 | 			      { | 
 | 258 | 				buf[3] = i3; | 
 | 259 | 				result = try (cd, buf, 4, out); | 
 | 260 | 				if (result < 0) | 
 | 261 | 				  { | 
 | 262 | 				  } | 
 | 263 | 				else if (result > 0) | 
 | 264 | 				  { | 
 | 265 | 				    const char *unicode = | 
 | 266 | 				      utf8_decode (out, result); | 
 | 267 | 				    if (unicode != NULL) | 
 | 268 | 				      printf ("0x%02X%02X%02X%02X\t%s\n", | 
 | 269 | 					      i0, i1, i2, i3, unicode); | 
 | 270 | 				  } | 
 | 271 | 				else | 
 | 272 | 				  { | 
 | 273 | 				    fprintf (stderr, | 
 | 274 | 					     "%s: incomplete byte sequence\n", | 
 | 275 | 					     hexbuf (buf, 4)); | 
 | 276 | 				    exit (1); | 
 | 277 | 				  } | 
 | 278 | 			      } | 
 | 279 | 			  } | 
 | 280 | 		      } | 
 | 281 | 		  } | 
 | 282 | 	      } | 
 | 283 | 	  } | 
 | 284 |       } | 
 | 285 |   } | 
 | 286 |  | 
 | 287 |   if (iconv_close (cd) < 0) | 
 | 288 |     { | 
 | 289 |       perror ("iconv_close"); | 
 | 290 |       exit (1); | 
 | 291 |     } | 
 | 292 |  | 
 | 293 |   if (ferror (stdin) || fflush (stdout) || ferror (stdout)) | 
 | 294 |     { | 
 | 295 |       fprintf (stderr, "I/O error\n"); | 
 | 296 |       exit (1); | 
 | 297 |     } | 
 | 298 |  | 
 | 299 |   return 0; | 
 | 300 | } |