lh | 9ed821d | 2023-04-07 01:36:19 -0700 | [diff] [blame^] | 1 | |
| 2 | /* Copyright (C) 2002, 2003, 2004 Manuel Novoa III |
| 3 | * |
| 4 | * This library is free software; you can redistribute it and/or |
| 5 | * modify it under the terms of the GNU Library General Public |
| 6 | * License as published by the Free Software Foundation; either |
| 7 | * version 2 of the License, or (at your option) any later version. |
| 8 | * |
| 9 | * This library is distributed in the hope that it will be useful, |
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | * Library General Public License for more details. |
| 13 | * |
| 14 | * You should have received a copy of the GNU Library General Public |
| 15 | * License along with this library; if not, write to the Free |
| 16 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 17 | */ |
| 18 | |
| 19 | /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! |
| 20 | * |
| 21 | * Besides uClibc, I'm using this code in my libc for elks, which is |
| 22 | * a 16-bit environment with a fairly limited compiler. It would make |
| 23 | * things much easier for me if this file isn't modified unnecessarily. |
| 24 | * In particular, please put any new or replacement functions somewhere |
| 25 | * else, and modify the makefile to use your version instead. |
| 26 | * Thanks. Manuel |
| 27 | * |
| 28 | * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */ |
| 29 | |
| 30 | |
| 31 | /* May 23, 2002 Initial Notes: |
| 32 | * |
| 33 | * I'm still tweaking this stuff, but it passes the tests I've thrown |
| 34 | * at it, and Erik needs it for the gcc port. The glibc extension |
| 35 | * __wcsnrtombs() hasn't been tested, as I didn't find a test for it |
| 36 | * in the glibc source. I also need to fix the behavior of |
| 37 | * _wchar_utf8sntowcs() if the max number of wchars to convert is 0. |
| 38 | * |
| 39 | * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt |
| 40 | * file on my platform (x86) show about 5-10% faster conversion speed than |
| 41 | * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with |
| 42 | * individual mbrtowc()/wcrtomb() calls. |
| 43 | * |
| 44 | * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled |
| 45 | * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which |
| 46 | * needs to deal gracefully with whatever is sent to it. In that mode, |
| 47 | * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add |
| 48 | * an arg to force that behavior, so the interface will be changing. |
| 49 | * |
| 50 | * I need to fix the error checking for 16-bit wide chars. This isn't |
| 51 | * an issue for uClibc, but may be for ELKS. I'm currently not sure |
| 52 | * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS. |
| 53 | * |
| 54 | * July 1, 2002 |
| 55 | * |
| 56 | * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case. |
| 57 | * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit |
| 58 | * locales. |
| 59 | * Enabled building of a C/POSIX-locale-only version, so full locale support |
| 60 | * no longer needs to be enabled. |
| 61 | * |
| 62 | * Nov 4, 2002 |
| 63 | * |
| 64 | * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL. |
| 65 | * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in |
| 66 | * order to support %ls in printf. See comments below for details. |
| 67 | * Change behaviour of wc<->mb functions when in the C locale. Now they do |
| 68 | * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility |
| 69 | * and consistency with the stds requirements that a printf format string by |
| 70 | * a valid multibyte string beginning and ending in it's initial shift state. |
| 71 | * |
| 72 | * Nov 5, 2002 |
| 73 | * |
| 74 | * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday. |
| 75 | * |
| 76 | * Nov 7, 2002 |
| 77 | * |
| 78 | * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08. |
| 79 | * Added some size/speed optimizations and integrated it into my locale |
| 80 | * framework. Minimally tested at the moment, but the stub C-locale |
| 81 | * version (which most people would probably be using) should be fine. |
| 82 | * |
| 83 | * Nov 21, 2002 |
| 84 | * |
| 85 | * Revert the wc<->mb changes from earlier this month involving the C-locale. |
| 86 | * Add a couple of ugly hacks to support *wprintf. |
| 87 | * Add a mini iconv() and iconv implementation (requires locale support). |
| 88 | * |
| 89 | * Aug 1, 2003 |
| 90 | * Bug fix for mbrtowc. |
| 91 | * |
| 92 | * Aug 18, 2003 |
| 93 | * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ. |
| 94 | * |
| 95 | * Feb 11, 2004 |
| 96 | * Bug fix: Fix size check for remaining output space in iconv(). |
| 97 | * |
| 98 | * Manuel |
| 99 | */ |
| 100 | #ifdef _LIBC |
| 101 | #include <errno.h> |
| 102 | #include <stddef.h> |
| 103 | #include <limits.h> |
| 104 | #include <stdint.h> |
| 105 | #include <inttypes.h> |
| 106 | #include <stdlib.h> |
| 107 | #include <stdio.h> |
| 108 | #include <assert.h> |
| 109 | #include <locale.h> |
| 110 | #include <wchar.h> |
| 111 | #include <bits/uClibc_uwchar.h> |
| 112 | |
| 113 | /**********************************************************************/ |
| 114 | #ifdef __UCLIBC_HAS_LOCALE__ |
| 115 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 116 | #ifdef L_iswspace |
| 117 | /* generates one warning */ |
| 118 | #warning TODO: Fix Cc2wc* and Cwc2c* defines! |
| 119 | #endif |
| 120 | #endif /* __UCLIBC_MJN3_ONLY__ */ |
| 121 | |
| 122 | #define ENCODING (__UCLIBC_CURLOCALE->encoding) |
| 123 | |
| 124 | #define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT |
| 125 | #define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN |
| 126 | #define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX |
| 127 | #define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT |
| 128 | #define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT |
| 129 | #define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN |
| 130 | |
| 131 | #ifndef __CTYPE_HAS_UTF_8_LOCALES |
| 132 | #warning __CTYPE_HAS_UTF_8_LOCALES not set! |
| 133 | #endif |
| 134 | |
| 135 | #else /* __UCLIBC_HAS_LOCALE__ */ |
| 136 | |
| 137 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 138 | #ifdef L_btowc |
| 139 | /* emit only once */ |
| 140 | #warning fix preprocessor logic testing locale settings |
| 141 | #endif |
| 142 | #endif |
| 143 | |
| 144 | #define ENCODING (__ctype_encoding_7_bit) |
| 145 | #ifdef __CTYPE_HAS_8_BIT_LOCALES |
| 146 | #error __CTYPE_HAS_8_BIT_LOCALES is defined! |
| 147 | #endif |
| 148 | #ifdef __CTYPE_HAS_UTF_8_LOCALES |
| 149 | #error __CTYPE_HAS_UTF_8_LOCALES is defined! |
| 150 | #endif |
| 151 | #undef L__wchar_utf8sntowcs |
| 152 | #undef L__wchar_wcsntoutf8s |
| 153 | |
| 154 | #endif /* __UCLIBC_HAS_LOCALE__ */ |
| 155 | /**********************************************************************/ |
| 156 | |
| 157 | #if WCHAR_MAX > 0xffffUL |
| 158 | #define UTF_8_MAX_LEN 6 |
| 159 | #else |
| 160 | #define UTF_8_MAX_LEN 3 |
| 161 | #endif |
| 162 | |
| 163 | #define KUHN 1 |
| 164 | |
| 165 | /* Implementation-specific work functions. */ |
| 166 | |
| 167 | extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn, |
| 168 | const char **__restrict src, size_t n, |
| 169 | mbstate_t *ps, int allow_continuation) attribute_hidden; |
| 170 | |
| 171 | extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n, |
| 172 | const wchar_t **__restrict src, size_t wn) attribute_hidden; |
| 173 | #endif |
| 174 | /**********************************************************************/ |
| 175 | #ifdef L_btowc |
| 176 | |
| 177 | |
| 178 | wint_t btowc(int c) |
| 179 | { |
| 180 | #ifdef __CTYPE_HAS_8_BIT_LOCALES |
| 181 | |
| 182 | wchar_t wc; |
| 183 | unsigned char buf[1]; |
| 184 | mbstate_t mbstate; |
| 185 | |
| 186 | if (c != EOF) { |
| 187 | *buf = (unsigned char) c; |
| 188 | mbstate.__mask = 0; /* Initialize the mbstate. */ |
| 189 | if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) { |
| 190 | return wc; |
| 191 | } |
| 192 | } |
| 193 | return WEOF; |
| 194 | |
| 195 | #else /* !__CTYPE_HAS_8_BIT_LOCALES */ |
| 196 | |
| 197 | #ifdef __UCLIBC_HAS_LOCALE__ |
| 198 | assert((ENCODING == __ctype_encoding_7_bit) |
| 199 | || (ENCODING == __ctype_encoding_utf8)); |
| 200 | #endif |
| 201 | |
| 202 | /* If we don't have 8-bit locale support, then this is trivial since |
| 203 | * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */ |
| 204 | return (((unsigned int)c) < 0x80) ? c : WEOF; |
| 205 | |
| 206 | #endif /* !__CTYPE_HAS_8_BIT_LOCALES */ |
| 207 | } |
| 208 | libc_hidden_def(btowc) |
| 209 | |
| 210 | #endif |
| 211 | /**********************************************************************/ |
| 212 | #ifdef L_wctob |
| 213 | |
| 214 | /* Note: We completely ignore ps in all currently supported conversions. */ |
| 215 | |
| 216 | |
| 217 | int wctob(wint_t c) |
| 218 | { |
| 219 | #ifdef __CTYPE_HAS_8_BIT_LOCALES |
| 220 | |
| 221 | unsigned char buf[MB_LEN_MAX]; |
| 222 | |
| 223 | return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF; |
| 224 | |
| 225 | #else /* __CTYPE_HAS_8_BIT_LOCALES */ |
| 226 | |
| 227 | #ifdef __UCLIBC_HAS_LOCALE__ |
| 228 | assert((ENCODING == __ctype_encoding_7_bit) |
| 229 | || (ENCODING == __ctype_encoding_utf8)); |
| 230 | #endif /* __UCLIBC_HAS_LOCALE__ */ |
| 231 | |
| 232 | /* If we don't have 8-bit locale support, then this is trivial since |
| 233 | * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */ |
| 234 | |
| 235 | /* TODO: need unsigned version of wint_t... */ |
| 236 | /* return (((unsigned int)c) < 0x80) ? c : WEOF; */ |
| 237 | return ((c >= 0) && (c < 0x80)) ? c : EOF; |
| 238 | |
| 239 | #endif /* __CTYPE_HAS_8_BIT_LOCALES */ |
| 240 | } |
| 241 | |
| 242 | #endif |
| 243 | /**********************************************************************/ |
| 244 | #ifdef L_mbsinit |
| 245 | |
| 246 | int mbsinit(const mbstate_t *ps) |
| 247 | { |
| 248 | return !ps || !ps->__mask; |
| 249 | } |
| 250 | libc_hidden_def(mbsinit) |
| 251 | |
| 252 | #endif |
| 253 | /**********************************************************************/ |
| 254 | #ifdef L_mbrlen |
| 255 | |
| 256 | |
| 257 | size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps) |
| 258 | { |
| 259 | static mbstate_t mbstate; /* Rely on bss 0-init. */ |
| 260 | |
| 261 | return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate); |
| 262 | } |
| 263 | libc_hidden_def(mbrlen) |
| 264 | |
| 265 | #endif |
| 266 | /**********************************************************************/ |
| 267 | #ifdef L_mbrtowc |
| 268 | |
| 269 | |
| 270 | size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, |
| 271 | size_t n, mbstate_t *__restrict ps) |
| 272 | { |
| 273 | static mbstate_t mbstate; /* Rely on bss 0-init. */ |
| 274 | wchar_t wcbuf[1]; |
| 275 | const char *p; |
| 276 | size_t r; |
| 277 | char empty_string[1]; /* Avoid static to be fPIC friendly. */ |
| 278 | |
| 279 | if (!ps) { |
| 280 | ps = &mbstate; |
| 281 | } |
| 282 | |
| 283 | if (!s) { |
| 284 | pwc = (wchar_t *) s; /* NULL */ |
| 285 | empty_string[0] = 0; /* Init the empty string when necessary. */ |
| 286 | s = empty_string; |
| 287 | n = 1; |
| 288 | } else if (*s == '\0') { |
| 289 | if (pwc) |
| 290 | *pwc = '\0'; |
| 291 | /* According to the ISO C 89 standard this is the expected behaviour. */ |
| 292 | return 0; |
| 293 | } else if (!n) { |
| 294 | /* TODO: change error code? */ |
| 295 | #if 0 |
| 296 | return (ps->__mask && (ps->__wc == 0xffffU)) |
| 297 | ? ((size_t) -1) : ((size_t) -2); |
| 298 | #else |
| 299 | return 0; |
| 300 | #endif |
| 301 | } |
| 302 | |
| 303 | p = s; |
| 304 | |
| 305 | #ifdef __CTYPE_HAS_UTF_8_LOCALES |
| 306 | /* Need to do this here since mbsrtowcs doesn't allow incompletes. */ |
| 307 | if (ENCODING == __ctype_encoding_utf8) { |
| 308 | if (!pwc) { |
| 309 | pwc = wcbuf; |
| 310 | } |
| 311 | r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1); |
| 312 | return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */ |
| 313 | } |
| 314 | #endif |
| 315 | |
| 316 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 317 | #warning TODO: This adds a trailing nul! |
| 318 | #endif /* __UCLIBC_MJN3_ONLY__ */ |
| 319 | |
| 320 | r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps); |
| 321 | |
| 322 | if (((ssize_t) r) >= 0) { |
| 323 | if (pwc) { |
| 324 | *pwc = *wcbuf; |
| 325 | } |
| 326 | } |
| 327 | return (size_t) r; |
| 328 | } |
| 329 | libc_hidden_def(mbrtowc) |
| 330 | |
| 331 | #endif |
| 332 | /**********************************************************************/ |
| 333 | #ifdef L_wcrtomb |
| 334 | |
| 335 | |
| 336 | /* Note: We completely ignore ps in all currently supported conversions. */ |
| 337 | /* TODO: Check for valid state anyway? */ |
| 338 | |
| 339 | size_t wcrtomb(register char *__restrict s, wchar_t wc, |
| 340 | mbstate_t *__restrict ps) |
| 341 | { |
| 342 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 343 | #warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc. |
| 344 | #endif /* __UCLIBC_MJN3_ONLY__ */ |
| 345 | wchar_t wcbuf[1]; |
| 346 | const wchar_t *pwc; |
| 347 | size_t r; |
| 348 | char buf[MB_LEN_MAX]; |
| 349 | |
| 350 | if (!s) { |
| 351 | s = buf; |
| 352 | wc = 0; |
| 353 | } |
| 354 | |
| 355 | pwc = wcbuf; |
| 356 | wcbuf[0] = wc; |
| 357 | |
| 358 | r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps); |
| 359 | return (r != 0) ? r : 1; |
| 360 | } |
| 361 | libc_hidden_def(wcrtomb) |
| 362 | |
| 363 | #endif |
| 364 | /**********************************************************************/ |
| 365 | #ifdef L_mbsrtowcs |
| 366 | |
| 367 | |
| 368 | size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src, |
| 369 | size_t len, mbstate_t *__restrict ps) |
| 370 | { |
| 371 | static mbstate_t mbstate; /* Rely on bss 0-init. */ |
| 372 | |
| 373 | return mbsnrtowcs(dst, src, SIZE_MAX, len, |
| 374 | ((ps != NULL) ? ps : &mbstate)); |
| 375 | } |
| 376 | libc_hidden_def(mbsrtowcs) |
| 377 | |
| 378 | #endif |
| 379 | /**********************************************************************/ |
| 380 | #ifdef L_wcsrtombs |
| 381 | |
| 382 | /* Note: We completely ignore ps in all currently supported conversions. |
| 383 | |
| 384 | * TODO: Check for valid state anyway? */ |
| 385 | |
| 386 | |
| 387 | size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src, |
| 388 | size_t len, mbstate_t *__restrict ps) |
| 389 | { |
| 390 | return wcsnrtombs(dst, src, SIZE_MAX, len, ps); |
| 391 | } |
| 392 | libc_hidden_def(wcsrtombs) |
| 393 | |
| 394 | #endif |
| 395 | /**********************************************************************/ |
| 396 | #ifdef L__wchar_utf8sntowcs |
| 397 | |
| 398 | /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's |
| 399 | * UTF-8-test.txt strss test. |
| 400 | */ |
| 401 | /* #define DECODER */ |
| 402 | |
| 403 | #ifdef DECODER |
| 404 | #ifndef KUHN |
| 405 | #define KUHN |
| 406 | #endif |
| 407 | #endif |
| 408 | |
| 409 | size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn, |
| 410 | const char **__restrict src, size_t n, |
| 411 | mbstate_t *ps, int allow_continuation) |
| 412 | { |
| 413 | register const char *s; |
| 414 | __uwchar_t mask; |
| 415 | __uwchar_t wc; |
| 416 | wchar_t wcbuf[1]; |
| 417 | size_t count; |
| 418 | int incr; |
| 419 | |
| 420 | s = *src; |
| 421 | |
| 422 | assert(s != NULL); |
| 423 | assert(ps != NULL); |
| 424 | |
| 425 | incr = 1; |
| 426 | /* NOTE: The following is an AWFUL HACK! In order to support %s in |
| 427 | * wprintf, we need to be able to compute the number of wchars needed |
| 428 | * for the mbs conversion, not to exceed the precision specified. |
| 429 | * But if dst is NULL, the return value is the length assuming a |
| 430 | * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps |
| 431 | * as pwc in order to flag that we really want the length, subject |
| 432 | * to the restricted buffer size and no partial conversions. |
| 433 | * See mbsnrtowcs() as well. */ |
| 434 | if (!pwc || (pwc == ((wchar_t *)ps))) { |
| 435 | if (!pwc) { |
| 436 | wn = SIZE_MAX; |
| 437 | } |
| 438 | pwc = wcbuf; |
| 439 | incr = 0; |
| 440 | } |
| 441 | |
| 442 | /* This is really here only to support the glibc extension function |
| 443 | * __mbsnrtowcs which apparently returns 0 if wn == 0 without any |
| 444 | * check on the validity of the mbstate. */ |
| 445 | if (!(count = wn)) { |
| 446 | return 0; |
| 447 | } |
| 448 | |
| 449 | if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */ |
| 450 | #ifdef DECODER |
| 451 | wc = (__uwchar_t) ps->__wc; |
| 452 | if (n) { |
| 453 | goto CONTINUE; |
| 454 | } |
| 455 | goto DONE; |
| 456 | #else |
| 457 | if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) { |
| 458 | /* TODO: change error code here and below? */ |
| 459 | if (n) { |
| 460 | goto CONTINUE; |
| 461 | } |
| 462 | goto DONE; |
| 463 | } |
| 464 | __set_errno(EILSEQ); |
| 465 | return (size_t) -1; /* We're in an error state. */ |
| 466 | #endif |
| 467 | } |
| 468 | |
| 469 | do { |
| 470 | if (!n) { |
| 471 | goto DONE; |
| 472 | } |
| 473 | --n; |
| 474 | if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */ |
| 475 | mask = 0x40; |
| 476 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 477 | #warning TODO: Fix range for 16 bit wchar_t case. |
| 478 | #endif |
| 479 | if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) && |
| 480 | (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) { |
| 481 | goto START; |
| 482 | } |
| 483 | BAD: |
| 484 | #ifdef DECODER |
| 485 | wc = 0xfffdU; |
| 486 | goto COMPLETE; |
| 487 | #else |
| 488 | ps->__mask = mask; |
| 489 | ps->__wc = 0xffffU; |
| 490 | __set_errno(EILSEQ); |
| 491 | return (size_t) -1; /* Illegal start byte! */ |
| 492 | #endif |
| 493 | |
| 494 | CONTINUE: |
| 495 | while (n) { |
| 496 | --n; |
| 497 | if ((*s & 0xc0) != 0x80) { |
| 498 | goto BAD; |
| 499 | } |
| 500 | mask <<= 5; |
| 501 | wc <<= 6; |
| 502 | wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */ |
| 503 | ++s; |
| 504 | START: |
| 505 | wc &= ~(mask << 1); |
| 506 | |
| 507 | if ((wc & mask) == 0) { /* Character completed. */ |
| 508 | if ((mask >>= 5) == 0x40) { |
| 509 | mask += mask; |
| 510 | } |
| 511 | /* Check for invalid sequences (longer than necessary) |
| 512 | * and invalid chars. */ |
| 513 | if ( (wc < mask) /* Sequence not minimal length. */ |
| 514 | #ifdef KUHN |
| 515 | #if UTF_8_MAX_LEN == 3 |
| 516 | #error broken since mask can overflow!! |
| 517 | /* For plane 0, these are the only defined values.*/ |
| 518 | || (wc > 0xfffdU) |
| 519 | #else |
| 520 | /* Note that we don't need to worry about exceeding */ |
| 521 | /* 31 bits as that is the most that UTF-8 provides. */ |
| 522 | || ( ((__uwchar_t)(wc - 0xfffeU)) < 2) |
| 523 | #endif |
| 524 | || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) ) |
| 525 | #endif /* KUHN */ |
| 526 | ) { |
| 527 | goto BAD; |
| 528 | } |
| 529 | goto COMPLETE; |
| 530 | } |
| 531 | } |
| 532 | /* Character potentially valid but incomplete. */ |
| 533 | if (!allow_continuation) { |
| 534 | if (count != wn) { |
| 535 | return 0; |
| 536 | } |
| 537 | /* NOTE: The following can fail if you allow and then disallow |
| 538 | * continuation!!! */ |
| 539 | #if UTF_8_MAX_LEN == 3 |
| 540 | #error broken since mask can overflow!! |
| 541 | #endif |
| 542 | /* Need to back up... */ |
| 543 | do { |
| 544 | --s; |
| 545 | } while ((mask >>= 5) >= 0x40); |
| 546 | goto DONE; |
| 547 | } |
| 548 | ps->__mask = (wchar_t) mask; |
| 549 | ps->__wc = (wchar_t) wc; |
| 550 | *src = s; |
| 551 | return (size_t) -2; |
| 552 | } |
| 553 | COMPLETE: |
| 554 | *pwc = wc; |
| 555 | pwc += incr; |
| 556 | } |
| 557 | #ifdef DECODER |
| 558 | while (--count); |
| 559 | #else |
| 560 | while (wc && --count); |
| 561 | |
| 562 | if (!wc) { |
| 563 | s = NULL; |
| 564 | } |
| 565 | #endif |
| 566 | |
| 567 | DONE: |
| 568 | /* ps->__wc is irrelavent here. */ |
| 569 | ps->__mask = 0; |
| 570 | if (pwc != wcbuf) { |
| 571 | *src = s; |
| 572 | } |
| 573 | |
| 574 | return wn - count; |
| 575 | } |
| 576 | |
| 577 | #endif |
| 578 | /**********************************************************************/ |
| 579 | #ifdef L__wchar_wcsntoutf8s |
| 580 | |
| 581 | size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n, |
| 582 | const wchar_t **__restrict src, size_t wn) |
| 583 | { |
| 584 | register char *p; |
| 585 | size_t len, t; |
| 586 | __uwchar_t wc; |
| 587 | const __uwchar_t *swc; |
| 588 | int store; |
| 589 | char buf[MB_LEN_MAX]; |
| 590 | char m; |
| 591 | |
| 592 | store = 1; |
| 593 | /* NOTE: The following is an AWFUL HACK! In order to support %ls in |
| 594 | * printf, we need to be able to compute the number of bytes needed |
| 595 | * for the mbs conversion, not to exceed the precision specified. |
| 596 | * But if dst is NULL, the return value is the length assuming a |
| 597 | * sufficiently sized buffer. So, we allow passing of (char *) src |
| 598 | * as dst in order to flag that we really want the length, subject |
| 599 | * to the restricted buffer size and no partial conversions. |
| 600 | * See wcsnrtombs() as well. */ |
| 601 | if (!s || (s == ((char *) src))) { |
| 602 | if (!s) { |
| 603 | n = SIZE_MAX; |
| 604 | } |
| 605 | s = buf; |
| 606 | store = 0; |
| 607 | } |
| 608 | |
| 609 | t = n; |
| 610 | swc = (const __uwchar_t *) *src; |
| 611 | |
| 612 | assert(swc != NULL); |
| 613 | |
| 614 | while (wn && t) { |
| 615 | wc = *swc; |
| 616 | |
| 617 | *s = wc; |
| 618 | len = 1; |
| 619 | |
| 620 | if (wc >= 0x80) { |
| 621 | #ifdef KUHN |
| 622 | if ( |
| 623 | #if UTF_8_MAX_LEN == 3 |
| 624 | /* For plane 0, these are the only defined values.*/ |
| 625 | /* Note that we don't need to worry about exceeding */ |
| 626 | /* 31 bits as that is the most that UTF-8 provides. */ |
| 627 | (wc > 0xfffdU) |
| 628 | #else |
| 629 | /* UTF_8_MAX_LEN == 6 */ |
| 630 | (wc > 0x7fffffffUL) |
| 631 | || ( ((__uwchar_t)(wc - 0xfffeU)) < 2) |
| 632 | #endif |
| 633 | || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) ) |
| 634 | ) { |
| 635 | __set_errno(EILSEQ); |
| 636 | return (size_t) -1; |
| 637 | } |
| 638 | #else /* KUHN */ |
| 639 | #if UTF_8_MAX_LEN != 3 |
| 640 | if (wc > 0x7fffffffUL) { /* Value too large. */ |
| 641 | __set_errno(EILSEQ); |
| 642 | return (size_t) -1; |
| 643 | } |
| 644 | #endif |
| 645 | #endif /* KUHN */ |
| 646 | |
| 647 | wc >>= 1; |
| 648 | p = s; |
| 649 | do { |
| 650 | ++p; |
| 651 | } while (wc >>= 5); |
| 652 | wc = *swc; |
| 653 | if ((len = p - s) > t) { /* Not enough space. */ |
| 654 | break; |
| 655 | } |
| 656 | |
| 657 | m = 0x80; |
| 658 | while( p>s ) { |
| 659 | m = (m >> 1) | 0x80; |
| 660 | *--p = (wc & 0x3f) | 0x80; |
| 661 | wc >>= 6; |
| 662 | } |
| 663 | *s |= (m << 1); |
| 664 | } else if (wc == 0) { /* End of string. */ |
| 665 | swc = NULL; |
| 666 | break; |
| 667 | } |
| 668 | |
| 669 | ++swc; |
| 670 | --wn; |
| 671 | t -= len; |
| 672 | if (store) { |
| 673 | s += len; |
| 674 | } |
| 675 | } |
| 676 | |
| 677 | if (store) { |
| 678 | *src = (const wchar_t *) swc; |
| 679 | } |
| 680 | |
| 681 | return n - t; |
| 682 | } |
| 683 | |
| 684 | |
| 685 | #endif |
| 686 | /**********************************************************************/ |
| 687 | #ifdef L_mbsnrtowcs |
| 688 | |
| 689 | /* WARNING: We treat len as SIZE_MAX when dst is NULL! */ |
| 690 | |
| 691 | size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src, |
| 692 | size_t NMC, size_t len, mbstate_t *__restrict ps) |
| 693 | { |
| 694 | static mbstate_t mbstate; /* Rely on bss 0-init. */ |
| 695 | wchar_t wcbuf[1]; |
| 696 | const char *s; |
| 697 | size_t count; |
| 698 | int incr; |
| 699 | |
| 700 | if (!ps) { |
| 701 | ps = &mbstate; |
| 702 | } |
| 703 | |
| 704 | #ifdef __CTYPE_HAS_UTF_8_LOCALES |
| 705 | if (ENCODING == __ctype_encoding_utf8) { |
| 706 | size_t r; |
| 707 | return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1)) |
| 708 | != (size_t) -2) ? r : 0; |
| 709 | } |
| 710 | #endif |
| 711 | incr = 1; |
| 712 | /* NOTE: The following is an AWFUL HACK! In order to support %s in |
| 713 | * wprintf, we need to be able to compute the number of wchars needed |
| 714 | * for the mbs conversion, not to exceed the precision specified. |
| 715 | * But if dst is NULL, the return value is the length assuming a |
| 716 | * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps) |
| 717 | * as dst in order to flag that we really want the length, subject |
| 718 | * to the restricted buffer size and no partial conversions. |
| 719 | * See _wchar_utf8sntowcs() as well. */ |
| 720 | if (!dst || (dst == ((wchar_t *)ps))) { |
| 721 | if (!dst) { |
| 722 | len = SIZE_MAX; |
| 723 | } |
| 724 | dst = wcbuf; |
| 725 | incr = 0; |
| 726 | } |
| 727 | |
| 728 | /* Since all the following encodings are single-byte encodings... */ |
| 729 | if (len > NMC) { |
| 730 | len = NMC; |
| 731 | } |
| 732 | |
| 733 | count = len; |
| 734 | s = *src; |
| 735 | |
| 736 | #ifdef __CTYPE_HAS_8_BIT_LOCALES |
| 737 | if (ENCODING == __ctype_encoding_8_bit) { |
| 738 | wchar_t wc; |
| 739 | while (count) { |
| 740 | if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */ |
| 741 | wc -= 0x80; |
| 742 | wc = __UCLIBC_CURLOCALE->tbl8c2wc[ |
| 743 | (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT] |
| 744 | << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))]; |
| 745 | if (!wc) { |
| 746 | goto BAD; |
| 747 | } |
| 748 | } |
| 749 | if (!(*dst = wc)) { |
| 750 | s = NULL; |
| 751 | break; |
| 752 | } |
| 753 | dst += incr; |
| 754 | ++s; |
| 755 | --count; |
| 756 | } |
| 757 | if (dst != wcbuf) { |
| 758 | *src = s; |
| 759 | } |
| 760 | return len - count; |
| 761 | } |
| 762 | #endif |
| 763 | |
| 764 | #ifdef __UCLIBC_HAS_LOCALE__ |
| 765 | assert(ENCODING == __ctype_encoding_7_bit); |
| 766 | #endif |
| 767 | |
| 768 | while (count) { |
| 769 | if ((*dst = (unsigned char) *s) == 0) { |
| 770 | s = NULL; |
| 771 | break; |
| 772 | } |
| 773 | if (*dst >= 0x80) { |
| 774 | #ifdef __CTYPE_HAS_8_BIT_LOCALES |
| 775 | BAD: |
| 776 | #endif |
| 777 | __set_errno(EILSEQ); |
| 778 | return (size_t) -1; |
| 779 | } |
| 780 | ++s; |
| 781 | dst += incr; |
| 782 | --count; |
| 783 | } |
| 784 | if (dst != wcbuf) { |
| 785 | *src = s; |
| 786 | } |
| 787 | return len - count; |
| 788 | } |
| 789 | libc_hidden_def(mbsnrtowcs) |
| 790 | |
| 791 | #endif |
| 792 | /**********************************************************************/ |
| 793 | #ifdef L_wcsnrtombs |
| 794 | |
| 795 | /* WARNING: We treat len as SIZE_MAX when dst is NULL! */ |
| 796 | |
| 797 | /* Note: We completely ignore ps in all currently supported conversions. |
| 798 | * TODO: Check for valid state anyway? */ |
| 799 | |
| 800 | size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src, |
| 801 | size_t NWC, size_t len, mbstate_t *__restrict ps) |
| 802 | { |
| 803 | const __uwchar_t *s; |
| 804 | size_t count; |
| 805 | int incr; |
| 806 | char buf[MB_LEN_MAX]; |
| 807 | |
| 808 | #ifdef __CTYPE_HAS_UTF_8_LOCALES |
| 809 | if (ENCODING == __ctype_encoding_utf8) { |
| 810 | return _wchar_wcsntoutf8s(dst, len, src, NWC); |
| 811 | } |
| 812 | #endif /* __CTYPE_HAS_UTF_8_LOCALES */ |
| 813 | |
| 814 | incr = 1; |
| 815 | /* NOTE: The following is an AWFUL HACK! In order to support %ls in |
| 816 | * printf, we need to be able to compute the number of bytes needed |
| 817 | * for the mbs conversion, not to exceed the precision specified. |
| 818 | * But if dst is NULL, the return value is the length assuming a |
| 819 | * sufficiently sized buffer. So, we allow passing of (char *) src |
| 820 | * as dst in order to flag that we really want the length, subject |
| 821 | * to the restricted buffer size and no partial conversions. |
| 822 | * See _wchar_wcsntoutf8s() as well. */ |
| 823 | if (!dst || (dst == ((char *) src))) { |
| 824 | if (!dst) { |
| 825 | len = SIZE_MAX; |
| 826 | } |
| 827 | dst = buf; |
| 828 | incr = 0; |
| 829 | } |
| 830 | |
| 831 | /* Since all the following encodings are single-byte encodings... */ |
| 832 | if (len > NWC) { |
| 833 | len = NWC; |
| 834 | } |
| 835 | |
| 836 | count = len; |
| 837 | s = (const __uwchar_t *) *src; |
| 838 | |
| 839 | #ifdef __CTYPE_HAS_8_BIT_LOCALES |
| 840 | if (ENCODING == __ctype_encoding_8_bit) { |
| 841 | __uwchar_t wc; |
| 842 | __uwchar_t u; |
| 843 | while (count) { |
| 844 | if ((wc = *s) <= 0x7f) { |
| 845 | if (!(*dst = (unsigned char) wc)) { |
| 846 | s = NULL; |
| 847 | break; |
| 848 | } |
| 849 | } else { |
| 850 | u = 0; |
| 851 | if (wc <= Cwc2c_DOMAIN_MAX) { |
| 852 | u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT |
| 853 | + Cwc2c_TT_SHIFT)]; |
| 854 | u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT) |
| 855 | + ((wc >> Cwc2c_TT_SHIFT) |
| 856 | & ((1 << Cwc2c_TI_SHIFT)-1))]; |
| 857 | u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN |
| 858 | + (u << Cwc2c_TT_SHIFT) |
| 859 | + (wc & ((1 << Cwc2c_TT_SHIFT)-1))]; |
| 860 | } |
| 861 | |
| 862 | #ifdef __WCHAR_REPLACEMENT_CHAR |
| 863 | *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR ); |
| 864 | #else /* __WCHAR_REPLACEMENT_CHAR */ |
| 865 | if (!u) { |
| 866 | goto BAD; |
| 867 | } |
| 868 | *dst = (unsigned char) u; |
| 869 | #endif /* __WCHAR_REPLACEMENT_CHAR */ |
| 870 | } |
| 871 | ++s; |
| 872 | dst += incr; |
| 873 | --count; |
| 874 | } |
| 875 | if (dst != buf) { |
| 876 | *src = (const wchar_t *) s; |
| 877 | } |
| 878 | return len - count; |
| 879 | } |
| 880 | #endif /* __CTYPE_HAS_8_BIT_LOCALES */ |
| 881 | |
| 882 | #ifdef __UCLIBC_HAS_LOCALE__ |
| 883 | assert(ENCODING == __ctype_encoding_7_bit); |
| 884 | #endif |
| 885 | |
| 886 | while (count) { |
| 887 | if (*s >= 0x80) { |
| 888 | #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR) |
| 889 | BAD: |
| 890 | #endif |
| 891 | __set_errno(EILSEQ); |
| 892 | return (size_t) -1; |
| 893 | } |
| 894 | if ((*dst = (unsigned char) *s) == 0) { |
| 895 | s = NULL; |
| 896 | break; |
| 897 | } |
| 898 | ++s; |
| 899 | dst += incr; |
| 900 | --count; |
| 901 | } |
| 902 | if (dst != buf) { |
| 903 | *src = (const wchar_t *) s; |
| 904 | } |
| 905 | return len - count; |
| 906 | } |
| 907 | libc_hidden_def(wcsnrtombs) |
| 908 | |
| 909 | #endif |
| 910 | /**********************************************************************/ |
| 911 | #ifdef L_wcswidth |
| 912 | |
| 913 | |
| 914 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 915 | #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating. |
| 916 | #warning TODO: Update wcwidth to match latest by Kuhn. |
| 917 | #endif |
| 918 | |
| 919 | #if defined(__UCLIBC_HAS_LOCALE__) && \ |
| 920 | ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) ) |
| 921 | |
| 922 | static const unsigned char new_idx[] = { |
| 923 | 0, 5, 5, 6, 10, 15, 28, 39, |
| 924 | 48, 48, 71, 94, 113, 128, 139, 154, |
| 925 | 175, 186, 188, 188, 188, 188, 188, 188, |
| 926 | 203, 208, 208, 208, 208, 208, 208, 208, |
| 927 | 208, 219, 219, 219, 222, 222, 222, 222, |
| 928 | 222, 222, 222, 222, 222, 222, 222, 224, |
| 929 | 224, 231, 231, 231, 231, 231, 231, 231, |
| 930 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 931 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 932 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 933 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 934 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 935 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 936 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 937 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 938 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 939 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 940 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 941 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 942 | 231, 231, 231, 231, 231, 231, 231, 231, |
| 943 | 231, 231, 231, 231, 231, 233, 233, 233, |
| 944 | 233, 233, 233, 233, 234, 234, 234, 234, |
| 945 | 234, 234, 234, 234, 234, 234, 234, 234, |
| 946 | 234, 234, 234, 234, 234, 234, 234, 234, |
| 947 | 234, 234, 234, 234, 234, 234, 234, 234, |
| 948 | 234, 234, 234, 234, 234, 234, 234, 234, |
| 949 | 234, 234, 234, 234, 234, 234, 234, 234, |
| 950 | 236, 236, 236, 236, 236, 236, 236, 236, |
| 951 | 236, 236, 236, 236, 236, 236, 236, 236, |
| 952 | 236, 236, 236, 236, 236, 236, 236, 236, |
| 953 | 236, 236, 236, 236, 236, 236, 236, 236, |
| 954 | 236, 237, 237, 238, 241, 241, 242, 249, |
| 955 | 255, |
| 956 | }; |
| 957 | |
| 958 | static const unsigned char new_tbl[] = { |
| 959 | 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50, |
| 960 | 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00, |
| 961 | 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0, |
| 962 | 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70, |
| 963 | 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00, |
| 964 | 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1, |
| 965 | 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d, |
| 966 | 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc, |
| 967 | 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00, |
| 968 | 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49, |
| 969 | 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd, |
| 970 | 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01, |
| 971 | 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d, |
| 972 | 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd, |
| 973 | 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e, |
| 974 | 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce, |
| 975 | 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2, |
| 976 | 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b, |
| 977 | 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd, |
| 978 | 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37, |
| 979 | 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86, |
| 980 | 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00, |
| 981 | 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a, |
| 982 | 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32, |
| 983 | 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6, |
| 984 | 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa, |
| 985 | 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a, |
| 986 | 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80, |
| 987 | 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00, |
| 988 | 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e, |
| 989 | 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70, |
| 990 | 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc, |
| 991 | }; |
| 992 | |
| 993 | static const signed char new_wtbl[] = { |
| 994 | 0, -1, 1, -1, 1, 1, 0, 1, |
| 995 | 0, 1, 1, 0, 1, 0, 1, 1, |
| 996 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 997 | 0, 1, 0, 1, 1, 0, 1, 0, |
| 998 | 1, 0, 1, 0, 1, 0, 1, 1, |
| 999 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 1000 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1001 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1002 | 1, 0, 1, 0, 1, 0, 1, 1, |
| 1003 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 1004 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 1005 | 0, 1, 0, 1, 0, 1, 1, 0, |
| 1006 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1007 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1008 | 1, 1, 0, 1, 0, 1, 0, 1, |
| 1009 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 1010 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1011 | 1, 0, 1, 1, 0, 1, 0, 1, |
| 1012 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 1013 | 0, 1, 1, 0, 1, 0, 1, 0, |
| 1014 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1015 | 1, 0, 1, 0, 1, 0, 1, 1, |
| 1016 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 1017 | 0, 1, 2, 0, 1, 0, 1, 0, |
| 1018 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1019 | 1, 0, 1, 1, 0, 1, 0, 1, |
| 1020 | 1, 0, 1, 0, 1, 0, 1, 0, |
| 1021 | 1, 0, 1, 1, 2, 1, 1, 2, |
| 1022 | 2, 0, 2, 1, 2, 0, 2, 2, |
| 1023 | 1, 1, 2, 1, 1, 2, 1, 0, |
| 1024 | 1, 1, 0, 1, 0, 1, 2, 1, |
| 1025 | 0, 2, 1, 2, 1, 0, 1, |
| 1026 | }; |
| 1027 | |
| 1028 | |
| 1029 | int wcswidth(const wchar_t *pwcs, size_t n) |
| 1030 | { |
| 1031 | int h, l, m, count; |
| 1032 | wchar_t wc; |
| 1033 | unsigned char b; |
| 1034 | |
| 1035 | if (ENCODING == __ctype_encoding_7_bit) { |
| 1036 | size_t i; |
| 1037 | |
| 1038 | for (i = 0 ; (i < n) && pwcs[i] ; i++) { |
| 1039 | if (pwcs[i] != (pwcs[i] & 0x7f)) { |
| 1040 | return -1; |
| 1041 | } |
| 1042 | } |
| 1043 | } |
| 1044 | #ifdef __CTYPE_HAS_8_BIT_LOCALES |
| 1045 | else if (ENCODING == __ctype_encoding_8_bit) { |
| 1046 | mbstate_t mbstate; |
| 1047 | |
| 1048 | mbstate.__mask = 0; /* Initialize the mbstate. */ |
| 1049 | if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) { |
| 1050 | return -1; |
| 1051 | } |
| 1052 | } |
| 1053 | #endif /* __CTYPE_HAS_8_BIT_LOCALES */ |
| 1054 | #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN) |
| 1055 | /* For stricter handling of allowed unicode values... see comments above. */ |
| 1056 | else if (ENCODING == __ctype_encoding_utf8) { |
| 1057 | size_t i; |
| 1058 | |
| 1059 | for (i = 0 ; (i < n) && pwcs[i] ; i++) { |
| 1060 | if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2) |
| 1061 | || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U)) |
| 1062 | ) { |
| 1063 | return -1; |
| 1064 | } |
| 1065 | } |
| 1066 | } |
| 1067 | #endif /* __CTYPE_HAS_UTF_8_LOCALES */ |
| 1068 | |
| 1069 | for (count = 0 ; n && (wc = *pwcs++) ; n--) { |
| 1070 | if (wc <= 0xff) { |
| 1071 | /* If we're here, wc != 0. */ |
| 1072 | if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) { |
| 1073 | return -1; |
| 1074 | } |
| 1075 | ++count; |
| 1076 | continue; |
| 1077 | } |
| 1078 | if (((unsigned int) wc) <= 0xffff) { |
| 1079 | b = wc & 0xff; |
| 1080 | h = (wc >> 8); |
| 1081 | l = new_idx[h]; |
| 1082 | h = new_idx[h+1]; |
| 1083 | while ((m = (l+h) >> 1) != l) { |
| 1084 | if (b >= new_tbl[m]) { |
| 1085 | l = m; |
| 1086 | } else { /* wc < tbl[m] */ |
| 1087 | h = m; |
| 1088 | } |
| 1089 | } |
| 1090 | count += new_wtbl[l]; /* none should be -1. */ |
| 1091 | continue; |
| 1092 | } |
| 1093 | |
| 1094 | /* Redo this to minimize average number of compares?*/ |
| 1095 | if (wc >= 0x1d167) { |
| 1096 | if (wc <= 0x1d1ad) { |
| 1097 | if ((wc <= 0x1d169 |
| 1098 | || (wc >= 0x1d173 |
| 1099 | && (wc <= 0x1d182 |
| 1100 | || (wc >= 0x1d185 |
| 1101 | && (wc <= 0x1d18b |
| 1102 | || (wc >= 0x1d1aa)))))) |
| 1103 | ) { |
| 1104 | continue; |
| 1105 | } |
| 1106 | } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) { |
| 1107 | continue; |
| 1108 | } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) { |
| 1109 | ++count; /* need 2.. add one here */ |
| 1110 | } |
| 1111 | #if (WCHAR_MAX > 0x7fffffffL) |
| 1112 | else if (wc > 0x7fffffffL) { |
| 1113 | return -1; |
| 1114 | } |
| 1115 | #endif /* (WCHAR_MAX > 0x7fffffffL) */ |
| 1116 | } |
| 1117 | |
| 1118 | ++count; |
| 1119 | } |
| 1120 | |
| 1121 | return count; |
| 1122 | } |
| 1123 | |
| 1124 | #else /* __UCLIBC_HAS_LOCALE__ */ |
| 1125 | |
| 1126 | int wcswidth(const wchar_t *pwcs, size_t n) |
| 1127 | { |
| 1128 | int count; |
| 1129 | wchar_t wc; |
| 1130 | size_t i; |
| 1131 | |
| 1132 | for (i = 0 ; (i < n) && pwcs[i] ; i++) { |
| 1133 | if (pwcs[i] != (pwcs[i] & 0x7f)) { |
| 1134 | return -1; |
| 1135 | } |
| 1136 | } |
| 1137 | |
| 1138 | for (count = 0 ; n && (wc = *pwcs++) ; n--) { |
| 1139 | if (wc <= 0xff) { |
| 1140 | /* If we're here, wc != 0. */ |
| 1141 | if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) { |
| 1142 | return -1; |
| 1143 | } |
| 1144 | ++count; |
| 1145 | continue; |
| 1146 | } else { |
| 1147 | return -1; |
| 1148 | } |
| 1149 | } |
| 1150 | |
| 1151 | return count; |
| 1152 | } |
| 1153 | |
| 1154 | #endif /* __UCLIBC_HAS_LOCALE__ */ |
| 1155 | |
| 1156 | libc_hidden_def(wcswidth) |
| 1157 | |
| 1158 | #endif |
| 1159 | /**********************************************************************/ |
| 1160 | #ifdef L_wcwidth |
| 1161 | |
| 1162 | |
| 1163 | int wcwidth(wchar_t wc) |
| 1164 | { |
| 1165 | return wcswidth(&wc, 1); |
| 1166 | } |
| 1167 | |
| 1168 | #endif |
| 1169 | /**********************************************************************/ |
| 1170 | |
| 1171 | |
| 1172 | typedef struct { |
| 1173 | mbstate_t tostate; |
| 1174 | mbstate_t fromstate; |
| 1175 | int tocodeset; |
| 1176 | int fromcodeset; |
| 1177 | int frombom; |
| 1178 | int tobom; |
| 1179 | int fromcodeset0; |
| 1180 | int frombom0; |
| 1181 | int tobom0; |
| 1182 | int skip_invalid_input; /* To support iconv -c option. */ |
| 1183 | } _UC_iconv_t; |
| 1184 | |
| 1185 | /* For the multibyte |
| 1186 | * bit 0 means swap endian |
| 1187 | * bit 1 means 2 byte |
| 1188 | * bit 2 means 4 byte |
| 1189 | * |
| 1190 | */ |
| 1191 | |
| 1192 | #if defined L_iconv && defined _LIBC |
| 1193 | /* Used externally only by iconv utility */ |
| 1194 | extern const unsigned char __iconv_codesets[]; |
| 1195 | libc_hidden_proto(__iconv_codesets) |
| 1196 | #endif |
| 1197 | |
| 1198 | #if defined L_iconv || defined L_iconv_main |
| 1199 | const unsigned char __iconv_codesets[] = |
| 1200 | "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */ |
| 1201 | #if __BYTE_ORDER == __BIG_ENDIAN |
| 1202 | "\x08\xec""UCS-4\x00" /* always BE */ |
| 1203 | "\x0a\xec""UCS-4BE\x00" |
| 1204 | "\x0a\xed""UCS-4LE\x00" |
| 1205 | "\x09\xe4""UTF-32\x00" /* platform endian with BOM */ |
| 1206 | "\x0b\xe4""UTF-32BE\x00" |
| 1207 | "\x0b\xe5""UTF-32LE\x00" |
| 1208 | "\x08\xe2""UCS-2\x00" /* always BE */ |
| 1209 | "\x0a\xe2""UCS-2BE\x00" |
| 1210 | "\x0a\xe3""UCS-2LE\x00" |
| 1211 | "\x09\xea""UTF-16\x00" /* platform endian with BOM */ |
| 1212 | "\x0b\xea""UTF-16BE\x00" |
| 1213 | "\x0b\xeb""UTF-16LE\x00" |
| 1214 | #elif __BYTE_ORDER == __LITTLE_ENDIAN |
| 1215 | "\x08\xed""UCS-4\x00" /* always BE */ |
| 1216 | "\x0a\xed""UCS-4BE\x00" |
| 1217 | "\x0a\xec""UCS-4LE\x00" |
| 1218 | "\x09\xf4""UTF-32\x00" /* platform endian with BOM */ |
| 1219 | "\x0b\xe5""UTF-32BE\x00" |
| 1220 | "\x0b\xe4""UTF-32LE\x00" |
| 1221 | "\x08\xe3""UCS-2\x00" /* always BE */ |
| 1222 | "\x0a\xe3""UCS-2BE\x00" |
| 1223 | "\x0a\xe2""UCS-2LE\x00" |
| 1224 | "\x09\xfa""UTF-16\x00" /* platform endian with BOM */ |
| 1225 | "\x0b\xeb""UTF-16BE\x00" |
| 1226 | "\x0b\xea""UTF-16LE\x00" |
| 1227 | #endif |
| 1228 | "\x08\x02""UTF-8\x00" |
| 1229 | "\x0b\x01""US-ASCII\x00" |
| 1230 | "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */ |
| 1231 | #endif |
| 1232 | #if defined L_iconv && defined _LIBC |
| 1233 | libc_hidden_data_def(__iconv_codesets) |
| 1234 | #endif |
| 1235 | |
| 1236 | |
| 1237 | #ifdef L_iconv |
| 1238 | |
| 1239 | #include <iconv.h> |
| 1240 | #include <string.h> |
| 1241 | #include <endian.h> |
| 1242 | #include <byteswap.h> |
| 1243 | |
| 1244 | #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN) |
| 1245 | #error unsupported endianness for iconv |
| 1246 | #endif |
| 1247 | |
| 1248 | #ifndef __CTYPE_HAS_8_BIT_LOCALES |
| 1249 | #error currently iconv requires 8 bit locales |
| 1250 | #endif |
| 1251 | #ifndef __CTYPE_HAS_UTF_8_LOCALES |
| 1252 | #error currently iconv requires UTF-8 locales |
| 1253 | #endif |
| 1254 | |
| 1255 | |
| 1256 | enum { |
| 1257 | IC_WCHAR_T = 0xe0, |
| 1258 | IC_MULTIBYTE = 0xe0, |
| 1259 | #if __BYTE_ORDER == __BIG_ENDIAN |
| 1260 | IC_UCS_4 = 0xec, |
| 1261 | IC_UTF_32 = 0xe4, |
| 1262 | IC_UCS_2 = 0xe2, |
| 1263 | IC_UTF_16 = 0xea, |
| 1264 | #else |
| 1265 | IC_UCS_4 = 0xed, |
| 1266 | IC_UTF_32 = 0xe5, |
| 1267 | IC_UCS_2 = 0xe3, |
| 1268 | IC_UTF_16 = 0xeb, |
| 1269 | #endif |
| 1270 | IC_UTF_8 = 2, |
| 1271 | IC_ASCII = 1 |
| 1272 | }; |
| 1273 | |
| 1274 | |
| 1275 | static int find_codeset(const char *name) |
| 1276 | { |
| 1277 | const unsigned char *s; |
| 1278 | int codeset; |
| 1279 | |
| 1280 | for (s = __iconv_codesets; *s; s += *s) { |
| 1281 | if (!strcasecmp((char*) (s + 2), name)) { |
| 1282 | return s[1]; |
| 1283 | } |
| 1284 | } |
| 1285 | |
| 1286 | /* The following is ripped from find_locale in locale.c. */ |
| 1287 | |
| 1288 | /* TODO: maybe CODESET_LIST + *s ??? */ |
| 1289 | /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */ |
| 1290 | codeset = 2; |
| 1291 | s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST; |
| 1292 | do { |
| 1293 | ++codeset; /* Increment codeset first. */ |
| 1294 | if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) { |
| 1295 | return codeset; |
| 1296 | } |
| 1297 | } while (*++s); |
| 1298 | |
| 1299 | return 0; /* No matching codeset! */ |
| 1300 | } |
| 1301 | |
| 1302 | iconv_t weak_function iconv_open(const char *tocode, const char *fromcode) |
| 1303 | { |
| 1304 | register _UC_iconv_t *px; |
| 1305 | int tocodeset, fromcodeset; |
| 1306 | |
| 1307 | if (((tocodeset = find_codeset(tocode)) != 0) |
| 1308 | && ((fromcodeset = find_codeset(fromcode)) != 0)) { |
| 1309 | if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) { |
| 1310 | px->tocodeset = tocodeset; |
| 1311 | px->tobom0 = px->tobom = (tocodeset >= 0xe0) ? (tocodeset & 0x10) >> 4 : 0; |
| 1312 | px->fromcodeset0 = px->fromcodeset = fromcodeset; |
| 1313 | px->frombom0 = px->frombom = (fromcodeset >= 0xe0) ? (fromcodeset & 0x10) >> 4 : 0; |
| 1314 | px->skip_invalid_input = px->tostate.__mask |
| 1315 | = px->fromstate.__mask = 0; |
| 1316 | return (iconv_t) px; |
| 1317 | } |
| 1318 | } else { |
| 1319 | __set_errno(EINVAL); |
| 1320 | } |
| 1321 | return (iconv_t)(-1); |
| 1322 | } |
| 1323 | |
| 1324 | int weak_function iconv_close(iconv_t cd) |
| 1325 | { |
| 1326 | free(cd); |
| 1327 | |
| 1328 | return 0; |
| 1329 | } |
| 1330 | |
| 1331 | size_t weak_function iconv(iconv_t cd, char **__restrict inbuf, |
| 1332 | size_t *__restrict inbytesleft, |
| 1333 | char **__restrict outbuf, |
| 1334 | size_t *__restrict outbytesleft) |
| 1335 | { |
| 1336 | _UC_iconv_t *px = (_UC_iconv_t *) cd; |
| 1337 | size_t nrcount, r; |
| 1338 | wchar_t wc, wc2; |
| 1339 | int inci, inco; |
| 1340 | |
| 1341 | assert(px != (_UC_iconv_t *)(-1)); |
| 1342 | assert(sizeof(wchar_t) == 4); |
| 1343 | |
| 1344 | if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */ |
| 1345 | /* Note: For shift-state encodings we possibly need to output the |
| 1346 | * shift sequence to return to initial state! */ |
| 1347 | if ((px->fromcodeset & 0xf0) == 0xe0) { |
| 1348 | } |
| 1349 | px->tostate.__mask = px->fromstate.__mask = 0; |
| 1350 | px->fromcodeset = px->fromcodeset0; |
| 1351 | px->tobom = px->tobom0; |
| 1352 | px->frombom = px->frombom0; |
| 1353 | return 0; |
| 1354 | } |
| 1355 | |
| 1356 | nrcount = 0; |
| 1357 | while (*inbytesleft) { |
| 1358 | if (!*outbytesleft) { |
| 1359 | TOO_BIG: |
| 1360 | __set_errno(E2BIG); |
| 1361 | return (size_t) -1; |
| 1362 | } |
| 1363 | |
| 1364 | inci = inco = 1; |
| 1365 | if (px->fromcodeset >= IC_MULTIBYTE) { |
| 1366 | inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6); |
| 1367 | if (*inbytesleft < inci) goto INVALID; |
| 1368 | wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8) |
| 1369 | + ((unsigned char)((*inbuf)[1])); |
| 1370 | if (inci == 4) { |
| 1371 | wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8) |
| 1372 | + ((unsigned char)((*inbuf)[3])) + (wc << 16); |
| 1373 | if (!(px->fromcodeset & 1)) wc = bswap_32(wc); |
| 1374 | } else { |
| 1375 | if (!(px->fromcodeset & 1)) wc = bswap_16(wc); |
| 1376 | if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16) |
| 1377 | && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U)) |
| 1378 | ) { /* surrogate */ |
| 1379 | wc =- 0xd800U; |
| 1380 | if (*inbytesleft < 4) goto INVALID; |
| 1381 | wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8) |
| 1382 | + ((unsigned char)((*inbuf)[3])); |
| 1383 | if (!(px->fromcodeset & 1)) wc = bswap_16(wc2); |
| 1384 | if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) { |
| 1385 | goto ILLEGAL; |
| 1386 | } |
| 1387 | inci = 4; /* Change inci here in case skipping illegals. */ |
| 1388 | wc = 0x10000UL + (wc << 10) + wc2; |
| 1389 | } |
| 1390 | } |
| 1391 | |
| 1392 | if (px->frombom) { |
| 1393 | px->frombom = 0; |
| 1394 | if ((wc == 0xfeffU) |
| 1395 | || (wc == ((inci == 4) |
| 1396 | ? (((wchar_t) 0xfffe0000UL)) |
| 1397 | : ((wchar_t)(0xfffeUL)))) |
| 1398 | ) { |
| 1399 | if (wc != 0xfeffU) { |
| 1400 | px->fromcodeset ^= 1; /* toggle endianness */ |
| 1401 | wc = 0xfeffU; |
| 1402 | } |
| 1403 | if (!px->frombom) { |
| 1404 | goto BOM_SKIP_OUTPUT; |
| 1405 | } |
| 1406 | goto GOT_BOM; |
| 1407 | } |
| 1408 | } |
| 1409 | |
| 1410 | if (px->fromcodeset != IC_WCHAR_T) { |
| 1411 | if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4) |
| 1412 | ? 0x7fffffffUL : 0x10ffffUL) |
| 1413 | #ifdef KUHN |
| 1414 | || (((__uwchar_t)(wc - 0xfffeU)) < 2) |
| 1415 | || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U)) |
| 1416 | #endif |
| 1417 | ) { |
| 1418 | goto ILLEGAL; |
| 1419 | } |
| 1420 | } |
| 1421 | } else if (px->fromcodeset == IC_UTF_8) { |
| 1422 | const char *p = *inbuf; |
| 1423 | r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0); |
| 1424 | if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */ |
| 1425 | if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */ |
| 1426 | assert((r == (size_t)(-1)) || (r == (size_t)(-2))); |
| 1427 | if (r == (size_t)(-2)) { |
| 1428 | INVALID: |
| 1429 | __set_errno(EINVAL); |
| 1430 | } else { |
| 1431 | px->fromstate.__mask = 0; |
| 1432 | inci = 1; |
| 1433 | ILLEGAL: |
| 1434 | if (px->skip_invalid_input) { |
| 1435 | px->skip_invalid_input = 2; /* flag for iconv utility */ |
| 1436 | goto BOM_SKIP_OUTPUT; |
| 1437 | } |
| 1438 | __set_errno(EILSEQ); |
| 1439 | } |
| 1440 | return (size_t)(-1); |
| 1441 | } |
| 1442 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 1443 | #warning TODO: optimize this. |
| 1444 | #endif |
| 1445 | if (p != NULL) { /* incomplete char case */ |
| 1446 | goto INVALID; |
| 1447 | } |
| 1448 | p = *inbuf + 1; /* nul */ |
| 1449 | } |
| 1450 | inci = p - *inbuf; |
| 1451 | } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */ |
| 1452 | if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */ |
| 1453 | goto ILLEGAL; |
| 1454 | } else { /* some other 8-bit ascii-extension codeset */ |
| 1455 | const __codeset_8_bit_t *c8b |
| 1456 | = __locale_mmap->codeset_8_bit + px->fromcodeset - 3; |
| 1457 | wc -= 0x80; |
| 1458 | wc = __UCLIBC_CURLOCALE->tbl8c2wc[ |
| 1459 | (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT] |
| 1460 | << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))]; |
| 1461 | if (!wc) { |
| 1462 | goto ILLEGAL; |
| 1463 | } |
| 1464 | } |
| 1465 | } |
| 1466 | |
| 1467 | |
| 1468 | if (px->tobom) { |
| 1469 | inci = 0; |
| 1470 | wc = 0xfeffU; |
| 1471 | GOT_BOM: |
| 1472 | px->tobom = 0; |
| 1473 | } |
| 1474 | |
| 1475 | if (px->tocodeset >= IC_MULTIBYTE) { |
| 1476 | inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6); |
| 1477 | if (*outbytesleft < inco) goto TOO_BIG; |
| 1478 | if (px->tocodeset != IC_WCHAR_T) { |
| 1479 | if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4) |
| 1480 | ? 0x7fffffffUL : 0x10ffffUL) |
| 1481 | #ifdef KUHN |
| 1482 | || (((__uwchar_t)(wc - 0xfffeU)) < 2) |
| 1483 | || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U)) |
| 1484 | #endif |
| 1485 | ) { |
| 1486 | REPLACE_32: |
| 1487 | wc = 0xfffd; |
| 1488 | ++nrcount; |
| 1489 | } |
| 1490 | } |
| 1491 | if (inco == 4) { |
| 1492 | if (px->tocodeset & 1) wc = bswap_32(wc); |
| 1493 | } else { |
| 1494 | if (((__uwchar_t)wc ) > 0xffffU) { |
| 1495 | if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) { |
| 1496 | goto REPLACE_32; |
| 1497 | } |
| 1498 | if (*outbytesleft < (inco = 4)) goto TOO_BIG; |
| 1499 | wc2 = 0xdc00U + (wc & 0x3ff); |
| 1500 | wc = 0xd800U + ((wc >> 10) & 0x3ff); |
| 1501 | if (px->tocodeset & 1) { |
| 1502 | wc = bswap_16(wc); |
| 1503 | wc2 = bswap_16(wc2); |
| 1504 | } |
| 1505 | wc += (wc2 << 16); |
| 1506 | } else if (px->tocodeset & 1) wc = bswap_16(wc); |
| 1507 | } |
| 1508 | (*outbuf)[0] = (char)((unsigned char)(wc)); |
| 1509 | (*outbuf)[1] = (char)((unsigned char)(wc >> 8)); |
| 1510 | if (inco == 4) { |
| 1511 | (*outbuf)[2] = (char)((unsigned char)(wc >> 16)); |
| 1512 | (*outbuf)[3] = (char)((unsigned char)(wc >> 24)); |
| 1513 | } |
| 1514 | } else if (px->tocodeset == IC_UTF_8) { |
| 1515 | const wchar_t *pw = &wc; |
| 1516 | do { |
| 1517 | r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1); |
| 1518 | if (r != (size_t)(-1)) { |
| 1519 | #ifdef __UCLIBC_MJN3_ONLY__ |
| 1520 | #warning TODO: What happens for a nul? |
| 1521 | #endif |
| 1522 | if (r == 0) { |
| 1523 | if (wc != 0) { |
| 1524 | goto TOO_BIG; |
| 1525 | } |
| 1526 | ++r; |
| 1527 | } |
| 1528 | break; |
| 1529 | } |
| 1530 | wc = 0xfffdU; |
| 1531 | ++nrcount; |
| 1532 | } while (1); |
| 1533 | inco = r; |
| 1534 | } else if (((__uwchar_t)(wc)) < 0x80) { |
| 1535 | CHAR_GOOD: |
| 1536 | **outbuf = wc; |
| 1537 | } else { |
| 1538 | if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) { |
| 1539 | const __codeset_8_bit_t *c8b |
| 1540 | = __locale_mmap->codeset_8_bit + px->tocodeset - 3; |
| 1541 | __uwchar_t u; |
| 1542 | u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)]; |
| 1543 | u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT) |
| 1544 | + ((wc >> Cwc2c_TT_SHIFT) |
| 1545 | & ((1 << Cwc2c_TI_SHIFT)-1))]; |
| 1546 | wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN |
| 1547 | + (u << Cwc2c_TT_SHIFT) |
| 1548 | + (wc & ((1 << Cwc2c_TT_SHIFT)-1))]; |
| 1549 | if (wc) { |
| 1550 | goto CHAR_GOOD; |
| 1551 | } |
| 1552 | } |
| 1553 | **outbuf = '?'; |
| 1554 | ++nrcount; |
| 1555 | } |
| 1556 | |
| 1557 | *outbuf += inco; |
| 1558 | *outbytesleft -= inco; |
| 1559 | BOM_SKIP_OUTPUT: |
| 1560 | *inbuf += inci; |
| 1561 | *inbytesleft -= inci; |
| 1562 | } |
| 1563 | return nrcount; |
| 1564 | } |
| 1565 | #endif |