blob: 412c557eb2a92d36d30885f85cc5ff7f372d4bbc [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001
2/* Copyright (C) 2002, 2003, 2004 Manuel Novoa III
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
13 *
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19/* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
20 *
21 * Besides uClibc, I'm using this code in my libc for elks, which is
22 * a 16-bit environment with a fairly limited compiler. It would make
23 * things much easier for me if this file isn't modified unnecessarily.
24 * In particular, please put any new or replacement functions somewhere
25 * else, and modify the makefile to use your version instead.
26 * Thanks. Manuel
27 *
28 * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
29
30
31/* May 23, 2002 Initial Notes:
32 *
33 * I'm still tweaking this stuff, but it passes the tests I've thrown
34 * at it, and Erik needs it for the gcc port. The glibc extension
35 * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
36 * in the glibc source. I also need to fix the behavior of
37 * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
38 *
39 * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
40 * file on my platform (x86) show about 5-10% faster conversion speed than
41 * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
42 * individual mbrtowc()/wcrtomb() calls.
43 *
44 * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
45 * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
46 * needs to deal gracefully with whatever is sent to it. In that mode,
47 * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
48 * an arg to force that behavior, so the interface will be changing.
49 *
50 * I need to fix the error checking for 16-bit wide chars. This isn't
51 * an issue for uClibc, but may be for ELKS. I'm currently not sure
52 * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
53 *
54 * July 1, 2002
55 *
56 * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
57 * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
58 * locales.
59 * Enabled building of a C/POSIX-locale-only version, so full locale support
60 * no longer needs to be enabled.
61 *
62 * Nov 4, 2002
63 *
64 * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
65 * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
66 * order to support %ls in printf. See comments below for details.
67 * Change behaviour of wc<->mb functions when in the C locale. Now they do
68 * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
69 * and consistency with the stds requirements that a printf format string by
70 * a valid multibyte string beginning and ending in it's initial shift state.
71 *
72 * Nov 5, 2002
73 *
74 * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
75 *
76 * Nov 7, 2002
77 *
78 * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
79 * Added some size/speed optimizations and integrated it into my locale
80 * framework. Minimally tested at the moment, but the stub C-locale
81 * version (which most people would probably be using) should be fine.
82 *
83 * Nov 21, 2002
84 *
85 * Revert the wc<->mb changes from earlier this month involving the C-locale.
86 * Add a couple of ugly hacks to support *wprintf.
87 * Add a mini iconv() and iconv implementation (requires locale support).
88 *
89 * Aug 1, 2003
90 * Bug fix for mbrtowc.
91 *
92 * Aug 18, 2003
93 * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
94 *
95 * Feb 11, 2004
96 * Bug fix: Fix size check for remaining output space in iconv().
97 *
98 * Manuel
99 */
100#ifdef _LIBC
101#include <errno.h>
102#include <stddef.h>
103#include <limits.h>
104#include <stdint.h>
105#include <inttypes.h>
106#include <stdlib.h>
107#include <stdio.h>
108#include <assert.h>
109#include <locale.h>
110#include <wchar.h>
111#include <bits/uClibc_uwchar.h>
112
113/**********************************************************************/
114#ifdef __UCLIBC_HAS_LOCALE__
115#ifdef __UCLIBC_MJN3_ONLY__
116#ifdef L_iswspace
117/* generates one warning */
118#warning TODO: Fix Cc2wc* and Cwc2c* defines!
119#endif
120#endif /* __UCLIBC_MJN3_ONLY__ */
121
122#define ENCODING (__UCLIBC_CURLOCALE->encoding)
123
124#define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT
125#define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN
126#define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX
127#define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT
128#define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT
129#define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN
130
131#ifndef __CTYPE_HAS_UTF_8_LOCALES
132#warning __CTYPE_HAS_UTF_8_LOCALES not set!
133#endif
134
135#else /* __UCLIBC_HAS_LOCALE__ */
136
137#ifdef __UCLIBC_MJN3_ONLY__
138#ifdef L_btowc
139/* emit only once */
140#warning fix preprocessor logic testing locale settings
141#endif
142#endif
143
144#define ENCODING (__ctype_encoding_7_bit)
145#ifdef __CTYPE_HAS_8_BIT_LOCALES
146#error __CTYPE_HAS_8_BIT_LOCALES is defined!
147#endif
148#ifdef __CTYPE_HAS_UTF_8_LOCALES
149#error __CTYPE_HAS_UTF_8_LOCALES is defined!
150#endif
151#undef L__wchar_utf8sntowcs
152#undef L__wchar_wcsntoutf8s
153
154#endif /* __UCLIBC_HAS_LOCALE__ */
155/**********************************************************************/
156
157#if WCHAR_MAX > 0xffffUL
158#define UTF_8_MAX_LEN 6
159#else
160#define UTF_8_MAX_LEN 3
161#endif
162
163#define KUHN 1
164
165/* Implementation-specific work functions. */
166
167extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
168 const char **__restrict src, size_t n,
169 mbstate_t *ps, int allow_continuation) attribute_hidden;
170
171extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
172 const wchar_t **__restrict src, size_t wn) attribute_hidden;
173#endif
174/**********************************************************************/
175#ifdef L_btowc
176
177
178wint_t btowc(int c)
179{
180#ifdef __CTYPE_HAS_8_BIT_LOCALES
181
182 wchar_t wc;
183 unsigned char buf[1];
184 mbstate_t mbstate;
185
186 if (c != EOF) {
187 *buf = (unsigned char) c;
188 mbstate.__mask = 0; /* Initialize the mbstate. */
189 if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
190 return wc;
191 }
192 }
193 return WEOF;
194
195#else /* !__CTYPE_HAS_8_BIT_LOCALES */
196
197#ifdef __UCLIBC_HAS_LOCALE__
198 assert((ENCODING == __ctype_encoding_7_bit)
199 || (ENCODING == __ctype_encoding_utf8));
200#endif
201
202 /* If we don't have 8-bit locale support, then this is trivial since
203 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
204 return (((unsigned int)c) < 0x80) ? c : WEOF;
205
206#endif /* !__CTYPE_HAS_8_BIT_LOCALES */
207}
208libc_hidden_def(btowc)
209
210#endif
211/**********************************************************************/
212#ifdef L_wctob
213
214/* Note: We completely ignore ps in all currently supported conversions. */
215
216
217int wctob(wint_t c)
218{
219#ifdef __CTYPE_HAS_8_BIT_LOCALES
220
221 unsigned char buf[MB_LEN_MAX];
222
223 return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
224
225#else /* __CTYPE_HAS_8_BIT_LOCALES */
226
227#ifdef __UCLIBC_HAS_LOCALE__
228 assert((ENCODING == __ctype_encoding_7_bit)
229 || (ENCODING == __ctype_encoding_utf8));
230#endif /* __UCLIBC_HAS_LOCALE__ */
231
232 /* If we don't have 8-bit locale support, then this is trivial since
233 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
234
235 /* TODO: need unsigned version of wint_t... */
236/* return (((unsigned int)c) < 0x80) ? c : WEOF; */
237 return ((c >= 0) && (c < 0x80)) ? c : EOF;
238
239#endif /* __CTYPE_HAS_8_BIT_LOCALES */
240}
241
242#endif
243/**********************************************************************/
244#ifdef L_mbsinit
245
246int mbsinit(const mbstate_t *ps)
247{
248 return !ps || !ps->__mask;
249}
250libc_hidden_def(mbsinit)
251
252#endif
253/**********************************************************************/
254#ifdef L_mbrlen
255
256
257size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
258{
259 static mbstate_t mbstate; /* Rely on bss 0-init. */
260
261 return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
262}
263libc_hidden_def(mbrlen)
264
265#endif
266/**********************************************************************/
267#ifdef L_mbrtowc
268
269
270size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
271 size_t n, mbstate_t *__restrict ps)
272{
273 static mbstate_t mbstate; /* Rely on bss 0-init. */
274 wchar_t wcbuf[1];
275 const char *p;
276 size_t r;
277 char empty_string[1]; /* Avoid static to be fPIC friendly. */
278
279 if (!ps) {
280 ps = &mbstate;
281 }
282
283 if (!s) {
284 pwc = (wchar_t *) s; /* NULL */
285 empty_string[0] = 0; /* Init the empty string when necessary. */
286 s = empty_string;
287 n = 1;
288 } else if (*s == '\0') {
289 if (pwc)
290 *pwc = '\0';
291 /* According to the ISO C 89 standard this is the expected behaviour. */
292 return 0;
293 } else if (!n) {
294 /* TODO: change error code? */
295#if 0
296 return (ps->__mask && (ps->__wc == 0xffffU))
297 ? ((size_t) -1) : ((size_t) -2);
298#else
299 return 0;
300#endif
301 }
302
303 p = s;
304
305#ifdef __CTYPE_HAS_UTF_8_LOCALES
306 /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
307 if (ENCODING == __ctype_encoding_utf8) {
308 if (!pwc) {
309 pwc = wcbuf;
310 }
311 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
312 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
313 }
314#endif
315
316#ifdef __UCLIBC_MJN3_ONLY__
317#warning TODO: This adds a trailing nul!
318#endif /* __UCLIBC_MJN3_ONLY__ */
319
320 r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
321
322 if (((ssize_t) r) >= 0) {
323 if (pwc) {
324 *pwc = *wcbuf;
325 }
326 }
327 return (size_t) r;
328}
329libc_hidden_def(mbrtowc)
330
331#endif
332/**********************************************************************/
333#ifdef L_wcrtomb
334
335
336/* Note: We completely ignore ps in all currently supported conversions. */
337/* TODO: Check for valid state anyway? */
338
339size_t wcrtomb(register char *__restrict s, wchar_t wc,
340 mbstate_t *__restrict ps)
341{
342#ifdef __UCLIBC_MJN3_ONLY__
343#warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc.
344#endif /* __UCLIBC_MJN3_ONLY__ */
345 wchar_t wcbuf[1];
346 const wchar_t *pwc;
347 size_t r;
348 char buf[MB_LEN_MAX];
349
350 if (!s) {
351 s = buf;
352 wc = 0;
353 }
354
355 pwc = wcbuf;
356 wcbuf[0] = wc;
357
358 r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
359 return (r != 0) ? r : 1;
360}
361libc_hidden_def(wcrtomb)
362
363#endif
364/**********************************************************************/
365#ifdef L_mbsrtowcs
366
367
368size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
369 size_t len, mbstate_t *__restrict ps)
370{
371 static mbstate_t mbstate; /* Rely on bss 0-init. */
372
373 return mbsnrtowcs(dst, src, SIZE_MAX, len,
374 ((ps != NULL) ? ps : &mbstate));
375}
376libc_hidden_def(mbsrtowcs)
377
378#endif
379/**********************************************************************/
380#ifdef L_wcsrtombs
381
382/* Note: We completely ignore ps in all currently supported conversions.
383
384 * TODO: Check for valid state anyway? */
385
386
387size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
388 size_t len, mbstate_t *__restrict ps)
389{
390 return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
391}
392libc_hidden_def(wcsrtombs)
393
394#endif
395/**********************************************************************/
396#ifdef L__wchar_utf8sntowcs
397
398/* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
399 * UTF-8-test.txt strss test.
400 */
401/* #define DECODER */
402
403#ifdef DECODER
404#ifndef KUHN
405#define KUHN
406#endif
407#endif
408
409size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
410 const char **__restrict src, size_t n,
411 mbstate_t *ps, int allow_continuation)
412{
413 register const char *s;
414 __uwchar_t mask;
415 __uwchar_t wc;
416 wchar_t wcbuf[1];
417 size_t count;
418 int incr;
419
420 s = *src;
421
422 assert(s != NULL);
423 assert(ps != NULL);
424
425 incr = 1;
426 /* NOTE: The following is an AWFUL HACK! In order to support %s in
427 * wprintf, we need to be able to compute the number of wchars needed
428 * for the mbs conversion, not to exceed the precision specified.
429 * But if dst is NULL, the return value is the length assuming a
430 * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps
431 * as pwc in order to flag that we really want the length, subject
432 * to the restricted buffer size and no partial conversions.
433 * See mbsnrtowcs() as well. */
434 if (!pwc || (pwc == ((wchar_t *)ps))) {
435 if (!pwc) {
436 wn = SIZE_MAX;
437 }
438 pwc = wcbuf;
439 incr = 0;
440 }
441
442 /* This is really here only to support the glibc extension function
443 * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
444 * check on the validity of the mbstate. */
445 if (!(count = wn)) {
446 return 0;
447 }
448
449 if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
450#ifdef DECODER
451 wc = (__uwchar_t) ps->__wc;
452 if (n) {
453 goto CONTINUE;
454 }
455 goto DONE;
456#else
457 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
458 /* TODO: change error code here and below? */
459 if (n) {
460 goto CONTINUE;
461 }
462 goto DONE;
463 }
464 __set_errno(EILSEQ);
465 return (size_t) -1; /* We're in an error state. */
466#endif
467 }
468
469 do {
470 if (!n) {
471 goto DONE;
472 }
473 --n;
474 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
475 mask = 0x40;
476#ifdef __UCLIBC_MJN3_ONLY__
477#warning TODO: Fix range for 16 bit wchar_t case.
478#endif
479 if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
480 (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
481 goto START;
482 }
483 BAD:
484#ifdef DECODER
485 wc = 0xfffdU;
486 goto COMPLETE;
487#else
488 ps->__mask = mask;
489 ps->__wc = 0xffffU;
490 __set_errno(EILSEQ);
491 return (size_t) -1; /* Illegal start byte! */
492#endif
493
494 CONTINUE:
495 while (n) {
496 --n;
497 if ((*s & 0xc0) != 0x80) {
498 goto BAD;
499 }
500 mask <<= 5;
501 wc <<= 6;
502 wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
503 ++s;
504 START:
505 wc &= ~(mask << 1);
506
507 if ((wc & mask) == 0) { /* Character completed. */
508 if ((mask >>= 5) == 0x40) {
509 mask += mask;
510 }
511 /* Check for invalid sequences (longer than necessary)
512 * and invalid chars. */
513 if ( (wc < mask) /* Sequence not minimal length. */
514#ifdef KUHN
515#if UTF_8_MAX_LEN == 3
516#error broken since mask can overflow!!
517 /* For plane 0, these are the only defined values.*/
518 || (wc > 0xfffdU)
519#else
520 /* Note that we don't need to worry about exceeding */
521 /* 31 bits as that is the most that UTF-8 provides. */
522 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
523#endif
524 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
525#endif /* KUHN */
526 ) {
527 goto BAD;
528 }
529 goto COMPLETE;
530 }
531 }
532 /* Character potentially valid but incomplete. */
533 if (!allow_continuation) {
534 if (count != wn) {
535 return 0;
536 }
537 /* NOTE: The following can fail if you allow and then disallow
538 * continuation!!! */
539#if UTF_8_MAX_LEN == 3
540#error broken since mask can overflow!!
541#endif
542 /* Need to back up... */
543 do {
544 --s;
545 } while ((mask >>= 5) >= 0x40);
546 goto DONE;
547 }
548 ps->__mask = (wchar_t) mask;
549 ps->__wc = (wchar_t) wc;
550 *src = s;
551 return (size_t) -2;
552 }
553 COMPLETE:
554 *pwc = wc;
555 pwc += incr;
556 }
557#ifdef DECODER
558 while (--count);
559#else
560 while (wc && --count);
561
562 if (!wc) {
563 s = NULL;
564 }
565#endif
566
567 DONE:
568 /* ps->__wc is irrelavent here. */
569 ps->__mask = 0;
570 if (pwc != wcbuf) {
571 *src = s;
572 }
573
574 return wn - count;
575}
576
577#endif
578/**********************************************************************/
579#ifdef L__wchar_wcsntoutf8s
580
581size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
582 const wchar_t **__restrict src, size_t wn)
583{
584 register char *p;
585 size_t len, t;
586 __uwchar_t wc;
587 const __uwchar_t *swc;
588 int store;
589 char buf[MB_LEN_MAX];
590 char m;
591
592 store = 1;
593 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
594 * printf, we need to be able to compute the number of bytes needed
595 * for the mbs conversion, not to exceed the precision specified.
596 * But if dst is NULL, the return value is the length assuming a
597 * sufficiently sized buffer. So, we allow passing of (char *) src
598 * as dst in order to flag that we really want the length, subject
599 * to the restricted buffer size and no partial conversions.
600 * See wcsnrtombs() as well. */
601 if (!s || (s == ((char *) src))) {
602 if (!s) {
603 n = SIZE_MAX;
604 }
605 s = buf;
606 store = 0;
607 }
608
609 t = n;
610 swc = (const __uwchar_t *) *src;
611
612 assert(swc != NULL);
613
614 while (wn && t) {
615 wc = *swc;
616
617 *s = wc;
618 len = 1;
619
620 if (wc >= 0x80) {
621#ifdef KUHN
622 if (
623#if UTF_8_MAX_LEN == 3
624 /* For plane 0, these are the only defined values.*/
625 /* Note that we don't need to worry about exceeding */
626 /* 31 bits as that is the most that UTF-8 provides. */
627 (wc > 0xfffdU)
628#else
629 /* UTF_8_MAX_LEN == 6 */
630 (wc > 0x7fffffffUL)
631 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
632#endif
633 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
634 ) {
635 __set_errno(EILSEQ);
636 return (size_t) -1;
637 }
638#else /* KUHN */
639#if UTF_8_MAX_LEN != 3
640 if (wc > 0x7fffffffUL) { /* Value too large. */
641 __set_errno(EILSEQ);
642 return (size_t) -1;
643 }
644#endif
645#endif /* KUHN */
646
647 wc >>= 1;
648 p = s;
649 do {
650 ++p;
651 } while (wc >>= 5);
652 wc = *swc;
653 if ((len = p - s) > t) { /* Not enough space. */
654 break;
655 }
656
657 m = 0x80;
658 while( p>s ) {
659 m = (m >> 1) | 0x80;
660 *--p = (wc & 0x3f) | 0x80;
661 wc >>= 6;
662 }
663 *s |= (m << 1);
664 } else if (wc == 0) { /* End of string. */
665 swc = NULL;
666 break;
667 }
668
669 ++swc;
670 --wn;
671 t -= len;
672 if (store) {
673 s += len;
674 }
675 }
676
677 if (store) {
678 *src = (const wchar_t *) swc;
679 }
680
681 return n - t;
682}
683
684
685#endif
686/**********************************************************************/
687#ifdef L_mbsnrtowcs
688
689/* WARNING: We treat len as SIZE_MAX when dst is NULL! */
690
691size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
692 size_t NMC, size_t len, mbstate_t *__restrict ps)
693{
694 static mbstate_t mbstate; /* Rely on bss 0-init. */
695 wchar_t wcbuf[1];
696 const char *s;
697 size_t count;
698 int incr;
699
700 if (!ps) {
701 ps = &mbstate;
702 }
703
704#ifdef __CTYPE_HAS_UTF_8_LOCALES
705 if (ENCODING == __ctype_encoding_utf8) {
706 size_t r;
707 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
708 != (size_t) -2) ? r : 0;
709 }
710#endif
711 incr = 1;
712 /* NOTE: The following is an AWFUL HACK! In order to support %s in
713 * wprintf, we need to be able to compute the number of wchars needed
714 * for the mbs conversion, not to exceed the precision specified.
715 * But if dst is NULL, the return value is the length assuming a
716 * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps)
717 * as dst in order to flag that we really want the length, subject
718 * to the restricted buffer size and no partial conversions.
719 * See _wchar_utf8sntowcs() as well. */
720 if (!dst || (dst == ((wchar_t *)ps))) {
721 if (!dst) {
722 len = SIZE_MAX;
723 }
724 dst = wcbuf;
725 incr = 0;
726 }
727
728 /* Since all the following encodings are single-byte encodings... */
729 if (len > NMC) {
730 len = NMC;
731 }
732
733 count = len;
734 s = *src;
735
736#ifdef __CTYPE_HAS_8_BIT_LOCALES
737 if (ENCODING == __ctype_encoding_8_bit) {
738 wchar_t wc;
739 while (count) {
740 if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
741 wc -= 0x80;
742 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
743 (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
744 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
745 if (!wc) {
746 goto BAD;
747 }
748 }
749 if (!(*dst = wc)) {
750 s = NULL;
751 break;
752 }
753 dst += incr;
754 ++s;
755 --count;
756 }
757 if (dst != wcbuf) {
758 *src = s;
759 }
760 return len - count;
761 }
762#endif
763
764#ifdef __UCLIBC_HAS_LOCALE__
765 assert(ENCODING == __ctype_encoding_7_bit);
766#endif
767
768 while (count) {
769 if ((*dst = (unsigned char) *s) == 0) {
770 s = NULL;
771 break;
772 }
773 if (*dst >= 0x80) {
774#ifdef __CTYPE_HAS_8_BIT_LOCALES
775 BAD:
776#endif
777 __set_errno(EILSEQ);
778 return (size_t) -1;
779 }
780 ++s;
781 dst += incr;
782 --count;
783 }
784 if (dst != wcbuf) {
785 *src = s;
786 }
787 return len - count;
788}
789libc_hidden_def(mbsnrtowcs)
790
791#endif
792/**********************************************************************/
793#ifdef L_wcsnrtombs
794
795/* WARNING: We treat len as SIZE_MAX when dst is NULL! */
796
797/* Note: We completely ignore ps in all currently supported conversions.
798 * TODO: Check for valid state anyway? */
799
800size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
801 size_t NWC, size_t len, mbstate_t *__restrict ps)
802{
803 const __uwchar_t *s;
804 size_t count;
805 int incr;
806 char buf[MB_LEN_MAX];
807
808#ifdef __CTYPE_HAS_UTF_8_LOCALES
809 if (ENCODING == __ctype_encoding_utf8) {
810 return _wchar_wcsntoutf8s(dst, len, src, NWC);
811 }
812#endif /* __CTYPE_HAS_UTF_8_LOCALES */
813
814 incr = 1;
815 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
816 * printf, we need to be able to compute the number of bytes needed
817 * for the mbs conversion, not to exceed the precision specified.
818 * But if dst is NULL, the return value is the length assuming a
819 * sufficiently sized buffer. So, we allow passing of (char *) src
820 * as dst in order to flag that we really want the length, subject
821 * to the restricted buffer size and no partial conversions.
822 * See _wchar_wcsntoutf8s() as well. */
823 if (!dst || (dst == ((char *) src))) {
824 if (!dst) {
825 len = SIZE_MAX;
826 }
827 dst = buf;
828 incr = 0;
829 }
830
831 /* Since all the following encodings are single-byte encodings... */
832 if (len > NWC) {
833 len = NWC;
834 }
835
836 count = len;
837 s = (const __uwchar_t *) *src;
838
839#ifdef __CTYPE_HAS_8_BIT_LOCALES
840 if (ENCODING == __ctype_encoding_8_bit) {
841 __uwchar_t wc;
842 __uwchar_t u;
843 while (count) {
844 if ((wc = *s) <= 0x7f) {
845 if (!(*dst = (unsigned char) wc)) {
846 s = NULL;
847 break;
848 }
849 } else {
850 u = 0;
851 if (wc <= Cwc2c_DOMAIN_MAX) {
852 u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
853 + Cwc2c_TT_SHIFT)];
854 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
855 + ((wc >> Cwc2c_TT_SHIFT)
856 & ((1 << Cwc2c_TI_SHIFT)-1))];
857 u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
858 + (u << Cwc2c_TT_SHIFT)
859 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
860 }
861
862#ifdef __WCHAR_REPLACEMENT_CHAR
863 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
864#else /* __WCHAR_REPLACEMENT_CHAR */
865 if (!u) {
866 goto BAD;
867 }
868 *dst = (unsigned char) u;
869#endif /* __WCHAR_REPLACEMENT_CHAR */
870 }
871 ++s;
872 dst += incr;
873 --count;
874 }
875 if (dst != buf) {
876 *src = (const wchar_t *) s;
877 }
878 return len - count;
879 }
880#endif /* __CTYPE_HAS_8_BIT_LOCALES */
881
882#ifdef __UCLIBC_HAS_LOCALE__
883 assert(ENCODING == __ctype_encoding_7_bit);
884#endif
885
886 while (count) {
887 if (*s >= 0x80) {
888#if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
889 BAD:
890#endif
891 __set_errno(EILSEQ);
892 return (size_t) -1;
893 }
894 if ((*dst = (unsigned char) *s) == 0) {
895 s = NULL;
896 break;
897 }
898 ++s;
899 dst += incr;
900 --count;
901 }
902 if (dst != buf) {
903 *src = (const wchar_t *) s;
904 }
905 return len - count;
906}
907libc_hidden_def(wcsnrtombs)
908
909#endif
910/**********************************************************************/
911#ifdef L_wcswidth
912
913
914#ifdef __UCLIBC_MJN3_ONLY__
915#warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
916#warning TODO: Update wcwidth to match latest by Kuhn.
917#endif
918
919#if defined(__UCLIBC_HAS_LOCALE__) && \
920( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
921
922static const unsigned char new_idx[] = {
923 0, 5, 5, 6, 10, 15, 28, 39,
924 48, 48, 71, 94, 113, 128, 139, 154,
925 175, 186, 188, 188, 188, 188, 188, 188,
926 203, 208, 208, 208, 208, 208, 208, 208,
927 208, 219, 219, 219, 222, 222, 222, 222,
928 222, 222, 222, 222, 222, 222, 222, 224,
929 224, 231, 231, 231, 231, 231, 231, 231,
930 231, 231, 231, 231, 231, 231, 231, 231,
931 231, 231, 231, 231, 231, 231, 231, 231,
932 231, 231, 231, 231, 231, 231, 231, 231,
933 231, 231, 231, 231, 231, 231, 231, 231,
934 231, 231, 231, 231, 231, 231, 231, 231,
935 231, 231, 231, 231, 231, 231, 231, 231,
936 231, 231, 231, 231, 231, 231, 231, 231,
937 231, 231, 231, 231, 231, 231, 231, 231,
938 231, 231, 231, 231, 231, 231, 231, 231,
939 231, 231, 231, 231, 231, 231, 231, 231,
940 231, 231, 231, 231, 231, 231, 231, 231,
941 231, 231, 231, 231, 231, 231, 231, 231,
942 231, 231, 231, 231, 231, 231, 231, 231,
943 231, 231, 231, 231, 231, 233, 233, 233,
944 233, 233, 233, 233, 234, 234, 234, 234,
945 234, 234, 234, 234, 234, 234, 234, 234,
946 234, 234, 234, 234, 234, 234, 234, 234,
947 234, 234, 234, 234, 234, 234, 234, 234,
948 234, 234, 234, 234, 234, 234, 234, 234,
949 234, 234, 234, 234, 234, 234, 234, 234,
950 236, 236, 236, 236, 236, 236, 236, 236,
951 236, 236, 236, 236, 236, 236, 236, 236,
952 236, 236, 236, 236, 236, 236, 236, 236,
953 236, 236, 236, 236, 236, 236, 236, 236,
954 236, 237, 237, 238, 241, 241, 242, 249,
955 255,
956};
957
958static const unsigned char new_tbl[] = {
959 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
960 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
961 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
962 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
963 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
964 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
965 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
966 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
967 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
968 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
969 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
970 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
971 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
972 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
973 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
974 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
975 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
976 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
977 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
978 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
979 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
980 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
981 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
982 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
983 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
984 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
985 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
986 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
987 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
988 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
989 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
990 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
991};
992
993static const signed char new_wtbl[] = {
994 0, -1, 1, -1, 1, 1, 0, 1,
995 0, 1, 1, 0, 1, 0, 1, 1,
996 0, 1, 0, 1, 0, 1, 0, 1,
997 0, 1, 0, 1, 1, 0, 1, 0,
998 1, 0, 1, 0, 1, 0, 1, 1,
999 0, 1, 0, 1, 0, 1, 0, 1,
1000 1, 0, 1, 0, 1, 0, 1, 0,
1001 1, 0, 1, 0, 1, 0, 1, 0,
1002 1, 0, 1, 0, 1, 0, 1, 1,
1003 0, 1, 0, 1, 0, 1, 0, 1,
1004 0, 1, 0, 1, 0, 1, 0, 1,
1005 0, 1, 0, 1, 0, 1, 1, 0,
1006 1, 0, 1, 0, 1, 0, 1, 0,
1007 1, 0, 1, 0, 1, 0, 1, 0,
1008 1, 1, 0, 1, 0, 1, 0, 1,
1009 0, 1, 0, 1, 0, 1, 0, 1,
1010 1, 0, 1, 0, 1, 0, 1, 0,
1011 1, 0, 1, 1, 0, 1, 0, 1,
1012 0, 1, 0, 1, 0, 1, 0, 1,
1013 0, 1, 1, 0, 1, 0, 1, 0,
1014 1, 0, 1, 0, 1, 0, 1, 0,
1015 1, 0, 1, 0, 1, 0, 1, 1,
1016 0, 1, 0, 1, 0, 1, 0, 1,
1017 0, 1, 2, 0, 1, 0, 1, 0,
1018 1, 0, 1, 0, 1, 0, 1, 0,
1019 1, 0, 1, 1, 0, 1, 0, 1,
1020 1, 0, 1, 0, 1, 0, 1, 0,
1021 1, 0, 1, 1, 2, 1, 1, 2,
1022 2, 0, 2, 1, 2, 0, 2, 2,
1023 1, 1, 2, 1, 1, 2, 1, 0,
1024 1, 1, 0, 1, 0, 1, 2, 1,
1025 0, 2, 1, 2, 1, 0, 1,
1026};
1027
1028
1029int wcswidth(const wchar_t *pwcs, size_t n)
1030{
1031 int h, l, m, count;
1032 wchar_t wc;
1033 unsigned char b;
1034
1035 if (ENCODING == __ctype_encoding_7_bit) {
1036 size_t i;
1037
1038 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1039 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1040 return -1;
1041 }
1042 }
1043 }
1044#ifdef __CTYPE_HAS_8_BIT_LOCALES
1045 else if (ENCODING == __ctype_encoding_8_bit) {
1046 mbstate_t mbstate;
1047
1048 mbstate.__mask = 0; /* Initialize the mbstate. */
1049 if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1050 return -1;
1051 }
1052 }
1053#endif /* __CTYPE_HAS_8_BIT_LOCALES */
1054#if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1055 /* For stricter handling of allowed unicode values... see comments above. */
1056 else if (ENCODING == __ctype_encoding_utf8) {
1057 size_t i;
1058
1059 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1060 if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1061 || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1062 ) {
1063 return -1;
1064 }
1065 }
1066 }
1067#endif /* __CTYPE_HAS_UTF_8_LOCALES */
1068
1069 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1070 if (wc <= 0xff) {
1071 /* If we're here, wc != 0. */
1072 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1073 return -1;
1074 }
1075 ++count;
1076 continue;
1077 }
1078 if (((unsigned int) wc) <= 0xffff) {
1079 b = wc & 0xff;
1080 h = (wc >> 8);
1081 l = new_idx[h];
1082 h = new_idx[h+1];
1083 while ((m = (l+h) >> 1) != l) {
1084 if (b >= new_tbl[m]) {
1085 l = m;
1086 } else { /* wc < tbl[m] */
1087 h = m;
1088 }
1089 }
1090 count += new_wtbl[l]; /* none should be -1. */
1091 continue;
1092 }
1093
1094 /* Redo this to minimize average number of compares?*/
1095 if (wc >= 0x1d167) {
1096 if (wc <= 0x1d1ad) {
1097 if ((wc <= 0x1d169
1098 || (wc >= 0x1d173
1099 && (wc <= 0x1d182
1100 || (wc >= 0x1d185
1101 && (wc <= 0x1d18b
1102 || (wc >= 0x1d1aa))))))
1103 ) {
1104 continue;
1105 }
1106 } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1107 continue;
1108 } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1109 ++count; /* need 2.. add one here */
1110 }
1111#if (WCHAR_MAX > 0x7fffffffL)
1112 else if (wc > 0x7fffffffL) {
1113 return -1;
1114 }
1115#endif /* (WCHAR_MAX > 0x7fffffffL) */
1116 }
1117
1118 ++count;
1119 }
1120
1121 return count;
1122}
1123
1124#else /* __UCLIBC_HAS_LOCALE__ */
1125
1126int wcswidth(const wchar_t *pwcs, size_t n)
1127{
1128 int count;
1129 wchar_t wc;
1130 size_t i;
1131
1132 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1133 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1134 return -1;
1135 }
1136 }
1137
1138 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1139 if (wc <= 0xff) {
1140 /* If we're here, wc != 0. */
1141 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1142 return -1;
1143 }
1144 ++count;
1145 continue;
1146 } else {
1147 return -1;
1148 }
1149 }
1150
1151 return count;
1152}
1153
1154#endif /* __UCLIBC_HAS_LOCALE__ */
1155
1156libc_hidden_def(wcswidth)
1157
1158#endif
1159/**********************************************************************/
1160#ifdef L_wcwidth
1161
1162
1163int wcwidth(wchar_t wc)
1164{
1165 return wcswidth(&wc, 1);
1166}
1167
1168#endif
1169/**********************************************************************/
1170
1171
1172typedef struct {
1173 mbstate_t tostate;
1174 mbstate_t fromstate;
1175 int tocodeset;
1176 int fromcodeset;
1177 int frombom;
1178 int tobom;
1179 int fromcodeset0;
1180 int frombom0;
1181 int tobom0;
1182 int skip_invalid_input; /* To support iconv -c option. */
1183} _UC_iconv_t;
1184
1185/* For the multibyte
1186 * bit 0 means swap endian
1187 * bit 1 means 2 byte
1188 * bit 2 means 4 byte
1189 *
1190 */
1191
1192#if defined L_iconv && defined _LIBC
1193/* Used externally only by iconv utility */
1194extern const unsigned char __iconv_codesets[];
1195libc_hidden_proto(__iconv_codesets)
1196#endif
1197
1198#if defined L_iconv || defined L_iconv_main
1199const unsigned char __iconv_codesets[] =
1200 "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */
1201#if __BYTE_ORDER == __BIG_ENDIAN
1202 "\x08\xec""UCS-4\x00" /* always BE */
1203 "\x0a\xec""UCS-4BE\x00"
1204 "\x0a\xed""UCS-4LE\x00"
1205 "\x09\xe4""UTF-32\x00" /* platform endian with BOM */
1206 "\x0b\xe4""UTF-32BE\x00"
1207 "\x0b\xe5""UTF-32LE\x00"
1208 "\x08\xe2""UCS-2\x00" /* always BE */
1209 "\x0a\xe2""UCS-2BE\x00"
1210 "\x0a\xe3""UCS-2LE\x00"
1211 "\x09\xea""UTF-16\x00" /* platform endian with BOM */
1212 "\x0b\xea""UTF-16BE\x00"
1213 "\x0b\xeb""UTF-16LE\x00"
1214#elif __BYTE_ORDER == __LITTLE_ENDIAN
1215 "\x08\xed""UCS-4\x00" /* always BE */
1216 "\x0a\xed""UCS-4BE\x00"
1217 "\x0a\xec""UCS-4LE\x00"
1218 "\x09\xf4""UTF-32\x00" /* platform endian with BOM */
1219 "\x0b\xe5""UTF-32BE\x00"
1220 "\x0b\xe4""UTF-32LE\x00"
1221 "\x08\xe3""UCS-2\x00" /* always BE */
1222 "\x0a\xe3""UCS-2BE\x00"
1223 "\x0a\xe2""UCS-2LE\x00"
1224 "\x09\xfa""UTF-16\x00" /* platform endian with BOM */
1225 "\x0b\xeb""UTF-16BE\x00"
1226 "\x0b\xea""UTF-16LE\x00"
1227#endif
1228 "\x08\x02""UTF-8\x00"
1229 "\x0b\x01""US-ASCII\x00"
1230 "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */
1231#endif
1232#if defined L_iconv && defined _LIBC
1233libc_hidden_data_def(__iconv_codesets)
1234#endif
1235
1236
1237#ifdef L_iconv
1238
1239#include <iconv.h>
1240#include <string.h>
1241#include <endian.h>
1242#include <byteswap.h>
1243
1244#if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1245#error unsupported endianness for iconv
1246#endif
1247
1248#ifndef __CTYPE_HAS_8_BIT_LOCALES
1249#error currently iconv requires 8 bit locales
1250#endif
1251#ifndef __CTYPE_HAS_UTF_8_LOCALES
1252#error currently iconv requires UTF-8 locales
1253#endif
1254
1255
1256enum {
1257 IC_WCHAR_T = 0xe0,
1258 IC_MULTIBYTE = 0xe0,
1259#if __BYTE_ORDER == __BIG_ENDIAN
1260 IC_UCS_4 = 0xec,
1261 IC_UTF_32 = 0xe4,
1262 IC_UCS_2 = 0xe2,
1263 IC_UTF_16 = 0xea,
1264#else
1265 IC_UCS_4 = 0xed,
1266 IC_UTF_32 = 0xe5,
1267 IC_UCS_2 = 0xe3,
1268 IC_UTF_16 = 0xeb,
1269#endif
1270 IC_UTF_8 = 2,
1271 IC_ASCII = 1
1272};
1273
1274
1275static int find_codeset(const char *name)
1276{
1277 const unsigned char *s;
1278 int codeset;
1279
1280 for (s = __iconv_codesets; *s; s += *s) {
1281 if (!strcasecmp((char*) (s + 2), name)) {
1282 return s[1];
1283 }
1284 }
1285
1286 /* The following is ripped from find_locale in locale.c. */
1287
1288 /* TODO: maybe CODESET_LIST + *s ??? */
1289 /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1290 codeset = 2;
1291 s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
1292 do {
1293 ++codeset; /* Increment codeset first. */
1294 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1295 return codeset;
1296 }
1297 } while (*++s);
1298
1299 return 0; /* No matching codeset! */
1300}
1301
1302iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1303{
1304 register _UC_iconv_t *px;
1305 int tocodeset, fromcodeset;
1306
1307 if (((tocodeset = find_codeset(tocode)) != 0)
1308 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1309 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1310 px->tocodeset = tocodeset;
1311 px->tobom0 = px->tobom = (tocodeset >= 0xe0) ? (tocodeset & 0x10) >> 4 : 0;
1312 px->fromcodeset0 = px->fromcodeset = fromcodeset;
1313 px->frombom0 = px->frombom = (fromcodeset >= 0xe0) ? (fromcodeset & 0x10) >> 4 : 0;
1314 px->skip_invalid_input = px->tostate.__mask
1315 = px->fromstate.__mask = 0;
1316 return (iconv_t) px;
1317 }
1318 } else {
1319 __set_errno(EINVAL);
1320 }
1321 return (iconv_t)(-1);
1322}
1323
1324int weak_function iconv_close(iconv_t cd)
1325{
1326 free(cd);
1327
1328 return 0;
1329}
1330
1331size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1332 size_t *__restrict inbytesleft,
1333 char **__restrict outbuf,
1334 size_t *__restrict outbytesleft)
1335{
1336 _UC_iconv_t *px = (_UC_iconv_t *) cd;
1337 size_t nrcount, r;
1338 wchar_t wc, wc2;
1339 int inci, inco;
1340
1341 assert(px != (_UC_iconv_t *)(-1));
1342 assert(sizeof(wchar_t) == 4);
1343
1344 if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */
1345 /* Note: For shift-state encodings we possibly need to output the
1346 * shift sequence to return to initial state! */
1347 if ((px->fromcodeset & 0xf0) == 0xe0) {
1348 }
1349 px->tostate.__mask = px->fromstate.__mask = 0;
1350 px->fromcodeset = px->fromcodeset0;
1351 px->tobom = px->tobom0;
1352 px->frombom = px->frombom0;
1353 return 0;
1354 }
1355
1356 nrcount = 0;
1357 while (*inbytesleft) {
1358 if (!*outbytesleft) {
1359 TOO_BIG:
1360 __set_errno(E2BIG);
1361 return (size_t) -1;
1362 }
1363
1364 inci = inco = 1;
1365 if (px->fromcodeset >= IC_MULTIBYTE) {
1366 inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1367 if (*inbytesleft < inci) goto INVALID;
1368 wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1369 + ((unsigned char)((*inbuf)[1]));
1370 if (inci == 4) {
1371 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1372 + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1373 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1374 } else {
1375 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1376 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1377 && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1378 ) { /* surrogate */
1379 wc =- 0xd800U;
1380 if (*inbytesleft < 4) goto INVALID;
1381 wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1382 + ((unsigned char)((*inbuf)[3]));
1383 if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1384 if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1385 goto ILLEGAL;
1386 }
1387 inci = 4; /* Change inci here in case skipping illegals. */
1388 wc = 0x10000UL + (wc << 10) + wc2;
1389 }
1390 }
1391
1392 if (px->frombom) {
1393 px->frombom = 0;
1394 if ((wc == 0xfeffU)
1395 || (wc == ((inci == 4)
1396 ? (((wchar_t) 0xfffe0000UL))
1397 : ((wchar_t)(0xfffeUL))))
1398 ) {
1399 if (wc != 0xfeffU) {
1400 px->fromcodeset ^= 1; /* toggle endianness */
1401 wc = 0xfeffU;
1402 }
1403 if (!px->frombom) {
1404 goto BOM_SKIP_OUTPUT;
1405 }
1406 goto GOT_BOM;
1407 }
1408 }
1409
1410 if (px->fromcodeset != IC_WCHAR_T) {
1411 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1412 ? 0x7fffffffUL : 0x10ffffUL)
1413#ifdef KUHN
1414 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1415 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1416#endif
1417 ) {
1418 goto ILLEGAL;
1419 }
1420 }
1421 } else if (px->fromcodeset == IC_UTF_8) {
1422 const char *p = *inbuf;
1423 r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1424 if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1425 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1426 assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1427 if (r == (size_t)(-2)) {
1428 INVALID:
1429 __set_errno(EINVAL);
1430 } else {
1431 px->fromstate.__mask = 0;
1432 inci = 1;
1433 ILLEGAL:
1434 if (px->skip_invalid_input) {
1435 px->skip_invalid_input = 2; /* flag for iconv utility */
1436 goto BOM_SKIP_OUTPUT;
1437 }
1438 __set_errno(EILSEQ);
1439 }
1440 return (size_t)(-1);
1441 }
1442#ifdef __UCLIBC_MJN3_ONLY__
1443#warning TODO: optimize this.
1444#endif
1445 if (p != NULL) { /* incomplete char case */
1446 goto INVALID;
1447 }
1448 p = *inbuf + 1; /* nul */
1449 }
1450 inci = p - *inbuf;
1451 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1452 if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1453 goto ILLEGAL;
1454 } else { /* some other 8-bit ascii-extension codeset */
1455 const __codeset_8_bit_t *c8b
1456 = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1457 wc -= 0x80;
1458 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
1459 (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1460 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1461 if (!wc) {
1462 goto ILLEGAL;
1463 }
1464 }
1465 }
1466
1467
1468 if (px->tobom) {
1469 inci = 0;
1470 wc = 0xfeffU;
1471 GOT_BOM:
1472 px->tobom = 0;
1473 }
1474
1475 if (px->tocodeset >= IC_MULTIBYTE) {
1476 inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1477 if (*outbytesleft < inco) goto TOO_BIG;
1478 if (px->tocodeset != IC_WCHAR_T) {
1479 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1480 ? 0x7fffffffUL : 0x10ffffUL)
1481#ifdef KUHN
1482 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1483 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1484#endif
1485 ) {
1486 REPLACE_32:
1487 wc = 0xfffd;
1488 ++nrcount;
1489 }
1490 }
1491 if (inco == 4) {
1492 if (px->tocodeset & 1) wc = bswap_32(wc);
1493 } else {
1494 if (((__uwchar_t)wc ) > 0xffffU) {
1495 if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1496 goto REPLACE_32;
1497 }
1498 if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1499 wc2 = 0xdc00U + (wc & 0x3ff);
1500 wc = 0xd800U + ((wc >> 10) & 0x3ff);
1501 if (px->tocodeset & 1) {
1502 wc = bswap_16(wc);
1503 wc2 = bswap_16(wc2);
1504 }
1505 wc += (wc2 << 16);
1506 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1507 }
1508 (*outbuf)[0] = (char)((unsigned char)(wc));
1509 (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1510 if (inco == 4) {
1511 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1512 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1513 }
1514 } else if (px->tocodeset == IC_UTF_8) {
1515 const wchar_t *pw = &wc;
1516 do {
1517 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1518 if (r != (size_t)(-1)) {
1519#ifdef __UCLIBC_MJN3_ONLY__
1520#warning TODO: What happens for a nul?
1521#endif
1522 if (r == 0) {
1523 if (wc != 0) {
1524 goto TOO_BIG;
1525 }
1526 ++r;
1527 }
1528 break;
1529 }
1530 wc = 0xfffdU;
1531 ++nrcount;
1532 } while (1);
1533 inco = r;
1534 } else if (((__uwchar_t)(wc)) < 0x80) {
1535 CHAR_GOOD:
1536 **outbuf = wc;
1537 } else {
1538 if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1539 const __codeset_8_bit_t *c8b
1540 = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1541 __uwchar_t u;
1542 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1543 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1544 + ((wc >> Cwc2c_TT_SHIFT)
1545 & ((1 << Cwc2c_TI_SHIFT)-1))];
1546 wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
1547 + (u << Cwc2c_TT_SHIFT)
1548 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1549 if (wc) {
1550 goto CHAR_GOOD;
1551 }
1552 }
1553 **outbuf = '?';
1554 ++nrcount;
1555 }
1556
1557 *outbuf += inco;
1558 *outbytesleft -= inco;
1559 BOM_SKIP_OUTPUT:
1560 *inbuf += inci;
1561 *inbytesleft -= inci;
1562 }
1563 return nrcount;
1564}
1565#endif