blob: 771186d8a6f8bf7c7ca69bf9b2ffdff4f9ac444d [file] [log] [blame]
liubin281ac462023-07-19 14:22:54 +08001//
2// Created by hitmoon on 15-12-17.
3//
4#include "mbtk_utf.h"
5#include <stdio.h>
6#include <wchar.h>
7#include <string.h>
8
9static const int halfShift = 10;
10/* used for shifting by 10 bits */
11
12static const UTF32 halfBase = 0x0010000UL;
13static const UTF32 halfMask = 0x3FFUL;
14
15#define UNI_SUR_HIGH_START (UTF32)0xD800
16#define UNI_SUR_HIGH_END (UTF32)0xDBFF
17#define UNI_SUR_LOW_START (UTF32)0xDC00
18#define UNI_SUR_LOW_END (UTF32)0xDFFF
19#define false 0
20#define true 1
21
22/* --------------------------------------------------------------------- */
23
24ConversionResult ConvertUTF32toUTF16(
25 const UTF32 **sourceStart, const UTF32 *sourceEnd,
26 UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags) {
27 ConversionResult result = conversionOK;
28 const UTF32 *source = *sourceStart;
29 UTF16 *target = *targetStart;
30 while (source < sourceEnd) {
31 UTF32 ch;
32 if (target >= targetEnd) {
33 result = targetExhausted;
34 break;
35 }
36 ch = *source++;
37 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
38 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
39 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
40 if (flags == strictConversion) {
41 --source; /* return to the illegal value itself */
42 result = sourceIllegal;
43 break;
44 } else {
45 *target++ = UNI_REPLACEMENT_CHAR;
46 }
47 } else {
48 *target++ = (UTF16) ch; /* normal case */
49 }
50 } else if (ch > UNI_MAX_LEGAL_UTF32) {
51 if (flags == strictConversion) {
52 result = sourceIllegal;
53 } else {
54 *target++ = UNI_REPLACEMENT_CHAR;
55 }
56 } else {
57 /* target is a character in range 0xFFFF - 0x10FFFF. */
58 if (target + 1 >= targetEnd) {
59 --source; /* Back up source pointer! */
60 result = targetExhausted;
61 break;
62 }
63 ch -= halfBase;
64 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
65 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
66 }
67 }
68 *sourceStart = source;
69 *targetStart = target;
70 return result;
71}
72
73/* --------------------------------------------------------------------- */
74
75ConversionResult ConvertUTF16toUTF32(
76 const UTF16 **sourceStart, const UTF16 *sourceEnd,
77 UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags) {
78 ConversionResult result = conversionOK;
79 const UTF16 *source = *sourceStart;
80 UTF32 *target = *targetStart;
81 UTF32 ch, ch2;
82 while (source < sourceEnd) {
83 const UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */
84 ch = *source++;
85 /* If we have a surrogate pair, convert to UTF32 first. */
86 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
87 /* If the 16 bits following the high surrogate are in the source buffer... */
88 if (source < sourceEnd) {
89 ch2 = *source;
90 /* If it's a low surrogate, convert to UTF32. */
91 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
92 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
93 + (ch2 - UNI_SUR_LOW_START) + halfBase;
94 ++source;
95 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
96 --source; /* return to the illegal value itself */
97 result = sourceIllegal;
98 break;
99 }
100 } else { /* We don't have the 16 bits following the high surrogate. */
101 --source; /* return to the high surrogate */
102 result = sourceExhausted;
103 break;
104 }
105 } else if (flags == strictConversion) {
106 /* UTF-16 surrogate values are illegal in UTF-32 */
107 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
108 --source; /* return to the illegal value itself */
109 result = sourceIllegal;
110 break;
111 }
112 }
113 if (target >= targetEnd) {
114 source = oldSource; /* Back up source pointer! */
115 result = targetExhausted;
116 break;
117 }
118 *target++ = ch;
119 }
120 *sourceStart = source;
121 *targetStart = target;
122#ifdef CVTUTF_DEBUG
123 if (result == sourceIllegal) {
124 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x/n", ch, ch2);
125 fflush(stderr);
126}
127#endif
128 return result;
129}
130
131/* --------------------------------------------------------------------- */
132
133/*
134 * Index into the table below with the first byte of a UTF-8 sequence to
135 * get the number of trailing bytes that are supposed to follow it.
136 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
137 * left as-is for anyone who may want to do such conversion, which was
138 * allowed in earlier algorithms.
139 */
140static const char trailingBytesForUTF8[256] = {
141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
149};
150
151/*
152 * Magic values subtracted from a buffer value during UTF8 conversion.
153 * This table contains as many values as there might be trailing bytes
154 * in a UTF-8 sequence.
155 */
156static const UTF32 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL,
157 0x03C82080UL, 0xFA082080UL, 0x82082080UL};
158
159/*
160 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
161 * into the first byte, depending on how many bytes follow. There are
162 * as many entries in this table as there are UTF-8 sequence types.
163 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
164 * for *legal* UTF-8 will be 4 or fewer bytes total.
165 */
166static const UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
167
168/* --------------------------------------------------------------------- */
169
170/* The interface converts a whole buffer to avoid function-call overhead.
171 * Constants have been gathered. Loops & conditionals have been removed as
172 * much as possible for efficiency, in favor of drop-through switches.
173 * (See "Note A" at the bottom of the file for equivalent code.)
174 * If your compiler supports it, the "isLegalUTF8" call can be turned
175 * into an inline function.
176 */
177
178/* --------------------------------------------------------------------- */
179
180ConversionResult ConvertUTF16toUTF8(
181 const UTF16 **sourceStart, const UTF16 *sourceEnd,
182 UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags) {
183 ConversionResult result = conversionOK;
184 const UTF16 *source = *sourceStart;
185 UTF8 *target = *targetStart;
186 while (source < sourceEnd) {
187 UTF32 ch;
b.liu778645e2024-06-21 16:47:42 +0800188 unsigned int bytesToWrite = 0;
liubin281ac462023-07-19 14:22:54 +0800189 const UTF32 byteMask = 0xBF;
190 const UTF32 byteMark = 0x80;
191 const UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */
192 ch = *source++;
193 /* If we have a surrogate pair, convert to UTF32 first. */
194 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
195 /* If the 16 bits following the high surrogate are in the source buffer... */
196 if (source < sourceEnd) {
197 UTF32 ch2 = *source;
198 /* If it's a low surrogate, convert to UTF32. */
199 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
200 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
201 + (ch2 - UNI_SUR_LOW_START) + halfBase;
202 ++source;
203 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
204 --source; /* return to the illegal value itself */
205 result = sourceIllegal;
206 break;
207 }
208 } else { /* We don't have the 16 bits following the high surrogate. */
209 --source; /* return to the high surrogate */
210 result = sourceExhausted;
211 break;
212 }
213 } else if (flags == strictConversion) {
214 /* UTF-16 surrogate values are illegal in UTF-32 */
215 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
216 --source; /* return to the illegal value itself */
217 result = sourceIllegal;
218 break;
219 }
220 }
221 /* Figure out how many bytes the result will require */
222 if (ch < (UTF32) 0x80) {
223 bytesToWrite = 1;
224 } else if (ch < (UTF32) 0x800) {
225 bytesToWrite = 2;
226 } else if (ch < (UTF32) 0x10000) {
227 bytesToWrite = 3;
228 } else if (ch < (UTF32) 0x110000) {
229 bytesToWrite = 4;
230 } else {
231 bytesToWrite = 3;
232 ch = UNI_REPLACEMENT_CHAR;
233 }
234
235 target += bytesToWrite;
236 if (target > targetEnd) {
237 source = oldSource; /* Back up source pointer! */
238 target -= bytesToWrite;
239 result = targetExhausted;
240 break;
241 }
242 switch (bytesToWrite) { /* note: everything falls through. */
243 case 4:
244 *--target = (UTF8) ((ch | byteMark) & byteMask);
245 ch >>= 6;
246 case 3:
247 *--target = (UTF8) ((ch | byteMark) & byteMask);
248 ch >>= 6;
249 case 2:
250 *--target = (UTF8) ((ch | byteMark) & byteMask);
251 ch >>= 6;
252 case 1:
253 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
254 }
255 target += bytesToWrite;
256 }
257 *sourceStart = source;
258 *targetStart = target;
259 return result;
260}
261
262/* --------------------------------------------------------------------- */
263
264/*
265 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
266 * This must be called with the length pre-determined by the first byte.
267 * If not calling this from ConvertUTF8to*, then the length can be set by:
268 * length = trailingBytesForUTF8[*source]+1;
269 * and the sequence is illegal right away if there aren't that many bytes
270 * available.
271 * If presented with a length > 4, this returns false. The Unicode
272 * definition of UTF-8 goes up to 4-byte sequences.
273 */
274
275static Boolean isLegalUTF8(const UTF8 *source, int length) {
276 UTF8 a;
277 const UTF8 *srcptr = source + length;
278 switch (length) {
279 default:
280 return false;
281 /* Everything else falls through when "true"... */
282 case 4:
283 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
284 case 3:
285 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
286 case 2:
287 if ((a = (*--srcptr)) > 0xBF) return false;
288
289 switch (*source) {
290 /* no fall-through in this inner switch */
291 case 0xE0:
292 if (a < 0xA0) return false;
293 break;
294 case 0xED:
295 if (a > 0x9F) return false;
296 break;
297 case 0xF0:
298 if (a < 0x90) return false;
299 break;
300 case 0xF4:
301 if (a > 0x8F) return false;
302 break;
303 default:
304 if (a < 0x80) return false;
305 }
306
307 case 1:
308 if (*source >= 0x80 && *source < 0xC2) return false;
309 }
310 if (*source > 0xF4) return false;
311 return true;
312}
313
314/* --------------------------------------------------------------------- */
315
316/*
317 * Exported function to return whether a UTF-8 sequence is legal or not.
318 * This is not used here; it's just exported.
319 */
320Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
321 int length = trailingBytesForUTF8[*source] + 1;
322 if (source + length > sourceEnd) {
323 return false;
324 }
325 return isLegalUTF8(source, length);
326}
327
328/* --------------------------------------------------------------------- */
329
330ConversionResult ConvertUTF8toUTF16(
331 const UTF8 **sourceStart, const UTF8 *sourceEnd,
332 UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags) {
333 ConversionResult result = conversionOK;
334 const UTF8 *source = *sourceStart;
335 UTF16 *target = *targetStart;
336 while (source < sourceEnd) {
337 UTF32 ch = 0;
338 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
339 if (source + extraBytesToRead >= sourceEnd) {
340 result = sourceExhausted;
341 break;
342 }
343 /* Do this check whether lenient or strict */
344 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
345 result = sourceIllegal;
346 break;
347 }
348 /*
349 * The cases all fall through. See "Note A" below.
350 */
351 switch (extraBytesToRead) {
352 case 5:
353 ch += *source++;
354 ch <<= 6; /* remember, illegal UTF-8 */
355 case 4:
356 ch += *source++;
357 ch <<= 6; /* remember, illegal UTF-8 */
358 case 3:
359 ch += *source++;
360 ch <<= 6;
361 case 2:
362 ch += *source++;
363 ch <<= 6;
364 case 1:
365 ch += *source++;
366 ch <<= 6;
367 case 0:
368 ch += *source++;
369 }
370 ch -= offsetsFromUTF8[extraBytesToRead];
371
372 if (target >= targetEnd) {
373 source -= (extraBytesToRead + 1); /* Back up source pointer! */
374 result = targetExhausted;
375 break;
376 }
377 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
378 /* UTF-16 surrogate values are illegal in UTF-32 */
379 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
380 if (flags == strictConversion) {
381 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
382 result = sourceIllegal;
383 break;
384 } else {
385 *target++ = UNI_REPLACEMENT_CHAR;
386 }
387 } else {
388 *target++ = (UTF16) ch; /* normal case */
389 }
390 } else if (ch > UNI_MAX_UTF16) {
391 if (flags == strictConversion) {
392 result = sourceIllegal;
393 source -= (extraBytesToRead + 1); /* return to the start */
394 break; /* Bail out; shouldn't continue */
395 } else {
396 *target++ = UNI_REPLACEMENT_CHAR;
397 }
398 } else {
399 /* target is a character in range 0xFFFF - 0x10FFFF. */
400 if (target + 1 >= targetEnd) {
401 source -= (extraBytesToRead + 1); /* Back up source pointer! */
402 result = targetExhausted;
403 break;
404 }
405 ch -= halfBase;
406 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
407 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
408 }
409 }
410 *sourceStart = source;
411 *targetStart = target;
412 return result;
413}
414
415/* --------------------------------------------------------------------- */
416
417ConversionResult ConvertUTF32toUTF8(
418 const UTF32 **sourceStart, const UTF32 *sourceEnd,
419 UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags) {
420 ConversionResult result = conversionOK;
421 const UTF32 *source = *sourceStart;
422 UTF8 *target = *targetStart;
423 while (source < sourceEnd) {
424 UTF32 ch;
425 unsigned short bytesToWrite = 0;
426 const UTF32 byteMask = 0xBF;
427 const UTF32 byteMark = 0x80;
428 ch = *source++;
429 if (flags == strictConversion) {
430 /* UTF-16 surrogate values are illegal in UTF-32 */
431 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
432 --source; /* return to the illegal value itself */
433 result = sourceIllegal;
434 break;
435 }
436 }
437 /*
438 * Figure out how many bytes the result will require. Turn any
439 * illegally large UTF32 things (> Plane 17) into replacement chars.
440 */
441 if (ch < (UTF32) 0x80) {
442 bytesToWrite = 1;
443 } else if (ch < (UTF32) 0x800) {
444 bytesToWrite = 2;
445 } else if (ch < (UTF32) 0x10000) {
446 bytesToWrite = 3;
447 } else if (ch <= UNI_MAX_LEGAL_UTF32) {
448 bytesToWrite = 4;
449 } else {
450 bytesToWrite = 3;
451 ch = UNI_REPLACEMENT_CHAR;
452 result = sourceIllegal;
453 }
454
455 target += bytesToWrite;
456 if (target > targetEnd) {
457 --source; /* Back up source pointer! */
458 target -= bytesToWrite;
459 result = targetExhausted;
460 break;
461 }
462 switch (bytesToWrite) { /* note: everything falls through. */
463 case 4:
464 *--target = (UTF8) ((ch | byteMark) & byteMask);
465 ch >>= 6;
466 case 3:
467 *--target = (UTF8) ((ch | byteMark) & byteMask);
468 ch >>= 6;
469 case 2:
470 *--target = (UTF8) ((ch | byteMark) & byteMask);
471 ch >>= 6;
472 case 1:
473 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
474 }
475 target += bytesToWrite;
476 }
477 *sourceStart = source;
478 *targetStart = target;
479 return result;
480}
481
482/* --------------------------------------------------------------------- */
483
484ConversionResult ConvertUTF8toUTF32(
485 const UTF8 **sourceStart, const UTF8 *sourceEnd,
486 UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags) {
487 ConversionResult result = conversionOK;
488 const UTF8 *source = *sourceStart;
489 UTF32 *target = *targetStart;
490 while (source < sourceEnd) {
491 UTF32 ch = 0;
b.liu778645e2024-06-21 16:47:42 +0800492 int extraBytesToRead = trailingBytesForUTF8[*source];
liubin281ac462023-07-19 14:22:54 +0800493 if (source + extraBytesToRead >= sourceEnd) {
494 result = sourceExhausted;
495 break;
496 }
497 /* Do this check whether lenient or strict */
498 if (!isLegalUTF8(source, extraBytesToRead + 1)) {
499 result = sourceIllegal;
500 break;
501 }
502 /*
503 * The cases all fall through. See "Note A" below.
504 */
505 switch (extraBytesToRead) {
506 case 5:
507 ch += *source++;
508 ch <<= 6;
509 case 4:
510 ch += *source++;
511 ch <<= 6;
512 case 3:
513 ch += *source++;
514 ch <<= 6;
515 case 2:
516 ch += *source++;
517 ch <<= 6;
518 case 1:
519 ch += *source++;
520 ch <<= 6;
521 case 0:
522 ch += *source++;
523 }
524 ch -= offsetsFromUTF8[extraBytesToRead];
525
526 if (target >= targetEnd) {
527 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
528 result = targetExhausted;
529 break;
530 }
531 if (ch <= UNI_MAX_LEGAL_UTF32) {
532 /*
533 * UTF-16 surrogate values are illegal in UTF-32, and anything
534 * over Plane 17 (> 0x10FFFF) is illegal.
535 */
536 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
537 if (flags == strictConversion) {
538 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
539 result = sourceIllegal;
540 break;
541 } else {
542 *target++ = UNI_REPLACEMENT_CHAR;
543 }
544 } else {
545 *target++ = ch;
546 }
547 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
548 result = sourceIllegal;
549 *target++ = UNI_REPLACEMENT_CHAR;
550 }
551 }
552 *sourceStart = source;
553 *targetStart = target;
554 return result;
555}
556
557/* ---------------------------------------------------------------------
558
559 Note A.
560 The fall-through switches in UTF-8 reading code save a
561 temp variable, some decrements & conditionals. The switches
562 are equivalent to the following loop:
563 {
564 int tmpBytesToRead = extraBytesToRead+1;
565 do {
566 ch += *source++;
567 --tmpBytesToRead;
568 if (tmpBytesToRead) ch <<= 6;
569 } while (tmpBytesToRead > 0);
570 }
571 In UTF-8 writing code, the switches on "bytesToWrite" are
572 similarly unrolled loops.
573
574 --------------------------------------------------------------------- */
575
576const unsigned char *utf32toutf8(wchar_t *source, unsigned char *target, size_t size, int *len){
577
578 wchar_t *s_start;
579 unsigned char *t_start;
580
581 s_start = source;
582 t_start = target;
583
584 if (ConvertUTF32toUTF8((const UTF32**) &s_start, (UTF32*)s_start + wcslen(source), (UTF8**)&t_start, (UTF8*)t_start + size, strictConversion) == conversionOK) {
585 *len = t_start - target;
586 }
587 else {
588 *len = 0;
589 }
590 target[*len] = '\0';
591 return (const unsigned char*)target;
592}
593
594
595unsigned char *utf16toutf8(unsigned short *source, unsigned char *target, size_t size, int *len){
596
597 unsigned short *s_start;
598 unsigned char *t_start;
599
600 s_start = source;
601 t_start = target;
602
603 if (ConvertUTF16toUTF8((const UTF16**) &s_start, (UTF16*)s_start + strlen((const char*)source) / 2, (UTF8**)&t_start, (UTF8*)t_start + size, strictConversion) == conversionOK) {
604 *len = t_start - target;
605 }
606 else {
607 *len = 0;
608 }
609 target[*len] = '\0';
610 return target;
611}
612
613unsigned short *utf8toutf16(unsigned char *source, unsigned short *target, size_t size, int *len)
614{
615 unsigned char *s_start;
616 unsigned short *t_start;
617
618 s_start = source;
619 t_start = target;
620
621 if (ConvertUTF8toUTF16((const UTF8 **)&s_start, s_start + strlen((const char*)source), &t_start, t_start + size, strictConversion) == conversionOK) {
622 *len = t_start - target;
623 }
624 else {
625 *len = 0;
626 }
627
628 return target;
629}
630
631u_int32_t next_char(unsigned char **string) {
632
633 int len = strlen((const char*)*string);
634 unsigned char ch[4];
635 int i = 0;
636
637 if (len < 4){
638 for (i = 0; i < len; i++)
639 ch[i] = (*string)[i];
640 }
641 else {
642 ch[0] = (*string)[0];
643 ch[1] = (*string)[1];
644 ch[2] = (*string)[2];
645 ch[3] = (*string)[3];
646 }
647
648 if(ch[0] < 0x80) {
649 *string = (*string + 1);
650 return ch[0];
651 }
652 else if (ch[0] >= 0xc0 && ch[0] <= 0xdf) {
653 *string = (*string + 2);
654 return ch[1] << 8 | ch[0];
655 }
656 else if (ch[0] >= 0xe0 && ch[0] <= 0xef) {
657 *string = (*string + 3);
658 return ch[2] << 16 | ch[1] << 8 | ch[0];
659 }
660 else if (ch[0] >= 0xf0 && ch[0] <= 0xf7) {
661 *string = (*string + 4);
662 return ch[3] << 24 | ch[2] << 16 | ch[1] << 8 | ch[0];
663 }
664
665 return *(u_int32_t*)ch;
666}
667
668
669int utf8len(unsigned char *string)
670{
671 unsigned char *end;
672 int ret = 0;
673
674 end = string + strlen((const char*)string);
675 while(string < end) {
676 next_char(&string);
677 ret++;
678 }
679 return ret;
680}
681
682int is_acsii(unsigned char *string)
683{
684 while(*string) {
685 if (*string >= 0x80)
686 return 0;
687 string++;
688 }
689 return 1;
690}
691
692size_t utf8_get_size(unsigned char *source, size_t num)
693{
694 size_t ret = 0;
695
696 unsigned char *cur = source;
697 while (num-- && *cur) {
698 next_char(&cur);
699 }
700 ret = cur - source;
701
702 return ret;
703}