Blame - mbtk/libmbtk_lib/common/mbtk_utf.c - T108_Public

blob: 771186d8a6f8bf7c7ca69bf9b2ffdff4f9ac444d [file] [log] [blame]

liubin	281ac46	2023-07-19 14:22:54 +0800	[diff] [blame]	1	//
				2	// Created by hitmoon on 15-12-17.
				3	//
				4	#include "mbtk_utf.h"
				5	#include <stdio.h>
				6	#include <wchar.h>
				7	#include <string.h>
				8
				9	static const int halfShift = 10;
				10	/* used for shifting by 10 bits */
				11
				12	static const UTF32 halfBase = 0x0010000UL;
				13	static const UTF32 halfMask = 0x3FFUL;
				14
				15	#define UNI_SUR_HIGH_START (UTF32)0xD800
				16	#define UNI_SUR_HIGH_END (UTF32)0xDBFF
				17	#define UNI_SUR_LOW_START (UTF32)0xDC00
				18	#define UNI_SUR_LOW_END (UTF32)0xDFFF
				19	#define false 0
				20	#define true 1
				21
				22	/* --------------------------------------------------------------------- */
				23
				24	ConversionResult ConvertUTF32toUTF16(
				25	const UTF32 *sourceStart, const UTF32 sourceEnd,
				26	UTF16 *targetStart, UTF16 targetEnd, ConversionFlags flags) {
				27	ConversionResult result = conversionOK;
				28	const UTF32 source = sourceStart;
				29	UTF16 target = targetStart;
				30	while (source < sourceEnd) {
				31	UTF32 ch;
				32	if (target >= targetEnd) {
				33	result = targetExhausted;
				34	break;
				35	}
				36	ch = *source++;
				37	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				38	/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
				39	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				40	if (flags == strictConversion) {
				41	--source; /* return to the illegal value itself */
				42	result = sourceIllegal;
				43	break;
				44	} else {
				45	*target++ = UNI_REPLACEMENT_CHAR;
				46	}
				47	} else {
				48	target++ = (UTF16) ch; / normal case */
				49	}
				50	} else if (ch > UNI_MAX_LEGAL_UTF32) {
				51	if (flags == strictConversion) {
				52	result = sourceIllegal;
				53	} else {
				54	*target++ = UNI_REPLACEMENT_CHAR;
				55	}
				56	} else {
				57	/* target is a character in range 0xFFFF - 0x10FFFF. */
				58	if (target + 1 >= targetEnd) {
				59	--source; /* Back up source pointer! */
				60	result = targetExhausted;
				61	break;
				62	}
				63	ch -= halfBase;
				64	*target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
				65	*target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
				66	}
				67	}
				68	*sourceStart = source;
				69	*targetStart = target;
				70	return result;
				71	}
				72
				73	/* --------------------------------------------------------------------- */
				74
				75	ConversionResult ConvertUTF16toUTF32(
				76	const UTF16 *sourceStart, const UTF16 sourceEnd,
				77	UTF32 *targetStart, UTF32 targetEnd, ConversionFlags flags) {
				78	ConversionResult result = conversionOK;
				79	const UTF16 source = sourceStart;
				80	UTF32 target = targetStart;
				81	UTF32 ch, ch2;
				82	while (source < sourceEnd) {
				83	const UTF16 oldSource = source; / In case we have to back up because of target overflow. */
				84	ch = *source++;
				85	/* If we have a surrogate pair, convert to UTF32 first. */
				86	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				87	/* If the 16 bits following the high surrogate are in the source buffer... */
				88	if (source < sourceEnd) {
				89	ch2 = *source;
				90	/* If it's a low surrogate, convert to UTF32. */
				91	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				92	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				93	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				94	++source;
				95	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				96	--source; /* return to the illegal value itself */
				97	result = sourceIllegal;
				98	break;
				99	}
				100	} else { /* We don't have the 16 bits following the high surrogate. */
				101	--source; /* return to the high surrogate */
				102	result = sourceExhausted;
				103	break;
				104	}
				105	} else if (flags == strictConversion) {
				106	/* UTF-16 surrogate values are illegal in UTF-32 */
				107	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				108	--source; /* return to the illegal value itself */
				109	result = sourceIllegal;
				110	break;
				111	}
				112	}
				113	if (target >= targetEnd) {
				114	source = oldSource; /* Back up source pointer! */
				115	result = targetExhausted;
				116	break;
				117	}
				118	*target++ = ch;
				119	}
				120	*sourceStart = source;
				121	*targetStart = target;
				122	#ifdef CVTUTF_DEBUG
				123	if (result == sourceIllegal) {
				124	fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x/n", ch, ch2);
				125	fflush(stderr);
				126	}
				127	#endif
				128	return result;
				129	}
				130
				131	/* --------------------------------------------------------------------- */
				132
				133	/*
				134	* Index into the table below with the first byte of a UTF-8 sequence to
				135	* get the number of trailing bytes that are supposed to follow it.
				136	* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
				137	* left as-is for anyone who may want to do such conversion, which was
				138	* allowed in earlier algorithms.
				139	*/
				140	static const char trailingBytesForUTF8[256] = {
				141	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				142	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				143	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				144	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				145	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				146	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				147	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				148	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
				149	};
				150
				151	/*
				152	* Magic values subtracted from a buffer value during UTF8 conversion.
				153	* This table contains as many values as there might be trailing bytes
				154	* in a UTF-8 sequence.
				155	*/
				156	static const UTF32 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL,
				157	0x03C82080UL, 0xFA082080UL, 0x82082080UL};
				158
				159	/*
				160	* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
				161	* into the first byte, depending on how many bytes follow. There are
				162	* as many entries in this table as there are UTF-8 sequence types.
				163	* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
				164	* for legal UTF-8 will be 4 or fewer bytes total.
				165	*/
				166	static const UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
				167
				168	/* --------------------------------------------------------------------- */
				169
				170	/* The interface converts a whole buffer to avoid function-call overhead.
				171	* Constants have been gathered. Loops & conditionals have been removed as
				172	* much as possible for efficiency, in favor of drop-through switches.
				173	* (See "Note A" at the bottom of the file for equivalent code.)
				174	* If your compiler supports it, the "isLegalUTF8" call can be turned
				175	* into an inline function.
				176	*/
				177
				178	/* --------------------------------------------------------------------- */
				179
				180	ConversionResult ConvertUTF16toUTF8(
				181	const UTF16 *sourceStart, const UTF16 sourceEnd,
				182	UTF8 *targetStart, UTF8 targetEnd, ConversionFlags flags) {
				183	ConversionResult result = conversionOK;
				184	const UTF16 source = sourceStart;
				185	UTF8 target = targetStart;
				186	while (source < sourceEnd) {
				187	UTF32 ch;
b.liu	778645e	2024-06-21 16:47:42 +0800	[diff] [blame]	188	unsigned int bytesToWrite = 0;
liubin	281ac46	2023-07-19 14:22:54 +0800	[diff] [blame]	189	const UTF32 byteMask = 0xBF;
				190	const UTF32 byteMark = 0x80;
				191	const UTF16 oldSource = source; / In case we have to back up because of target overflow. */
				192	ch = *source++;
				193	/* If we have a surrogate pair, convert to UTF32 first. */
				194	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
				195	/* If the 16 bits following the high surrogate are in the source buffer... */
				196	if (source < sourceEnd) {
				197	UTF32 ch2 = *source;
				198	/* If it's a low surrogate, convert to UTF32. */
				199	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
				200	ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
				201	+ (ch2 - UNI_SUR_LOW_START) + halfBase;
				202	++source;
				203	} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
				204	--source; /* return to the illegal value itself */
				205	result = sourceIllegal;
				206	break;
				207	}
				208	} else { /* We don't have the 16 bits following the high surrogate. */
				209	--source; /* return to the high surrogate */
				210	result = sourceExhausted;
				211	break;
				212	}
				213	} else if (flags == strictConversion) {
				214	/* UTF-16 surrogate values are illegal in UTF-32 */
				215	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
				216	--source; /* return to the illegal value itself */
				217	result = sourceIllegal;
				218	break;
				219	}
				220	}
				221	/* Figure out how many bytes the result will require */
				222	if (ch < (UTF32) 0x80) {
				223	bytesToWrite = 1;
				224	} else if (ch < (UTF32) 0x800) {
				225	bytesToWrite = 2;
				226	} else if (ch < (UTF32) 0x10000) {
				227	bytesToWrite = 3;
				228	} else if (ch < (UTF32) 0x110000) {
				229	bytesToWrite = 4;
				230	} else {
				231	bytesToWrite = 3;
				232	ch = UNI_REPLACEMENT_CHAR;
				233	}
				234
				235	target += bytesToWrite;
				236	if (target > targetEnd) {
				237	source = oldSource; /* Back up source pointer! */
				238	target -= bytesToWrite;
				239	result = targetExhausted;
				240	break;
				241	}
				242	switch (bytesToWrite) { /* note: everything falls through. */
				243	case 4:
				244	*--target = (UTF8) ((ch \| byteMark) & byteMask);
				245	ch >>= 6;
				246	case 3:
				247	*--target = (UTF8) ((ch \| byteMark) & byteMask);
				248	ch >>= 6;
				249	case 2:
				250	*--target = (UTF8) ((ch \| byteMark) & byteMask);
				251	ch >>= 6;
				252	case 1:
				253	*--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
				254	}
				255	target += bytesToWrite;
				256	}
				257	*sourceStart = source;
				258	*targetStart = target;
				259	return result;
				260	}
				261
				262	/* --------------------------------------------------------------------- */
				263
				264	/*
				265	* Utility routine to tell whether a sequence of bytes is legal UTF-8.
				266	* This must be called with the length pre-determined by the first byte.
				267	* If not calling this from ConvertUTF8to*, then the length can be set by:
				268	* length = trailingBytesForUTF8[*source]+1;
				269	* and the sequence is illegal right away if there aren't that many bytes
				270	* available.
				271	* If presented with a length > 4, this returns false. The Unicode
				272	* definition of UTF-8 goes up to 4-byte sequences.
				273	*/
				274
				275	static Boolean isLegalUTF8(const UTF8 *source, int length) {
				276	UTF8 a;
				277	const UTF8 *srcptr = source + length;
				278	switch (length) {
				279	default:
				280	return false;
				281	/* Everything else falls through when "true"... */
				282	case 4:
				283	if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				284	case 3:
				285	if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;
				286	case 2:
				287	if ((a = (*--srcptr)) > 0xBF) return false;
				288
				289	switch (*source) {
				290	/* no fall-through in this inner switch */
				291	case 0xE0:
				292	if (a < 0xA0) return false;
				293	break;
				294	case 0xED:
				295	if (a > 0x9F) return false;
				296	break;
				297	case 0xF0:
				298	if (a < 0x90) return false;
				299	break;
				300	case 0xF4:
				301	if (a > 0x8F) return false;
				302	break;
				303	default:
				304	if (a < 0x80) return false;
				305	}
				306
				307	case 1:
				308	if (source >= 0x80 && source < 0xC2) return false;
				309	}
				310	if (*source > 0xF4) return false;
				311	return true;
				312	}
				313
				314	/* --------------------------------------------------------------------- */
				315
				316	/*
				317	* Exported function to return whether a UTF-8 sequence is legal or not.
				318	* This is not used here; it's just exported.
				319	*/
				320	Boolean isLegalUTF8Sequence(const UTF8 source, const UTF8 sourceEnd) {
				321	int length = trailingBytesForUTF8[*source] + 1;
				322	if (source + length > sourceEnd) {
				323	return false;
				324	}
				325	return isLegalUTF8(source, length);
				326	}
				327
				328	/* --------------------------------------------------------------------- */
				329
				330	ConversionResult ConvertUTF8toUTF16(
				331	const UTF8 *sourceStart, const UTF8 sourceEnd,
				332	UTF16 *targetStart, UTF16 targetEnd, ConversionFlags flags) {
				333	ConversionResult result = conversionOK;
				334	const UTF8 source = sourceStart;
				335	UTF16 target = targetStart;
				336	while (source < sourceEnd) {
				337	UTF32 ch = 0;
				338	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
				339	if (source + extraBytesToRead >= sourceEnd) {
				340	result = sourceExhausted;
				341	break;
				342	}
				343	/* Do this check whether lenient or strict */
				344	if (!isLegalUTF8(source, extraBytesToRead + 1)) {
				345	result = sourceIllegal;
				346	break;
				347	}
				348	/*
				349	* The cases all fall through. See "Note A" below.
				350	*/
				351	switch (extraBytesToRead) {
				352	case 5:
				353	ch += *source++;
				354	ch <<= 6; /* remember, illegal UTF-8 */
				355	case 4:
				356	ch += *source++;
				357	ch <<= 6; /* remember, illegal UTF-8 */
				358	case 3:
				359	ch += *source++;
				360	ch <<= 6;
				361	case 2:
				362	ch += *source++;
				363	ch <<= 6;
				364	case 1:
				365	ch += *source++;
				366	ch <<= 6;
				367	case 0:
				368	ch += *source++;
				369	}
				370	ch -= offsetsFromUTF8[extraBytesToRead];
				371
				372	if (target >= targetEnd) {
				373	source -= (extraBytesToRead + 1); /* Back up source pointer! */
				374	result = targetExhausted;
				375	break;
				376	}
				377	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
				378	/* UTF-16 surrogate values are illegal in UTF-32 */
				379	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				380	if (flags == strictConversion) {
				381	source -= (extraBytesToRead + 1); /* return to the illegal value itself */
				382	result = sourceIllegal;
				383	break;
				384	} else {
				385	*target++ = UNI_REPLACEMENT_CHAR;
				386	}
				387	} else {
				388	target++ = (UTF16) ch; / normal case */
				389	}
				390	} else if (ch > UNI_MAX_UTF16) {
				391	if (flags == strictConversion) {
				392	result = sourceIllegal;
				393	source -= (extraBytesToRead + 1); /* return to the start */
				394	break; /* Bail out; shouldn't continue */
				395	} else {
				396	*target++ = UNI_REPLACEMENT_CHAR;
				397	}
				398	} else {
				399	/* target is a character in range 0xFFFF - 0x10FFFF. */
				400	if (target + 1 >= targetEnd) {
				401	source -= (extraBytesToRead + 1); /* Back up source pointer! */
				402	result = targetExhausted;
				403	break;
				404	}
				405	ch -= halfBase;
				406	*target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
				407	*target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
				408	}
				409	}
				410	*sourceStart = source;
				411	*targetStart = target;
				412	return result;
				413	}
				414
				415	/* --------------------------------------------------------------------- */
				416
				417	ConversionResult ConvertUTF32toUTF8(
				418	const UTF32 *sourceStart, const UTF32 sourceEnd,
				419	UTF8 *targetStart, UTF8 targetEnd, ConversionFlags flags) {
				420	ConversionResult result = conversionOK;
				421	const UTF32 source = sourceStart;
				422	UTF8 target = targetStart;
				423	while (source < sourceEnd) {
				424	UTF32 ch;
				425	unsigned short bytesToWrite = 0;
				426	const UTF32 byteMask = 0xBF;
				427	const UTF32 byteMark = 0x80;
				428	ch = *source++;
				429	if (flags == strictConversion) {
				430	/* UTF-16 surrogate values are illegal in UTF-32 */
				431	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				432	--source; /* return to the illegal value itself */
				433	result = sourceIllegal;
				434	break;
				435	}
				436	}
				437	/*
				438	* Figure out how many bytes the result will require. Turn any
				439	* illegally large UTF32 things (> Plane 17) into replacement chars.
				440	*/
				441	if (ch < (UTF32) 0x80) {
				442	bytesToWrite = 1;
				443	} else if (ch < (UTF32) 0x800) {
				444	bytesToWrite = 2;
				445	} else if (ch < (UTF32) 0x10000) {
				446	bytesToWrite = 3;
				447	} else if (ch <= UNI_MAX_LEGAL_UTF32) {
				448	bytesToWrite = 4;
				449	} else {
				450	bytesToWrite = 3;
				451	ch = UNI_REPLACEMENT_CHAR;
				452	result = sourceIllegal;
				453	}
				454
				455	target += bytesToWrite;
				456	if (target > targetEnd) {
				457	--source; /* Back up source pointer! */
				458	target -= bytesToWrite;
				459	result = targetExhausted;
				460	break;
				461	}
				462	switch (bytesToWrite) { /* note: everything falls through. */
				463	case 4:
				464	*--target = (UTF8) ((ch \| byteMark) & byteMask);
				465	ch >>= 6;
				466	case 3:
				467	*--target = (UTF8) ((ch \| byteMark) & byteMask);
				468	ch >>= 6;
				469	case 2:
				470	*--target = (UTF8) ((ch \| byteMark) & byteMask);
				471	ch >>= 6;
				472	case 1:
				473	*--target = (UTF8) (ch \| firstByteMark[bytesToWrite]);
				474	}
				475	target += bytesToWrite;
				476	}
				477	*sourceStart = source;
				478	*targetStart = target;
				479	return result;
				480	}
				481
				482	/* --------------------------------------------------------------------- */
				483
				484	ConversionResult ConvertUTF8toUTF32(
				485	const UTF8 *sourceStart, const UTF8 sourceEnd,
				486	UTF32 *targetStart, UTF32 targetEnd, ConversionFlags flags) {
				487	ConversionResult result = conversionOK;
				488	const UTF8 source = sourceStart;
				489	UTF32 target = targetStart;
				490	while (source < sourceEnd) {
				491	UTF32 ch = 0;
b.liu	778645e	2024-06-21 16:47:42 +0800	[diff] [blame]	492	int extraBytesToRead = trailingBytesForUTF8[*source];
liubin	281ac46	2023-07-19 14:22:54 +0800	[diff] [blame]	493	if (source + extraBytesToRead >= sourceEnd) {
				494	result = sourceExhausted;
				495	break;
				496	}
				497	/* Do this check whether lenient or strict */
				498	if (!isLegalUTF8(source, extraBytesToRead + 1)) {
				499	result = sourceIllegal;
				500	break;
				501	}
				502	/*
				503	* The cases all fall through. See "Note A" below.
				504	*/
				505	switch (extraBytesToRead) {
				506	case 5:
				507	ch += *source++;
				508	ch <<= 6;
				509	case 4:
				510	ch += *source++;
				511	ch <<= 6;
				512	case 3:
				513	ch += *source++;
				514	ch <<= 6;
				515	case 2:
				516	ch += *source++;
				517	ch <<= 6;
				518	case 1:
				519	ch += *source++;
				520	ch <<= 6;
				521	case 0:
				522	ch += *source++;
				523	}
				524	ch -= offsetsFromUTF8[extraBytesToRead];
				525
				526	if (target >= targetEnd) {
				527	source -= (extraBytesToRead + 1); /* Back up the source pointer! */
				528	result = targetExhausted;
				529	break;
				530	}
				531	if (ch <= UNI_MAX_LEGAL_UTF32) {
				532	/*
				533	* UTF-16 surrogate values are illegal in UTF-32, and anything
				534	* over Plane 17 (> 0x10FFFF) is illegal.
				535	*/
				536	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
				537	if (flags == strictConversion) {
				538	source -= (extraBytesToRead + 1); /* return to the illegal value itself */
				539	result = sourceIllegal;
				540	break;
				541	} else {
				542	*target++ = UNI_REPLACEMENT_CHAR;
				543	}
				544	} else {
				545	*target++ = ch;
				546	}
				547	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
				548	result = sourceIllegal;
				549	*target++ = UNI_REPLACEMENT_CHAR;
				550	}
				551	}
				552	*sourceStart = source;
				553	*targetStart = target;
				554	return result;
				555	}
				556
				557	/* ---------------------------------------------------------------------
				558
				559	Note A.
				560	The fall-through switches in UTF-8 reading code save a
				561	temp variable, some decrements & conditionals. The switches
				562	are equivalent to the following loop:
				563	{
				564	int tmpBytesToRead = extraBytesToRead+1;
				565	do {
				566	ch += *source++;
				567	--tmpBytesToRead;
				568	if (tmpBytesToRead) ch <<= 6;
				569	} while (tmpBytesToRead > 0);
				570	}
				571	In UTF-8 writing code, the switches on "bytesToWrite" are
				572	similarly unrolled loops.
				573
				574	--------------------------------------------------------------------- */
				575
				576	const unsigned char utf32toutf8(wchar_t source, unsigned char target, size_t size, int len){
				577
				578	wchar_t *s_start;
				579	unsigned char *t_start;
				580
				581	s_start = source;
				582	t_start = target;
				583
				584	if (ConvertUTF32toUTF8((const UTF32*) &s_start, (UTF32)s_start + wcslen(source), (UTF8*)&t_start, (UTF8)t_start + size, strictConversion) == conversionOK) {
				585	*len = t_start - target;
				586	}
				587	else {
				588	*len = 0;
				589	}
				590	target[*len] = '\0';
				591	return (const unsigned char*)target;
				592	}
				593
				594
				595	unsigned char utf16toutf8(unsigned short source, unsigned char target, size_t size, int len){
				596
				597	unsigned short *s_start;
				598	unsigned char *t_start;
				599
				600	s_start = source;
				601	t_start = target;
				602
				603	if (ConvertUTF16toUTF8((const UTF16*) &s_start, (UTF16)s_start + strlen((const char)source) / 2, (UTF8)&t_start, (UTF8)t_start + size, strictConversion) == conversionOK) {
				604	*len = t_start - target;
				605	}
				606	else {
				607	*len = 0;
				608	}
				609	target[*len] = '\0';
				610	return target;
				611	}
				612
				613	unsigned short utf8toutf16(unsigned char source, unsigned short target, size_t size, int len)
				614	{
				615	unsigned char *s_start;
				616	unsigned short *t_start;
				617
				618	s_start = source;
				619	t_start = target;
				620
				621	if (ConvertUTF8toUTF16((const UTF8 *)&s_start, s_start + strlen((const char)source), &t_start, t_start + size, strictConversion) == conversionOK) {
				622	*len = t_start - target;
				623	}
				624	else {
				625	*len = 0;
				626	}
				627
				628	return target;
				629	}
				630
				631	u_int32_t next_char(unsigned char **string) {
				632
				633	int len = strlen((const char)string);
				634	unsigned char ch[4];
				635	int i = 0;
				636
				637	if (len < 4){
				638	for (i = 0; i < len; i++)
				639	ch[i] = (*string)[i];
				640	}
				641	else {
				642	ch[0] = (*string)[0];
				643	ch[1] = (*string)[1];
				644	ch[2] = (*string)[2];
				645	ch[3] = (*string)[3];
				646	}
				647
				648	if(ch[0] < 0x80) {
				649	string = (string + 1);
				650	return ch[0];
				651	}
				652	else if (ch[0] >= 0xc0 && ch[0] <= 0xdf) {
				653	string = (string + 2);
				654	return ch[1] << 8 \| ch[0];
				655	}
				656	else if (ch[0] >= 0xe0 && ch[0] <= 0xef) {
				657	string = (string + 3);
				658	return ch[2] << 16 \| ch[1] << 8 \| ch[0];
				659	}
				660	else if (ch[0] >= 0xf0 && ch[0] <= 0xf7) {
				661	string = (string + 4);
				662	return ch[3] << 24 \| ch[2] << 16 \| ch[1] << 8 \| ch[0];
				663	}
				664
				665	return (u_int32_t)ch;
				666	}
				667
				668
				669	int utf8len(unsigned char *string)
				670	{
				671	unsigned char *end;
				672	int ret = 0;
				673
				674	end = string + strlen((const char*)string);
				675	while(string < end) {
				676	next_char(&string);
				677	ret++;
				678	}
				679	return ret;
				680	}
				681
				682	int is_acsii(unsigned char *string)
				683	{
				684	while(*string) {
				685	if (*string >= 0x80)
				686	return 0;
				687	string++;
				688	}
				689	return 1;
				690	}
				691
				692	size_t utf8_get_size(unsigned char *source, size_t num)
				693	{
				694	size_t ret = 0;
				695
				696	unsigned char *cur = source;
				697	while (num-- && *cur) {
				698	next_char(&cur);
				699	}
				700	ret = cur - source;
				701
				702	return ret;
				703	}