Blame - ap/libc/glibc/glibc-2.23/libidn/nfkc.c - T106_DC

blob: f3e41d038b86894694f2a9bda6f76d1876ef336d [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/* nfkc.c Unicode normalization utilities.
				2	* Copyright (C) 2002, 2003 Simon Josefsson
				3	*
				4	* This file is part of GNU Libidn.
				5	*
				6	* GNU Libidn is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU Lesser General Public
				8	* License as published by the Free Software Foundation; either
				9	* version 2.1 of the License, or (at your option) any later version.
				10	*
				11	* GNU Libidn is distributed in the hope that it will be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	* Lesser General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU Lesser General Public
				17	* License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
				18	*/
				19
				20	#if HAVE_CONFIG_H
				21	# include "config.h"
				22	#endif
				23
				24	#include <stdlib.h>
				25	#include <string.h>
				26	#include <stdint.h>
				27
				28	#include "stringprep.h"
				29
				30	/* This file contains functions from GLIB, including gutf8.c and
				31	* gunidecomp.c, all licensed under LGPL and copyright hold by:
				32	*
				33	* Copyright (C) 1999, 2000 Tom Tromey
				34	* Copyright 2000 Red Hat, Inc.
				35	*/
				36
				37	/* Hacks to make syncing with GLIB code easier. */
				38	#define gboolean int
				39	#define gchar char
				40	#define guchar unsigned char
				41	#define glong long
				42	#define gint int
				43	#define guint unsigned int
				44	#define gushort unsigned short
				45	#define gint16 int16_t
				46	#define guint16 uint16_t
				47	#define gunichar uint32_t
				48	#define gsize size_t
				49	#define gssize ssize_t
				50	#define g_malloc malloc
				51	#define g_free free
				52	#define GError void
				53	#define g_set_error(a,b,c,d) ((void) 0)
				54	#define g_new(struct_type, n_structs) \
				55	((struct_type ) g_malloc (((gsize) sizeof (struct_type)) ((gsize) (n_structs))))
				56	# if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
				57	# define G_STMT_START (void)(
				58	# define G_STMT_END )
				59	# else
				60	# if (defined (sun) \|\| defined (__sun__))
				61	# define G_STMT_START if (1)
				62	# define G_STMT_END else (void)0
				63	# else
				64	# define G_STMT_START do
				65	# define G_STMT_END while (0)
				66	# endif
				67	# endif
				68	#define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
				69	#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
				70	#define TRUE 1
				71	#define FALSE 0
				72
				73	/* Code from GLIB gunicode.h starts here. */
				74
				75	typedef enum
				76	{
				77	G_NORMALIZE_DEFAULT,
				78	G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
				79	G_NORMALIZE_DEFAULT_COMPOSE,
				80	G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
				81	G_NORMALIZE_ALL,
				82	G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
				83	G_NORMALIZE_ALL_COMPOSE,
				84	G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
				85	}
				86	GNormalizeMode;
				87
				88	/* Code from GLIB gutf8.c starts here. */
				89
				90	#define UTF8_COMPUTE(Char, Mask, Len) \
				91	if (Char < 128) \
				92	{ \
				93	Len = 1; \
				94	Mask = 0x7f; \
				95	} \
				96	else if ((Char & 0xe0) == 0xc0) \
				97	{ \
				98	Len = 2; \
				99	Mask = 0x1f; \
				100	} \
				101	else if ((Char & 0xf0) == 0xe0) \
				102	{ \
				103	Len = 3; \
				104	Mask = 0x0f; \
				105	} \
				106	else if ((Char & 0xf8) == 0xf0) \
				107	{ \
				108	Len = 4; \
				109	Mask = 0x07; \
				110	} \
				111	else if ((Char & 0xfc) == 0xf8) \
				112	{ \
				113	Len = 5; \
				114	Mask = 0x03; \
				115	} \
				116	else if ((Char & 0xfe) == 0xfc) \
				117	{ \
				118	Len = 6; \
				119	Mask = 0x01; \
				120	} \
				121	else \
				122	Len = -1;
				123
				124	#define UTF8_LENGTH(Char) \
				125	((Char) < 0x80 ? 1 : \
				126	((Char) < 0x800 ? 2 : \
				127	((Char) < 0x10000 ? 3 : \
				128	((Char) < 0x200000 ? 4 : \
				129	((Char) < 0x4000000 ? 5 : 6)))))
				130
				131
				132	#define UTF8_GET(Result, Chars, Count, Mask, Len) \
				133	(Result) = (Chars)[0] & (Mask); \
				134	for ((Count) = 1; (Count) < (Len); ++(Count)) \
				135	{ \
				136	if (((Chars)[(Count)] & 0xc0) != 0x80) \
				137	{ \
				138	(Result) = -1; \
				139	break; \
				140	} \
				141	(Result) <<= 6; \
				142	(Result) \|= ((Chars)[(Count)] & 0x3f); \
				143	}
				144
				145	#define UNICODE_VALID(Char) \
				146	((Char) < 0x110000 && \
				147	(((Char) & 0xFFFFF800) != 0xD800) && \
				148	((Char) < 0xFDD0 \|\| (Char) > 0xFDEF) && \
				149	((Char) & 0xFFFE) != 0xFFFE)
				150
				151
				152	static const gchar utf8_skip_data[256] = {
				153	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				154	1, 1, 1, 1, 1, 1, 1,
				155	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				156	1, 1, 1, 1, 1, 1, 1,
				157	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				158	1, 1, 1, 1, 1, 1, 1,
				159	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				160	1, 1, 1, 1, 1, 1, 1,
				161	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				162	1, 1, 1, 1, 1, 1, 1,
				163	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				164	1, 1, 1, 1, 1, 1, 1,
				165	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				166	2, 2, 2, 2, 2, 2, 2,
				167	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
				168	5, 5, 5, 6, 6, 1, 1
				169	};
				170
				171	const gchar *const g_utf8_skip = utf8_skip_data;
				172
				173	#define g_utf8_next_char(p) (char )((p) + g_utf8_skip[(guchar *)(p)])
				174
				175	/*
				176	* g_utf8_strlen:
				177	* @p: pointer to the start of a UTF-8 encoded string.
				178	* @max: the maximum number of bytes to examine. If @max
				179	* is less than 0, then the string is assumed to be
				180	* nul-terminated. If @max is 0, @p will not be examined and
				181	* may be %NULL.
				182	*
				183	* Returns the length of the string in characters.
				184	*
				185	* Return value: the length of the string in characters
				186	**/
				187	static glong
				188	g_utf8_strlen (const gchar * p, gssize max)
				189	{
				190	glong len = 0;
				191	const gchar *start = p;
				192	g_return_val_if_fail (p != NULL \|\| max == 0, 0);
				193
				194	if (max < 0)
				195	{
				196	while (*p)
				197	{
				198	p = g_utf8_next_char (p);
				199	++len;
				200	}
				201	}
				202	else
				203	{
				204	if (max == 0 \|\| !*p)
				205	return 0;
				206
				207	p = g_utf8_next_char (p);
				208
				209	while (p - start < max && *p)
				210	{
				211	++len;
				212	p = g_utf8_next_char (p);
				213	}
				214
				215	/* only do the last len increment if we got a complete
				216	* char (don't count partial chars)
				217	*/
				218	if (p - start == max)
				219	++len;
				220	}
				221
				222	return len;
				223	}
				224
				225	/*
				226	* g_utf8_get_char:
				227	* @p: a pointer to Unicode character encoded as UTF-8
				228	*
				229	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
				230	* If @p does not point to a valid UTF-8 encoded character, results are
				231	* undefined. If you are not sure that the bytes are complete
				232	* valid Unicode characters, you should use g_utf8_get_char_validated()
				233	* instead.
				234	*
				235	* Return value: the resulting character
				236	**/
				237	static gunichar
				238	g_utf8_get_char (const gchar * p)
				239	{
				240	int i, mask = 0, len;
				241	gunichar result;
				242	unsigned char c = (unsigned char) *p;
				243
				244	UTF8_COMPUTE (c, mask, len);
				245	if (len == -1)
				246	return (gunichar) - 1;
				247	UTF8_GET (result, p, i, mask, len);
				248
				249	return result;
				250	}
				251
				252	/*
				253	* g_unichar_to_utf8:
				254	* @c: a ISO10646 character code
				255	* @outbuf: output buffer, must have at least 6 bytes of space.
				256	* If %NULL, the length will be computed and returned
				257	* and nothing will be written to @outbuf.
				258	*
				259	* Converts a single character to UTF-8.
				260	*
				261	* Return value: number of bytes written
				262	**/
				263	static int
				264	g_unichar_to_utf8 (gunichar c, gchar * outbuf)
				265	{
				266	guint len = 0;
				267	int first;
				268	int i;
				269
				270	if (c < 0x80)
				271	{
				272	first = 0;
				273	len = 1;
				274	}
				275	else if (c < 0x800)
				276	{
				277	first = 0xc0;
				278	len = 2;
				279	}
				280	else if (c < 0x10000)
				281	{
				282	first = 0xe0;
				283	len = 3;
				284	}
				285	else if (c < 0x200000)
				286	{
				287	first = 0xf0;
				288	len = 4;
				289	}
				290	else if (c < 0x4000000)
				291	{
				292	first = 0xf8;
				293	len = 5;
				294	}
				295	else
				296	{
				297	first = 0xfc;
				298	len = 6;
				299	}
				300
				301	if (outbuf)
				302	{
				303	for (i = len - 1; i > 0; --i)
				304	{
				305	outbuf[i] = (c & 0x3f) \| 0x80;
				306	c >>= 6;
				307	}
				308	outbuf[0] = c \| first;
				309	}
				310
				311	return len;
				312	}
				313
				314	/*
				315	* g_utf8_to_ucs4_fast:
				316	* @str: a UTF-8 encoded string
				317	* @len: the maximum length of @str to use. If @len < 0, then
				318	* the string is nul-terminated.
				319	* @items_written: location to store the number of characters in the
				320	* result, or %NULL.
				321	*
				322	* Convert a string from UTF-8 to a 32-bit fixed width
				323	* representation as UCS-4, assuming valid UTF-8 input.
				324	* This function is roughly twice as fast as g_utf8_to_ucs4()
				325	* but does no error checking on the input.
				326	*
				327	* Return value: a pointer to a newly allocated UCS-4 string.
				328	* This value must be freed with g_free().
				329	**/
				330	static gunichar *
				331	g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
				332	{
				333	gint j, charlen;
				334	gunichar *result;
				335	gint n_chars, i;
				336	const gchar *p;
				337
				338	g_return_val_if_fail (str != NULL, NULL);
				339
				340	p = str;
				341	n_chars = 0;
				342	if (len < 0)
				343	{
				344	while (*p)
				345	{
				346	p = g_utf8_next_char (p);
				347	++n_chars;
				348	}
				349	}
				350	else
				351	{
				352	while (p < str + len && *p)
				353	{
				354	p = g_utf8_next_char (p);
				355	++n_chars;
				356	}
				357	}
				358
				359	result = g_new (gunichar, n_chars + 1);
				360	if (!result)
				361	return NULL;
				362
				363	p = str;
				364	for (i = 0; i < n_chars; i++)
				365	{
				366	gunichar wc = ((unsigned char *) p)[0];
				367
				368	if (wc < 0x80)
				369	{
				370	result[i] = wc;
				371	p++;
				372	}
				373	else
				374	{
				375	if (wc < 0xe0)
				376	{
				377	charlen = 2;
				378	wc &= 0x1f;
				379	}
				380	else if (wc < 0xf0)
				381	{
				382	charlen = 3;
				383	wc &= 0x0f;
				384	}
				385	else if (wc < 0xf8)
				386	{
				387	charlen = 4;
				388	wc &= 0x07;
				389	}
				390	else if (wc < 0xfc)
				391	{
				392	charlen = 5;
				393	wc &= 0x03;
				394	}
				395	else
				396	{
				397	charlen = 6;
				398	wc &= 0x01;
				399	}
				400
				401	for (j = 1; j < charlen; j++)
				402	{
				403	wc <<= 6;
				404	wc \|= ((unsigned char *) p)[j] & 0x3f;
				405	}
				406
				407	result[i] = wc;
				408	p += charlen;
				409	}
				410	}
				411	result[i] = 0;
				412
				413	if (items_written)
				414	*items_written = i;
				415
				416	return result;
				417	}
				418
				419	/*
				420	* g_ucs4_to_utf8:
				421	* @str: a UCS-4 encoded string
				422	* @len: the maximum length of @str to use. If @len < 0, then
				423	* the string is terminated with a 0 character.
				424	* @items_read: location to store number of characters read read, or %NULL.
				425	* @items_written: location to store number of bytes written or %NULL.
				426	* The value here stored does not include the trailing 0
				427	* byte.
				428	* @error: location to store the error occuring, or %NULL to ignore
				429	* errors. Any of the errors in #GConvertError other than
				430	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
				431	*
				432	* Convert a string from a 32-bit fixed width representation as UCS-4.
				433	* to UTF-8. The result will be terminated with a 0 byte.
				434	*
				435	* Return value: a pointer to a newly allocated UTF-8 string.
				436	* This value must be freed with g_free(). If an
				437	* error occurs, %NULL will be returned and
				438	* @error set.
				439	**/
				440	static gchar *
				441	g_ucs4_to_utf8 (const gunichar * str,
				442	glong len,
				443	glong * items_read, glong * items_written, GError ** error)
				444	{
				445	gint result_length;
				446	gchar *result = NULL;
				447	gchar *p;
				448	gint i;
				449
				450	result_length = 0;
				451	for (i = 0; len < 0 \|\| i < len; i++)
				452	{
				453	if (!str[i])
				454	break;
				455
				456	if (str[i] >= 0x80000000)
				457	{
				458	if (items_read)
				459	*items_read = i;
				460
				461	g_set_error (error, G_CONVERT_ERROR,
				462	G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
				463	_("Character out of range for UTF-8"));
				464	goto err_out;
				465	}
				466
				467	result_length += UTF8_LENGTH (str[i]);
				468	}
				469
				470	result = g_malloc (result_length + 1);
				471	if (!result)
				472	return NULL;
				473	p = result;
				474
				475	i = 0;
				476	while (p < result + result_length)
				477	p += g_unichar_to_utf8 (str[i++], p);
				478
				479	*p = '\0';
				480
				481	if (items_written)
				482	*items_written = p - result;
				483
				484	err_out:
				485	if (items_read)
				486	*items_read = i;
				487
				488	return result;
				489	}
				490
				491	/* Code from GLIB gunidecomp.c starts here. */
				492
				493	#include "gunidecomp.h"
				494	#include "gunicomp.h"
				495
				496	#define CC_PART1(Page, Char) \
				497	((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
				498	? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
				499	: (cclass_data[combining_class_table_part1[Page]][Char]))
				500
				501	#define CC_PART2(Page, Char) \
				502	((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
				503	? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
				504	: (cclass_data[combining_class_table_part2[Page]][Char]))
				505
				506	#define COMBINING_CLASS(Char) \
				507	(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
				508	? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
				509	: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
				510	? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
				511	: 0))
				512
				513	/* constants for hangul syllable [de]composition */
				514	#define SBase 0xAC00
				515	#define LBase 0x1100
				516	#define VBase 0x1161
				517	#define TBase 0x11A7
				518	#define LCount 19
				519	#define VCount 21
				520	#define TCount 28
				521	#define NCount (VCount * TCount)
				522	#define SCount (LCount * NCount)
				523
				524	/*
				525	* g_unicode_canonical_ordering:
				526	* @string: a UCS-4 encoded string.
				527	* @len: the maximum length of @string to use.
				528	*
				529	* Computes the canonical ordering of a string in-place.
				530	* This rearranges decomposed characters in the string
				531	* according to their combining classes. See the Unicode
				532	* manual for more information.
				533	**/
				534	static void
				535	g_unicode_canonical_ordering (gunichar * string, gsize len)
				536	{
				537	gsize i;
				538	int swap = 1;
				539
				540	while (swap)
				541	{
				542	int last;
				543	swap = 0;
				544	last = COMBINING_CLASS (string[0]);
				545	for (i = 0; i < len - 1; ++i)
				546	{
				547	int next = COMBINING_CLASS (string[i + 1]);
				548	if (next != 0 && last > next)
				549	{
				550	gsize j;
				551	/* Percolate item leftward through string. */
				552	for (j = i + 1; j > 0; --j)
				553	{
				554	gunichar t;
				555	if (COMBINING_CLASS (string[j - 1]) <= next)
				556	break;
				557	t = string[j];
				558	string[j] = string[j - 1];
				559	string[j - 1] = t;
				560	swap = 1;
				561	}
				562	/* We're re-entering the loop looking at the old
				563	character again. */
				564	next = last;
				565	}
				566	last = next;
				567	}
				568	}
				569	}
				570
				571	/* http://www.unicode.org/unicode/reports/tr15/#Hangul
				572	* r should be null or have sufficient space. Calling with r == NULL will
				573	* only calculate the result_len; however, a buffer with space for three
				574	* characters will always be big enough. */
				575	static void
				576	decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
				577	{
				578	gint SIndex = s - SBase;
				579
				580	/* not a hangul syllable */
				581	if (SIndex < 0 \|\| SIndex >= SCount)
				582	{
				583	if (r)
				584	r[0] = s;
				585	*result_len = 1;
				586	}
				587	else
				588	{
				589	gunichar L = LBase + SIndex / NCount;
				590	gunichar V = VBase + (SIndex % NCount) / TCount;
				591	gunichar T = TBase + SIndex % TCount;
				592
				593	if (r)
				594	{
				595	r[0] = L;
				596	r[1] = V;
				597	}
				598
				599	if (T != TBase)
				600	{
				601	if (r)
				602	r[2] = T;
				603	*result_len = 3;
				604	}
				605	else
				606	*result_len = 2;
				607	}
				608	}
				609
				610	/* returns a pointer to a null-terminated UTF-8 string */
				611	static const gchar *
				612	find_decomposition (gunichar ch, gboolean compat)
				613	{
				614	int start = 0;
				615	int end = G_N_ELEMENTS (decomp_table);
				616
				617	if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
				618	{
				619	while (TRUE)
				620	{
				621	int half = (start + end) / 2;
				622	if (ch == decomp_table[half].ch)
				623	{
				624	int offset;
				625
				626	if (compat)
				627	{
				628	offset = decomp_table[half].compat_offset;
				629	if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
				630	offset = decomp_table[half].canon_offset;
				631	}
				632	else
				633	{
				634	offset = decomp_table[half].canon_offset;
				635	if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
				636	return NULL;
				637	}
				638
				639	return &(decomp_expansion_string[offset]);
				640	}
				641	else if (half == start)
				642	break;
				643	else if (ch > decomp_table[half].ch)
				644	start = half;
				645	else
				646	end = half;
				647	}
				648	}
				649
				650	return NULL;
				651	}
				652
				653	/* L,V => LV and LV,T => LVT */
				654	static gboolean
				655	combine_hangul (gunichar a, gunichar b, gunichar * result)
				656	{
				657	gint LIndex = a - LBase;
				658	gint SIndex = a - SBase;
				659
				660	gint VIndex = b - VBase;
				661	gint TIndex = b - TBase;
				662
				663	if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
				664	{
				665	result = SBase + (LIndex VCount + VIndex) * TCount;
				666	return TRUE;
				667	}
				668	else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
				669	&& 0 <= TIndex && TIndex <= TCount)
				670	{
				671	*result = a + TIndex;
				672	return TRUE;
				673	}
				674
				675	return FALSE;
				676	}
				677
				678	#define CI(Page, Char) \
				679	((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
				680	? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
				681	: (compose_data[compose_table[Page]][Char]))
				682
				683	#define COMPOSE_INDEX(Char) \
				684	((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
				685
				686	static gboolean
				687	combine (gunichar a, gunichar b, gunichar * result)
				688	{
				689	gushort index_a, index_b;
				690
				691	if (combine_hangul (a, b, result))
				692	return TRUE;
				693
				694	index_a = COMPOSE_INDEX (a);
				695
				696	if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
				697	{
				698	if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
				699	{
				700	*result =
				701	compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
				702	return TRUE;
				703	}
				704	else
				705	return FALSE;
				706	}
				707
				708	index_b = COMPOSE_INDEX (b);
				709
				710	if (index_b >= COMPOSE_SECOND_SINGLE_START)
				711	{
				712	if (a ==
				713	compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
				714	{
				715	*result =
				716	compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
				717	return TRUE;
				718	}
				719	else
				720	return FALSE;
				721	}
				722
				723	if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
				724	&& index_b >= COMPOSE_SECOND_START
				725	&& index_b < COMPOSE_SECOND_SINGLE_START)
				726	{
				727	gunichar res =
				728	compose_array[index_a - COMPOSE_FIRST_START][index_b -
				729	COMPOSE_SECOND_START];
				730
				731	if (res)
				732	{
				733	*result = res;
				734	return TRUE;
				735	}
				736	}
				737
				738	return FALSE;
				739	}
				740
				741	static gunichar *
				742	_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
				743	{
				744	gsize n_wc;
				745	gunichar *wc_buffer;
				746	const char *p;
				747	gsize last_start;
				748	gboolean do_compat = (mode == G_NORMALIZE_NFKC \|\| mode == G_NORMALIZE_NFKD);
				749	gboolean do_compose = (mode == G_NORMALIZE_NFC \|\| mode == G_NORMALIZE_NFKC);
				750
				751	n_wc = 0;
				752	p = str;
				753	while ((max_len < 0 \|\| p < str + max_len) && *p)
				754	{
				755	const gchar *decomp;
				756	gunichar wc = g_utf8_get_char (p);
				757
				758	if (wc >= 0xac00 && wc <= 0xd7af)
				759	{
				760	gsize result_len;
				761	decompose_hangul (wc, NULL, &result_len);
				762	n_wc += result_len;
				763	}
				764	else
				765	{
				766	decomp = find_decomposition (wc, do_compat);
				767
				768	if (decomp)
				769	n_wc += g_utf8_strlen (decomp, -1);
				770	else
				771	n_wc++;
				772	}
				773
				774	p = g_utf8_next_char (p);
				775	}
				776
				777	wc_buffer = g_new (gunichar, n_wc + 1);
				778	if (!wc_buffer)
				779	return NULL;
				780
				781	last_start = 0;
				782	n_wc = 0;
				783	p = str;
				784	while ((max_len < 0 \|\| p < str + max_len) && *p)
				785	{
				786	gunichar wc = g_utf8_get_char (p);
				787	const gchar *decomp;
				788	int cc;
				789	gsize old_n_wc = n_wc;
				790
				791	if (wc >= 0xac00 && wc <= 0xd7af)
				792	{
				793	gsize result_len;
				794	decompose_hangul (wc, wc_buffer + n_wc, &result_len);
				795	n_wc += result_len;
				796	}
				797	else
				798	{
				799	decomp = find_decomposition (wc, do_compat);
				800
				801	if (decomp)
				802	{
				803	const char *pd;
				804	for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
				805	wc_buffer[n_wc++] = g_utf8_get_char (pd);
				806	}
				807	else
				808	wc_buffer[n_wc++] = wc;
				809	}
				810
				811	if (n_wc > 0)
				812	{
				813	cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
				814
				815	if (cc == 0)
				816	{
				817	g_unicode_canonical_ordering (wc_buffer + last_start,
				818	n_wc - last_start);
				819	last_start = old_n_wc;
				820	}
				821	}
				822
				823	p = g_utf8_next_char (p);
				824	}
				825
				826	if (n_wc > 0)
				827	{
				828	g_unicode_canonical_ordering (wc_buffer + last_start,
				829	n_wc - last_start);
				830	last_start = n_wc;
				831	}
				832
				833	wc_buffer[n_wc] = 0;
				834
				835	/* All decomposed and reordered */
				836
				837	if (do_compose && n_wc > 0)
				838	{
				839	gsize i, j;
				840	int last_cc = 0;
				841	last_start = 0;
				842
				843	for (i = 0; i < n_wc; i++)
				844	{
				845	int cc = COMBINING_CLASS (wc_buffer[i]);
				846
				847	if (i > 0 &&
				848	(last_cc == 0 \|\| last_cc != cc) &&
				849	combine (wc_buffer[last_start], wc_buffer[i],
				850	&wc_buffer[last_start]))
				851	{
				852	for (j = i + 1; j < n_wc; j++)
				853	wc_buffer[j - 1] = wc_buffer[j];
				854	n_wc--;
				855	i--;
				856
				857	if (i == last_start)
				858	last_cc = 0;
				859	else
				860	last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
				861
				862	continue;
				863	}
				864
				865	if (cc == 0)
				866	last_start = i;
				867
				868	last_cc = cc;
				869	}
				870	}
				871
				872	wc_buffer[n_wc] = 0;
				873
				874	return wc_buffer;
				875	}
				876
				877	/*
				878	* g_utf8_normalize:
				879	* @str: a UTF-8 encoded string.
				880	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
				881	* @mode: the type of normalization to perform.
				882	*
				883	* Converts a string into canonical form, standardizing
				884	* such issues as whether a character with an accent
				885	* is represented as a base character and combining
				886	* accent or as a single precomposed character. You
				887	* should generally call g_utf8_normalize() before
				888	* comparing two Unicode strings.
				889	*
				890	* The normalization mode %G_NORMALIZE_DEFAULT only
				891	* standardizes differences that do not affect the
				892	* text content, such as the above-mentioned accent
				893	* representation. %G_NORMALIZE_ALL also standardizes
				894	* the "compatibility" characters in Unicode, such
				895	* as SUPERSCRIPT THREE to the standard forms
				896	* (in this case DIGIT THREE). Formatting information
				897	* may be lost but for most text operations such
				898	* characters should be considered the same.
				899	* For example, g_utf8_collate() normalizes
				900	* with %G_NORMALIZE_ALL as its first step.
				901	*
				902	* %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
				903	* are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
				904	* but returned a result with composed forms rather
				905	* than a maximally decomposed form. This is often
				906	* useful if you intend to convert the string to
				907	* a legacy encoding or pass it to a system with
				908	* less capable Unicode handling.
				909	*
				910	* Return value: a newly allocated string, that is the
				911	* normalized form of @str.
				912	**/
				913	static gchar *
				914	g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
				915	{
				916	gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
				917	gchar *result;
				918
				919	result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
				920	g_free (result_wc);
				921
				922	return result;
				923	}
				924
				925	/* Public Libidn API starts here. */
				926
				927	/**
				928	* stringprep_utf8_to_unichar:
				929	* @p: a pointer to Unicode character encoded as UTF-8
				930	*
				931	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
				932	* If @p does not point to a valid UTF-8 encoded character, results are
				933	* undefined.
				934	*
				935	* Return value: the resulting character.
				936	**/
				937	uint32_t
				938	stringprep_utf8_to_unichar (const char *p)
				939	{
				940	return g_utf8_get_char (p);
				941	}
				942
				943	/**
				944	* stringprep_unichar_to_utf8:
				945	* @c: a ISO10646 character code
				946	* @outbuf: output buffer, must have at least 6 bytes of space.
				947	* If %NULL, the length will be computed and returned
				948	* and nothing will be written to @outbuf.
				949	*
				950	* Converts a single character to UTF-8.
				951	*
				952	* Return value: number of bytes written.
				953	**/
				954	int
				955	stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
				956	{
				957	return g_unichar_to_utf8 (c, outbuf);
				958	}
				959
				960	/**
				961	* stringprep_utf8_to_ucs4:
				962	* @str: a UTF-8 encoded string
				963	* @len: the maximum length of @str to use. If @len < 0, then
				964	* the string is nul-terminated.
				965	* @items_written: location to store the number of characters in the
				966	* result, or %NULL.
				967	*
				968	* Convert a string from UTF-8 to a 32-bit fixed width
				969	* representation as UCS-4, assuming valid UTF-8 input.
				970	* This function does no error checking on the input.
				971	*
				972	* Return value: a pointer to a newly allocated UCS-4 string.
				973	* This value must be freed with free().
				974	**/
				975	uint32_t *
				976	stringprep_utf8_to_ucs4 (const char str, ssize_t len, size_t items_written)
				977	{
				978	return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
				979	}
				980
				981	/**
				982	* stringprep_ucs4_to_utf8:
				983	* @str: a UCS-4 encoded string
				984	* @len: the maximum length of @str to use. If @len < 0, then
				985	* the string is terminated with a 0 character.
				986	* @items_read: location to store number of characters read read, or %NULL.
				987	* @items_written: location to store number of bytes written or %NULL.
				988	* The value here stored does not include the trailing 0
				989	* byte.
				990	*
				991	* Convert a string from a 32-bit fixed width representation as UCS-4.
				992	* to UTF-8. The result will be terminated with a 0 byte.
				993	*
				994	* Return value: a pointer to a newly allocated UTF-8 string.
				995	* This value must be freed with free(). If an
				996	* error occurs, %NULL will be returned and
				997	* @error set.
				998	**/
				999	char *
				1000	stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
				1001	size_t * items_read, size_t * items_written)
				1002	{
				1003	return g_ucs4_to_utf8 (str, len, (glong *) items_read,
				1004	(glong *) items_written, NULL);
				1005	}
				1006
				1007	/**
				1008	* stringprep_utf8_nfkc_normalize:
				1009	* @str: a UTF-8 encoded string.
				1010	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
				1011	*
				1012	* Converts a string into canonical form, standardizing
				1013	* such issues as whether a character with an accent
				1014	* is represented as a base character and combining
				1015	* accent or as a single precomposed character.
				1016	*
				1017	* The normalization mode is NFKC (ALL COMPOSE). It standardizes
				1018	* differences that do not affect the text content, such as the
				1019	* above-mentioned accent representation. It standardizes the
				1020	* "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
				1021	* the standard forms (in this case DIGIT THREE). Formatting
				1022	* information may be lost but for most text operations such
				1023	* characters should be considered the same. It returns a result with
				1024	* composed forms rather than a maximally decomposed form.
				1025	*
				1026	* Return value: a newly allocated string, that is the
				1027	* NFKC normalized form of @str.
				1028	**/
				1029	char *
				1030	stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
				1031	{
				1032	return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
				1033	}
				1034
				1035	/**
				1036	* stringprep_ucs4_nfkc_normalize:
				1037	* @str: a Unicode string.
				1038	* @len: length of @str array, or -1 if @str is nul-terminated.
				1039	*
				1040	* Converts UCS4 string into UTF-8 and runs
				1041	* stringprep_utf8_nfkc_normalize().
				1042	*
				1043	* Return value: a newly allocated Unicode string, that is the NFKC
				1044	* normalized form of @str.
				1045	**/
				1046	uint32_t *
				1047	stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
				1048	{
				1049	char *p;
				1050	uint32_t *result_wc;
				1051
				1052	p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
				1053	result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
				1054	free (p);
				1055
				1056	return result_wc;
				1057	}