| /* | 
 |  * Copyright (c) 2014 SGI. | 
 |  * All rights reserved. | 
 |  * | 
 |  * This program is free software; you can redistribute it and/or | 
 |  * modify it under the terms of the GNU General Public License as | 
 |  * published by the Free Software Foundation. | 
 |  * | 
 |  * This program is distributed in the hope that it would be useful, | 
 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
 |  * GNU General Public License for more details. | 
 |  * | 
 |  * You should have received a copy of the GNU General Public License | 
 |  * along with this program; if not, write the Free Software Foundation, | 
 |  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA | 
 |  */ | 
 |  | 
 | /* Generator for a compact trie for unicode normalization */ | 
 |  | 
 | #include <sys/types.h> | 
 | #include <stddef.h> | 
 | #include <stdlib.h> | 
 | #include <stdio.h> | 
 | #include <assert.h> | 
 | #include <string.h> | 
 | #include <unistd.h> | 
 | #include <errno.h> | 
 |  | 
 | /* Default names of the in- and output files. */ | 
 |  | 
 | #define AGE_NAME	"DerivedAge.txt" | 
 | #define CCC_NAME	"DerivedCombiningClass.txt" | 
 | #define PROP_NAME	"DerivedCoreProperties.txt" | 
 | #define DATA_NAME	"UnicodeData.txt" | 
 | #define FOLD_NAME	"CaseFolding.txt" | 
 | #define NORM_NAME	"NormalizationCorrections.txt" | 
 | #define TEST_NAME	"NormalizationTest.txt" | 
 | #define UTF8_NAME	"utf8data.h" | 
 |  | 
 | const char	*age_name  = AGE_NAME; | 
 | const char	*ccc_name  = CCC_NAME; | 
 | const char	*prop_name = PROP_NAME; | 
 | const char	*data_name = DATA_NAME; | 
 | const char	*fold_name = FOLD_NAME; | 
 | const char	*norm_name = NORM_NAME; | 
 | const char	*test_name = TEST_NAME; | 
 | const char	*utf8_name = UTF8_NAME; | 
 |  | 
 | int verbose = 0; | 
 |  | 
 | /* An arbitrary line size limit on input lines. */ | 
 |  | 
 | #define LINESIZE	1024 | 
 | char line[LINESIZE]; | 
 | char buf0[LINESIZE]; | 
 | char buf1[LINESIZE]; | 
 | char buf2[LINESIZE]; | 
 | char buf3[LINESIZE]; | 
 |  | 
 | const char *argv0; | 
 |  | 
 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | /* | 
 |  * Unicode version numbers consist of three parts: major, minor, and a | 
 |  * revision.  These numbers are packed into an unsigned int to obtain | 
 |  * a single version number. | 
 |  * | 
 |  * To save space in the generated trie, the unicode version is not | 
 |  * stored directly, instead we calculate a generation number from the | 
 |  * unicode versions seen in the DerivedAge file, and use that as an | 
 |  * index into a table of unicode versions. | 
 |  */ | 
 | #define UNICODE_MAJ_SHIFT		(16) | 
 | #define UNICODE_MIN_SHIFT		(8) | 
 |  | 
 | #define UNICODE_MAJ_MAX			((unsigned short)-1) | 
 | #define UNICODE_MIN_MAX			((unsigned char)-1) | 
 | #define UNICODE_REV_MAX			((unsigned char)-1) | 
 |  | 
 | #define UNICODE_AGE(MAJ,MIN,REV)			\ | 
 | 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\ | 
 | 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\ | 
 | 	 ((unsigned int)(REV))) | 
 |  | 
 | unsigned int *ages; | 
 | int ages_count; | 
 |  | 
 | unsigned int unicode_maxage; | 
 |  | 
 | static int age_valid(unsigned int major, unsigned int minor, | 
 | 		     unsigned int revision) | 
 | { | 
 | 	if (major > UNICODE_MAJ_MAX) | 
 | 		return 0; | 
 | 	if (minor > UNICODE_MIN_MAX) | 
 | 		return 0; | 
 | 	if (revision > UNICODE_REV_MAX) | 
 | 		return 0; | 
 | 	return 1; | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | /* | 
 |  * utf8trie_t | 
 |  * | 
 |  * A compact binary tree, used to decode UTF-8 characters. | 
 |  * | 
 |  * Internal nodes are one byte for the node itself, and up to three | 
 |  * bytes for an offset into the tree.  The first byte contains the | 
 |  * following information: | 
 |  *  NEXTBYTE  - flag        - advance to next byte if set | 
 |  *  BITNUM    - 3 bit field - the bit number to tested | 
 |  *  OFFLEN    - 2 bit field - number of bytes in the offset | 
 |  * if offlen == 0 (non-branching node) | 
 |  *  RIGHTPATH - 1 bit field - set if the following node is for the | 
 |  *                            right-hand path (tested bit is set) | 
 |  *  TRIENODE  - 1 bit field - set if the following node is an internal | 
 |  *                            node, otherwise it is a leaf node | 
 |  * if offlen != 0 (branching node) | 
 |  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal | 
 |  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal | 
 |  * | 
 |  * Due to the way utf8 works, there cannot be branching nodes with | 
 |  * NEXTBYTE set, and moreover those nodes always have a righthand | 
 |  * descendant. | 
 |  */ | 
 | typedef unsigned char utf8trie_t; | 
 | #define BITNUM		0x07 | 
 | #define NEXTBYTE	0x08 | 
 | #define OFFLEN		0x30 | 
 | #define OFFLEN_SHIFT	4 | 
 | #define RIGHTPATH	0x40 | 
 | #define TRIENODE	0x80 | 
 | #define RIGHTNODE	0x40 | 
 | #define LEFTNODE	0x80 | 
 |  | 
 | /* | 
 |  * utf8leaf_t | 
 |  * | 
 |  * The leaves of the trie are embedded in the trie, and so the same | 
 |  * underlying datatype, unsigned char. | 
 |  * | 
 |  * leaf[0]: The unicode version, stored as a generation number that is | 
 |  *          an index into utf8agetab[].  With this we can filter code | 
 |  *          points based on the unicode version in which they were | 
 |  *          defined.  The CCC of a non-defined code point is 0. | 
 |  * leaf[1]: Canonical Combining Class. During normalization, we need | 
 |  *          to do a stable sort into ascending order of all characters | 
 |  *          with a non-zero CCC that occur between two characters with | 
 |  *          a CCC of 0, or at the begin or end of a string. | 
 |  *          The unicode standard guarantees that all CCC values are | 
 |  *          between 0 and 254 inclusive, which leaves 255 available as | 
 |  *          a special value. | 
 |  *          Code points with CCC 0 are known as stoppers. | 
 |  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the | 
 |  *          start of a NUL-terminated string that is the decomposition | 
 |  *          of the character. | 
 |  *          The CCC of a decomposable character is the same as the CCC | 
 |  *          of the first character of its decomposition. | 
 |  *          Some characters decompose as the empty string: these are | 
 |  *          characters with the Default_Ignorable_Code_Point property. | 
 |  *          These do affect normalization, as they all have CCC 0. | 
 |  * | 
 |  * The decompositions in the trie have been fully expanded. | 
 |  * | 
 |  * Casefolding, if applicable, is also done using decompositions. | 
 |  */ | 
 | typedef unsigned char utf8leaf_t; | 
 |  | 
 | #define LEAF_GEN(LEAF)	((LEAF)[0]) | 
 | #define LEAF_CCC(LEAF)	((LEAF)[1]) | 
 | #define LEAF_STR(LEAF)	((const char*)((LEAF) + 2)) | 
 |  | 
 | #define MAXGEN		(255) | 
 |  | 
 | #define MINCCC		(0) | 
 | #define MAXCCC		(254) | 
 | #define STOPPER		(0) | 
 | #define DECOMPOSE	(255) | 
 | #define HANGUL		((char)(255)) | 
 |  | 
 | #define UTF8HANGULLEAF	(12) | 
 |  | 
 | struct tree; | 
 | static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, | 
 | 			       const char *, size_t); | 
 | static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); | 
 |  | 
 | unsigned char *utf8data; | 
 | size_t utf8data_size; | 
 |  | 
 | utf8trie_t *nfdi; | 
 | utf8trie_t *nfdicf; | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | /* | 
 |  * UTF8 valid ranges. | 
 |  * | 
 |  * The UTF-8 encoding spreads the bits of a 32bit word over several | 
 |  * bytes. This table gives the ranges that can be held and how they'd | 
 |  * be represented. | 
 |  * | 
 |  * 0x00000000 0x0000007F: 0xxxxxxx | 
 |  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx | 
 |  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx | 
 |  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 
 |  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 
 |  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 
 |  * | 
 |  * There is an additional requirement on UTF-8, in that only the | 
 |  * shortest representation of a 32bit value is to be used.  A decoder | 
 |  * must not decode sequences that do not satisfy this requirement. | 
 |  * Thus the allowed ranges have a lower bound. | 
 |  * | 
 |  * 0x00000000 0x0000007F: 0xxxxxxx | 
 |  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx | 
 |  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx | 
 |  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | 
 |  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 
 |  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | 
 |  * | 
 |  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, | 
 |  * 17 planes of 65536 values.  This limits the sequences actually seen | 
 |  * even more, to just the following. | 
 |  * | 
 |  *          0 -     0x7f: 0                     0x7f | 
 |  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf | 
 |  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf | 
 |  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf | 
 |  * | 
 |  * Even within those ranges not all values are allowed: the surrogates | 
 |  * 0xd800 - 0xdfff should never be seen. | 
 |  * | 
 |  * Note that the longest sequence seen with valid usage is 4 bytes, | 
 |  * the same a single UTF-32 character.  This makes the UTF-8 | 
 |  * representation of Unicode strictly smaller than UTF-32. | 
 |  * | 
 |  * The shortest sequence requirement was introduced by: | 
 |  *    Corrigendum #1: UTF-8 Shortest Form | 
 |  * It can be found here: | 
 |  *    http://www.unicode.org/versions/corrigendum1.html | 
 |  * | 
 |  */ | 
 |  | 
 | #define UTF8_2_BITS     0xC0 | 
 | #define UTF8_3_BITS     0xE0 | 
 | #define UTF8_4_BITS     0xF0 | 
 | #define UTF8_N_BITS     0x80 | 
 | #define UTF8_2_MASK     0xE0 | 
 | #define UTF8_3_MASK     0xF0 | 
 | #define UTF8_4_MASK     0xF8 | 
 | #define UTF8_N_MASK     0xC0 | 
 | #define UTF8_V_MASK     0x3F | 
 | #define UTF8_V_SHIFT    6 | 
 |  | 
 | static int utf8encode(char *str, unsigned int val) | 
 | { | 
 | 	int len; | 
 |  | 
 | 	if (val < 0x80) { | 
 | 		str[0] = val; | 
 | 		len = 1; | 
 | 	} else if (val < 0x800) { | 
 | 		str[1] = val & UTF8_V_MASK; | 
 | 		str[1] |= UTF8_N_BITS; | 
 | 		val >>= UTF8_V_SHIFT; | 
 | 		str[0] = val; | 
 | 		str[0] |= UTF8_2_BITS; | 
 | 		len = 2; | 
 | 	} else if (val < 0x10000) { | 
 | 		str[2] = val & UTF8_V_MASK; | 
 | 		str[2] |= UTF8_N_BITS; | 
 | 		val >>= UTF8_V_SHIFT; | 
 | 		str[1] = val & UTF8_V_MASK; | 
 | 		str[1] |= UTF8_N_BITS; | 
 | 		val >>= UTF8_V_SHIFT; | 
 | 		str[0] = val; | 
 | 		str[0] |= UTF8_3_BITS; | 
 | 		len = 3; | 
 | 	} else if (val < 0x110000) { | 
 | 		str[3] = val & UTF8_V_MASK; | 
 | 		str[3] |= UTF8_N_BITS; | 
 | 		val >>= UTF8_V_SHIFT; | 
 | 		str[2] = val & UTF8_V_MASK; | 
 | 		str[2] |= UTF8_N_BITS; | 
 | 		val >>= UTF8_V_SHIFT; | 
 | 		str[1] = val & UTF8_V_MASK; | 
 | 		str[1] |= UTF8_N_BITS; | 
 | 		val >>= UTF8_V_SHIFT; | 
 | 		str[0] = val; | 
 | 		str[0] |= UTF8_4_BITS; | 
 | 		len = 4; | 
 | 	} else { | 
 | 		printf("%#x: illegal val\n", val); | 
 | 		len = 0; | 
 | 	} | 
 | 	return len; | 
 | } | 
 |  | 
 | static unsigned int utf8decode(const char *str) | 
 | { | 
 | 	const unsigned char *s = (const unsigned char*)str; | 
 | 	unsigned int unichar = 0; | 
 |  | 
 | 	if (*s < 0x80) { | 
 | 		unichar = *s; | 
 | 	} else if (*s < UTF8_3_BITS) { | 
 | 		unichar = *s++ & 0x1F; | 
 | 		unichar <<= UTF8_V_SHIFT; | 
 | 		unichar |= *s & 0x3F; | 
 | 	} else if (*s < UTF8_4_BITS) { | 
 | 		unichar = *s++ & 0x0F; | 
 | 		unichar <<= UTF8_V_SHIFT; | 
 | 		unichar |= *s++ & 0x3F; | 
 | 		unichar <<= UTF8_V_SHIFT; | 
 | 		unichar |= *s & 0x3F; | 
 | 	} else { | 
 | 		unichar = *s++ & 0x0F; | 
 | 		unichar <<= UTF8_V_SHIFT; | 
 | 		unichar |= *s++ & 0x3F; | 
 | 		unichar <<= UTF8_V_SHIFT; | 
 | 		unichar |= *s++ & 0x3F; | 
 | 		unichar <<= UTF8_V_SHIFT; | 
 | 		unichar |= *s & 0x3F; | 
 | 	} | 
 | 	return unichar; | 
 | } | 
 |  | 
 | static int utf32valid(unsigned int unichar) | 
 | { | 
 | 	return unichar < 0x110000; | 
 | } | 
 |  | 
 | #define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3) | 
 |  | 
 | #define NODE 1 | 
 | #define LEAF 0 | 
 |  | 
 | struct tree { | 
 | 	void *root; | 
 | 	int childnode; | 
 | 	const char *type; | 
 | 	unsigned int maxage; | 
 | 	struct tree *next; | 
 | 	int (*leaf_equal)(void *, void *); | 
 | 	void (*leaf_print)(void *, int); | 
 | 	int (*leaf_mark)(void *); | 
 | 	int (*leaf_size)(void *); | 
 | 	int *(*leaf_index)(struct tree *, void *); | 
 | 	unsigned char *(*leaf_emit)(void *, unsigned char *); | 
 | 	int leafindex[0x110000]; | 
 | 	int index; | 
 | }; | 
 |  | 
 | struct node { | 
 | 	int index; | 
 | 	int offset; | 
 | 	int mark; | 
 | 	int size; | 
 | 	struct node *parent; | 
 | 	void *left; | 
 | 	void *right; | 
 | 	unsigned char bitnum; | 
 | 	unsigned char nextbyte; | 
 | 	unsigned char leftnode; | 
 | 	unsigned char rightnode; | 
 | 	unsigned int keybits; | 
 | 	unsigned int keymask; | 
 | }; | 
 |  | 
 | /* | 
 |  * Example lookup function for a tree. | 
 |  */ | 
 | static void *lookup(struct tree *tree, const char *key) | 
 | { | 
 | 	struct node *node; | 
 | 	void *leaf = NULL; | 
 |  | 
 | 	node = tree->root; | 
 | 	while (!leaf && node) { | 
 | 		if (node->nextbyte) | 
 | 			key++; | 
 | 		if (*key & (1 << (node->bitnum & 7))) { | 
 | 			/* Right leg */ | 
 | 			if (node->rightnode == NODE) { | 
 | 				node = node->right; | 
 | 			} else if (node->rightnode == LEAF) { | 
 | 				leaf = node->right; | 
 | 			} else { | 
 | 				node = NULL; | 
 | 			} | 
 | 		} else { | 
 | 			/* Left leg */ | 
 | 			if (node->leftnode == NODE) { | 
 | 				node = node->left; | 
 | 			} else if (node->leftnode == LEAF) { | 
 | 				leaf = node->left; | 
 | 			} else { | 
 | 				node = NULL; | 
 | 			} | 
 | 		} | 
 | 	} | 
 |  | 
 | 	return leaf; | 
 | } | 
 |  | 
 | /* | 
 |  * A simple non-recursive tree walker: keep track of visits to the | 
 |  * left and right branches in the leftmask and rightmask. | 
 |  */ | 
 | static void tree_walk(struct tree *tree) | 
 | { | 
 | 	struct node *node; | 
 | 	unsigned int leftmask; | 
 | 	unsigned int rightmask; | 
 | 	unsigned int bitmask; | 
 | 	int indent = 1; | 
 | 	int nodes, singletons, leaves; | 
 |  | 
 | 	nodes = singletons = leaves = 0; | 
 |  | 
 | 	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); | 
 | 	if (tree->childnode == LEAF) { | 
 | 		assert(tree->root); | 
 | 		tree->leaf_print(tree->root, indent); | 
 | 		leaves = 1; | 
 | 	} else { | 
 | 		assert(tree->childnode == NODE); | 
 | 		node = tree->root; | 
 | 		leftmask = rightmask = 0; | 
 | 		while (node) { | 
 | 			printf("%*snode @ %p bitnum %d nextbyte %d" | 
 | 			       " left %p right %p mask %x bits %x\n", | 
 | 				indent, "", node, | 
 | 				node->bitnum, node->nextbyte, | 
 | 				node->left, node->right, | 
 | 				node->keymask, node->keybits); | 
 | 			nodes += 1; | 
 | 			if (!(node->left && node->right)) | 
 | 				singletons += 1; | 
 |  | 
 | 			while (node) { | 
 | 				bitmask = 1 << node->bitnum; | 
 | 				if ((leftmask & bitmask) == 0) { | 
 | 					leftmask |= bitmask; | 
 | 					if (node->leftnode == LEAF) { | 
 | 						assert(node->left); | 
 | 						tree->leaf_print(node->left, | 
 | 								 indent+1); | 
 | 						leaves += 1; | 
 | 					} else if (node->left) { | 
 | 						assert(node->leftnode == NODE); | 
 | 						indent += 1; | 
 | 						node = node->left; | 
 | 						break; | 
 | 					} | 
 | 				} | 
 | 				if ((rightmask & bitmask) == 0) { | 
 | 					rightmask |= bitmask; | 
 | 					if (node->rightnode == LEAF) { | 
 | 						assert(node->right); | 
 | 						tree->leaf_print(node->right, | 
 | 								 indent+1); | 
 | 						leaves += 1; | 
 | 					} else if (node->right) { | 
 | 						assert(node->rightnode == NODE); | 
 | 						indent += 1; | 
 | 						node = node->right; | 
 | 						break; | 
 | 					} | 
 | 				} | 
 | 				leftmask &= ~bitmask; | 
 | 				rightmask &= ~bitmask; | 
 | 				node = node->parent; | 
 | 				indent -= 1; | 
 | 			} | 
 | 		} | 
 | 	} | 
 | 	printf("nodes %d leaves %d singletons %d\n", | 
 | 	       nodes, leaves, singletons); | 
 | } | 
 |  | 
 | /* | 
 |  * Allocate an initialize a new internal node. | 
 |  */ | 
 | static struct node *alloc_node(struct node *parent) | 
 | { | 
 | 	struct node *node; | 
 | 	int bitnum; | 
 |  | 
 | 	node = malloc(sizeof(*node)); | 
 | 	node->left = node->right = NULL; | 
 | 	node->parent = parent; | 
 | 	node->leftnode = NODE; | 
 | 	node->rightnode = NODE; | 
 | 	node->keybits = 0; | 
 | 	node->keymask = 0; | 
 | 	node->mark = 0; | 
 | 	node->index = 0; | 
 | 	node->offset = -1; | 
 | 	node->size = 4; | 
 |  | 
 | 	if (node->parent) { | 
 | 		bitnum = parent->bitnum; | 
 | 		if ((bitnum & 7) == 0) { | 
 | 			node->bitnum = bitnum + 7 + 8; | 
 | 			node->nextbyte = 1; | 
 | 		} else { | 
 | 			node->bitnum = bitnum - 1; | 
 | 			node->nextbyte = 0; | 
 | 		} | 
 | 	} else { | 
 | 		node->bitnum = 7; | 
 | 		node->nextbyte = 0; | 
 | 	} | 
 |  | 
 | 	return node; | 
 | } | 
 |  | 
 | /* | 
 |  * Insert a new leaf into the tree, and collapse any subtrees that are | 
 |  * fully populated and end in identical leaves. A nextbyte tagged | 
 |  * internal node will not be removed to preserve the tree's integrity. | 
 |  * Note that due to the structure of utf8, no nextbyte tagged node | 
 |  * will be a candidate for removal. | 
 |  */ | 
 | static int insert(struct tree *tree, char *key, int keylen, void *leaf) | 
 | { | 
 | 	struct node *node; | 
 | 	struct node *parent; | 
 | 	void **cursor; | 
 | 	int keybits; | 
 |  | 
 | 	assert(keylen >= 1 && keylen <= 4); | 
 |  | 
 | 	node = NULL; | 
 | 	cursor = &tree->root; | 
 | 	keybits = 8 * keylen; | 
 |  | 
 | 	/* Insert, creating path along the way. */ | 
 | 	while (keybits) { | 
 | 		if (!*cursor) | 
 | 			*cursor = alloc_node(node); | 
 | 		node = *cursor; | 
 | 		if (node->nextbyte) | 
 | 			key++; | 
 | 		if (*key & (1 << (node->bitnum & 7))) | 
 | 			cursor = &node->right; | 
 | 		else | 
 | 			cursor = &node->left; | 
 | 		keybits--; | 
 | 	} | 
 | 	*cursor = leaf; | 
 |  | 
 | 	/* Merge subtrees if possible. */ | 
 | 	while (node) { | 
 | 		if (*key & (1 << (node->bitnum & 7))) | 
 | 			node->rightnode = LEAF; | 
 | 		else | 
 | 			node->leftnode = LEAF; | 
 | 		if (node->nextbyte) | 
 | 			break; | 
 | 		if (node->leftnode == NODE || node->rightnode == NODE) | 
 | 			break; | 
 | 		assert(node->left); | 
 | 		assert(node->right); | 
 | 		/* Compare */ | 
 | 		if (! tree->leaf_equal(node->left, node->right)) | 
 | 			break; | 
 | 		/* Keep left, drop right leaf. */ | 
 | 		leaf = node->left; | 
 | 		/* Check in parent */ | 
 | 		parent = node->parent; | 
 | 		if (!parent) { | 
 | 			/* root of tree! */ | 
 | 			tree->root = leaf; | 
 | 			tree->childnode = LEAF; | 
 | 		} else if (parent->left == node) { | 
 | 			parent->left = leaf; | 
 | 			parent->leftnode = LEAF; | 
 | 			if (parent->right) { | 
 | 				parent->keymask = 0; | 
 | 				parent->keybits = 0; | 
 | 			} else { | 
 | 				parent->keymask |= (1 << node->bitnum); | 
 | 			} | 
 | 		} else if (parent->right == node) { | 
 | 			parent->right = leaf; | 
 | 			parent->rightnode = LEAF; | 
 | 			if (parent->left) { | 
 | 				parent->keymask = 0; | 
 | 				parent->keybits = 0; | 
 | 			} else { | 
 | 				parent->keymask |= (1 << node->bitnum); | 
 | 				parent->keybits |= (1 << node->bitnum); | 
 | 			} | 
 | 		} else { | 
 | 			/* internal tree error */ | 
 | 			assert(0); | 
 | 		} | 
 | 		free(node); | 
 | 		node = parent; | 
 | 	} | 
 |  | 
 | 	/* Propagate keymasks up along singleton chains. */ | 
 | 	while (node) { | 
 | 		parent = node->parent; | 
 | 		if (!parent) | 
 | 			break; | 
 | 		/* Nix the mask for parents with two children. */ | 
 | 		if (node->keymask == 0) { | 
 | 			parent->keymask = 0; | 
 | 			parent->keybits = 0; | 
 | 		} else if (parent->left && parent->right) { | 
 | 			parent->keymask = 0; | 
 | 			parent->keybits = 0; | 
 | 		} else { | 
 | 			assert((parent->keymask & node->keymask) == 0); | 
 | 			parent->keymask |= node->keymask; | 
 | 			parent->keymask |= (1 << parent->bitnum); | 
 | 			parent->keybits |= node->keybits; | 
 | 			if (parent->right) | 
 | 				parent->keybits |= (1 << parent->bitnum); | 
 | 		} | 
 | 		node = parent; | 
 | 	} | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* | 
 |  * Prune internal nodes. | 
 |  * | 
 |  * Fully populated subtrees that end at the same leaf have already | 
 |  * been collapsed.  There are still internal nodes that have for both | 
 |  * their left and right branches a sequence of singletons that make | 
 |  * identical choices and end in identical leaves.  The keymask and | 
 |  * keybits collected in the nodes describe the choices made in these | 
 |  * singleton chains.  When they are identical for the left and right | 
 |  * branch of a node, and the two leaves comare identical, the node in | 
 |  * question can be removed. | 
 |  * | 
 |  * Note that nodes with the nextbyte tag set will not be removed by | 
 |  * this to ensure tree integrity.  Note as well that the structure of | 
 |  * utf8 ensures that these nodes would not have been candidates for | 
 |  * removal in any case. | 
 |  */ | 
 | static void prune(struct tree *tree) | 
 | { | 
 | 	struct node *node; | 
 | 	struct node *left; | 
 | 	struct node *right; | 
 | 	struct node *parent; | 
 | 	void *leftleaf; | 
 | 	void *rightleaf; | 
 | 	unsigned int leftmask; | 
 | 	unsigned int rightmask; | 
 | 	unsigned int bitmask; | 
 | 	int count; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Pruning %s_%x\n", tree->type, tree->maxage); | 
 |  | 
 | 	count = 0; | 
 | 	if (tree->childnode == LEAF) | 
 | 		return; | 
 | 	if (!tree->root) | 
 | 		return; | 
 |  | 
 | 	leftmask = rightmask = 0; | 
 | 	node = tree->root; | 
 | 	while (node) { | 
 | 		if (node->nextbyte) | 
 | 			goto advance; | 
 | 		if (node->leftnode == LEAF) | 
 | 			goto advance; | 
 | 		if (node->rightnode == LEAF) | 
 | 			goto advance; | 
 | 		if (!node->left) | 
 | 			goto advance; | 
 | 		if (!node->right) | 
 | 			goto advance; | 
 | 		left = node->left; | 
 | 		right = node->right; | 
 | 		if (left->keymask == 0) | 
 | 			goto advance; | 
 | 		if (right->keymask == 0) | 
 | 			goto advance; | 
 | 		if (left->keymask != right->keymask) | 
 | 			goto advance; | 
 | 		if (left->keybits != right->keybits) | 
 | 			goto advance; | 
 | 		leftleaf = NULL; | 
 | 		while (!leftleaf) { | 
 | 			assert(left->left || left->right); | 
 | 			if (left->leftnode == LEAF) | 
 | 				leftleaf = left->left; | 
 | 			else if (left->rightnode == LEAF) | 
 | 				leftleaf = left->right; | 
 | 			else if (left->left) | 
 | 				left = left->left; | 
 | 			else if (left->right) | 
 | 				left = left->right; | 
 | 			else | 
 | 				assert(0); | 
 | 		} | 
 | 		rightleaf = NULL; | 
 | 		while (!rightleaf) { | 
 | 			assert(right->left || right->right); | 
 | 			if (right->leftnode == LEAF) | 
 | 				rightleaf = right->left; | 
 | 			else if (right->rightnode == LEAF) | 
 | 				rightleaf = right->right; | 
 | 			else if (right->left) | 
 | 				right = right->left; | 
 | 			else if (right->right) | 
 | 				right = right->right; | 
 | 			else | 
 | 				assert(0); | 
 | 		} | 
 | 		if (! tree->leaf_equal(leftleaf, rightleaf)) | 
 | 			goto advance; | 
 | 		/* | 
 | 		 * This node has identical singleton-only subtrees. | 
 | 		 * Remove it. | 
 | 		 */ | 
 | 		parent = node->parent; | 
 | 		left = node->left; | 
 | 		right = node->right; | 
 | 		if (parent->left == node) | 
 | 			parent->left = left; | 
 | 		else if (parent->right == node) | 
 | 			parent->right = left; | 
 | 		else | 
 | 			assert(0); | 
 | 		left->parent = parent; | 
 | 		left->keymask |= (1 << node->bitnum); | 
 | 		node->left = NULL; | 
 | 		while (node) { | 
 | 			bitmask = 1 << node->bitnum; | 
 | 			leftmask &= ~bitmask; | 
 | 			rightmask &= ~bitmask; | 
 | 			if (node->leftnode == NODE && node->left) { | 
 | 				left = node->left; | 
 | 				free(node); | 
 | 				count++; | 
 | 				node = left; | 
 | 			} else if (node->rightnode == NODE && node->right) { | 
 | 				right = node->right; | 
 | 				free(node); | 
 | 				count++; | 
 | 				node = right; | 
 | 			} else { | 
 | 				node = NULL; | 
 | 			} | 
 | 		} | 
 | 		/* Propagate keymasks up along singleton chains. */ | 
 | 		node = parent; | 
 | 		/* Force re-check */ | 
 | 		bitmask = 1 << node->bitnum; | 
 | 		leftmask &= ~bitmask; | 
 | 		rightmask &= ~bitmask; | 
 | 		for (;;) { | 
 | 			if (node->left && node->right) | 
 | 				break; | 
 | 			if (node->left) { | 
 | 				left = node->left; | 
 | 				node->keymask |= left->keymask; | 
 | 				node->keybits |= left->keybits; | 
 | 			} | 
 | 			if (node->right) { | 
 | 				right = node->right; | 
 | 				node->keymask |= right->keymask; | 
 | 				node->keybits |= right->keybits; | 
 | 			} | 
 | 			node->keymask |= (1 << node->bitnum); | 
 | 			node = node->parent; | 
 | 			/* Force re-check */ | 
 | 			bitmask = 1 << node->bitnum; | 
 | 			leftmask &= ~bitmask; | 
 | 			rightmask &= ~bitmask; | 
 | 		} | 
 | 	advance: | 
 | 		bitmask = 1 << node->bitnum; | 
 | 		if ((leftmask & bitmask) == 0 && | 
 | 		    node->leftnode == NODE && | 
 | 		    node->left) { | 
 | 			leftmask |= bitmask; | 
 | 			node = node->left; | 
 | 		} else if ((rightmask & bitmask) == 0 && | 
 | 			   node->rightnode == NODE && | 
 | 			   node->right) { | 
 | 			rightmask |= bitmask; | 
 | 			node = node->right; | 
 | 		} else { | 
 | 			leftmask &= ~bitmask; | 
 | 			rightmask &= ~bitmask; | 
 | 			node = node->parent; | 
 | 		} | 
 | 	} | 
 | 	if (verbose > 0) | 
 | 		printf("Pruned %d nodes\n", count); | 
 | } | 
 |  | 
 | /* | 
 |  * Mark the nodes in the tree that lead to leaves that must be | 
 |  * emitted. | 
 |  */ | 
 | static void mark_nodes(struct tree *tree) | 
 | { | 
 | 	struct node *node; | 
 | 	struct node *n; | 
 | 	unsigned int leftmask; | 
 | 	unsigned int rightmask; | 
 | 	unsigned int bitmask; | 
 | 	int marked; | 
 |  | 
 | 	marked = 0; | 
 | 	if (verbose > 0) | 
 | 		printf("Marking %s_%x\n", tree->type, tree->maxage); | 
 | 	if (tree->childnode == LEAF) | 
 | 		goto done; | 
 |  | 
 | 	assert(tree->childnode == NODE); | 
 | 	node = tree->root; | 
 | 	leftmask = rightmask = 0; | 
 | 	while (node) { | 
 | 		bitmask = 1 << node->bitnum; | 
 | 		if ((leftmask & bitmask) == 0) { | 
 | 			leftmask |= bitmask; | 
 | 			if (node->leftnode == LEAF) { | 
 | 				assert(node->left); | 
 | 				if (tree->leaf_mark(node->left)) { | 
 | 					n = node; | 
 | 					while (n && !n->mark) { | 
 | 						marked++; | 
 | 						n->mark = 1; | 
 | 						n = n->parent; | 
 | 					} | 
 | 				} | 
 | 			} else if (node->left) { | 
 | 				assert(node->leftnode == NODE); | 
 | 				node = node->left; | 
 | 				continue; | 
 | 			} | 
 | 		} | 
 | 		if ((rightmask & bitmask) == 0) { | 
 | 			rightmask |= bitmask; | 
 | 			if (node->rightnode == LEAF) { | 
 | 				assert(node->right); | 
 | 				if (tree->leaf_mark(node->right)) { | 
 | 					n = node; | 
 | 					while (n && !n->mark) { | 
 | 						marked++; | 
 | 						n->mark = 1; | 
 | 						n = n->parent; | 
 | 					} | 
 | 				} | 
 | 			} else if (node->right) { | 
 | 				assert(node->rightnode == NODE); | 
 | 				node = node->right; | 
 | 				continue; | 
 | 			} | 
 | 		} | 
 | 		leftmask &= ~bitmask; | 
 | 		rightmask &= ~bitmask; | 
 | 		node = node->parent; | 
 | 	} | 
 |  | 
 | 	/* second pass: left siblings and singletons */ | 
 |  | 
 | 	assert(tree->childnode == NODE); | 
 | 	node = tree->root; | 
 | 	leftmask = rightmask = 0; | 
 | 	while (node) { | 
 | 		bitmask = 1 << node->bitnum; | 
 | 		if ((leftmask & bitmask) == 0) { | 
 | 			leftmask |= bitmask; | 
 | 			if (node->leftnode == LEAF) { | 
 | 				assert(node->left); | 
 | 				if (tree->leaf_mark(node->left)) { | 
 | 					n = node; | 
 | 					while (n && !n->mark) { | 
 | 						marked++; | 
 | 						n->mark = 1; | 
 | 						n = n->parent; | 
 | 					} | 
 | 				} | 
 | 			} else if (node->left) { | 
 | 				assert(node->leftnode == NODE); | 
 | 				node = node->left; | 
 | 				if (!node->mark && node->parent->mark) { | 
 | 					marked++; | 
 | 					node->mark = 1; | 
 | 				} | 
 | 				continue; | 
 | 			} | 
 | 		} | 
 | 		if ((rightmask & bitmask) == 0) { | 
 | 			rightmask |= bitmask; | 
 | 			if (node->rightnode == LEAF) { | 
 | 				assert(node->right); | 
 | 				if (tree->leaf_mark(node->right)) { | 
 | 					n = node; | 
 | 					while (n && !n->mark) { | 
 | 						marked++; | 
 | 						n->mark = 1; | 
 | 						n = n->parent; | 
 | 					} | 
 | 				} | 
 | 			} else if (node->right) { | 
 | 				assert(node->rightnode == NODE); | 
 | 				node = node->right; | 
 | 				if (!node->mark && node->parent->mark && | 
 | 				    !node->parent->left) { | 
 | 					marked++; | 
 | 					node->mark = 1; | 
 | 				} | 
 | 				continue; | 
 | 			} | 
 | 		} | 
 | 		leftmask &= ~bitmask; | 
 | 		rightmask &= ~bitmask; | 
 | 		node = node->parent; | 
 | 	} | 
 | done: | 
 | 	if (verbose > 0) | 
 | 		printf("Marked %d nodes\n", marked); | 
 | } | 
 |  | 
 | /* | 
 |  * Compute the index of each node and leaf, which is the offset in the | 
 |  * emitted trie.  These values must be pre-computed because relative | 
 |  * offsets between nodes are used to navigate the tree. | 
 |  */ | 
 | static int index_nodes(struct tree *tree, int index) | 
 | { | 
 | 	struct node *node; | 
 | 	unsigned int leftmask; | 
 | 	unsigned int rightmask; | 
 | 	unsigned int bitmask; | 
 | 	int count; | 
 | 	int indent; | 
 |  | 
 | 	/* Align to a cache line (or half a cache line?). */ | 
 | 	while (index % 64) | 
 | 		index++; | 
 | 	tree->index = index; | 
 | 	indent = 1; | 
 | 	count = 0; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); | 
 | 	if (tree->childnode == LEAF) { | 
 | 		index += tree->leaf_size(tree->root); | 
 | 		goto done; | 
 | 	} | 
 |  | 
 | 	assert(tree->childnode == NODE); | 
 | 	node = tree->root; | 
 | 	leftmask = rightmask = 0; | 
 | 	while (node) { | 
 | 		if (!node->mark) | 
 | 			goto skip; | 
 | 		count++; | 
 | 		if (node->index != index) | 
 | 			node->index = index; | 
 | 		index += node->size; | 
 | skip: | 
 | 		while (node) { | 
 | 			bitmask = 1 << node->bitnum; | 
 | 			if (node->mark && (leftmask & bitmask) == 0) { | 
 | 				leftmask |= bitmask; | 
 | 				if (node->leftnode == LEAF) { | 
 | 					assert(node->left); | 
 | 					*tree->leaf_index(tree, node->left) = | 
 | 									index; | 
 | 					index += tree->leaf_size(node->left); | 
 | 					count++; | 
 | 				} else if (node->left) { | 
 | 					assert(node->leftnode == NODE); | 
 | 					indent += 1; | 
 | 					node = node->left; | 
 | 					break; | 
 | 				} | 
 | 			} | 
 | 			if (node->mark && (rightmask & bitmask) == 0) { | 
 | 				rightmask |= bitmask; | 
 | 				if (node->rightnode == LEAF) { | 
 | 					assert(node->right); | 
 | 					*tree->leaf_index(tree, node->right) = index; | 
 | 					index += tree->leaf_size(node->right); | 
 | 					count++; | 
 | 				} else if (node->right) { | 
 | 					assert(node->rightnode == NODE); | 
 | 					indent += 1; | 
 | 					node = node->right; | 
 | 					break; | 
 | 				} | 
 | 			} | 
 | 			leftmask &= ~bitmask; | 
 | 			rightmask &= ~bitmask; | 
 | 			node = node->parent; | 
 | 			indent -= 1; | 
 | 		} | 
 | 	} | 
 | done: | 
 | 	/* Round up to a multiple of 16 */ | 
 | 	while (index % 16) | 
 | 		index++; | 
 | 	if (verbose > 0) | 
 | 		printf("Final index %d\n", index); | 
 | 	return index; | 
 | } | 
 |  | 
 | /* | 
 |  * Mark the nodes in a subtree, helper for size_nodes(). | 
 |  */ | 
 | static int mark_subtree(struct node *node) | 
 | { | 
 | 	int changed; | 
 |  | 
 | 	if (!node || node->mark) | 
 | 		return 0; | 
 | 	node->mark = 1; | 
 | 	node->index = node->parent->index; | 
 | 	changed = 1; | 
 | 	if (node->leftnode == NODE) | 
 | 		changed += mark_subtree(node->left); | 
 | 	if (node->rightnode == NODE) | 
 | 		changed += mark_subtree(node->right); | 
 | 	return changed; | 
 | } | 
 |  | 
 | /* | 
 |  * Compute the size of nodes and leaves. We start by assuming that | 
 |  * each node needs to store a three-byte offset. The indexes of the | 
 |  * nodes are calculated based on that, and then this function is | 
 |  * called to see if the sizes of some nodes can be reduced.  This is | 
 |  * repeated until no more changes are seen. | 
 |  */ | 
 | static int size_nodes(struct tree *tree) | 
 | { | 
 | 	struct tree *next; | 
 | 	struct node *node; | 
 | 	struct node *right; | 
 | 	struct node *n; | 
 | 	unsigned int leftmask; | 
 | 	unsigned int rightmask; | 
 | 	unsigned int bitmask; | 
 | 	unsigned int pathbits; | 
 | 	unsigned int pathmask; | 
 | 	unsigned int nbit; | 
 | 	int changed; | 
 | 	int offset; | 
 | 	int size; | 
 | 	int indent; | 
 |  | 
 | 	indent = 1; | 
 | 	changed = 0; | 
 | 	size = 0; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Sizing %s_%x\n", tree->type, tree->maxage); | 
 | 	if (tree->childnode == LEAF) | 
 | 		goto done; | 
 |  | 
 | 	assert(tree->childnode == NODE); | 
 | 	pathbits = 0; | 
 | 	pathmask = 0; | 
 | 	node = tree->root; | 
 | 	leftmask = rightmask = 0; | 
 | 	while (node) { | 
 | 		if (!node->mark) | 
 | 			goto skip; | 
 | 		offset = 0; | 
 | 		if (!node->left || !node->right) { | 
 | 			size = 1; | 
 | 		} else { | 
 | 			if (node->rightnode == NODE) { | 
 | 				/* | 
 | 				 * If the right node is not marked, | 
 | 				 * look for a corresponding node in | 
 | 				 * the next tree.  Such a node need | 
 | 				 * not exist. | 
 | 				 */ | 
 | 				right = node->right; | 
 | 				next = tree->next; | 
 | 				while (!right->mark) { | 
 | 					assert(next); | 
 | 					n = next->root; | 
 | 					while (n->bitnum != node->bitnum) { | 
 | 						nbit = 1 << n->bitnum; | 
 | 						if (!(pathmask & nbit)) | 
 | 							break; | 
 | 						if (pathbits & nbit) { | 
 | 							if (n->rightnode == LEAF) | 
 | 								break; | 
 | 							n = n->right; | 
 | 						} else { | 
 | 							if (n->leftnode == LEAF) | 
 | 								break; | 
 | 							n = n->left; | 
 | 						} | 
 | 					} | 
 | 					if (n->bitnum != node->bitnum) | 
 | 						break; | 
 | 					n = n->right; | 
 | 					right = n; | 
 | 					next = next->next; | 
 | 				} | 
 | 				/* Make sure the right node is marked. */ | 
 | 				if (!right->mark) | 
 | 					changed += mark_subtree(right); | 
 | 				offset = right->index - node->index; | 
 | 			} else { | 
 | 				offset = *tree->leaf_index(tree, node->right); | 
 | 				offset -= node->index; | 
 | 			} | 
 | 			assert(offset >= 0); | 
 | 			assert(offset <= 0xffffff); | 
 | 			if (offset <= 0xff) { | 
 | 				size = 2; | 
 | 			} else if (offset <= 0xffff) { | 
 | 				size = 3; | 
 | 			} else { /* offset <= 0xffffff */ | 
 | 				size = 4; | 
 | 			} | 
 | 		} | 
 | 		if (node->size != size || node->offset != offset) { | 
 | 			node->size = size; | 
 | 			node->offset = offset; | 
 | 			changed++; | 
 | 		} | 
 | skip: | 
 | 		while (node) { | 
 | 			bitmask = 1 << node->bitnum; | 
 | 			pathmask |= bitmask; | 
 | 			if (node->mark && (leftmask & bitmask) == 0) { | 
 | 				leftmask |= bitmask; | 
 | 				if (node->leftnode == LEAF) { | 
 | 					assert(node->left); | 
 | 				} else if (node->left) { | 
 | 					assert(node->leftnode == NODE); | 
 | 					indent += 1; | 
 | 					node = node->left; | 
 | 					break; | 
 | 				} | 
 | 			} | 
 | 			if (node->mark && (rightmask & bitmask) == 0) { | 
 | 				rightmask |= bitmask; | 
 | 				pathbits |= bitmask; | 
 | 				if (node->rightnode == LEAF) { | 
 | 					assert(node->right); | 
 | 				} else if (node->right) { | 
 | 					assert(node->rightnode == NODE); | 
 | 					indent += 1; | 
 | 					node = node->right; | 
 | 					break; | 
 | 				} | 
 | 			} | 
 | 			leftmask &= ~bitmask; | 
 | 			rightmask &= ~bitmask; | 
 | 			pathmask &= ~bitmask; | 
 | 			pathbits &= ~bitmask; | 
 | 			node = node->parent; | 
 | 			indent -= 1; | 
 | 		} | 
 | 	} | 
 | done: | 
 | 	if (verbose > 0) | 
 | 		printf("Found %d changes\n", changed); | 
 | 	return changed; | 
 | } | 
 |  | 
 | /* | 
 |  * Emit a trie for the given tree into the data array. | 
 |  */ | 
 | static void emit(struct tree *tree, unsigned char *data) | 
 | { | 
 | 	struct node *node; | 
 | 	unsigned int leftmask; | 
 | 	unsigned int rightmask; | 
 | 	unsigned int bitmask; | 
 | 	int offlen; | 
 | 	int offset; | 
 | 	int index; | 
 | 	int indent; | 
 | 	int size; | 
 | 	int bytes; | 
 | 	int leaves; | 
 | 	int nodes[4]; | 
 | 	unsigned char byte; | 
 |  | 
 | 	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; | 
 | 	leaves = 0; | 
 | 	bytes = 0; | 
 | 	index = tree->index; | 
 | 	data += index; | 
 | 	indent = 1; | 
 | 	if (verbose > 0) | 
 | 		printf("Emitting %s_%x\n", tree->type, tree->maxage); | 
 | 	if (tree->childnode == LEAF) { | 
 | 		assert(tree->root); | 
 | 		tree->leaf_emit(tree->root, data); | 
 | 		size = tree->leaf_size(tree->root); | 
 | 		index += size; | 
 | 		leaves++; | 
 | 		goto done; | 
 | 	} | 
 |  | 
 | 	assert(tree->childnode == NODE); | 
 | 	node = tree->root; | 
 | 	leftmask = rightmask = 0; | 
 | 	while (node) { | 
 | 		if (!node->mark) | 
 | 			goto skip; | 
 | 		assert(node->offset != -1); | 
 | 		assert(node->index == index); | 
 |  | 
 | 		byte = 0; | 
 | 		if (node->nextbyte) | 
 | 			byte |= NEXTBYTE; | 
 | 		byte |= (node->bitnum & BITNUM); | 
 | 		if (node->left && node->right) { | 
 | 			if (node->leftnode == NODE) | 
 | 				byte |= LEFTNODE; | 
 | 			if (node->rightnode == NODE) | 
 | 				byte |= RIGHTNODE; | 
 | 			if (node->offset <= 0xff) | 
 | 				offlen = 1; | 
 | 			else if (node->offset <= 0xffff) | 
 | 				offlen = 2; | 
 | 			else | 
 | 				offlen = 3; | 
 | 			nodes[offlen]++; | 
 | 			offset = node->offset; | 
 | 			byte |= offlen << OFFLEN_SHIFT; | 
 | 			*data++ = byte; | 
 | 			index++; | 
 | 			while (offlen--) { | 
 | 				*data++ = offset & 0xff; | 
 | 				index++; | 
 | 				offset >>= 8; | 
 | 			} | 
 | 		} else if (node->left) { | 
 | 			if (node->leftnode == NODE) | 
 | 				byte |= TRIENODE; | 
 | 			nodes[0]++; | 
 | 			*data++ = byte; | 
 | 			index++; | 
 | 		} else if (node->right) { | 
 | 			byte |= RIGHTNODE; | 
 | 			if (node->rightnode == NODE) | 
 | 				byte |= TRIENODE; | 
 | 			nodes[0]++; | 
 | 			*data++ = byte; | 
 | 			index++; | 
 | 		} else { | 
 | 			assert(0); | 
 | 		} | 
 | skip: | 
 | 		while (node) { | 
 | 			bitmask = 1 << node->bitnum; | 
 | 			if (node->mark && (leftmask & bitmask) == 0) { | 
 | 				leftmask |= bitmask; | 
 | 				if (node->leftnode == LEAF) { | 
 | 					assert(node->left); | 
 | 					data = tree->leaf_emit(node->left, | 
 | 							       data); | 
 | 					size = tree->leaf_size(node->left); | 
 | 					index += size; | 
 | 					bytes += size; | 
 | 					leaves++; | 
 | 				} else if (node->left) { | 
 | 					assert(node->leftnode == NODE); | 
 | 					indent += 1; | 
 | 					node = node->left; | 
 | 					break; | 
 | 				} | 
 | 			} | 
 | 			if (node->mark && (rightmask & bitmask) == 0) { | 
 | 				rightmask |= bitmask; | 
 | 				if (node->rightnode == LEAF) { | 
 | 					assert(node->right); | 
 | 					data = tree->leaf_emit(node->right, | 
 | 							       data); | 
 | 					size = tree->leaf_size(node->right); | 
 | 					index += size; | 
 | 					bytes += size; | 
 | 					leaves++; | 
 | 				} else if (node->right) { | 
 | 					assert(node->rightnode == NODE); | 
 | 					indent += 1; | 
 | 					node = node->right; | 
 | 					break; | 
 | 				} | 
 | 			} | 
 | 			leftmask &= ~bitmask; | 
 | 			rightmask &= ~bitmask; | 
 | 			node = node->parent; | 
 | 			indent -= 1; | 
 | 		} | 
 | 	} | 
 | done: | 
 | 	if (verbose > 0) { | 
 | 		printf("Emitted %d (%d) leaves", | 
 | 			leaves, bytes); | 
 | 		printf(" %d (%d+%d+%d+%d) nodes", | 
 | 			nodes[0] + nodes[1] + nodes[2] + nodes[3], | 
 | 			nodes[0], nodes[1], nodes[2], nodes[3]); | 
 | 		printf(" %d total\n", index - tree->index); | 
 | 	} | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | /* | 
 |  * Unicode data. | 
 |  * | 
 |  * We need to keep track of the Canonical Combining Class, the Age, | 
 |  * and decompositions for a code point. | 
 |  * | 
 |  * For the Age, we store the index into the ages table.  Effectively | 
 |  * this is a generation number that the table maps to a unicode | 
 |  * version. | 
 |  * | 
 |  * The correction field is used to indicate that this entry is in the | 
 |  * corrections array, which contains decompositions that were | 
 |  * corrected in later revisions.  The value of the correction field is | 
 |  * the Unicode version in which the mapping was corrected. | 
 |  */ | 
 | struct unicode_data { | 
 | 	unsigned int code; | 
 | 	int ccc; | 
 | 	int gen; | 
 | 	int correction; | 
 | 	unsigned int *utf32nfdi; | 
 | 	unsigned int *utf32nfdicf; | 
 | 	char *utf8nfdi; | 
 | 	char *utf8nfdicf; | 
 | }; | 
 |  | 
 | struct unicode_data unicode_data[0x110000]; | 
 | struct unicode_data *corrections; | 
 | int    corrections_count; | 
 |  | 
 | struct tree *nfdi_tree; | 
 | struct tree *nfdicf_tree; | 
 |  | 
 | struct tree *trees; | 
 | int          trees_count; | 
 |  | 
 | /* | 
 |  * Check the corrections array to see if this entry was corrected at | 
 |  * some point. | 
 |  */ | 
 | static struct unicode_data *corrections_lookup(struct unicode_data *u) | 
 | { | 
 | 	int i; | 
 |  | 
 | 	for (i = 0; i != corrections_count; i++) | 
 | 		if (u->code == corrections[i].code) | 
 | 			return &corrections[i]; | 
 | 	return u; | 
 | } | 
 |  | 
 | static int nfdi_equal(void *l, void *r) | 
 | { | 
 | 	struct unicode_data *left = l; | 
 | 	struct unicode_data *right = r; | 
 |  | 
 | 	if (left->gen != right->gen) | 
 | 		return 0; | 
 | 	if (left->ccc != right->ccc) | 
 | 		return 0; | 
 | 	if (left->utf8nfdi && right->utf8nfdi && | 
 | 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0) | 
 | 		return 1; | 
 | 	if (left->utf8nfdi || right->utf8nfdi) | 
 | 		return 0; | 
 | 	return 1; | 
 | } | 
 |  | 
 | static int nfdicf_equal(void *l, void *r) | 
 | { | 
 | 	struct unicode_data *left = l; | 
 | 	struct unicode_data *right = r; | 
 |  | 
 | 	if (left->gen != right->gen) | 
 | 		return 0; | 
 | 	if (left->ccc != right->ccc) | 
 | 		return 0; | 
 | 	if (left->utf8nfdicf && right->utf8nfdicf && | 
 | 	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) | 
 | 		return 1; | 
 | 	if (left->utf8nfdicf && right->utf8nfdicf) | 
 | 		return 0; | 
 | 	if (left->utf8nfdicf || right->utf8nfdicf) | 
 | 		return 0; | 
 | 	if (left->utf8nfdi && right->utf8nfdi && | 
 | 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0) | 
 | 		return 1; | 
 | 	if (left->utf8nfdi || right->utf8nfdi) | 
 | 		return 0; | 
 | 	return 1; | 
 | } | 
 |  | 
 | static void nfdi_print(void *l, int indent) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 |  | 
 | 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, | 
 | 		leaf->code, leaf->ccc, leaf->gen); | 
 |  | 
 | 	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) | 
 | 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); | 
 | 	else if (leaf->utf8nfdi) | 
 | 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); | 
 |  | 
 | 	printf("\n"); | 
 | } | 
 |  | 
 | static void nfdicf_print(void *l, int indent) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 |  | 
 | 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, | 
 | 		leaf->code, leaf->ccc, leaf->gen); | 
 |  | 
 | 	if (leaf->utf8nfdicf) | 
 | 		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); | 
 | 	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) | 
 | 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); | 
 | 	else if (leaf->utf8nfdi) | 
 | 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); | 
 | 	printf("\n"); | 
 | } | 
 |  | 
 | static int nfdi_mark(void *l) | 
 | { | 
 | 	return 1; | 
 | } | 
 |  | 
 | static int nfdicf_mark(void *l) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 |  | 
 | 	if (leaf->utf8nfdicf) | 
 | 		return 1; | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int correction_mark(void *l) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 |  | 
 | 	return leaf->correction; | 
 | } | 
 |  | 
 | static int nfdi_size(void *l) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 | 	int size = 2; | 
 |  | 
 | 	if (HANGUL_SYLLABLE(leaf->code)) | 
 | 		size += 1; | 
 | 	else if (leaf->utf8nfdi) | 
 | 		size += strlen(leaf->utf8nfdi) + 1; | 
 | 	return size; | 
 | } | 
 |  | 
 | static int nfdicf_size(void *l) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 | 	int size = 2; | 
 |  | 
 | 	if (HANGUL_SYLLABLE(leaf->code)) | 
 | 		size += 1; | 
 | 	else if (leaf->utf8nfdicf) | 
 | 		size += strlen(leaf->utf8nfdicf) + 1; | 
 | 	else if (leaf->utf8nfdi) | 
 | 		size += strlen(leaf->utf8nfdi) + 1; | 
 | 	return size; | 
 | } | 
 |  | 
 | static int *nfdi_index(struct tree *tree, void *l) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 |  | 
 | 	return &tree->leafindex[leaf->code]; | 
 | } | 
 |  | 
 | static int *nfdicf_index(struct tree *tree, void *l) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 |  | 
 | 	return &tree->leafindex[leaf->code]; | 
 | } | 
 |  | 
 | static unsigned char *nfdi_emit(void *l, unsigned char *data) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 | 	unsigned char *s; | 
 |  | 
 | 	*data++ = leaf->gen; | 
 |  | 
 | 	if (HANGUL_SYLLABLE(leaf->code)) { | 
 | 		*data++ = DECOMPOSE; | 
 | 		*data++ = HANGUL; | 
 | 	} else if (leaf->utf8nfdi) { | 
 | 		*data++ = DECOMPOSE; | 
 | 		s = (unsigned char*)leaf->utf8nfdi; | 
 | 		while ((*data++ = *s++) != 0) | 
 | 			; | 
 | 	} else { | 
 | 		*data++ = leaf->ccc; | 
 | 	} | 
 | 	return data; | 
 | } | 
 |  | 
 | static unsigned char *nfdicf_emit(void *l, unsigned char *data) | 
 | { | 
 | 	struct unicode_data *leaf = l; | 
 | 	unsigned char *s; | 
 |  | 
 | 	*data++ = leaf->gen; | 
 |  | 
 | 	if (HANGUL_SYLLABLE(leaf->code)) { | 
 | 		*data++ = DECOMPOSE; | 
 | 		*data++ = HANGUL; | 
 | 	} else if (leaf->utf8nfdicf) { | 
 | 		*data++ = DECOMPOSE; | 
 | 		s = (unsigned char*)leaf->utf8nfdicf; | 
 | 		while ((*data++ = *s++) != 0) | 
 | 			; | 
 | 	} else if (leaf->utf8nfdi) { | 
 | 		*data++ = DECOMPOSE; | 
 | 		s = (unsigned char*)leaf->utf8nfdi; | 
 | 		while ((*data++ = *s++) != 0) | 
 | 			; | 
 | 	} else { | 
 | 		*data++ = leaf->ccc; | 
 | 	} | 
 | 	return data; | 
 | } | 
 |  | 
 | static void utf8_create(struct unicode_data *data) | 
 | { | 
 | 	char utf[18*4+1]; | 
 | 	char *u; | 
 | 	unsigned int *um; | 
 | 	int i; | 
 |  | 
 | 	if (data->utf8nfdi) { | 
 | 		assert(data->utf8nfdi[0] == HANGUL); | 
 | 		return; | 
 | 	} | 
 |  | 
 | 	u = utf; | 
 | 	um = data->utf32nfdi; | 
 | 	if (um) { | 
 | 		for (i = 0; um[i]; i++) | 
 | 			u += utf8encode(u, um[i]); | 
 | 		*u = '\0'; | 
 | 		data->utf8nfdi = strdup(utf); | 
 | 	} | 
 | 	u = utf; | 
 | 	um = data->utf32nfdicf; | 
 | 	if (um) { | 
 | 		for (i = 0; um[i]; i++) | 
 | 			u += utf8encode(u, um[i]); | 
 | 		*u = '\0'; | 
 | 		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) | 
 | 			data->utf8nfdicf = strdup(utf); | 
 | 	} | 
 | } | 
 |  | 
 | static void utf8_init(void) | 
 | { | 
 | 	unsigned int unichar; | 
 | 	int i; | 
 |  | 
 | 	for (unichar = 0; unichar != 0x110000; unichar++) | 
 | 		utf8_create(&unicode_data[unichar]); | 
 |  | 
 | 	for (i = 0; i != corrections_count; i++) | 
 | 		utf8_create(&corrections[i]); | 
 | } | 
 |  | 
 | static void trees_init(void) | 
 | { | 
 | 	struct unicode_data *data; | 
 | 	unsigned int maxage; | 
 | 	unsigned int nextage; | 
 | 	int count; | 
 | 	int i; | 
 | 	int j; | 
 |  | 
 | 	/* Count the number of different ages. */ | 
 | 	count = 0; | 
 | 	nextage = (unsigned int)-1; | 
 | 	do { | 
 | 		maxage = nextage; | 
 | 		nextage = 0; | 
 | 		for (i = 0; i <= corrections_count; i++) { | 
 | 			data = &corrections[i]; | 
 | 			if (nextage < data->correction && | 
 | 			    data->correction < maxage) | 
 | 				nextage = data->correction; | 
 | 		} | 
 | 		count++; | 
 | 	} while (nextage); | 
 |  | 
 | 	/* Two trees per age: nfdi and nfdicf */ | 
 | 	trees_count = count * 2; | 
 | 	trees = calloc(trees_count, sizeof(struct tree)); | 
 |  | 
 | 	/* Assign ages to the trees. */ | 
 | 	count = trees_count; | 
 | 	nextage = (unsigned int)-1; | 
 | 	do { | 
 | 		maxage = nextage; | 
 | 		trees[--count].maxage = maxage; | 
 | 		trees[--count].maxage = maxage; | 
 | 		nextage = 0; | 
 | 		for (i = 0; i <= corrections_count; i++) { | 
 | 			data = &corrections[i]; | 
 | 			if (nextage < data->correction && | 
 | 			    data->correction < maxage) | 
 | 				nextage = data->correction; | 
 | 		} | 
 | 	} while (nextage); | 
 |  | 
 | 	/* The ages assigned above are off by one. */ | 
 | 	for (i = 0; i != trees_count; i++) { | 
 | 		j = 0; | 
 | 		while (ages[j] < trees[i].maxage) | 
 | 			j++; | 
 | 		trees[i].maxage = ages[j-1]; | 
 | 	} | 
 |  | 
 | 	/* Set up the forwarding between trees. */ | 
 | 	trees[trees_count-2].next = &trees[trees_count-1]; | 
 | 	trees[trees_count-1].leaf_mark = nfdi_mark; | 
 | 	trees[trees_count-2].leaf_mark = nfdicf_mark; | 
 | 	for (i = 0; i != trees_count-2; i += 2) { | 
 | 		trees[i].next = &trees[trees_count-2]; | 
 | 		trees[i].leaf_mark = correction_mark; | 
 | 		trees[i+1].next = &trees[trees_count-1]; | 
 | 		trees[i+1].leaf_mark = correction_mark; | 
 | 	} | 
 |  | 
 | 	/* Assign the callouts. */ | 
 | 	for (i = 0; i != trees_count; i += 2) { | 
 | 		trees[i].type = "nfdicf"; | 
 | 		trees[i].leaf_equal = nfdicf_equal; | 
 | 		trees[i].leaf_print = nfdicf_print; | 
 | 		trees[i].leaf_size = nfdicf_size; | 
 | 		trees[i].leaf_index = nfdicf_index; | 
 | 		trees[i].leaf_emit = nfdicf_emit; | 
 |  | 
 | 		trees[i+1].type = "nfdi"; | 
 | 		trees[i+1].leaf_equal = nfdi_equal; | 
 | 		trees[i+1].leaf_print = nfdi_print; | 
 | 		trees[i+1].leaf_size = nfdi_size; | 
 | 		trees[i+1].leaf_index = nfdi_index; | 
 | 		trees[i+1].leaf_emit = nfdi_emit; | 
 | 	} | 
 |  | 
 | 	/* Finish init. */ | 
 | 	for (i = 0; i != trees_count; i++) | 
 | 		trees[i].childnode = NODE; | 
 | } | 
 |  | 
 | static void trees_populate(void) | 
 | { | 
 | 	struct unicode_data *data; | 
 | 	unsigned int unichar; | 
 | 	char keyval[4]; | 
 | 	int keylen; | 
 | 	int i; | 
 |  | 
 | 	for (i = 0; i != trees_count; i++) { | 
 | 		if (verbose > 0) { | 
 | 			printf("Populating %s_%x\n", | 
 | 				trees[i].type, trees[i].maxage); | 
 | 		} | 
 | 		for (unichar = 0; unichar != 0x110000; unichar++) { | 
 | 			if (unicode_data[unichar].gen < 0) | 
 | 				continue; | 
 | 			keylen = utf8encode(keyval, unichar); | 
 | 			data = corrections_lookup(&unicode_data[unichar]); | 
 | 			if (data->correction <= trees[i].maxage) | 
 | 				data = &unicode_data[unichar]; | 
 | 			insert(&trees[i], keyval, keylen, data); | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | static void trees_reduce(void) | 
 | { | 
 | 	int i; | 
 | 	int size; | 
 | 	int changed; | 
 |  | 
 | 	for (i = 0; i != trees_count; i++) | 
 | 		prune(&trees[i]); | 
 | 	for (i = 0; i != trees_count; i++) | 
 | 		mark_nodes(&trees[i]); | 
 | 	do { | 
 | 		size = 0; | 
 | 		for (i = 0; i != trees_count; i++) | 
 | 			size = index_nodes(&trees[i], size); | 
 | 		changed = 0; | 
 | 		for (i = 0; i != trees_count; i++) | 
 | 			changed += size_nodes(&trees[i]); | 
 | 	} while (changed); | 
 |  | 
 | 	utf8data = calloc(size, 1); | 
 | 	utf8data_size = size; | 
 | 	for (i = 0; i != trees_count; i++) | 
 | 		emit(&trees[i], utf8data); | 
 |  | 
 | 	if (verbose > 0) { | 
 | 		for (i = 0; i != trees_count; i++) { | 
 | 			printf("%s_%x idx %d\n", | 
 | 				trees[i].type, trees[i].maxage, trees[i].index); | 
 | 		} | 
 | 	} | 
 |  | 
 | 	nfdi = utf8data + trees[trees_count-1].index; | 
 | 	nfdicf = utf8data + trees[trees_count-2].index; | 
 |  | 
 | 	nfdi_tree = &trees[trees_count-1]; | 
 | 	nfdicf_tree = &trees[trees_count-2]; | 
 | } | 
 |  | 
 | static void verify(struct tree *tree) | 
 | { | 
 | 	struct unicode_data *data; | 
 | 	utf8leaf_t	*leaf; | 
 | 	unsigned int	unichar; | 
 | 	char		key[4]; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 | 	int		report; | 
 | 	int		nocf; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Verifying %s_%x\n", tree->type, tree->maxage); | 
 | 	nocf = strcmp(tree->type, "nfdicf"); | 
 |  | 
 | 	for (unichar = 0; unichar != 0x110000; unichar++) { | 
 | 		report = 0; | 
 | 		data = corrections_lookup(&unicode_data[unichar]); | 
 | 		if (data->correction <= tree->maxage) | 
 | 			data = &unicode_data[unichar]; | 
 | 		utf8encode(key,unichar); | 
 | 		leaf = utf8lookup(tree, hangul, key); | 
 |  | 
 | 		if (!leaf) { | 
 | 			if (data->gen != -1) | 
 | 				report++; | 
 | 			if (unichar < 0xd800 || unichar > 0xdfff) | 
 | 				report++; | 
 | 		} else { | 
 | 			if (unichar >= 0xd800 && unichar <= 0xdfff) | 
 | 				report++; | 
 | 			if (data->gen == -1) | 
 | 				report++; | 
 | 			if (data->gen != LEAF_GEN(leaf)) | 
 | 				report++; | 
 | 			if (LEAF_CCC(leaf) == DECOMPOSE) { | 
 | 				if (HANGUL_SYLLABLE(data->code)) { | 
 | 					if (data->utf8nfdi[0] != HANGUL) | 
 | 						report++; | 
 | 				} else if (nocf) { | 
 | 					if (!data->utf8nfdi) { | 
 | 						report++; | 
 | 					} else if (strcmp(data->utf8nfdi, | 
 | 							  LEAF_STR(leaf))) { | 
 | 						report++; | 
 | 					} | 
 | 				} else { | 
 | 					if (!data->utf8nfdicf && | 
 | 					    !data->utf8nfdi) { | 
 | 						report++; | 
 | 					} else if (data->utf8nfdicf) { | 
 | 						if (strcmp(data->utf8nfdicf, | 
 | 							   LEAF_STR(leaf))) | 
 | 							report++; | 
 | 					} else if (strcmp(data->utf8nfdi, | 
 | 							  LEAF_STR(leaf))) { | 
 | 						report++; | 
 | 					} | 
 | 				} | 
 | 			} else if (data->ccc != LEAF_CCC(leaf)) { | 
 | 				report++; | 
 | 			} | 
 | 		} | 
 | 		if (report) { | 
 | 			printf("%X code %X gen %d ccc %d" | 
 | 				" nfdi -> \"%s\"", | 
 | 				unichar, data->code, data->gen, | 
 | 				data->ccc, | 
 | 				data->utf8nfdi); | 
 | 			if (leaf) { | 
 | 				printf(" gen %d ccc %d" | 
 | 					" nfdi -> \"%s\"", | 
 | 					LEAF_GEN(leaf), | 
 | 					LEAF_CCC(leaf), | 
 | 					LEAF_CCC(leaf) == DECOMPOSE ? | 
 | 						LEAF_STR(leaf) : ""); | 
 | 			} | 
 | 			printf("\n"); | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | static void trees_verify(void) | 
 | { | 
 | 	int i; | 
 |  | 
 | 	for (i = 0; i != trees_count; i++) | 
 | 		verify(&trees[i]); | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | static void help(void) | 
 | { | 
 | 	printf("Usage: %s [options]\n", argv0); | 
 | 	printf("\n"); | 
 | 	printf("This program creates an a data trie used for parsing and\n"); | 
 | 	printf("normalization of UTF-8 strings. The trie is derived from\n"); | 
 | 	printf("a set of input files from the Unicode character database\n"); | 
 | 	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); | 
 | 	printf("\n"); | 
 | 	printf("The generated tree supports two normalization forms:\n"); | 
 | 	printf("\n"); | 
 | 	printf("\tnfdi:\n"); | 
 | 	printf("\t- Apply unicode normalization form NFD.\n"); | 
 | 	printf("\t- Remove any Default_Ignorable_Code_Point.\n"); | 
 | 	printf("\n"); | 
 | 	printf("\tnfdicf:\n"); | 
 | 	printf("\t- Apply unicode normalization form NFD.\n"); | 
 | 	printf("\t- Remove any Default_Ignorable_Code_Point.\n"); | 
 | 	printf("\t- Apply a full casefold (C + F).\n"); | 
 | 	printf("\n"); | 
 | 	printf("These forms were chosen as being most useful when dealing\n"); | 
 | 	printf("with file names: NFD catches most cases where characters\n"); | 
 | 	printf("should be considered equivalent. The ignorables are mostly\n"); | 
 | 	printf("invisible, making names hard to type.\n"); | 
 | 	printf("\n"); | 
 | 	printf("The options to specify the files to be used are listed\n"); | 
 | 	printf("below with their default values, which are the names used\n"); | 
 | 	printf("by version 11.0.0 of the Unicode Character Database.\n"); | 
 | 	printf("\n"); | 
 | 	printf("The input files:\n"); | 
 | 	printf("\t-a %s\n", AGE_NAME); | 
 | 	printf("\t-c %s\n", CCC_NAME); | 
 | 	printf("\t-p %s\n", PROP_NAME); | 
 | 	printf("\t-d %s\n", DATA_NAME); | 
 | 	printf("\t-f %s\n", FOLD_NAME); | 
 | 	printf("\t-n %s\n", NORM_NAME); | 
 | 	printf("\n"); | 
 | 	printf("Additionally, the generated tables are tested using:\n"); | 
 | 	printf("\t-t %s\n", TEST_NAME); | 
 | 	printf("\n"); | 
 | 	printf("Finally, the output file:\n"); | 
 | 	printf("\t-o %s\n", UTF8_NAME); | 
 | 	printf("\n"); | 
 | } | 
 |  | 
 | static void usage(void) | 
 | { | 
 | 	help(); | 
 | 	exit(1); | 
 | } | 
 |  | 
 | static void open_fail(const char *name, int error) | 
 | { | 
 | 	printf("Error %d opening %s: %s\n", error, name, strerror(error)); | 
 | 	exit(1); | 
 | } | 
 |  | 
 | static void file_fail(const char *filename) | 
 | { | 
 | 	printf("Error parsing %s\n", filename); | 
 | 	exit(1); | 
 | } | 
 |  | 
 | static void line_fail(const char *filename, const char *line) | 
 | { | 
 | 	printf("Error parsing %s:%s\n", filename, line); | 
 | 	exit(1); | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | static void print_utf32(unsigned int *utf32str) | 
 | { | 
 | 	int	i; | 
 |  | 
 | 	for (i = 0; utf32str[i]; i++) | 
 | 		printf(" %X", utf32str[i]); | 
 | } | 
 |  | 
 | static void print_utf32nfdi(unsigned int unichar) | 
 | { | 
 | 	printf(" %X ->", unichar); | 
 | 	print_utf32(unicode_data[unichar].utf32nfdi); | 
 | 	printf("\n"); | 
 | } | 
 |  | 
 | static void print_utf32nfdicf(unsigned int unichar) | 
 | { | 
 | 	printf(" %X ->", unichar); | 
 | 	print_utf32(unicode_data[unichar].utf32nfdicf); | 
 | 	printf("\n"); | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | static void age_init(void) | 
 | { | 
 | 	FILE *file; | 
 | 	unsigned int first; | 
 | 	unsigned int last; | 
 | 	unsigned int unichar; | 
 | 	unsigned int major; | 
 | 	unsigned int minor; | 
 | 	unsigned int revision; | 
 | 	int gen; | 
 | 	int count; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Parsing %s\n", age_name); | 
 |  | 
 | 	file = fopen(age_name, "r"); | 
 | 	if (!file) | 
 | 		open_fail(age_name, errno); | 
 | 	count = 0; | 
 |  | 
 | 	gen = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "# Age=V%d_%d_%d", | 
 | 				&major, &minor, &revision); | 
 | 		if (ret == 3) { | 
 | 			ages_count++; | 
 | 			if (verbose > 1) | 
 | 				printf(" Age V%d_%d_%d\n", | 
 | 					major, minor, revision); | 
 | 			if (!age_valid(major, minor, revision)) | 
 | 				line_fail(age_name, line); | 
 | 			continue; | 
 | 		} | 
 | 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor); | 
 | 		if (ret == 2) { | 
 | 			ages_count++; | 
 | 			if (verbose > 1) | 
 | 				printf(" Age V%d_%d\n", major, minor); | 
 | 			if (!age_valid(major, minor, 0)) | 
 | 				line_fail(age_name, line); | 
 | 			continue; | 
 | 		} | 
 | 	} | 
 |  | 
 | 	/* We must have found something above. */ | 
 | 	if (verbose > 1) | 
 | 		printf("%d age entries\n", ages_count); | 
 | 	if (ages_count == 0 || ages_count > MAXGEN) | 
 | 		file_fail(age_name); | 
 |  | 
 | 	/* There is a 0 entry. */ | 
 | 	ages_count++; | 
 | 	ages = calloc(ages_count + 1, sizeof(*ages)); | 
 | 	/* And a guard entry. */ | 
 | 	ages[ages_count] = (unsigned int)-1; | 
 |  | 
 | 	rewind(file); | 
 | 	count = 0; | 
 | 	gen = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "# Age=V%d_%d_%d", | 
 | 				&major, &minor, &revision); | 
 | 		if (ret == 3) { | 
 | 			ages[++gen] = | 
 | 				UNICODE_AGE(major, minor, revision); | 
 | 			if (verbose > 1) | 
 | 				printf(" Age V%d_%d_%d = gen %d\n", | 
 | 					major, minor, revision, gen); | 
 | 			if (!age_valid(major, minor, revision)) | 
 | 				line_fail(age_name, line); | 
 | 			continue; | 
 | 		} | 
 | 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor); | 
 | 		if (ret == 2) { | 
 | 			ages[++gen] = UNICODE_AGE(major, minor, 0); | 
 | 			if (verbose > 1) | 
 | 				printf(" Age V%d_%d = %d\n", | 
 | 					major, minor, gen); | 
 | 			if (!age_valid(major, minor, 0)) | 
 | 				line_fail(age_name, line); | 
 | 			continue; | 
 | 		} | 
 | 		ret = sscanf(line, "%X..%X ; %d.%d #", | 
 | 			     &first, &last, &major, &minor); | 
 | 		if (ret == 4) { | 
 | 			for (unichar = first; unichar <= last; unichar++) | 
 | 				unicode_data[unichar].gen = gen; | 
 | 			count += 1 + last - first; | 
 | 			if (verbose > 1) | 
 | 				printf("  %X..%X gen %d\n", first, last, gen); | 
 | 			if (!utf32valid(first) || !utf32valid(last)) | 
 | 				line_fail(age_name, line); | 
 | 			continue; | 
 | 		} | 
 | 		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); | 
 | 		if (ret == 3) { | 
 | 			unicode_data[unichar].gen = gen; | 
 | 			count++; | 
 | 			if (verbose > 1) | 
 | 				printf("  %X gen %d\n", unichar, gen); | 
 | 			if (!utf32valid(unichar)) | 
 | 				line_fail(age_name, line); | 
 | 			continue; | 
 | 		} | 
 | 	} | 
 | 	unicode_maxage = ages[gen]; | 
 | 	fclose(file); | 
 |  | 
 | 	/* Nix surrogate block */ | 
 | 	if (verbose > 1) | 
 | 		printf(" Removing surrogate block D800..DFFF\n"); | 
 | 	for (unichar = 0xd800; unichar <= 0xdfff; unichar++) | 
 | 		unicode_data[unichar].gen = -1; | 
 |  | 
 | 	if (verbose > 0) | 
 | 	        printf("Found %d entries\n", count); | 
 | 	if (count == 0) | 
 | 		file_fail(age_name); | 
 | } | 
 |  | 
 | static void ccc_init(void) | 
 | { | 
 | 	FILE *file; | 
 | 	unsigned int first; | 
 | 	unsigned int last; | 
 | 	unsigned int unichar; | 
 | 	unsigned int value; | 
 | 	int count; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Parsing %s\n", ccc_name); | 
 |  | 
 | 	file = fopen(ccc_name, "r"); | 
 | 	if (!file) | 
 | 		open_fail(ccc_name, errno); | 
 |  | 
 | 	count = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); | 
 | 		if (ret == 3) { | 
 | 			for (unichar = first; unichar <= last; unichar++) { | 
 | 				unicode_data[unichar].ccc = value; | 
 |                                 count++; | 
 | 			} | 
 | 			if (verbose > 1) | 
 | 				printf(" %X..%X ccc %d\n", first, last, value); | 
 | 			if (!utf32valid(first) || !utf32valid(last)) | 
 | 				line_fail(ccc_name, line); | 
 | 			continue; | 
 | 		} | 
 | 		ret = sscanf(line, "%X ; %d #", &unichar, &value); | 
 | 		if (ret == 2) { | 
 | 			unicode_data[unichar].ccc = value; | 
 |                         count++; | 
 | 			if (verbose > 1) | 
 | 				printf(" %X ccc %d\n", unichar, value); | 
 | 			if (!utf32valid(unichar)) | 
 | 				line_fail(ccc_name, line); | 
 | 			continue; | 
 | 		} | 
 | 	} | 
 | 	fclose(file); | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Found %d entries\n", count); | 
 | 	if (count == 0) | 
 | 		file_fail(ccc_name); | 
 | } | 
 |  | 
 | static int ignore_compatibility_form(char *type) | 
 | { | 
 | 	int i; | 
 | 	char *ignored_types[] = {"font", "noBreak", "initial", "medial", | 
 | 				 "final", "isolated", "circle", "super", | 
 | 				 "sub", "vertical", "wide", "narrow", | 
 | 				 "small", "square", "fraction", "compat"}; | 
 |  | 
 | 	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) | 
 | 		if (strcmp(type, ignored_types[i]) == 0) | 
 | 			return 1; | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void nfdi_init(void) | 
 | { | 
 | 	FILE *file; | 
 | 	unsigned int unichar; | 
 | 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ | 
 | 	char *s; | 
 | 	char *type; | 
 | 	unsigned int *um; | 
 | 	int count; | 
 | 	int i; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Parsing %s\n", data_name); | 
 | 	file = fopen(data_name, "r"); | 
 | 	if (!file) | 
 | 		open_fail(data_name, errno); | 
 |  | 
 | 	count = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", | 
 | 			     &unichar, buf0); | 
 | 		if (ret != 2) | 
 | 			continue; | 
 | 		if (!utf32valid(unichar)) | 
 | 			line_fail(data_name, line); | 
 |  | 
 | 		s = buf0; | 
 | 		/* skip over <tag> */ | 
 | 		if (*s == '<') { | 
 | 			type = ++s; | 
 | 			while (*++s != '>'); | 
 | 			*s++ = '\0'; | 
 | 			if(ignore_compatibility_form(type)) | 
 | 				continue; | 
 | 		} | 
 | 		/* decode the decomposition into UTF-32 */ | 
 | 		i = 0; | 
 | 		while (*s) { | 
 | 			mapping[i] = strtoul(s, &s, 16); | 
 | 			if (!utf32valid(mapping[i])) | 
 | 				line_fail(data_name, line); | 
 | 			i++; | 
 | 		} | 
 | 		mapping[i++] = 0; | 
 |  | 
 | 		um = malloc(i * sizeof(unsigned int)); | 
 | 		memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 		unicode_data[unichar].utf32nfdi = um; | 
 |  | 
 | 		if (verbose > 1) | 
 | 			print_utf32nfdi(unichar); | 
 | 		count++; | 
 | 	} | 
 | 	fclose(file); | 
 | 	if (verbose > 0) | 
 | 		printf("Found %d entries\n", count); | 
 | 	if (count == 0) | 
 | 		file_fail(data_name); | 
 | } | 
 |  | 
 | static void nfdicf_init(void) | 
 | { | 
 | 	FILE *file; | 
 | 	unsigned int unichar; | 
 | 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ | 
 | 	char status; | 
 | 	char *s; | 
 | 	unsigned int *um; | 
 | 	int i; | 
 | 	int count; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Parsing %s\n", fold_name); | 
 | 	file = fopen(fold_name, "r"); | 
 | 	if (!file) | 
 | 		open_fail(fold_name, errno); | 
 |  | 
 | 	count = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); | 
 | 		if (ret != 3) | 
 | 			continue; | 
 | 		if (!utf32valid(unichar)) | 
 | 			line_fail(fold_name, line); | 
 | 		/* Use the C+F casefold. */ | 
 | 		if (status != 'C' && status != 'F') | 
 | 			continue; | 
 | 		s = buf0; | 
 | 		if (*s == '<') | 
 | 			while (*s++ != ' ') | 
 | 				; | 
 | 		i = 0; | 
 | 		while (*s) { | 
 | 			mapping[i] = strtoul(s, &s, 16); | 
 | 			if (!utf32valid(mapping[i])) | 
 | 				line_fail(fold_name, line); | 
 | 			i++; | 
 | 		} | 
 | 		mapping[i++] = 0; | 
 |  | 
 | 		um = malloc(i * sizeof(unsigned int)); | 
 | 		memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 		unicode_data[unichar].utf32nfdicf = um; | 
 |  | 
 | 		if (verbose > 1) | 
 | 			print_utf32nfdicf(unichar); | 
 | 		count++; | 
 | 	} | 
 | 	fclose(file); | 
 | 	if (verbose > 0) | 
 | 		printf("Found %d entries\n", count); | 
 | 	if (count == 0) | 
 | 		file_fail(fold_name); | 
 | } | 
 |  | 
 | static void ignore_init(void) | 
 | { | 
 | 	FILE *file; | 
 | 	unsigned int unichar; | 
 | 	unsigned int first; | 
 | 	unsigned int last; | 
 | 	unsigned int *um; | 
 | 	int count; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Parsing %s\n", prop_name); | 
 | 	file = fopen(prop_name, "r"); | 
 | 	if (!file) | 
 | 		open_fail(prop_name, errno); | 
 | 	assert(file); | 
 | 	count = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); | 
 | 		if (ret == 3) { | 
 | 			if (strcmp(buf0, "Default_Ignorable_Code_Point")) | 
 | 				continue; | 
 | 			if (!utf32valid(first) || !utf32valid(last)) | 
 | 				line_fail(prop_name, line); | 
 | 			for (unichar = first; unichar <= last; unichar++) { | 
 | 				free(unicode_data[unichar].utf32nfdi); | 
 | 				um = malloc(sizeof(unsigned int)); | 
 | 				*um = 0; | 
 | 				unicode_data[unichar].utf32nfdi = um; | 
 | 				free(unicode_data[unichar].utf32nfdicf); | 
 | 				um = malloc(sizeof(unsigned int)); | 
 | 				*um = 0; | 
 | 				unicode_data[unichar].utf32nfdicf = um; | 
 | 				count++; | 
 | 			} | 
 | 			if (verbose > 1) | 
 | 				printf(" %X..%X Default_Ignorable_Code_Point\n", | 
 | 					first, last); | 
 | 			continue; | 
 | 		} | 
 | 		ret = sscanf(line, "%X ; %s # ", &unichar, buf0); | 
 | 		if (ret == 2) { | 
 | 			if (strcmp(buf0, "Default_Ignorable_Code_Point")) | 
 | 				continue; | 
 | 			if (!utf32valid(unichar)) | 
 | 				line_fail(prop_name, line); | 
 | 			free(unicode_data[unichar].utf32nfdi); | 
 | 			um = malloc(sizeof(unsigned int)); | 
 | 			*um = 0; | 
 | 			unicode_data[unichar].utf32nfdi = um; | 
 | 			free(unicode_data[unichar].utf32nfdicf); | 
 | 			um = malloc(sizeof(unsigned int)); | 
 | 			*um = 0; | 
 | 			unicode_data[unichar].utf32nfdicf = um; | 
 | 			if (verbose > 1) | 
 | 				printf(" %X Default_Ignorable_Code_Point\n", | 
 | 					unichar); | 
 | 			count++; | 
 | 			continue; | 
 | 		} | 
 | 	} | 
 | 	fclose(file); | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Found %d entries\n", count); | 
 | 	if (count == 0) | 
 | 		file_fail(prop_name); | 
 | } | 
 |  | 
 | static void corrections_init(void) | 
 | { | 
 | 	FILE *file; | 
 | 	unsigned int unichar; | 
 | 	unsigned int major; | 
 | 	unsigned int minor; | 
 | 	unsigned int revision; | 
 | 	unsigned int age; | 
 | 	unsigned int *um; | 
 | 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ | 
 | 	char *s; | 
 | 	int i; | 
 | 	int count; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Parsing %s\n", norm_name); | 
 | 	file = fopen(norm_name, "r"); | 
 | 	if (!file) | 
 | 		open_fail(norm_name, errno); | 
 |  | 
 | 	count = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", | 
 | 				&unichar, buf0, buf1, | 
 | 				&major, &minor, &revision); | 
 | 		if (ret != 6) | 
 | 			continue; | 
 | 		if (!utf32valid(unichar) || !age_valid(major, minor, revision)) | 
 | 			line_fail(norm_name, line); | 
 | 		count++; | 
 | 	} | 
 | 	corrections = calloc(count, sizeof(struct unicode_data)); | 
 | 	corrections_count = count; | 
 | 	rewind(file); | 
 |  | 
 | 	count = 0; | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", | 
 | 				&unichar, buf0, buf1, | 
 | 				&major, &minor, &revision); | 
 | 		if (ret != 6) | 
 | 			continue; | 
 | 		if (!utf32valid(unichar) || !age_valid(major, minor, revision)) | 
 | 			line_fail(norm_name, line); | 
 | 		corrections[count] = unicode_data[unichar]; | 
 | 		assert(corrections[count].code == unichar); | 
 | 		age = UNICODE_AGE(major, minor, revision); | 
 | 		corrections[count].correction = age; | 
 |  | 
 | 		i = 0; | 
 | 		s = buf0; | 
 | 		while (*s) { | 
 | 			mapping[i] = strtoul(s, &s, 16); | 
 | 			if (!utf32valid(mapping[i])) | 
 | 				line_fail(norm_name, line); | 
 | 			i++; | 
 | 		} | 
 | 		mapping[i++] = 0; | 
 |  | 
 | 		um = malloc(i * sizeof(unsigned int)); | 
 | 		memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 		corrections[count].utf32nfdi = um; | 
 |  | 
 | 		if (verbose > 1) | 
 | 			printf(" %X -> %s -> %s V%d_%d_%d\n", | 
 | 				unichar, buf0, buf1, major, minor, revision); | 
 | 		count++; | 
 | 	} | 
 | 	fclose(file); | 
 |  | 
 | 	if (verbose > 0) | 
 | 	        printf("Found %d entries\n", count); | 
 | 	if (count == 0) | 
 | 		file_fail(norm_name); | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | /* | 
 |  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) | 
 |  * | 
 |  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; | 
 |  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; | 
 |  * | 
 |  * SBase = 0xAC00 | 
 |  * LBase = 0x1100 | 
 |  * VBase = 0x1161 | 
 |  * TBase = 0x11A7 | 
 |  * LCount = 19 | 
 |  * VCount = 21 | 
 |  * TCount = 28 | 
 |  * NCount = 588 (VCount * TCount) | 
 |  * SCount = 11172 (LCount * NCount) | 
 |  * | 
 |  * Decomposition: | 
 |  *   SIndex = s - SBase | 
 |  * | 
 |  * LV (Canonical/Full) | 
 |  *   LIndex = SIndex / NCount | 
 |  *   VIndex = (Sindex % NCount) / TCount | 
 |  *   LPart = LBase + LIndex | 
 |  *   VPart = VBase + VIndex | 
 |  * | 
 |  * LVT (Canonical) | 
 |  *   LVIndex = (SIndex / TCount) * TCount | 
 |  *   TIndex = (Sindex % TCount) | 
 |  *   LVPart = SBase + LVIndex | 
 |  *   TPart = TBase + TIndex | 
 |  * | 
 |  * LVT (Full) | 
 |  *   LIndex = SIndex / NCount | 
 |  *   VIndex = (Sindex % NCount) / TCount | 
 |  *   TIndex = (Sindex % TCount) | 
 |  *   LPart = LBase + LIndex | 
 |  *   VPart = VBase + VIndex | 
 |  *   if (TIndex == 0) { | 
 |  *          d = <LPart, VPart> | 
 |  *   } else { | 
 |  *          TPart = TBase + TIndex | 
 |  *          d = <LPart, VPart, TPart> | 
 |  *   } | 
 |  * | 
 |  */ | 
 |  | 
 | static void hangul_decompose(void) | 
 | { | 
 | 	unsigned int sb = 0xAC00; | 
 | 	unsigned int lb = 0x1100; | 
 | 	unsigned int vb = 0x1161; | 
 | 	unsigned int tb = 0x11a7; | 
 | 	/* unsigned int lc = 19; */ | 
 | 	unsigned int vc = 21; | 
 | 	unsigned int tc = 28; | 
 | 	unsigned int nc = (vc * tc); | 
 | 	/* unsigned int sc = (lc * nc); */ | 
 | 	unsigned int unichar; | 
 | 	unsigned int mapping[4]; | 
 | 	unsigned int *um; | 
 |         int count; | 
 | 	int i; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Decomposing hangul\n"); | 
 | 	/* Hangul */ | 
 | 	count = 0; | 
 | 	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { | 
 | 		unsigned int si = unichar - sb; | 
 | 		unsigned int li = si / nc; | 
 | 		unsigned int vi = (si % nc) / tc; | 
 | 		unsigned int ti = si % tc; | 
 |  | 
 | 		i = 0; | 
 | 		mapping[i++] = lb + li; | 
 | 		mapping[i++] = vb + vi; | 
 | 		if (ti) | 
 | 			mapping[i++] = tb + ti; | 
 | 		mapping[i++] = 0; | 
 |  | 
 | 		assert(!unicode_data[unichar].utf32nfdi); | 
 | 		um = malloc(i * sizeof(unsigned int)); | 
 | 		memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 		unicode_data[unichar].utf32nfdi = um; | 
 |  | 
 | 		assert(!unicode_data[unichar].utf32nfdicf); | 
 | 		um = malloc(i * sizeof(unsigned int)); | 
 | 		memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 		unicode_data[unichar].utf32nfdicf = um; | 
 |  | 
 | 		/* | 
 | 		 * Add a cookie as a reminder that the hangul syllable | 
 | 		 * decompositions must not be stored in the generated | 
 | 		 * trie. | 
 | 		 */ | 
 | 		unicode_data[unichar].utf8nfdi = malloc(2); | 
 | 		unicode_data[unichar].utf8nfdi[0] = HANGUL; | 
 | 		unicode_data[unichar].utf8nfdi[1] = '\0'; | 
 |  | 
 | 		if (verbose > 1) | 
 | 			print_utf32nfdi(unichar); | 
 |  | 
 | 		count++; | 
 | 	} | 
 | 	if (verbose > 0) | 
 | 		printf("Created %d entries\n", count); | 
 | } | 
 |  | 
 | static void nfdi_decompose(void) | 
 | { | 
 | 	unsigned int unichar; | 
 | 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ | 
 | 	unsigned int *um; | 
 | 	unsigned int *dc; | 
 | 	int count; | 
 | 	int i; | 
 | 	int j; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Decomposing nfdi\n"); | 
 |  | 
 | 	count = 0; | 
 | 	for (unichar = 0; unichar != 0x110000; unichar++) { | 
 | 		if (!unicode_data[unichar].utf32nfdi) | 
 | 			continue; | 
 | 		for (;;) { | 
 | 			ret = 1; | 
 | 			i = 0; | 
 | 			um = unicode_data[unichar].utf32nfdi; | 
 | 			while (*um) { | 
 | 				dc = unicode_data[*um].utf32nfdi; | 
 | 				if (dc) { | 
 | 					for (j = 0; dc[j]; j++) | 
 | 						mapping[i++] = dc[j]; | 
 | 					ret = 0; | 
 | 				} else { | 
 | 					mapping[i++] = *um; | 
 | 				} | 
 | 				um++; | 
 | 			} | 
 | 			mapping[i++] = 0; | 
 | 			if (ret) | 
 | 				break; | 
 | 			free(unicode_data[unichar].utf32nfdi); | 
 | 			um = malloc(i * sizeof(unsigned int)); | 
 | 			memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 			unicode_data[unichar].utf32nfdi = um; | 
 | 		} | 
 | 		/* Add this decomposition to nfdicf if there is no entry. */ | 
 | 		if (!unicode_data[unichar].utf32nfdicf) { | 
 | 			um = malloc(i * sizeof(unsigned int)); | 
 | 			memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 			unicode_data[unichar].utf32nfdicf = um; | 
 | 		} | 
 | 		if (verbose > 1) | 
 | 			print_utf32nfdi(unichar); | 
 | 		count++; | 
 | 	} | 
 | 	if (verbose > 0) | 
 | 		printf("Processed %d entries\n", count); | 
 | } | 
 |  | 
 | static void nfdicf_decompose(void) | 
 | { | 
 | 	unsigned int unichar; | 
 | 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ | 
 | 	unsigned int *um; | 
 | 	unsigned int *dc; | 
 | 	int count; | 
 | 	int i; | 
 | 	int j; | 
 | 	int ret; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Decomposing nfdicf\n"); | 
 | 	count = 0; | 
 | 	for (unichar = 0; unichar != 0x110000; unichar++) { | 
 | 		if (!unicode_data[unichar].utf32nfdicf) | 
 | 			continue; | 
 | 		for (;;) { | 
 | 			ret = 1; | 
 | 			i = 0; | 
 | 			um = unicode_data[unichar].utf32nfdicf; | 
 | 			while (*um) { | 
 | 				dc = unicode_data[*um].utf32nfdicf; | 
 | 				if (dc) { | 
 | 					for (j = 0; dc[j]; j++) | 
 | 						mapping[i++] = dc[j]; | 
 | 					ret = 0; | 
 | 				} else { | 
 | 					mapping[i++] = *um; | 
 | 				} | 
 | 				um++; | 
 | 			} | 
 | 			mapping[i++] = 0; | 
 | 			if (ret) | 
 | 				break; | 
 | 			free(unicode_data[unichar].utf32nfdicf); | 
 | 			um = malloc(i * sizeof(unsigned int)); | 
 | 			memcpy(um, mapping, i * sizeof(unsigned int)); | 
 | 			unicode_data[unichar].utf32nfdicf = um; | 
 | 		} | 
 | 		if (verbose > 1) | 
 | 			print_utf32nfdicf(unichar); | 
 | 		count++; | 
 | 	} | 
 | 	if (verbose > 0) | 
 | 		printf("Processed %d entries\n", count); | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | int utf8agemax(struct tree *, const char *); | 
 | int utf8nagemax(struct tree *, const char *, size_t); | 
 | int utf8agemin(struct tree *, const char *); | 
 | int utf8nagemin(struct tree *, const char *, size_t); | 
 | ssize_t utf8len(struct tree *, const char *); | 
 | ssize_t utf8nlen(struct tree *, const char *, size_t); | 
 | struct utf8cursor; | 
 | int utf8cursor(struct utf8cursor *, struct tree *, const char *); | 
 | int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); | 
 | int utf8byte(struct utf8cursor *); | 
 |  | 
 | /* | 
 |  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) | 
 |  * | 
 |  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; | 
 |  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; | 
 |  * | 
 |  * SBase = 0xAC00 | 
 |  * LBase = 0x1100 | 
 |  * VBase = 0x1161 | 
 |  * TBase = 0x11A7 | 
 |  * LCount = 19 | 
 |  * VCount = 21 | 
 |  * TCount = 28 | 
 |  * NCount = 588 (VCount * TCount) | 
 |  * SCount = 11172 (LCount * NCount) | 
 |  * | 
 |  * Decomposition: | 
 |  *   SIndex = s - SBase | 
 |  * | 
 |  * LV (Canonical/Full) | 
 |  *   LIndex = SIndex / NCount | 
 |  *   VIndex = (Sindex % NCount) / TCount | 
 |  *   LPart = LBase + LIndex | 
 |  *   VPart = VBase + VIndex | 
 |  * | 
 |  * LVT (Canonical) | 
 |  *   LVIndex = (SIndex / TCount) * TCount | 
 |  *   TIndex = (Sindex % TCount) | 
 |  *   LVPart = SBase + LVIndex | 
 |  *   TPart = TBase + TIndex | 
 |  * | 
 |  * LVT (Full) | 
 |  *   LIndex = SIndex / NCount | 
 |  *   VIndex = (Sindex % NCount) / TCount | 
 |  *   TIndex = (Sindex % TCount) | 
 |  *   LPart = LBase + LIndex | 
 |  *   VPart = VBase + VIndex | 
 |  *   if (TIndex == 0) { | 
 |  *          d = <LPart, VPart> | 
 |  *   } else { | 
 |  *          TPart = TBase + TIndex | 
 |  *          d = <LPart, VPart, TPart> | 
 |  *   } | 
 |  */ | 
 |  | 
 | /* Constants */ | 
 | #define SB	(0xAC00) | 
 | #define LB	(0x1100) | 
 | #define VB	(0x1161) | 
 | #define TB	(0x11A7) | 
 | #define LC	(19) | 
 | #define VC	(21) | 
 | #define TC	(28) | 
 | #define NC	(VC * TC) | 
 | #define SC	(LC * NC) | 
 |  | 
 | /* Algorithmic decomposition of hangul syllable. */ | 
 | static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) | 
 | { | 
 | 	unsigned int	si; | 
 | 	unsigned int	li; | 
 | 	unsigned int	vi; | 
 | 	unsigned int	ti; | 
 | 	unsigned char	*h; | 
 |  | 
 | 	/* Calculate the SI, LI, VI, and TI values. */ | 
 | 	si = utf8decode(str) - SB; | 
 | 	li = si / NC; | 
 | 	vi = (si % NC) / TC; | 
 | 	ti = si % TC; | 
 |  | 
 | 	/* Fill in base of leaf. */ | 
 | 	h = hangul; | 
 | 	LEAF_GEN(h) = 2; | 
 | 	LEAF_CCC(h) = DECOMPOSE; | 
 | 	h += 2; | 
 |  | 
 | 	/* Add LPart, a 3-byte UTF-8 sequence. */ | 
 | 	h += utf8encode((char *)h, li + LB); | 
 |  | 
 | 	/* Add VPart, a 3-byte UTF-8 sequence. */ | 
 | 	h += utf8encode((char *)h, vi + VB); | 
 |  | 
 | 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */ | 
 | 	if (ti) | 
 | 		h += utf8encode((char *)h, ti + TB); | 
 |  | 
 | 	/* Terminate string. */ | 
 | 	h[0] = '\0'; | 
 |  | 
 | 	return hangul; | 
 | } | 
 |  | 
 | /* | 
 |  * Use trie to scan s, touching at most len bytes. | 
 |  * Returns the leaf if one exists, NULL otherwise. | 
 |  * | 
 |  * A non-NULL return guarantees that the UTF-8 sequence starting at s | 
 |  * is well-formed and corresponds to a known unicode code point.  The | 
 |  * shorthand for this will be "is valid UTF-8 unicode". | 
 |  */ | 
 | static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, | 
 | 			       const char *s, size_t len) | 
 | { | 
 | 	utf8trie_t	*trie; | 
 | 	int		offlen; | 
 | 	int		offset; | 
 | 	int		mask; | 
 | 	int		node; | 
 |  | 
 | 	if (!tree) | 
 | 		return NULL; | 
 | 	if (len == 0) | 
 | 		return NULL; | 
 | 	node = 1; | 
 | 	trie = utf8data + tree->index; | 
 | 	while (node) { | 
 | 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; | 
 | 		if (*trie & NEXTBYTE) { | 
 | 			if (--len == 0) | 
 | 				return NULL; | 
 | 			s++; | 
 | 		} | 
 | 		mask = 1 << (*trie & BITNUM); | 
 | 		if (*s & mask) { | 
 | 			/* Right leg */ | 
 | 			if (offlen) { | 
 | 				/* Right node at offset of trie */ | 
 | 				node = (*trie & RIGHTNODE); | 
 | 				offset = trie[offlen]; | 
 | 				while (--offlen) { | 
 | 					offset <<= 8; | 
 | 					offset |= trie[offlen]; | 
 | 				} | 
 | 				trie += offset; | 
 | 			} else if (*trie & RIGHTPATH) { | 
 | 				/* Right node after this node */ | 
 | 				node = (*trie & TRIENODE); | 
 | 				trie++; | 
 | 			} else { | 
 | 				/* No right node. */ | 
 | 				return NULL; | 
 | 			} | 
 | 		} else { | 
 | 			/* Left leg */ | 
 | 			if (offlen) { | 
 | 				/* Left node after this node. */ | 
 | 				node = (*trie & LEFTNODE); | 
 | 				trie += offlen + 1; | 
 | 			} else if (*trie & RIGHTPATH) { | 
 | 				/* No left node. */ | 
 | 				return NULL; | 
 | 			} else { | 
 | 				/* Left node after this node */ | 
 | 				node = (*trie & TRIENODE); | 
 | 				trie++; | 
 | 			} | 
 | 		} | 
 | 	} | 
 | 	/* | 
 | 	 * Hangul decomposition is done algorithmically. These are the | 
 | 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is | 
 | 	 * always 3 bytes long, so s has been advanced twice, and the | 
 | 	 * start of the sequence is at s-2. | 
 | 	 */ | 
 | 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) | 
 | 		trie = utf8hangul(s - 2, hangul); | 
 | 	return trie; | 
 | } | 
 |  | 
 | /* | 
 |  * Use trie to scan s. | 
 |  * Returns the leaf if one exists, NULL otherwise. | 
 |  * | 
 |  * Forwards to trie_nlookup(). | 
 |  */ | 
 | static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, | 
 | 			      const char *s) | 
 | { | 
 | 	return utf8nlookup(tree, hangul, s, (size_t)-1); | 
 | } | 
 |  | 
 | /* | 
 |  * Return the number of bytes used by the current UTF-8 sequence. | 
 |  * Assumes the input points to the first byte of a valid UTF-8 | 
 |  * sequence. | 
 |  */ | 
 | static inline int utf8clen(const char *s) | 
 | { | 
 | 	unsigned char c = *s; | 
 | 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); | 
 | } | 
 |  | 
 | /* | 
 |  * Maximum age of any character in s. | 
 |  * Return -1 if s is not valid UTF-8 unicode. | 
 |  * Return 0 if only non-assigned code points are used. | 
 |  */ | 
 | int utf8agemax(struct tree *tree, const char *s) | 
 | { | 
 | 	utf8leaf_t	*leaf; | 
 | 	int		age = 0; | 
 | 	int		leaf_age; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 |  | 
 | 	if (!tree) | 
 | 		return -1; | 
 |  | 
 | 	while (*s) { | 
 | 		leaf = utf8lookup(tree, hangul, s); | 
 | 		if (!leaf) | 
 | 			return -1; | 
 | 		leaf_age = ages[LEAF_GEN(leaf)]; | 
 | 		if (leaf_age <= tree->maxage && leaf_age > age) | 
 | 			age = leaf_age; | 
 | 		s += utf8clen(s); | 
 | 	} | 
 | 	return age; | 
 | } | 
 |  | 
 | /* | 
 |  * Minimum age of any character in s. | 
 |  * Return -1 if s is not valid UTF-8 unicode. | 
 |  * Return 0 if non-assigned code points are used. | 
 |  */ | 
 | int utf8agemin(struct tree *tree, const char *s) | 
 | { | 
 | 	utf8leaf_t	*leaf; | 
 | 	int		age; | 
 | 	int		leaf_age; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 |  | 
 | 	if (!tree) | 
 | 		return -1; | 
 | 	age = tree->maxage; | 
 | 	while (*s) { | 
 | 		leaf = utf8lookup(tree, hangul, s); | 
 | 		if (!leaf) | 
 | 			return -1; | 
 | 		leaf_age = ages[LEAF_GEN(leaf)]; | 
 | 		if (leaf_age <= tree->maxage && leaf_age < age) | 
 | 			age = leaf_age; | 
 | 		s += utf8clen(s); | 
 | 	} | 
 | 	return age; | 
 | } | 
 |  | 
 | /* | 
 |  * Maximum age of any character in s, touch at most len bytes. | 
 |  * Return -1 if s is not valid UTF-8 unicode. | 
 |  */ | 
 | int utf8nagemax(struct tree *tree, const char *s, size_t len) | 
 | { | 
 | 	utf8leaf_t	*leaf; | 
 | 	int		age = 0; | 
 | 	int		leaf_age; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 |  | 
 | 	if (!tree) | 
 | 		return -1; | 
 |  | 
 |         while (len && *s) { | 
 | 		leaf = utf8nlookup(tree, hangul, s, len); | 
 | 		if (!leaf) | 
 | 			return -1; | 
 | 		leaf_age = ages[LEAF_GEN(leaf)]; | 
 | 		if (leaf_age <= tree->maxage && leaf_age > age) | 
 | 			age = leaf_age; | 
 | 		len -= utf8clen(s); | 
 | 		s += utf8clen(s); | 
 | 	} | 
 | 	return age; | 
 | } | 
 |  | 
 | /* | 
 |  * Maximum age of any character in s, touch at most len bytes. | 
 |  * Return -1 if s is not valid UTF-8 unicode. | 
 |  */ | 
 | int utf8nagemin(struct tree *tree, const char *s, size_t len) | 
 | { | 
 | 	utf8leaf_t	*leaf; | 
 | 	int		leaf_age; | 
 | 	int		age; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 |  | 
 | 	if (!tree) | 
 | 		return -1; | 
 | 	age = tree->maxage; | 
 |         while (len && *s) { | 
 | 		leaf = utf8nlookup(tree, hangul, s, len); | 
 | 		if (!leaf) | 
 | 			return -1; | 
 | 		leaf_age = ages[LEAF_GEN(leaf)]; | 
 | 		if (leaf_age <= tree->maxage && leaf_age < age) | 
 | 			age = leaf_age; | 
 | 		len -= utf8clen(s); | 
 | 		s += utf8clen(s); | 
 | 	} | 
 | 	return age; | 
 | } | 
 |  | 
 | /* | 
 |  * Length of the normalization of s. | 
 |  * Return -1 if s is not valid UTF-8 unicode. | 
 |  * | 
 |  * A string of Default_Ignorable_Code_Point has length 0. | 
 |  */ | 
 | ssize_t utf8len(struct tree *tree, const char *s) | 
 | { | 
 | 	utf8leaf_t	*leaf; | 
 | 	size_t		ret = 0; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 |  | 
 | 	if (!tree) | 
 | 		return -1; | 
 | 	while (*s) { | 
 | 		leaf = utf8lookup(tree, hangul, s); | 
 | 		if (!leaf) | 
 | 			return -1; | 
 | 		if (ages[LEAF_GEN(leaf)] > tree->maxage) | 
 | 			ret += utf8clen(s); | 
 | 		else if (LEAF_CCC(leaf) == DECOMPOSE) | 
 | 			ret += strlen(LEAF_STR(leaf)); | 
 | 		else | 
 | 			ret += utf8clen(s); | 
 | 		s += utf8clen(s); | 
 | 	} | 
 | 	return ret; | 
 | } | 
 |  | 
 | /* | 
 |  * Length of the normalization of s, touch at most len bytes. | 
 |  * Return -1 if s is not valid UTF-8 unicode. | 
 |  */ | 
 | ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) | 
 | { | 
 | 	utf8leaf_t	*leaf; | 
 | 	size_t		ret = 0; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 |  | 
 | 	if (!tree) | 
 | 		return -1; | 
 | 	while (len && *s) { | 
 | 		leaf = utf8nlookup(tree, hangul, s, len); | 
 | 		if (!leaf) | 
 | 			return -1; | 
 | 		if (ages[LEAF_GEN(leaf)] > tree->maxage) | 
 | 			ret += utf8clen(s); | 
 | 		else if (LEAF_CCC(leaf) == DECOMPOSE) | 
 | 			ret += strlen(LEAF_STR(leaf)); | 
 | 		else | 
 | 			ret += utf8clen(s); | 
 | 		len -= utf8clen(s); | 
 | 		s += utf8clen(s); | 
 | 	} | 
 | 	return ret; | 
 | } | 
 |  | 
 | /* | 
 |  * Cursor structure used by the normalizer. | 
 |  */ | 
 | struct utf8cursor { | 
 | 	struct tree	*tree; | 
 | 	const char	*s; | 
 | 	const char	*p; | 
 | 	const char	*ss; | 
 | 	const char	*sp; | 
 | 	unsigned int	len; | 
 | 	unsigned int	slen; | 
 | 	short int	ccc; | 
 | 	short int	nccc; | 
 | 	unsigned int	unichar; | 
 | 	unsigned char	hangul[UTF8HANGULLEAF]; | 
 | }; | 
 |  | 
 | /* | 
 |  * Set up an utf8cursor for use by utf8byte(). | 
 |  * | 
 |  *   s      : string. | 
 |  *   len    : length of s. | 
 |  *   u8c    : pointer to cursor. | 
 |  *   trie   : utf8trie_t to use for normalization. | 
 |  * | 
 |  * Returns -1 on error, 0 on success. | 
 |  */ | 
 | int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s, | 
 | 		size_t len) | 
 | { | 
 | 	if (!tree) | 
 | 		return -1; | 
 | 	if (!s) | 
 | 		return -1; | 
 | 	u8c->tree = tree; | 
 | 	u8c->s = s; | 
 | 	u8c->p = NULL; | 
 | 	u8c->ss = NULL; | 
 | 	u8c->sp = NULL; | 
 | 	u8c->len = len; | 
 | 	u8c->slen = 0; | 
 | 	u8c->ccc = STOPPER; | 
 | 	u8c->nccc = STOPPER; | 
 | 	u8c->unichar = 0; | 
 | 	/* Check we didn't clobber the maximum length. */ | 
 | 	if (u8c->len != len) | 
 | 		return -1; | 
 | 	/* The first byte of s may not be an utf8 continuation. */ | 
 | 	if (len > 0 && (*s & 0xC0) == 0x80) | 
 | 		return -1; | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* | 
 |  * Set up an utf8cursor for use by utf8byte(). | 
 |  * | 
 |  *   s      : NUL-terminated string. | 
 |  *   u8c    : pointer to cursor. | 
 |  *   trie   : utf8trie_t to use for normalization. | 
 |  * | 
 |  * Returns -1 on error, 0 on success. | 
 |  */ | 
 | int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s) | 
 | { | 
 | 	return utf8ncursor(u8c, tree, s, (unsigned int)-1); | 
 | } | 
 |  | 
 | /* | 
 |  * Get one byte from the normalized form of the string described by u8c. | 
 |  * | 
 |  * Returns the byte cast to an unsigned char on succes, and -1 on failure. | 
 |  * | 
 |  * The cursor keeps track of the location in the string in u8c->s. | 
 |  * When a character is decomposed, the current location is stored in | 
 |  * u8c->p, and u8c->s is set to the start of the decomposition. Note | 
 |  * that bytes from a decomposition do not count against u8c->len. | 
 |  * | 
 |  * Characters are emitted if they match the current CCC in u8c->ccc. | 
 |  * Hitting end-of-string while u8c->ccc == STOPPER means we're done, | 
 |  * and the function returns 0 in that case. | 
 |  * | 
 |  * Sorting by CCC is done by repeatedly scanning the string.  The | 
 |  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at | 
 |  * the start of the scan.  The first pass finds the lowest CCC to be | 
 |  * emitted and stores it in u8c->nccc, the second pass emits the | 
 |  * characters with this CCC and finds the next lowest CCC. This limits | 
 |  * the number of passes to 1 + the number of different CCCs in the | 
 |  * sequence being scanned. | 
 |  * | 
 |  * Therefore: | 
 |  *  u8c->p  != NULL -> a decomposition is being scanned. | 
 |  *  u8c->ss != NULL -> this is a repeating scan. | 
 |  *  u8c->ccc == -1  -> this is the first scan of a repeating scan. | 
 |  */ | 
 | int utf8byte(struct utf8cursor *u8c) | 
 | { | 
 | 	utf8leaf_t *leaf; | 
 | 	int ccc; | 
 |  | 
 | 	for (;;) { | 
 | 		/* Check for the end of a decomposed character. */ | 
 | 		if (u8c->p && *u8c->s == '\0') { | 
 | 			u8c->s = u8c->p; | 
 | 			u8c->p = NULL; | 
 | 		} | 
 |  | 
 | 		/* Check for end-of-string. */ | 
 | 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { | 
 | 			/* There is no next byte. */ | 
 | 			if (u8c->ccc == STOPPER) | 
 | 				return 0; | 
 | 			/* End-of-string during a scan counts as a stopper. */ | 
 | 			ccc = STOPPER; | 
 | 			goto ccc_mismatch; | 
 | 		} else if ((*u8c->s & 0xC0) == 0x80) { | 
 | 			/* This is a continuation of the current character. */ | 
 | 			if (!u8c->p) | 
 | 				u8c->len--; | 
 | 			return (unsigned char)*u8c->s++; | 
 | 		} | 
 |  | 
 | 		/* Look up the data for the current character. */ | 
 | 		if (u8c->p) { | 
 | 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); | 
 | 		} else { | 
 | 			leaf = utf8nlookup(u8c->tree, u8c->hangul, | 
 | 					   u8c->s, u8c->len); | 
 | 		} | 
 |  | 
 | 		/* No leaf found implies that the input is a binary blob. */ | 
 | 		if (!leaf) | 
 | 			return -1; | 
 |  | 
 | 		/* Characters that are too new have CCC 0. */ | 
 | 		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { | 
 | 			ccc = STOPPER; | 
 | 		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { | 
 | 			u8c->len -= utf8clen(u8c->s); | 
 | 			u8c->p = u8c->s + utf8clen(u8c->s); | 
 | 			u8c->s = LEAF_STR(leaf); | 
 | 			/* Empty decomposition implies CCC 0. */ | 
 | 			if (*u8c->s == '\0') { | 
 | 				if (u8c->ccc == STOPPER) | 
 | 					continue; | 
 | 				ccc = STOPPER; | 
 | 				goto ccc_mismatch; | 
 | 			} | 
 | 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); | 
 | 			ccc = LEAF_CCC(leaf); | 
 | 		} | 
 | 		u8c->unichar = utf8decode(u8c->s); | 
 |  | 
 | 		/* | 
 | 		 * If this is not a stopper, then see if it updates | 
 | 		 * the next canonical class to be emitted. | 
 | 		 */ | 
 | 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) | 
 | 			u8c->nccc = ccc; | 
 |  | 
 | 		/* | 
 | 		 * Return the current byte if this is the current | 
 | 		 * combining class. | 
 | 		 */ | 
 | 		if (ccc == u8c->ccc) { | 
 | 			if (!u8c->p) | 
 | 				u8c->len--; | 
 | 			return (unsigned char)*u8c->s++; | 
 | 		} | 
 |  | 
 | 		/* Current combining class mismatch. */ | 
 | 	ccc_mismatch: | 
 | 		if (u8c->nccc == STOPPER) { | 
 | 			/* | 
 | 			 * Scan forward for the first canonical class | 
 | 			 * to be emitted.  Save the position from | 
 | 			 * which to restart. | 
 | 			 */ | 
 | 			assert(u8c->ccc == STOPPER); | 
 | 			u8c->ccc = MINCCC - 1; | 
 | 			u8c->nccc = ccc; | 
 | 			u8c->sp = u8c->p; | 
 | 			u8c->ss = u8c->s; | 
 | 			u8c->slen = u8c->len; | 
 | 			if (!u8c->p) | 
 | 				u8c->len -= utf8clen(u8c->s); | 
 | 			u8c->s += utf8clen(u8c->s); | 
 | 		} else if (ccc != STOPPER) { | 
 | 			/* Not a stopper, and not the ccc we're emitting. */ | 
 | 			if (!u8c->p) | 
 | 				u8c->len -= utf8clen(u8c->s); | 
 | 			u8c->s += utf8clen(u8c->s); | 
 | 		} else if (u8c->nccc != MAXCCC + 1) { | 
 | 			/* At a stopper, restart for next ccc. */ | 
 | 			u8c->ccc = u8c->nccc; | 
 | 			u8c->nccc = MAXCCC + 1; | 
 | 			u8c->s = u8c->ss; | 
 | 			u8c->p = u8c->sp; | 
 | 			u8c->len = u8c->slen; | 
 | 		} else { | 
 | 			/* All done, proceed from here. */ | 
 | 			u8c->ccc = STOPPER; | 
 | 			u8c->nccc = STOPPER; | 
 | 			u8c->sp = NULL; | 
 | 			u8c->ss = NULL; | 
 | 			u8c->slen = 0; | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | static int normalize_line(struct tree *tree) | 
 | { | 
 | 	char *s; | 
 | 	char *t; | 
 | 	int c; | 
 | 	struct utf8cursor u8c; | 
 |  | 
 | 	/* First test: null-terminated string. */ | 
 | 	s = buf2; | 
 | 	t = buf3; | 
 | 	if (utf8cursor(&u8c, tree, s)) | 
 | 		return -1; | 
 | 	while ((c = utf8byte(&u8c)) > 0) | 
 | 		if (c != (unsigned char)*t++) | 
 | 			return -1; | 
 | 	if (c < 0) | 
 | 		return -1; | 
 | 	if (*t != 0) | 
 | 		return -1; | 
 |  | 
 | 	/* Second test: length-limited string. */ | 
 | 	s = buf2; | 
 | 	/* Replace NUL with a value that will cause an error if seen. */ | 
 | 	s[strlen(s) + 1] = -1; | 
 | 	t = buf3; | 
 | 	if (utf8cursor(&u8c, tree, s)) | 
 | 		return -1; | 
 | 	while ((c = utf8byte(&u8c)) > 0) | 
 | 		if (c != (unsigned char)*t++) | 
 | 			return -1; | 
 | 	if (c < 0) | 
 | 		return -1; | 
 | 	if (*t != 0) | 
 | 		return -1; | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void normalization_test(void) | 
 | { | 
 | 	FILE *file; | 
 | 	unsigned int unichar; | 
 | 	struct unicode_data *data; | 
 | 	char *s; | 
 | 	char *t; | 
 | 	int ret; | 
 | 	int ignorables; | 
 | 	int tests = 0; | 
 | 	int failures = 0; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Parsing %s\n", test_name); | 
 | 	/* Step one, read data from file. */ | 
 | 	file = fopen(test_name, "r"); | 
 | 	if (!file) | 
 | 		open_fail(test_name, errno); | 
 |  | 
 | 	while (fgets(line, LINESIZE, file)) { | 
 | 		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];", | 
 | 			     buf0, buf1); | 
 | 		if (ret != 2 || *line == '#') | 
 | 			continue; | 
 | 		s = buf0; | 
 | 		t = buf2; | 
 | 		while (*s) { | 
 | 			unichar = strtoul(s, &s, 16); | 
 | 			t += utf8encode(t, unichar); | 
 | 		} | 
 | 		*t = '\0'; | 
 |  | 
 | 		ignorables = 0; | 
 | 		s = buf1; | 
 | 		t = buf3; | 
 | 		while (*s) { | 
 | 			unichar = strtoul(s, &s, 16); | 
 | 			data = &unicode_data[unichar]; | 
 | 			if (data->utf8nfdi && !*data->utf8nfdi) | 
 | 				ignorables = 1; | 
 | 			else | 
 | 				t += utf8encode(t, unichar); | 
 | 		} | 
 | 		*t = '\0'; | 
 |  | 
 | 		tests++; | 
 | 		if (normalize_line(nfdi_tree) < 0) { | 
 | 			printf("Line %s -> %s", buf0, buf1); | 
 | 			if (ignorables) | 
 | 				printf(" (ignorables removed)"); | 
 | 			printf(" failure\n"); | 
 | 			failures++; | 
 | 		} | 
 | 	} | 
 | 	fclose(file); | 
 | 	if (verbose > 0) | 
 | 		printf("Ran %d tests with %d failures\n", tests, failures); | 
 | 	if (failures) | 
 | 		file_fail(test_name); | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | static void write_file(void) | 
 | { | 
 | 	FILE *file; | 
 | 	int i; | 
 | 	int j; | 
 | 	int t; | 
 | 	int gen; | 
 |  | 
 | 	if (verbose > 0) | 
 | 		printf("Writing %s\n", utf8_name); | 
 | 	file = fopen(utf8_name, "w"); | 
 | 	if (!file) | 
 | 		open_fail(utf8_name, errno); | 
 |  | 
 | 	fprintf(file, "/* This file is generated code, do not edit. */\n"); | 
 | 	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n"); | 
 | 	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n"); | 
 | 	fprintf(file, "#endif\n"); | 
 | 	fprintf(file, "\n"); | 
 | 	fprintf(file, "static const unsigned int utf8vers = %#x;\n", | 
 | 		unicode_maxage); | 
 | 	fprintf(file, "\n"); | 
 | 	fprintf(file, "static const unsigned int utf8agetab[] = {\n"); | 
 | 	for (i = 0; i != ages_count; i++) | 
 | 		fprintf(file, "\t%#x%s\n", ages[i], | 
 | 			ages[i] == unicode_maxage ? "" : ","); | 
 | 	fprintf(file, "};\n"); | 
 | 	fprintf(file, "\n"); | 
 | 	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n"); | 
 | 	t = 0; | 
 | 	for (gen = 0; gen < ages_count; gen++) { | 
 | 		fprintf(file, "\t{ %#x, %d }%s\n", | 
 | 			ages[gen], trees[t].index, | 
 | 			ages[gen] == unicode_maxage ? "" : ","); | 
 | 		if (trees[t].maxage == ages[gen]) | 
 | 			t += 2; | 
 | 	} | 
 | 	fprintf(file, "};\n"); | 
 | 	fprintf(file, "\n"); | 
 | 	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n"); | 
 | 	t = 1; | 
 | 	for (gen = 0; gen < ages_count; gen++) { | 
 | 		fprintf(file, "\t{ %#x, %d }%s\n", | 
 | 			ages[gen], trees[t].index, | 
 | 			ages[gen] == unicode_maxage ? "" : ","); | 
 | 		if (trees[t].maxage == ages[gen]) | 
 | 			t += 2; | 
 | 	} | 
 | 	fprintf(file, "};\n"); | 
 | 	fprintf(file, "\n"); | 
 | 	fprintf(file, "static const unsigned char utf8data[%zd] = {\n", | 
 | 		utf8data_size); | 
 | 	t = 0; | 
 | 	for (i = 0; i != utf8data_size; i += 16) { | 
 | 		if (i == trees[t].index) { | 
 | 			fprintf(file, "\t/* %s_%x */\n", | 
 | 				trees[t].type, trees[t].maxage); | 
 | 			if (t < trees_count-1) | 
 | 				t++; | 
 | 		} | 
 | 		fprintf(file, "\t"); | 
 | 		for (j = i; j != i + 16; j++) | 
 | 			fprintf(file, "0x%.2x%s", utf8data[j], | 
 | 				(j < utf8data_size -1 ? "," : "")); | 
 | 		fprintf(file, "\n"); | 
 | 	} | 
 | 	fprintf(file, "};\n"); | 
 | 	fclose(file); | 
 | } | 
 |  | 
 | /* ------------------------------------------------------------------ */ | 
 |  | 
 | int main(int argc, char *argv[]) | 
 | { | 
 | 	unsigned int unichar; | 
 | 	int opt; | 
 |  | 
 | 	argv0 = argv[0]; | 
 |  | 
 | 	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { | 
 | 		switch (opt) { | 
 | 		case 'a': | 
 | 			age_name = optarg; | 
 | 			break; | 
 | 		case 'c': | 
 | 			ccc_name = optarg; | 
 | 			break; | 
 | 		case 'd': | 
 | 			data_name = optarg; | 
 | 			break; | 
 | 		case 'f': | 
 | 			fold_name = optarg; | 
 | 			break; | 
 | 		case 'n': | 
 | 			norm_name = optarg; | 
 | 			break; | 
 | 		case 'o': | 
 | 			utf8_name = optarg; | 
 | 			break; | 
 | 		case 'p': | 
 | 			prop_name = optarg; | 
 | 			break; | 
 | 		case 't': | 
 | 			test_name = optarg; | 
 | 			break; | 
 | 		case 'v': | 
 | 			verbose++; | 
 | 			break; | 
 | 		case 'h': | 
 | 			help(); | 
 | 			exit(0); | 
 | 		default: | 
 | 			usage(); | 
 | 		} | 
 | 	} | 
 |  | 
 | 	if (verbose > 1) | 
 | 		help(); | 
 | 	for (unichar = 0; unichar != 0x110000; unichar++) | 
 | 		unicode_data[unichar].code = unichar; | 
 | 	age_init(); | 
 | 	ccc_init(); | 
 | 	nfdi_init(); | 
 | 	nfdicf_init(); | 
 | 	ignore_init(); | 
 | 	corrections_init(); | 
 | 	hangul_decompose(); | 
 | 	nfdi_decompose(); | 
 | 	nfdicf_decompose(); | 
 | 	utf8_init(); | 
 | 	trees_init(); | 
 | 	trees_populate(); | 
 | 	trees_reduce(); | 
 | 	trees_verify(); | 
 | 	/* Prevent "unused function" warning. */ | 
 | 	(void)lookup(nfdi_tree, " "); | 
 | 	if (verbose > 2) | 
 | 		tree_walk(nfdi_tree); | 
 | 	if (verbose > 2) | 
 | 		tree_walk(nfdicf_tree); | 
 | 	normalization_test(); | 
 | 	write_file(); | 
 |  | 
 | 	return 0; | 
 | } |