Blame - ap/libc/glibc/glibc-2.23/locale/programs/charmap.c - T106_DC

blob: d7580e05c486696c480e7484cc9b9791025ebf41 [file] [log] [blame]

xf.li	bdd93d5	2023-05-12 07:10:14 -0700	[diff] [blame]	1	/* Copyright (C) 1996-2016 Free Software Foundation, Inc.
				2	This file is part of the GNU C Library.
				3	Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
				4
				5	This program is free software; you can redistribute it and/or modify
				6	it under the terms of the GNU General Public License as published
				7	by the Free Software Foundation; version 2 of the License, or
				8	(at your option) any later version.
				9
				10	This program is distributed in the hope that it will be useful,
				11	but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				13	GNU General Public License for more details.
				14
				15	You should have received a copy of the GNU General Public License
				16	along with this program; if not, see <http://www.gnu.org/licenses/>. */
				17
				18	#ifdef HAVE_CONFIG_H
				19	# include <config.h>
				20	#endif
				21
				22	#include <ctype.h>
				23	#include <errno.h>
				24	#include <libintl.h>
				25	#include <limits.h>
				26	#include <stdio.h>
				27	#include <stdlib.h>
				28	#include <string.h>
				29	#include <error.h>
				30	#include <stdint.h>
				31
				32	#include "localedef.h"
				33	#include "linereader.h"
				34	#include "charmap.h"
				35	#include "charmap-dir.h"
				36
				37	#include <assert.h>
				38
				39
				40	/* Define the lookup function. */
				41	#include "charmap-kw.h"
				42
				43
				44	/* Prototypes for local functions. */
				45	static struct charmap_t parse_charmap (struct linereader cmfile,
				46	int verbose, int be_quiet);
				47	static void new_width (struct linereader cmfile, struct charmap_t result,
				48	const char from, const char to,
				49	unsigned long int width);
				50	static void charmap_new_char (struct linereader lr, struct charmap_t cm,
				51	size_t nbytes, unsigned char *bytes,
				52	const char from, const char to,
				53	int decimal_ellipsis, int step);
				54
				55
				56	bool enc_not_ascii_compatible;
				57
				58
				59	#ifdef NEED_NULL_POINTER
				60	static const char *null_pointer;
				61	#endif
				62
				63	static struct linereader *
				64	cmlr_open (const char directory, const char name, kw_hash_fct_t hf)
				65	{
				66	FILE *fp;
				67
				68	fp = charmap_open (directory, name);
				69	if (fp == NULL)
				70	return NULL;
				71	else
				72	{
				73	size_t dlen = strlen (directory);
				74	int add_slash = (dlen == 0 \|\| directory[dlen - 1] != '/');
				75	size_t nlen = strlen (name);
				76	char *pathname;
				77	char *p;
				78
				79	pathname = alloca (dlen + add_slash + nlen + 1);
				80	p = stpcpy (pathname, directory);
				81	if (add_slash)
				82	*p++ = '/';
				83	stpcpy (p, name);
				84
				85	return lr_create (fp, pathname, hf);
				86	}
				87	}
				88
				89	struct charmap_t *
				90	charmap_read (const char *filename, int verbose, int error_not_found,
				91	int be_quiet, int use_default)
				92	{
				93	struct charmap_t *result = NULL;
				94
				95	if (filename != NULL)
				96	{
				97	struct linereader *cmfile;
				98
				99	/* First try the name as found in the parameter. */
				100	cmfile = lr_open (filename, charmap_hash);
				101	if (cmfile == NULL)
				102	{
				103	/* No successful. So start looking through the directories
				104	in the I18NPATH if this is a simple name. */
				105	if (strchr (filename, '/') == NULL)
				106	{
				107	char *i18npath = getenv ("I18NPATH");
				108	if (i18npath != NULL && *i18npath != '\0')
				109	{
				110	const size_t pathlen = strlen (i18npath);
				111	char i18npathbuf[pathlen + 1];
				112	char path[pathlen + sizeof ("/charmaps")];
				113	char *next;
				114	i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
				115
				116	while (cmfile == NULL
				117	&& (next = strsep (&i18npath, ":")) != NULL)
				118	{
				119	stpcpy (stpcpy (path, next), "/charmaps");
				120	cmfile = cmlr_open (path, filename, charmap_hash);
				121
				122	if (cmfile == NULL)
				123	/* Try without the "/charmaps" part. */
				124	cmfile = cmlr_open (next, filename, charmap_hash);
				125	}
				126	}
				127
				128	if (cmfile == NULL)
				129	/* Try the default directory. */
				130	cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
				131	}
				132	}
				133
				134	if (cmfile != NULL)
				135	result = parse_charmap (cmfile, verbose, be_quiet);
				136
				137	if (result == NULL && error_not_found)
				138	WITH_CUR_LOCALE (error (0, errno, _("\
				139	character map file `%s' not found"), filename));
				140	}
				141
				142	if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
				143	{
				144	/* OK, one more try. We also accept the names given to the
				145	character sets in the files. Sometimes they differ from the
				146	file name. */
				147	CHARMAP_DIR *dir;
				148
				149	dir = charmap_opendir (CHARMAP_PATH);
				150	if (dir != NULL)
				151	{
				152	const char *dirent;
				153
				154	while ((dirent = charmap_readdir (dir)) != NULL)
				155	{
				156	char **aliases;
				157	char **p;
				158	int found;
				159
				160	aliases = charmap_aliases (CHARMAP_PATH, dirent);
				161	found = 0;
				162	for (p = aliases; *p; p++)
				163	if (strcasecmp (*p, filename) == 0)
				164	{
				165	found = 1;
				166	break;
				167	}
				168	charmap_free_aliases (aliases);
				169
				170	if (found)
				171	{
				172	struct linereader *cmfile;
				173
				174	cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
				175	if (cmfile != NULL)
				176	result = parse_charmap (cmfile, verbose, be_quiet);
				177
				178	break;
				179	}
				180	}
				181
				182	charmap_closedir (dir);
				183	}
				184	}
				185
				186	if (result == NULL && DEFAULT_CHARMAP != NULL)
				187	{
				188	struct linereader *cmfile;
				189
				190	cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
				191	if (cmfile != NULL)
				192	result = parse_charmap (cmfile, verbose, be_quiet);
				193
				194	if (result == NULL)
				195	WITH_CUR_LOCALE (error (4, errno, _("\
				196	default character map file `%s' not found"), DEFAULT_CHARMAP));
				197	}
				198
				199	if (result != NULL && result->code_set_name == NULL)
				200	/* The input file does not specify a code set name. This
				201	shouldn't happen but we should cope with it. */
				202	result->code_set_name = basename (filename);
				203
				204	/* Test of ASCII compatibility of locale encoding.
				205
				206	Verify that the encoding to be used in a locale is ASCII compatible,
				207	at least for the graphic characters, excluding the control characters,
				208	'$' and '@'. This constraint comes from an ISO C 99 restriction.
				209
				210	ISO C 99 section 7.17.(2) (about wchar_t):
				211	the null character shall have the code value zero and each member of
				212	the basic character set shall have a code value equal to its value
				213	when used as the lone character in an integer character constant.
				214	ISO C 99 section 5.2.1.(3):
				215	Both the basic source and basic execution character sets shall have
				216	the following members: the 26 uppercase letters of the Latin alphabet
				217	A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
				218	the 26 lowercase letters of the Latin alphabet
				219	a b c d e f g h i j k l m n o p q r s t u v w x y z
				220	the 10 decimal digits
				221	0 1 2 3 4 5 6 7 8 9
				222	the following 29 graphic characters
				223	! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { \| } ~
				224	the space character, and control characters representing horizontal
				225	tab, vertical tab, and form feed.
				226
				227	Therefore, for all members of the "basic character set", the 'char' code
				228	must have the same value as the 'wchar_t' code, which in glibc is the
				229	same as the Unicode code, which for all of the enumerated characters
				230	is identical to the ASCII code. */
				231	if (result != NULL && use_default)
				232	{
				233	static const char basic_charset[] =
				234	{
				235	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
				236	'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
				237	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
				238	'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
				239	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
				240	'!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
				241	'.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
				242	'_', '{', '\|', '}', '~', ' ', '\t', '\v', '\f', '\0'
				243	};
				244	int failed = 0;
				245	const char *p = basic_charset;
				246
				247	do
				248	{
				249	struct charseq *seq = charmap_find_symbol (result, p, 1);
				250
				251	if (seq == NULL \|\| seq->ucs4 != (uint32_t) *p)
				252	failed = 1;
				253	}
				254	while (*p++ != '\0');
				255
				256	if (failed)
				257	{
				258	WITH_CUR_LOCALE (fprintf (stderr, _("\
				259	character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
				260	result->code_set_name));
				261	enc_not_ascii_compatible = true;
				262	}
				263	}
				264
				265	return result;
				266	}
				267
				268
				269	static struct charmap_t *
				270	parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
				271	{
				272	struct charmap_t *result;
				273	int state;
				274	enum token_t expected_tok = tok_error;
				275	const char *expected_str = NULL;
				276	char *from_name = NULL;
				277	char *to_name = NULL;
				278	enum token_t ellipsis = 0;
				279	int step = 1;
				280
				281	/* We don't want symbolic names in string to be translated. */
				282	cmfile->translate_strings = 0;
				283
				284	/* Allocate room for result. */
				285	result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
				286	memset (result, '\0', sizeof (struct charmap_t));
				287	/* The default DEFAULT_WIDTH is 1. */
				288	result->width_default = 1;
				289
				290	#define obstack_chunk_alloc malloc
				291	#define obstack_chunk_free free
				292	obstack_init (&result->mem_pool);
				293
				294	if (init_hash (&result->char_table, 256)
				295	\|\| init_hash (&result->byte_table, 256))
				296	{
				297	free (result);
				298	return NULL;
				299	}
				300
				301	/* We use a state machine to describe the charmap description file
				302	format. */
				303	state = 1;
				304	while (1)
				305	{
				306	/* What's on? */
				307	struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
				308	enum token_t nowtok = now->tok;
				309	struct token *arg;
				310
				311	if (nowtok == tok_eof)
				312	break;
				313
				314	switch (state)
				315	{
				316	case 1:
				317	/* The beginning. We expect the special declarations, EOL or
				318	`CHARMAP'. */
				319	if (nowtok == tok_eol)
				320	/* Ignore empty lines. */
				321	continue;
				322
				323	if (nowtok == tok_charmap)
				324	{
				325	from_name = NULL;
				326	to_name = NULL;
				327
				328	/* We have to set up the real work. Fill in some
				329	default values. */
				330	if (result->mb_cur_max == 0)
				331	result->mb_cur_max = 1;
				332	if (result->mb_cur_min == 0)
				333	result->mb_cur_min = result->mb_cur_max;
				334	if (result->mb_cur_min > result->mb_cur_max)
				335	{
				336	if (!be_quiet)
				337	WITH_CUR_LOCALE (error (0, 0, _("\
				338	%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
				339	cmfile->fname));
				340
				341	result->mb_cur_min = result->mb_cur_max;
				342	}
				343
				344	lr_ignore_rest (cmfile, 1);
				345
				346	state = 2;
				347	continue;
				348	}
				349
				350	if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
				351	&& nowtok != tok_mb_cur_min && nowtok != tok_escape_char
				352	&& nowtok != tok_comment_char && nowtok != tok_g0esc
				353	&& nowtok != tok_g1esc && nowtok != tok_g2esc
				354	&& nowtok != tok_g3esc && nowtok != tok_repertoiremap
				355	&& nowtok != tok_include)
				356	{
				357	lr_error (cmfile, _("syntax error in prolog: %s"),
				358	_("invalid definition"));
				359
				360	lr_ignore_rest (cmfile, 0);
				361	continue;
				362	}
				363
				364	/* We know that we need an argument. */
				365	arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
				366
				367	switch (nowtok)
				368	{
				369	case tok_code_set_name:
				370	case tok_repertoiremap:
				371	if (arg->tok != tok_ident && arg->tok != tok_string)
				372	{
				373	badarg:
				374	lr_error (cmfile, _("syntax error in prolog: %s"),
				375	_("bad argument"));
				376
				377	lr_ignore_rest (cmfile, 0);
				378	continue;
				379	}
				380
				381	if (nowtok == tok_code_set_name)
				382	result->code_set_name = obstack_copy0 (&result->mem_pool,
				383	arg->val.str.startmb,
				384	arg->val.str.lenmb);
				385	else
				386	result->repertoiremap = obstack_copy0 (&result->mem_pool,
				387	arg->val.str.startmb,
				388	arg->val.str.lenmb);
				389
				390	lr_ignore_rest (cmfile, 1);
				391	continue;
				392
				393	case tok_mb_cur_max:
				394	case tok_mb_cur_min:
				395	if (arg->tok != tok_number)
				396	goto badarg;
				397
				398	if (verbose
				399	&& ((nowtok == tok_mb_cur_max
				400	&& result->mb_cur_max != 0)
				401	\|\| (nowtok == tok_mb_cur_max
				402	&& result->mb_cur_max != 0)))
				403	lr_error (cmfile, _("duplicate definition of <%s>"),
				404	nowtok == tok_mb_cur_min
				405	? "mb_cur_min" : "mb_cur_max");
				406
				407	if (arg->val.num < 1)
				408	{
				409	lr_error (cmfile,
				410	_("value for <%s> must be 1 or greater"),
				411	nowtok == tok_mb_cur_min
				412	? "mb_cur_min" : "mb_cur_max");
				413
				414	lr_ignore_rest (cmfile, 0);
				415	continue;
				416	}
				417	if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
				418	&& (int) arg->val.num < result->mb_cur_min)
				419	\|\| (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
				420	&& (int) arg->val.num > result->mb_cur_max))
				421	{
				422	lr_error (cmfile, _("\
				423	value of <%s> must be greater or equal than the value of <%s>"),
				424	"mb_cur_max", "mb_cur_min");
				425
				426	lr_ignore_rest (cmfile, 0);
				427	continue;
				428	}
				429
				430	if (nowtok == tok_mb_cur_max)
				431	result->mb_cur_max = arg->val.num;
				432	else
				433	result->mb_cur_min = arg->val.num;
				434
				435	lr_ignore_rest (cmfile, 1);
				436	continue;
				437
				438	case tok_escape_char:
				439	case tok_comment_char:
				440	if (arg->tok != tok_ident)
				441	goto badarg;
				442
				443	if (arg->val.str.lenmb != 1)
				444	{
				445	lr_error (cmfile, _("\
				446	argument to <%s> must be a single character"),
				447	nowtok == tok_escape_char ? "escape_char"
				448	: "comment_char");
				449
				450	lr_ignore_rest (cmfile, 0);
				451	continue;
				452	}
				453
				454	if (nowtok == tok_escape_char)
				455	cmfile->escape_char = *arg->val.str.startmb;
				456	else
				457	cmfile->comment_char = *arg->val.str.startmb;
				458
				459	lr_ignore_rest (cmfile, 1);
				460	continue;
				461
				462	case tok_g0esc:
				463	case tok_g1esc:
				464	case tok_g2esc:
				465	case tok_g3esc:
				466	case tok_escseq:
				467	lr_ignore_rest (cmfile, 0); /* XXX */
				468	continue;
				469
				470	case tok_include:
				471	lr_error (cmfile, _("\
				472	character sets with locking states are not supported"));
				473	exit (4);
				474
				475	default:
				476	/* Cannot happen. */
				477	assert (! "Should not happen");
				478	}
				479	break;
				480
				481	case 2:
				482	/* We have seen `CHARMAP' and now are in the body. Each line
				483	must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
				484	if (nowtok == tok_eol)
				485	/* Ignore empty lines. */
				486	continue;
				487
				488	if (nowtok == tok_end)
				489	{
				490	expected_tok = tok_charmap;
				491	expected_str = "CHARMAP";
				492	state = 90;
				493	continue;
				494	}
				495
				496	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
				497	{
				498	lr_error (cmfile, _("syntax error in %s definition: %s"),
				499	"CHARMAP", _("no symbolic name given"));
				500
				501	lr_ignore_rest (cmfile, 0);
				502	continue;
				503	}
				504
				505	/* If the previous line was not completely correct free the
				506	used memory. */
				507	if (from_name != NULL)
				508	obstack_free (&result->mem_pool, from_name);
				509
				510	if (nowtok == tok_bsymbol)
				511	from_name = (char *) obstack_copy0 (&result->mem_pool,
				512	now->val.str.startmb,
				513	now->val.str.lenmb);
				514	else
				515	{
				516	obstack_printf (&result->mem_pool, "U%08X",
				517	cmfile->token.val.ucs4);
				518	obstack_1grow (&result->mem_pool, '\0');
				519	from_name = (char *) obstack_finish (&result->mem_pool);
				520	}
				521	to_name = NULL;
				522
				523	state = 3;
				524	continue;
				525
				526	case 3:
				527	/* We have two possibilities: We can see an ellipsis or an
				528	encoding value. */
				529	if (nowtok == tok_ellipsis3 \|\| nowtok == tok_ellipsis4
				530	\|\| nowtok == tok_ellipsis2 \|\| nowtok == tok_ellipsis4_2
				531	\|\| nowtok == tok_ellipsis2_2)
				532	{
				533	ellipsis = nowtok;
				534	if (nowtok == tok_ellipsis4_2)
				535	{
				536	step = 2;
				537	nowtok = tok_ellipsis4;
				538	}
				539	else if (nowtok == tok_ellipsis2_2)
				540	{
				541	step = 2;
				542	nowtok = tok_ellipsis2;
				543	}
				544	state = 4;
				545	continue;
				546	}
				547	/* FALLTHROUGH */
				548
				549	case 5:
				550	if (nowtok != tok_charcode)
				551	{
				552	lr_error (cmfile, _("syntax error in %s definition: %s"),
				553	"CHARMAP", _("invalid encoding given"));
				554
				555	lr_ignore_rest (cmfile, 0);
				556
				557	state = 2;
				558	continue;
				559	}
				560
				561	if (now->val.charcode.nbytes < result->mb_cur_min)
				562	lr_error (cmfile, _("too few bytes in character encoding"));
				563	else if (now->val.charcode.nbytes > result->mb_cur_max)
				564	lr_error (cmfile, _("too many bytes in character encoding"));
				565	else
				566	charmap_new_char (cmfile, result, now->val.charcode.nbytes,
				567	now->val.charcode.bytes, from_name, to_name,
				568	ellipsis != tok_ellipsis2, step);
				569
				570	/* Ignore trailing comment silently. */
				571	lr_ignore_rest (cmfile, 0);
				572
				573	from_name = NULL;
				574	to_name = NULL;
				575	ellipsis = tok_none;
				576	step = 1;
				577
				578	state = 2;
				579	continue;
				580
				581	case 4:
				582	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
				583	{
				584	lr_error (cmfile, _("syntax error in %s definition: %s"),
				585	"CHARMAP",
				586	_("no symbolic name given for end of range"));
				587
				588	lr_ignore_rest (cmfile, 0);
				589	continue;
				590	}
				591
				592	/* Copy the to-name in a safe place. */
				593	if (nowtok == tok_bsymbol)
				594	to_name = (char *) obstack_copy0 (&result->mem_pool,
				595	cmfile->token.val.str.startmb,
				596	cmfile->token.val.str.lenmb);
				597	else
				598	{
				599	obstack_printf (&result->mem_pool, "U%08X",
				600	cmfile->token.val.ucs4);
				601	obstack_1grow (&result->mem_pool, '\0');
				602	to_name = (char *) obstack_finish (&result->mem_pool);
				603	}
				604
				605	state = 5;
				606	continue;
				607
				608	case 90:
				609	if (nowtok != expected_tok)
				610	lr_error (cmfile, _("\
				611	%1$s: definition does not end with `END %1$s'"), expected_str);
				612
				613	lr_ignore_rest (cmfile, nowtok == expected_tok);
				614	state = 91;
				615	continue;
				616
				617	case 91:
				618	/* Waiting for WIDTH... */
				619	if (nowtok == tok_eol)
				620	/* Ignore empty lines. */
				621	continue;
				622
				623	if (nowtok == tok_width_default)
				624	{
				625	state = 92;
				626	continue;
				627	}
				628
				629	if (nowtok == tok_width)
				630	{
				631	lr_ignore_rest (cmfile, 1);
				632	state = 93;
				633	continue;
				634	}
				635
				636	if (nowtok == tok_width_variable)
				637	{
				638	lr_ignore_rest (cmfile, 1);
				639	state = 98;
				640	continue;
				641	}
				642
				643	lr_error (cmfile, _("\
				644	only WIDTH definitions are allowed to follow the CHARMAP definition"));
				645
				646	lr_ignore_rest (cmfile, 0);
				647	continue;
				648
				649	case 92:
				650	if (nowtok != tok_number)
				651	lr_error (cmfile, _("value for %s must be an integer"),
				652	"WIDTH_DEFAULT");
				653	else
				654	result->width_default = now->val.num;
				655
				656	lr_ignore_rest (cmfile, nowtok == tok_number);
				657
				658	state = 91;
				659	continue;
				660
				661	case 93:
				662	/* We now expect `END WIDTH' or lines of the format "%s %d\n" or
				663	"%s...%s %d\n". */
				664	if (nowtok == tok_eol)
				665	/* ignore empty lines. */
				666	continue;
				667
				668	if (nowtok == tok_end)
				669	{
				670	expected_tok = tok_width;
				671	expected_str = "WIDTH";
				672	state = 90;
				673	continue;
				674	}
				675
				676	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
				677	{
				678	lr_error (cmfile, _("syntax error in %s definition: %s"),
				679	"WIDTH", _("no symbolic name given"));
				680
				681	lr_ignore_rest (cmfile, 0);
				682	continue;
				683	}
				684
				685	if (from_name != NULL)
				686	obstack_free (&result->mem_pool, from_name);
				687
				688	if (nowtok == tok_bsymbol)
				689	from_name = (char *) obstack_copy0 (&result->mem_pool,
				690	now->val.str.startmb,
				691	now->val.str.lenmb);
				692	else
				693	{
				694	obstack_printf (&result->mem_pool, "U%08X",
				695	cmfile->token.val.ucs4);
				696	obstack_1grow (&result->mem_pool, '\0');
				697	from_name = (char *) obstack_finish (&result->mem_pool);
				698	}
				699
				700	to_name = NULL;
				701
				702	state = 94;
				703	continue;
				704
				705	case 94:
				706	if (nowtok == tok_ellipsis3)
				707	{
				708	state = 95;
				709	continue;
				710	}
				711
				712	case 96:
				713	if (nowtok != tok_number)
				714	lr_error (cmfile, _("value for %s must be an integer"),
				715	"WIDTH");
				716	else
				717	{
				718	/* Store width for chars. */
				719	new_width (cmfile, result, from_name, to_name, now->val.num);
				720
				721	from_name = NULL;
				722	to_name = NULL;
				723	}
				724
				725	lr_ignore_rest (cmfile, nowtok == tok_number);
				726
				727	state = 93;
				728	continue;
				729
				730	case 95:
				731	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
				732	{
				733	lr_error (cmfile, _("syntax error in %s definition: %s"),
				734	"WIDTH", _("no symbolic name given for end of range"));
				735
				736	lr_ignore_rest (cmfile, 0);
				737
				738	state = 93;
				739	continue;
				740	}
				741
				742	if (nowtok == tok_bsymbol)
				743	to_name = (char *) obstack_copy0 (&result->mem_pool,
				744	now->val.str.startmb,
				745	now->val.str.lenmb);
				746	else
				747	{
				748	obstack_printf (&result->mem_pool, "U%08X",
				749	cmfile->token.val.ucs4);
				750	obstack_1grow (&result->mem_pool, '\0');
				751	to_name = (char *) obstack_finish (&result->mem_pool);
				752	}
				753
				754	state = 96;
				755	continue;
				756
				757	case 98:
				758	/* We now expect `END WIDTH_VARIABLE' or lines of the format
				759	"%s\n" or "%s...%s\n". */
				760	if (nowtok == tok_eol)
				761	/* ignore empty lines. */
				762	continue;
				763
				764	if (nowtok == tok_end)
				765	{
				766	expected_tok = tok_width_variable;
				767	expected_str = "WIDTH_VARIABLE";
				768	state = 90;
				769	continue;
				770	}
				771
				772	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
				773	{
				774	lr_error (cmfile, _("syntax error in %s definition: %s"),
				775	"WIDTH_VARIABLE", _("no symbolic name given"));
				776
				777	lr_ignore_rest (cmfile, 0);
				778
				779	continue;
				780	}
				781
				782	if (from_name != NULL)
				783	obstack_free (&result->mem_pool, from_name);
				784
				785	if (nowtok == tok_bsymbol)
				786	from_name = (char *) obstack_copy0 (&result->mem_pool,
				787	now->val.str.startmb,
				788	now->val.str.lenmb);
				789	else
				790	{
				791	obstack_printf (&result->mem_pool, "U%08X",
				792	cmfile->token.val.ucs4);
				793	obstack_1grow (&result->mem_pool, '\0');
				794	from_name = (char *) obstack_finish (&result->mem_pool);
				795	}
				796	to_name = NULL;
				797
				798	state = 99;
				799	continue;
				800
				801	case 99:
				802	if (nowtok == tok_ellipsis3)
				803	state = 100;
				804
				805	/* Store info. */
				806	from_name = NULL;
				807
				808	/* Warn */
				809	state = 98;
				810	continue;
				811
				812	case 100:
				813	if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
				814	{
				815	lr_error (cmfile, _("syntax error in %s definition: %s"),
				816	"WIDTH_VARIABLE",
				817	_("no symbolic name given for end of range"));
				818	lr_ignore_rest (cmfile, 0);
				819	continue;
				820	}
				821
				822	if (nowtok == tok_bsymbol)
				823	to_name = (char *) obstack_copy0 (&result->mem_pool,
				824	now->val.str.startmb,
				825	now->val.str.lenmb);
				826	else
				827	{
				828	obstack_printf (&result->mem_pool, "U%08X",
				829	cmfile->token.val.ucs4);
				830	obstack_1grow (&result->mem_pool, '\0');
				831	to_name = (char *) obstack_finish (&result->mem_pool);
				832	}
				833
				834	/* XXX Enter value into table. */
				835
				836	lr_ignore_rest (cmfile, 1);
				837
				838	state = 98;
				839	continue;
				840
				841	default:
				842	WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
				843	__FILE__));
				844	/* NOTREACHED */
				845	}
				846	break;
				847	}
				848
				849	if (state != 91 && !be_quiet)
				850	WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
				851	cmfile->fname));
				852
				853	lr_close (cmfile);
				854
				855	return result;
				856	}
				857
				858
				859	static void
				860	new_width (struct linereader cmfile, struct charmap_t result,
				861	const char from, const char to, unsigned long int width)
				862	{
				863	struct charseq *from_val;
				864	struct charseq *to_val;
				865
				866	from_val = charmap_find_value (result, from, strlen (from));
				867	if (from_val == NULL)
				868	{
				869	lr_error (cmfile, _("unknown character `%s'"), from);
				870	return;
				871	}
				872
				873	if (to == NULL)
				874	to_val = from_val;
				875	else
				876	{
				877	to_val = charmap_find_value (result, to, strlen (to));
				878	if (to_val == NULL)
				879	{
				880	lr_error (cmfile, _("unknown character `%s'"), to);
				881	return;
				882	}
				883
				884	/* Make sure the number of bytes for the end points of the range
				885	is correct. */
				886	if (from_val->nbytes != to_val->nbytes)
				887	{
				888	lr_error (cmfile, _("\
				889	number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
				890	from_val->nbytes, to_val->nbytes);
				891	return;
				892	}
				893	}
				894
				895	if (result->nwidth_rules >= result->nwidth_rules_max)
				896	{
				897	size_t new_size = result->nwidth_rules + 32;
				898	struct width_rule *new_rules =
				899	(struct width_rule *) obstack_alloc (&result->mem_pool,
				900	(new_size
				901	* sizeof (struct width_rule)));
				902
				903	memcpy (new_rules, result->width_rules,
				904	result->nwidth_rules_max * sizeof (struct width_rule));
				905
				906	result->width_rules = new_rules;
				907	result->nwidth_rules_max = new_size;
				908	}
				909
				910	result->width_rules[result->nwidth_rules].from = from_val;
				911	result->width_rules[result->nwidth_rules].to = to_val;
				912	result->width_rules[result->nwidth_rules].width = (unsigned int) width;
				913	++result->nwidth_rules;
				914	}
				915
				916
				917	struct charseq *
				918	charmap_find_value (const struct charmap_t cm, const char name, size_t len)
				919	{
				920	void *result;
				921
				922	return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
				923	< 0 ? NULL : (struct charseq *) result);
				924	}
				925
				926
				927	static void
				928	charmap_new_char (struct linereader lr, struct charmap_t cm,
				929	size_t nbytes, unsigned char *bytes,
				930	const char from, const char to,
				931	int decimal_ellipsis, int step)
				932	{
				933	hash_table *ht = &cm->char_table;
				934	hash_table *bt = &cm->byte_table;
				935	struct obstack *ob = &cm->mem_pool;
				936	char *from_end;
				937	char *to_end;
				938	const char *cp;
				939	int prefix_len, len1, len2;
				940	unsigned int from_nr, to_nr, cnt;
				941	struct charseq *newp;
				942
				943	len1 = strlen (from);
				944
				945	if (to == NULL)
				946	{
				947	newp = (struct charseq ) obstack_alloc (ob, sizeof (newp) + nbytes);
				948	newp->nbytes = nbytes;
				949	memcpy (newp->bytes, bytes, nbytes);
				950	newp->name = from;
				951
				952	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
				953	if ((from[0] == 'U' \|\| from[0] == 'P') && (len1 == 5 \|\| len1 == 9))
				954	{
				955	/* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
				956	xxxx and xxxxxxxx are hexadecimal numbers. In this case
				957	we use the value of xxxx or xxxxxxxx as the UCS4 value of
				958	this character and we don't have to consult the repertoire
				959	map.
				960
				961	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
				962	and xxxxxxxx also give the code point in UCS4 but this must
				963	be in the private, i.e., unassigned, area. This should be
				964	used for characters which do not (yet) have an equivalent
				965	in ISO 10646 and Unicode. */
				966	char *endp;
				967
				968	errno = 0;
				969	newp->ucs4 = strtoul (from + 1, &endp, 16);
				970	if (endp - from != len1
				971	\|\| (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
				972	\|\| newp->ucs4 >= 0x80000000)
				973	/* This wasn't successful. Signal this name cannot be a
				974	correct UCS value. */
				975	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
				976	}
				977
				978	insert_entry (ht, from, len1, newp);
				979	insert_entry (bt, newp->bytes, nbytes, newp);
				980	/* Please note that it isn't a bug if a symbol is defined more
				981	than once. All later definitions are simply discarded. */
				982	return;
				983	}
				984
				985	/* We have a range: the names must have names with equal prefixes
				986	and an equal number of digits, where the second number is greater
				987	or equal than the first. */
				988	len2 = strlen (to);
				989
				990	if (len1 != len2)
				991	{
				992	illegal_range:
				993	lr_error (lr, _("invalid names for character range"));
				994	return;
				995	}
				996
				997	cp = &from[len1 - 1];
				998	if (decimal_ellipsis)
				999	while (isdigit (*cp) && cp >= from)
				1000	--cp;
				1001	else
				1002	while (isxdigit (*cp) && cp >= from)
				1003	{
				1004	if (!isdigit (cp) && !isupper (cp))
				1005	lr_error (lr, _("\
				1006	hexadecimal range format should use only capital characters"));
				1007	--cp;
				1008	}
				1009
				1010	prefix_len = (cp - from) + 1;
				1011
				1012	if (cp == &from[len1 - 1] \|\| strncmp (from, to, prefix_len) != 0)
				1013	goto illegal_range;
				1014
				1015	errno = 0;
				1016	from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
				1017	if (*from_end != '\0' \|\| (from_nr == UINT_MAX && errno == ERANGE)
				1018	\|\| ((to_nr = strtoul (&to[prefix_len], &to_end,
				1019	decimal_ellipsis ? 10 : 16)) == UINT_MAX
				1020	&& errno == ERANGE)
				1021	\|\| *to_end != '\0')
				1022	{
				1023	lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
				1024	return;
				1025	}
				1026
				1027	if (from_nr > to_nr)
				1028	{
				1029	lr_error (lr, _("upper limit in range is smaller than lower limit"));
				1030	return;
				1031	}
				1032
				1033	for (cnt = from_nr; cnt <= to_nr; cnt += step)
				1034	{
				1035	char *name_end;
				1036	obstack_printf (ob, decimal_ellipsis ? "%.s%0d" : "%.s%0X",
				1037	prefix_len, from, len1 - prefix_len, cnt);
				1038	obstack_1grow (ob, '\0');
				1039	name_end = obstack_finish (ob);
				1040
				1041	newp = (struct charseq ) obstack_alloc (ob, sizeof (newp) + nbytes);
				1042	newp->nbytes = nbytes;
				1043	memcpy (newp->bytes, bytes, nbytes);
				1044	newp->name = name_end;
				1045
				1046	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
				1047	if ((name_end[0] == 'U' \|\| name_end[0] == 'P')
				1048	&& (len1 == 5 \|\| len1 == 9))
				1049	{
				1050	/* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
				1051	xxxx and xxxxxxxx are hexadecimal numbers. In this case
				1052	we use the value of xxxx or xxxxxxxx as the UCS4 value of
				1053	this character and we don't have to consult the repertoire
				1054	map.
				1055
				1056	If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
				1057	and xxxxxxxx also give the code point in UCS4 but this must
				1058	be in the private, i.e., unassigned, area. This should be
				1059	used for characters which do not (yet) have an equivalent
				1060	in ISO 10646 and Unicode. */
				1061	char *endp;
				1062
				1063	errno = 0;
				1064	newp->ucs4 = strtoul (name_end + 1, &endp, 16);
				1065	if (endp - name_end != len1
				1066	\|\| (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
				1067	\|\| newp->ucs4 >= 0x80000000)
				1068	/* This wasn't successful. Signal this name cannot be a
				1069	correct UCS value. */
				1070	newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
				1071	}
				1072
				1073	insert_entry (ht, name_end, len1, newp);
				1074	insert_entry (bt, newp->bytes, nbytes, newp);
				1075	/* Please note we don't examine the return value since it is no error
				1076	if we have two definitions for a symbol. */
				1077
				1078	/* Increment the value in the byte sequence. */
				1079	if (++bytes[nbytes - 1] == '\0')
				1080	{
				1081	int b = nbytes - 2;
				1082
				1083	do
				1084	if (b < 0)
				1085	{
				1086	lr_error (lr,
				1087	_("resulting bytes for range not representable."));
				1088	return;
				1089	}
				1090	while (++bytes[b--] == 0);
				1091	}
				1092	}
				1093	}
				1094
				1095
				1096	struct charseq *
				1097	charmap_find_symbol (const struct charmap_t cm, const char bytes,
				1098	size_t nbytes)
				1099	{
				1100	void *result;
				1101
				1102	return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
				1103	< 0 ? NULL : (struct charseq *) result);
				1104	}