Blame - ap/libc/glibc/glibc-2.23/localedata/unicode-gen/unicode_utils.py - 7520V3V

blob: 8cc5f2ba2ae1593866d721cdc1db700418be8c88 [file] [log] [blame]

xf.li	bfc6e71	2025-02-07 01:54:34 -0800	[diff] [blame^]	1	# Utilities to generate Unicode data for glibc from upstream Unicode data.
				2	#
				3	# Copyright (C) 2014-2016 Free Software Foundation, Inc.
				4	# This file is part of the GNU C Library.
				5	#
				6	# The GNU C Library is free software; you can redistribute it and/or
				7	# modify it under the terms of the GNU Lesser General Public
				8	# License as published by the Free Software Foundation; either
				9	# version 2.1 of the License, or (at your option) any later version.
				10	#
				11	# The GNU C Library is distributed in the hope that it will be useful,
				12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	# Lesser General Public License for more details.
				15	#
				16	# You should have received a copy of the GNU Lesser General Public
				17	# License along with the GNU C Library; if not, see
				18	# <http://www.gnu.org/licenses/>.
				19
				20	'''
				21	This module contains utilities used by the scripts to generate
				22	Unicode data for glibc from upstream Unicode data files.
				23	'''
				24
				25	import sys
				26	import re
				27
				28	# Dictionary holding the entire contents of the UnicodeData.txt file
				29	#
				30	# Contents of this dictionary look like this:
				31	#
				32	# {0: {'category': 'Cc',
				33	# 'title': None,
				34	# 'digit': '',
				35	# 'name': '<control>',
				36	# 'bidi': 'BN',
				37	# 'combining': '0',
				38	# 'comment': '',
				39	# 'oldname': 'NULL',
				40	# 'decomposition': '',
				41	# 'upper': None,
				42	# 'mirrored': 'N',
				43	# 'lower': None,
				44	# 'decdigit': '',
				45	# 'numeric': ''},
				46	# …
				47	# }
				48	UNICODE_ATTRIBUTES = {}
				49
				50	# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
				51	#
				52	# Contents of this dictionary look like this:
				53	#
				54	# {917504: ['Default_Ignorable_Code_Point'],
				55	# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
				56	# …
				57	# }
				58	DERIVED_CORE_PROPERTIES = {}
				59
				60	# Dictionary holding the entire contents of the EastAsianWidths.txt file
				61	#
				62	# Contents of this dictionary look like this:
				63	#
				64	# {0: 'N', … , 45430: 'W', …}
				65	EAST_ASIAN_WIDTHS = {}
				66
				67	def fill_attribute(code_point, fields):
				68	'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
				69
				70	One entry in the UNICODE_ATTRIBUTES dictionary represents one line
				71	in the UnicodeData.txt file.
				72
				73	'''
				74	UNICODE_ATTRIBUTES[code_point] = {
				75	'name': fields[1], # Character name
				76	'category': fields[2], # General category
				77	'combining': fields[3], # Canonical combining classes
				78	'bidi': fields[4], # Bidirectional category
				79	'decomposition': fields[5], # Character decomposition mapping
				80	'decdigit': fields[6], # Decimal digit value
				81	'digit': fields[7], # Digit value
				82	'numeric': fields[8], # Numeric value
				83	'mirrored': fields[9], # mirrored
				84	'oldname': fields[10], # Old Unicode 1.0 name
				85	'comment': fields[11], # comment
				86	# Uppercase mapping
				87	'upper': int(fields[12], 16) if fields[12] else None,
				88	# Lowercase mapping
				89	'lower': int(fields[13], 16) if fields[13] else None,
				90	# Titlecase mapping
				91	'title': int(fields[14], 16) if fields[14] else None,
				92	}
				93
				94	def fill_attributes(filename):
				95	'''Stores the entire contents of the UnicodeData.txt file
				96	in the UNICODE_ATTRIBUTES dictionary.
				97
				98	A typical line for a single code point in UnicodeData.txt looks
				99	like this:
				100
				101	0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
				102
				103	Code point ranges are indicated by pairs of lines like this:
				104
				105	4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
				106	9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
				107	'''
				108	with open(filename, mode='r') as unicode_data_file:
				109	fields_start = []
				110	for line in unicode_data_file:
				111	fields = line.strip().split(';')
				112	if len(fields) != 15:
				113	sys.stderr.write(
				114	'short line in file "%(f)s": %(l)s\n' %{
				115	'f': filename, 'l': line})
				116	exit(1)
				117	if fields[2] == 'Cs':
				118	# Surrogates are UTF-16 artefacts,
				119	# not real characters. Ignore them.
				120	fields_start = []
				121	continue
				122	if fields[1].endswith(', First>'):
				123	fields_start = fields
				124	fields_start[1] = fields_start[1].split(',')[0][1:]
				125	continue
				126	if fields[1].endswith(', Last>'):
				127	fields[1] = fields[1].split(',')[0][1:]
				128	if fields[1:] != fields_start[1:]:
				129	sys.stderr.write(
				130	'broken code point range in file "%(f)s": %(l)s\n' %{
				131	'f': filename, 'l': line})
				132	exit(1)
				133	for code_point in range(
				134	int(fields_start[0], 16),
				135	int(fields[0], 16)+1):
				136	fill_attribute(code_point, fields)
				137	fields_start = []
				138	continue
				139	fill_attribute(int(fields[0], 16), fields)
				140	fields_start = []
				141
				142	def fill_derived_core_properties(filename):
				143	'''Stores the entire contents of the DerivedCoreProperties.txt file
				144	in the DERIVED_CORE_PROPERTIES dictionary.
				145
				146	Lines in DerivedCoreProperties.txt are either a code point range like
				147	this:
				148
				149	0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
				150
				151	or a single code point like this:
				152
				153	00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
				154
				155	'''
				156	with open(filename, mode='r') as derived_core_properties_file:
				157	for line in derived_core_properties_file:
				158	match = re.match(
				159	r'^(?P<codepoint1>[0-9A-F]{4,6})'
				160	+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
				161	+ r'\s;\s(?P<property>[a-zA-Z_]+)',
				162	line)
				163	if not match:
				164	continue
				165	start = match.group('codepoint1')
				166	end = match.group('codepoint2')
				167	if not end:
				168	end = start
				169	for code_point in range(int(start, 16), int(end, 16)+1):
				170	prop = match.group('property')
				171	if code_point in DERIVED_CORE_PROPERTIES:
				172	DERIVED_CORE_PROPERTIES[code_point].append(prop)
				173	else:
				174	DERIVED_CORE_PROPERTIES[code_point] = [prop]
				175
				176	def fill_east_asian_widths(filename):
				177	'''Stores the entire contents of the EastAsianWidths.txt file
				178	in the EAST_ASIAN_WIDTHS dictionary.
				179
				180	Lines in EastAsianWidths.txt are either a code point range like
				181	this:
				182
				183	9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
				184
				185	or a single code point like this:
				186
				187	A015;W # Lm YI SYLLABLE WU
				188	'''
				189	with open(filename, mode='r') as east_asian_widths_file:
				190	for line in east_asian_widths_file:
				191	match = re.match(
				192	r'^(?P<codepoint1>[0-9A-F]{4,6})'
				193	+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
				194	+r'\s;\s(?P<property>[a-zA-Z]+)',
				195	line)
				196	if not match:
				197	continue
				198	start = match.group('codepoint1')
				199	end = match.group('codepoint2')
				200	if not end:
				201	end = start
				202	for code_point in range(int(start, 16), int(end, 16)+1):
				203	EAST_ASIAN_WIDTHS[code_point] = match.group('property')
				204
				205	def to_upper(code_point):
				206	'''Returns the code point of the uppercase version
				207	of the given code point'''
				208	if (UNICODE_ATTRIBUTES[code_point]['name']
				209	and UNICODE_ATTRIBUTES[code_point]['upper']):
				210	return UNICODE_ATTRIBUTES[code_point]['upper']
				211	else:
				212	return code_point
				213
				214	def to_lower(code_point):
				215	'''Returns the code point of the lowercase version
				216	of the given code point'''
				217	if (UNICODE_ATTRIBUTES[code_point]['name']
				218	and UNICODE_ATTRIBUTES[code_point]['lower']):
				219	return UNICODE_ATTRIBUTES[code_point]['lower']
				220	else:
				221	return code_point
				222
				223	def to_upper_turkish(code_point):
				224	'''Returns the code point of the Turkish uppercase version
				225	of the given code point'''
				226	if code_point == 0x0069:
				227	return 0x0130
				228	return to_upper(code_point)
				229
				230	def to_lower_turkish(code_point):
				231	'''Returns the code point of the Turkish lowercase version
				232	of the given code point'''
				233	if code_point == 0x0049:
				234	return 0x0131
				235	return to_lower(code_point)
				236
				237	def to_title(code_point):
				238	'''Returns the code point of the titlecase version
				239	of the given code point'''
				240	if (UNICODE_ATTRIBUTES[code_point]['name']
				241	and UNICODE_ATTRIBUTES[code_point]['title']):
				242	return UNICODE_ATTRIBUTES[code_point]['title']
				243	else:
				244	return code_point
				245
				246	def is_upper(code_point):
				247	'''Checks whether the character with this code point is uppercase'''
				248	return (to_lower(code_point) != code_point
				249	or (code_point in DERIVED_CORE_PROPERTIES
				250	and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
				251
				252	def is_lower(code_point):
				253	'''Checks whether the character with this code point is lowercase'''
				254	# Some characters are defined as “Lowercase” in
				255	# DerivedCoreProperties.txt but do not have a mapping to upper
				256	# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
				257	# one of these.
				258	return (to_upper(code_point) != code_point
				259	# <U00DF> is lowercase, but without simple to_upper mapping.
				260	or code_point == 0x00DF
				261	or (code_point in DERIVED_CORE_PROPERTIES
				262	and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
				263
				264	def is_alpha(code_point):
				265	'''Checks whether the character with this code point is alphabetic'''
				266	return ((code_point in DERIVED_CORE_PROPERTIES
				267	and
				268	'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
				269	or
				270	# Consider all the non-ASCII digits as alphabetic.
				271	# ISO C 99 forbids us to have them in category “digit”,
				272	# but we want iswalnum to return true on them.
				273	(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
				274	and not (code_point >= 0x0030 and code_point <= 0x0039)))
				275
				276	def is_digit(code_point):
				277	'''Checks whether the character with this code point is a digit'''
				278	if False:
				279	return (UNICODE_ATTRIBUTES[code_point]['name']
				280	and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
				281	# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
				282	# a zero. Must add <0> in front of them by hand.
				283	else:
				284	# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
				285	# takes it away:
				286	# 7.25.2.1.5:
				287	# The iswdigit function tests for any wide character that
				288	# corresponds to a decimal-digit character (as defined in 5.2.1).
				289	# 5.2.1:
				290	# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
				291	return (code_point >= 0x0030 and code_point <= 0x0039)
				292
				293	def is_outdigit(code_point):
				294	'''Checks whether the character with this code point is outdigit'''
				295	return (code_point >= 0x0030 and code_point <= 0x0039)
				296
				297	def is_blank(code_point):
				298	'''Checks whether the character with this code point is blank'''
				299	return (code_point == 0x0009 # '\t'
				300	# Category Zs without mention of '<noBreak>'
				301	or (UNICODE_ATTRIBUTES[code_point]['name']
				302	and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
				303	and '<noBreak>' not in
				304	UNICODE_ATTRIBUTES[code_point]['decomposition']))
				305
				306	def is_space(code_point):
				307	'''Checks whether the character with this code point is a space'''
				308	# Don’t make U+00A0 a space. Non-breaking space means that all programs
				309	# should treat it like a punctuation character, not like a space.
				310	return (code_point == 0x0020 # ' '
				311	or code_point == 0x000C # '\f'
				312	or code_point == 0x000A # '\n'
				313	or code_point == 0x000D # '\r'
				314	or code_point == 0x0009 # '\t'
				315	or code_point == 0x000B # '\v'
				316	# Categories Zl, Zp, and Zs without mention of "<noBreak>"
				317	or (UNICODE_ATTRIBUTES[code_point]['name']
				318	and
				319	(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
				320	or
				321	(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
				322	and
				323	'<noBreak>' not in
				324	UNICODE_ATTRIBUTES[code_point]['decomposition']))))
				325
				326	def is_cntrl(code_point):
				327	'''Checks whether the character with this code point is
				328	a control character'''
				329	return (UNICODE_ATTRIBUTES[code_point]['name']
				330	and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
				331	or
				332	UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
				333
				334	def is_xdigit(code_point):
				335	'''Checks whether the character with this code point is
				336	a hexadecimal digit'''
				337	if False:
				338	return (is_digit(code_point)
				339	or (code_point >= 0x0041 and code_point <= 0x0046)
				340	or (code_point >= 0x0061 and code_point <= 0x0066))
				341	else:
				342	# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
				343	# takes it away:
				344	# 7.25.2.1.12:
				345	# The iswxdigit function tests for any wide character that
				346	# corresponds to a hexadecimal-digit character (as defined
				347	# in 6.4.4.1).
				348	# 6.4.4.1:
				349	# hexadecimal-digit: one of
				350	# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
				351	return ((code_point >= 0x0030 and code_point <= 0x0039)
				352	or (code_point >= 0x0041 and code_point <= 0x0046)
				353	or (code_point >= 0x0061 and code_point <= 0x0066))
				354
				355	def is_graph(code_point):
				356	'''Checks whether the character with this code point is
				357	a graphical character'''
				358	return (UNICODE_ATTRIBUTES[code_point]['name']
				359	and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
				360	and not is_space(code_point))
				361
				362	def is_print(code_point):
				363	'''Checks whether the character with this code point is printable'''
				364	return (UNICODE_ATTRIBUTES[code_point]['name']
				365	and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
				366	and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
				367
				368	def is_punct(code_point):
				369	'''Checks whether the character with this code point is punctuation'''
				370	if False:
				371	return (UNICODE_ATTRIBUTES[code_point]['name']
				372	and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
				373	else:
				374	# The traditional POSIX definition of punctuation is every graphic,
				375	# non-alphanumeric character.
				376	return (is_graph(code_point)
				377	and not is_alpha(code_point)
				378	and not is_digit(code_point))
				379
				380	def is_combining(code_point):
				381	'''Checks whether the character with this code point is
				382	a combining character'''
				383	# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
				384	# file. In 3.0.1 it was identical to the union of the general categories
				385	# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
				386	# PropList.txt file, so we take the latter definition.
				387	return (UNICODE_ATTRIBUTES[code_point]['name']
				388	and
				389	UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
				390
				391	def is_combining_level3(code_point):
				392	'''Checks whether the character with this code point is
				393	a combining level3 character'''
				394	return (is_combining(code_point)
				395	and
				396	int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
				397
				398	def ucs_symbol(code_point):
				399	'''Return the UCS symbol string for a Unicode character.'''
				400	if code_point < 0x10000:
				401	return '<U{:04X}>'.format(code_point)
				402	else:
				403	return '<U{:08X}>'.format(code_point)
				404
				405	def ucs_symbol_range(code_point_low, code_point_high):
				406	'''Returns a string UCS symbol string for a code point range.
				407
				408	Example:
				409
				410	<U0041>..<U005A>
				411	'''
				412	return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
				413
				414	def verifications():
				415	'''Tests whether the is_* functions observe the known restrictions'''
				416	for code_point in sorted(UNICODE_ATTRIBUTES):
				417	# toupper restriction: "Only characters specified for the keywords
				418	# lower and upper shall be specified.
				419	if (to_upper(code_point) != code_point
				420	and not (is_lower(code_point) or is_upper(code_point))):
				421	sys.stderr.write(
				422	('%(sym)s is not upper\|lower '
				423	+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
				424	'sym': ucs_symbol(code_point),
				425	'c': code_point,
				426	'uc': to_upper(code_point)})
				427	# tolower restriction: "Only characters specified for the keywords
				428	# lower and upper shall be specified.
				429	if (to_lower(code_point) != code_point
				430	and not (is_lower(code_point) or is_upper(code_point))):
				431	sys.stderr.write(
				432	('%(sym)s is not upper\|lower '
				433	+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
				434	'sym': ucs_symbol(code_point),
				435	'c': code_point,
				436	'uc': to_lower(code_point)})
				437	# alpha restriction: "Characters classified as either upper or lower
				438	# shall automatically belong to this class.
				439	if ((is_lower(code_point) or is_upper(code_point))
				440	and not is_alpha(code_point)):
				441	sys.stderr.write('%(sym)s is upper\|lower but not alpha\n' %{
				442	'sym': ucs_symbol(code_point)})
				443	# alpha restriction: “No character specified for the keywords cntrl,
				444	# digit, punct or space shall be specified.”
				445	if (is_alpha(code_point) and is_cntrl(code_point)):
				446	sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
				447	'sym': ucs_symbol(code_point)})
				448	if (is_alpha(code_point) and is_digit(code_point)):
				449	sys.stderr.write('%(sym)s is alpha and digit\n' %{
				450	'sym': ucs_symbol(code_point)})
				451	if (is_alpha(code_point) and is_punct(code_point)):
				452	sys.stderr.write('%(sym)s is alpha and punct\n' %{
				453	'sym': ucs_symbol(code_point)})
				454	if (is_alpha(code_point) and is_space(code_point)):
				455	sys.stderr.write('%(sym)s is alpha and space\n' %{
				456	'sym': ucs_symbol(code_point)})
				457	# space restriction: “No character specified for the keywords upper,
				458	# lower, alpha, digit, graph or xdigit shall be specified.”
				459	# upper, lower, alpha already checked above.
				460	if (is_space(code_point) and is_digit(code_point)):
				461	sys.stderr.write('%(sym)s is space and digit\n' %{
				462	'sym': ucs_symbol(code_point)})
				463	if (is_space(code_point) and is_graph(code_point)):
				464	sys.stderr.write('%(sym)s is space and graph\n' %{
				465	'sym': ucs_symbol(code_point)})
				466	if (is_space(code_point) and is_xdigit(code_point)):
				467	sys.stderr.write('%(sym)s is space and xdigit\n' %{
				468	'sym': ucs_symbol(code_point)})
				469	# cntrl restriction: “No character specified for the keywords upper,
				470	# lower, alpha, digit, punct, graph, print or xdigit shall be
				471	# specified.” upper, lower, alpha already checked above.
				472	if (is_cntrl(code_point) and is_digit(code_point)):
				473	sys.stderr.write('%(sym)s is cntrl and digit\n' %{
				474	'sym': ucs_symbol(code_point)})
				475	if (is_cntrl(code_point) and is_punct(code_point)):
				476	sys.stderr.write('%(sym)s is cntrl and punct\n' %{
				477	'sym': ucs_symbol(code_point)})
				478	if (is_cntrl(code_point) and is_graph(code_point)):
				479	sys.stderr.write('%(sym)s is cntrl and graph\n' %{
				480	'sym': ucs_symbol(code_point)})
				481	if (is_cntrl(code_point) and is_print(code_point)):
				482	sys.stderr.write('%(sym)s is cntrl and print\n' %{
				483	'sym': ucs_symbol(code_point)})
				484	if (is_cntrl(code_point) and is_xdigit(code_point)):
				485	sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
				486	'sym': ucs_symbol(code_point)})
				487	# punct restriction: “No character specified for the keywords upper,
				488	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
				489	# be specified.” upper, lower, alpha, cntrl already checked above.
				490	if (is_punct(code_point) and is_digit(code_point)):
				491	sys.stderr.write('%(sym)s is punct and digit\n' %{
				492	'sym': ucs_symbol(code_point)})
				493	if (is_punct(code_point) and is_xdigit(code_point)):
				494	sys.stderr.write('%(sym)s is punct and xdigit\n' %{
				495	'sym': ucs_symbol(code_point)})
				496	if (is_punct(code_point) and code_point == 0x0020):
				497	sys.stderr.write('%(sym)s is punct\n' %{
				498	'sym': ucs_symbol(code_point)})
				499	# graph restriction: “No character specified for the keyword cntrl
				500	# shall be specified.” Already checked above.
				501
				502	# print restriction: “No character specified for the keyword cntrl
				503	# shall be specified.” Already checked above.
				504
				505	# graph - print relation: differ only in the <space> character.
				506	# How is this possible if there are more than one space character?!
				507	# I think susv2/xbd/locale.html should speak of “space characters”,
				508	# not “space character”.
				509	if (is_print(code_point)
				510	and not (is_graph(code_point) or is_space(code_point))):
				511	sys.stderr.write('%(sym)s is print but not graph\|<space>\n' %{
				512	'sym': unicode_utils.ucs_symbol(code_point)})
				513	if (not is_print(code_point)
				514	and (is_graph(code_point) or code_point == 0x0020)):
				515	sys.stderr.write('%(sym)s is graph\|<space> but not print\n' %{
				516	'sym': unicode_utils.ucs_symbol(code_point)})