xf.li | bfc6e71 | 2025-02-07 01:54:34 -0800 | [diff] [blame^] | 1 | # Utilities to generate Unicode data for glibc from upstream Unicode data. |
| 2 | # |
| 3 | # Copyright (C) 2014-2016 Free Software Foundation, Inc. |
| 4 | # This file is part of the GNU C Library. |
| 5 | # |
| 6 | # The GNU C Library is free software; you can redistribute it and/or |
| 7 | # modify it under the terms of the GNU Lesser General Public |
| 8 | # License as published by the Free Software Foundation; either |
| 9 | # version 2.1 of the License, or (at your option) any later version. |
| 10 | # |
| 11 | # The GNU C Library is distributed in the hope that it will be useful, |
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | # Lesser General Public License for more details. |
| 15 | # |
| 16 | # You should have received a copy of the GNU Lesser General Public |
| 17 | # License along with the GNU C Library; if not, see |
| 18 | # <http://www.gnu.org/licenses/>. |
| 19 | |
| 20 | ''' |
| 21 | This module contains utilities used by the scripts to generate |
| 22 | Unicode data for glibc from upstream Unicode data files. |
| 23 | ''' |
| 24 | |
| 25 | import sys |
| 26 | import re |
| 27 | |
| 28 | # Dictionary holding the entire contents of the UnicodeData.txt file |
| 29 | # |
| 30 | # Contents of this dictionary look like this: |
| 31 | # |
| 32 | # {0: {'category': 'Cc', |
| 33 | # 'title': None, |
| 34 | # 'digit': '', |
| 35 | # 'name': '<control>', |
| 36 | # 'bidi': 'BN', |
| 37 | # 'combining': '0', |
| 38 | # 'comment': '', |
| 39 | # 'oldname': 'NULL', |
| 40 | # 'decomposition': '', |
| 41 | # 'upper': None, |
| 42 | # 'mirrored': 'N', |
| 43 | # 'lower': None, |
| 44 | # 'decdigit': '', |
| 45 | # 'numeric': ''}, |
| 46 | # … |
| 47 | # } |
| 48 | UNICODE_ATTRIBUTES = {} |
| 49 | |
| 50 | # Dictionary holding the entire contents of the DerivedCoreProperties.txt file |
| 51 | # |
| 52 | # Contents of this dictionary look like this: |
| 53 | # |
| 54 | # {917504: ['Default_Ignorable_Code_Point'], |
| 55 | # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], |
| 56 | # … |
| 57 | # } |
| 58 | DERIVED_CORE_PROPERTIES = {} |
| 59 | |
| 60 | # Dictionary holding the entire contents of the EastAsianWidths.txt file |
| 61 | # |
| 62 | # Contents of this dictionary look like this: |
| 63 | # |
| 64 | # {0: 'N', … , 45430: 'W', …} |
| 65 | EAST_ASIAN_WIDTHS = {} |
| 66 | |
| 67 | def fill_attribute(code_point, fields): |
| 68 | '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. |
| 69 | |
| 70 | One entry in the UNICODE_ATTRIBUTES dictionary represents one line |
| 71 | in the UnicodeData.txt file. |
| 72 | |
| 73 | ''' |
| 74 | UNICODE_ATTRIBUTES[code_point] = { |
| 75 | 'name': fields[1], # Character name |
| 76 | 'category': fields[2], # General category |
| 77 | 'combining': fields[3], # Canonical combining classes |
| 78 | 'bidi': fields[4], # Bidirectional category |
| 79 | 'decomposition': fields[5], # Character decomposition mapping |
| 80 | 'decdigit': fields[6], # Decimal digit value |
| 81 | 'digit': fields[7], # Digit value |
| 82 | 'numeric': fields[8], # Numeric value |
| 83 | 'mirrored': fields[9], # mirrored |
| 84 | 'oldname': fields[10], # Old Unicode 1.0 name |
| 85 | 'comment': fields[11], # comment |
| 86 | # Uppercase mapping |
| 87 | 'upper': int(fields[12], 16) if fields[12] else None, |
| 88 | # Lowercase mapping |
| 89 | 'lower': int(fields[13], 16) if fields[13] else None, |
| 90 | # Titlecase mapping |
| 91 | 'title': int(fields[14], 16) if fields[14] else None, |
| 92 | } |
| 93 | |
| 94 | def fill_attributes(filename): |
| 95 | '''Stores the entire contents of the UnicodeData.txt file |
| 96 | in the UNICODE_ATTRIBUTES dictionary. |
| 97 | |
| 98 | A typical line for a single code point in UnicodeData.txt looks |
| 99 | like this: |
| 100 | |
| 101 | 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; |
| 102 | |
| 103 | Code point ranges are indicated by pairs of lines like this: |
| 104 | |
| 105 | 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; |
| 106 | 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; |
| 107 | ''' |
| 108 | with open(filename, mode='r') as unicode_data_file: |
| 109 | fields_start = [] |
| 110 | for line in unicode_data_file: |
| 111 | fields = line.strip().split(';') |
| 112 | if len(fields) != 15: |
| 113 | sys.stderr.write( |
| 114 | 'short line in file "%(f)s": %(l)s\n' %{ |
| 115 | 'f': filename, 'l': line}) |
| 116 | exit(1) |
| 117 | if fields[2] == 'Cs': |
| 118 | # Surrogates are UTF-16 artefacts, |
| 119 | # not real characters. Ignore them. |
| 120 | fields_start = [] |
| 121 | continue |
| 122 | if fields[1].endswith(', First>'): |
| 123 | fields_start = fields |
| 124 | fields_start[1] = fields_start[1].split(',')[0][1:] |
| 125 | continue |
| 126 | if fields[1].endswith(', Last>'): |
| 127 | fields[1] = fields[1].split(',')[0][1:] |
| 128 | if fields[1:] != fields_start[1:]: |
| 129 | sys.stderr.write( |
| 130 | 'broken code point range in file "%(f)s": %(l)s\n' %{ |
| 131 | 'f': filename, 'l': line}) |
| 132 | exit(1) |
| 133 | for code_point in range( |
| 134 | int(fields_start[0], 16), |
| 135 | int(fields[0], 16)+1): |
| 136 | fill_attribute(code_point, fields) |
| 137 | fields_start = [] |
| 138 | continue |
| 139 | fill_attribute(int(fields[0], 16), fields) |
| 140 | fields_start = [] |
| 141 | |
| 142 | def fill_derived_core_properties(filename): |
| 143 | '''Stores the entire contents of the DerivedCoreProperties.txt file |
| 144 | in the DERIVED_CORE_PROPERTIES dictionary. |
| 145 | |
| 146 | Lines in DerivedCoreProperties.txt are either a code point range like |
| 147 | this: |
| 148 | |
| 149 | 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z |
| 150 | |
| 151 | or a single code point like this: |
| 152 | |
| 153 | 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR |
| 154 | |
| 155 | ''' |
| 156 | with open(filename, mode='r') as derived_core_properties_file: |
| 157 | for line in derived_core_properties_file: |
| 158 | match = re.match( |
| 159 | r'^(?P<codepoint1>[0-9A-F]{4,6})' |
| 160 | + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' |
| 161 | + r'\s*;\s*(?P<property>[a-zA-Z_]+)', |
| 162 | line) |
| 163 | if not match: |
| 164 | continue |
| 165 | start = match.group('codepoint1') |
| 166 | end = match.group('codepoint2') |
| 167 | if not end: |
| 168 | end = start |
| 169 | for code_point in range(int(start, 16), int(end, 16)+1): |
| 170 | prop = match.group('property') |
| 171 | if code_point in DERIVED_CORE_PROPERTIES: |
| 172 | DERIVED_CORE_PROPERTIES[code_point].append(prop) |
| 173 | else: |
| 174 | DERIVED_CORE_PROPERTIES[code_point] = [prop] |
| 175 | |
| 176 | def fill_east_asian_widths(filename): |
| 177 | '''Stores the entire contents of the EastAsianWidths.txt file |
| 178 | in the EAST_ASIAN_WIDTHS dictionary. |
| 179 | |
| 180 | Lines in EastAsianWidths.txt are either a code point range like |
| 181 | this: |
| 182 | |
| 183 | 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> |
| 184 | |
| 185 | or a single code point like this: |
| 186 | |
| 187 | A015;W # Lm YI SYLLABLE WU |
| 188 | ''' |
| 189 | with open(filename, mode='r') as east_asian_widths_file: |
| 190 | for line in east_asian_widths_file: |
| 191 | match = re.match( |
| 192 | r'^(?P<codepoint1>[0-9A-F]{4,6})' |
| 193 | +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' |
| 194 | +r'\s*;\s*(?P<property>[a-zA-Z]+)', |
| 195 | line) |
| 196 | if not match: |
| 197 | continue |
| 198 | start = match.group('codepoint1') |
| 199 | end = match.group('codepoint2') |
| 200 | if not end: |
| 201 | end = start |
| 202 | for code_point in range(int(start, 16), int(end, 16)+1): |
| 203 | EAST_ASIAN_WIDTHS[code_point] = match.group('property') |
| 204 | |
| 205 | def to_upper(code_point): |
| 206 | '''Returns the code point of the uppercase version |
| 207 | of the given code point''' |
| 208 | if (UNICODE_ATTRIBUTES[code_point]['name'] |
| 209 | and UNICODE_ATTRIBUTES[code_point]['upper']): |
| 210 | return UNICODE_ATTRIBUTES[code_point]['upper'] |
| 211 | else: |
| 212 | return code_point |
| 213 | |
| 214 | def to_lower(code_point): |
| 215 | '''Returns the code point of the lowercase version |
| 216 | of the given code point''' |
| 217 | if (UNICODE_ATTRIBUTES[code_point]['name'] |
| 218 | and UNICODE_ATTRIBUTES[code_point]['lower']): |
| 219 | return UNICODE_ATTRIBUTES[code_point]['lower'] |
| 220 | else: |
| 221 | return code_point |
| 222 | |
| 223 | def to_upper_turkish(code_point): |
| 224 | '''Returns the code point of the Turkish uppercase version |
| 225 | of the given code point''' |
| 226 | if code_point == 0x0069: |
| 227 | return 0x0130 |
| 228 | return to_upper(code_point) |
| 229 | |
| 230 | def to_lower_turkish(code_point): |
| 231 | '''Returns the code point of the Turkish lowercase version |
| 232 | of the given code point''' |
| 233 | if code_point == 0x0049: |
| 234 | return 0x0131 |
| 235 | return to_lower(code_point) |
| 236 | |
| 237 | def to_title(code_point): |
| 238 | '''Returns the code point of the titlecase version |
| 239 | of the given code point''' |
| 240 | if (UNICODE_ATTRIBUTES[code_point]['name'] |
| 241 | and UNICODE_ATTRIBUTES[code_point]['title']): |
| 242 | return UNICODE_ATTRIBUTES[code_point]['title'] |
| 243 | else: |
| 244 | return code_point |
| 245 | |
| 246 | def is_upper(code_point): |
| 247 | '''Checks whether the character with this code point is uppercase''' |
| 248 | return (to_lower(code_point) != code_point |
| 249 | or (code_point in DERIVED_CORE_PROPERTIES |
| 250 | and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) |
| 251 | |
| 252 | def is_lower(code_point): |
| 253 | '''Checks whether the character with this code point is lowercase''' |
| 254 | # Some characters are defined as “Lowercase” in |
| 255 | # DerivedCoreProperties.txt but do not have a mapping to upper |
| 256 | # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is |
| 257 | # one of these. |
| 258 | return (to_upper(code_point) != code_point |
| 259 | # <U00DF> is lowercase, but without simple to_upper mapping. |
| 260 | or code_point == 0x00DF |
| 261 | or (code_point in DERIVED_CORE_PROPERTIES |
| 262 | and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) |
| 263 | |
| 264 | def is_alpha(code_point): |
| 265 | '''Checks whether the character with this code point is alphabetic''' |
| 266 | return ((code_point in DERIVED_CORE_PROPERTIES |
| 267 | and |
| 268 | 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) |
| 269 | or |
| 270 | # Consider all the non-ASCII digits as alphabetic. |
| 271 | # ISO C 99 forbids us to have them in category “digit”, |
| 272 | # but we want iswalnum to return true on them. |
| 273 | (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' |
| 274 | and not (code_point >= 0x0030 and code_point <= 0x0039))) |
| 275 | |
| 276 | def is_digit(code_point): |
| 277 | '''Checks whether the character with this code point is a digit''' |
| 278 | if False: |
| 279 | return (UNICODE_ATTRIBUTES[code_point]['name'] |
| 280 | and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') |
| 281 | # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without |
| 282 | # a zero. Must add <0> in front of them by hand. |
| 283 | else: |
| 284 | # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 |
| 285 | # takes it away: |
| 286 | # 7.25.2.1.5: |
| 287 | # The iswdigit function tests for any wide character that |
| 288 | # corresponds to a decimal-digit character (as defined in 5.2.1). |
| 289 | # 5.2.1: |
| 290 | # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 |
| 291 | return (code_point >= 0x0030 and code_point <= 0x0039) |
| 292 | |
| 293 | def is_outdigit(code_point): |
| 294 | '''Checks whether the character with this code point is outdigit''' |
| 295 | return (code_point >= 0x0030 and code_point <= 0x0039) |
| 296 | |
| 297 | def is_blank(code_point): |
| 298 | '''Checks whether the character with this code point is blank''' |
| 299 | return (code_point == 0x0009 # '\t' |
| 300 | # Category Zs without mention of '<noBreak>' |
| 301 | or (UNICODE_ATTRIBUTES[code_point]['name'] |
| 302 | and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' |
| 303 | and '<noBreak>' not in |
| 304 | UNICODE_ATTRIBUTES[code_point]['decomposition'])) |
| 305 | |
| 306 | def is_space(code_point): |
| 307 | '''Checks whether the character with this code point is a space''' |
| 308 | # Don’t make U+00A0 a space. Non-breaking space means that all programs |
| 309 | # should treat it like a punctuation character, not like a space. |
| 310 | return (code_point == 0x0020 # ' ' |
| 311 | or code_point == 0x000C # '\f' |
| 312 | or code_point == 0x000A # '\n' |
| 313 | or code_point == 0x000D # '\r' |
| 314 | or code_point == 0x0009 # '\t' |
| 315 | or code_point == 0x000B # '\v' |
| 316 | # Categories Zl, Zp, and Zs without mention of "<noBreak>" |
| 317 | or (UNICODE_ATTRIBUTES[code_point]['name'] |
| 318 | and |
| 319 | (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] |
| 320 | or |
| 321 | (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] |
| 322 | and |
| 323 | '<noBreak>' not in |
| 324 | UNICODE_ATTRIBUTES[code_point]['decomposition'])))) |
| 325 | |
| 326 | def is_cntrl(code_point): |
| 327 | '''Checks whether the character with this code point is |
| 328 | a control character''' |
| 329 | return (UNICODE_ATTRIBUTES[code_point]['name'] |
| 330 | and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' |
| 331 | or |
| 332 | UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) |
| 333 | |
| 334 | def is_xdigit(code_point): |
| 335 | '''Checks whether the character with this code point is |
| 336 | a hexadecimal digit''' |
| 337 | if False: |
| 338 | return (is_digit(code_point) |
| 339 | or (code_point >= 0x0041 and code_point <= 0x0046) |
| 340 | or (code_point >= 0x0061 and code_point <= 0x0066)) |
| 341 | else: |
| 342 | # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 |
| 343 | # takes it away: |
| 344 | # 7.25.2.1.12: |
| 345 | # The iswxdigit function tests for any wide character that |
| 346 | # corresponds to a hexadecimal-digit character (as defined |
| 347 | # in 6.4.4.1). |
| 348 | # 6.4.4.1: |
| 349 | # hexadecimal-digit: one of |
| 350 | # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F |
| 351 | return ((code_point >= 0x0030 and code_point <= 0x0039) |
| 352 | or (code_point >= 0x0041 and code_point <= 0x0046) |
| 353 | or (code_point >= 0x0061 and code_point <= 0x0066)) |
| 354 | |
| 355 | def is_graph(code_point): |
| 356 | '''Checks whether the character with this code point is |
| 357 | a graphical character''' |
| 358 | return (UNICODE_ATTRIBUTES[code_point]['name'] |
| 359 | and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' |
| 360 | and not is_space(code_point)) |
| 361 | |
| 362 | def is_print(code_point): |
| 363 | '''Checks whether the character with this code point is printable''' |
| 364 | return (UNICODE_ATTRIBUTES[code_point]['name'] |
| 365 | and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' |
| 366 | and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) |
| 367 | |
| 368 | def is_punct(code_point): |
| 369 | '''Checks whether the character with this code point is punctuation''' |
| 370 | if False: |
| 371 | return (UNICODE_ATTRIBUTES[code_point]['name'] |
| 372 | and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) |
| 373 | else: |
| 374 | # The traditional POSIX definition of punctuation is every graphic, |
| 375 | # non-alphanumeric character. |
| 376 | return (is_graph(code_point) |
| 377 | and not is_alpha(code_point) |
| 378 | and not is_digit(code_point)) |
| 379 | |
| 380 | def is_combining(code_point): |
| 381 | '''Checks whether the character with this code point is |
| 382 | a combining character''' |
| 383 | # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt |
| 384 | # file. In 3.0.1 it was identical to the union of the general categories |
| 385 | # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the |
| 386 | # PropList.txt file, so we take the latter definition. |
| 387 | return (UNICODE_ATTRIBUTES[code_point]['name'] |
| 388 | and |
| 389 | UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) |
| 390 | |
| 391 | def is_combining_level3(code_point): |
| 392 | '''Checks whether the character with this code point is |
| 393 | a combining level3 character''' |
| 394 | return (is_combining(code_point) |
| 395 | and |
| 396 | int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) |
| 397 | |
| 398 | def ucs_symbol(code_point): |
| 399 | '''Return the UCS symbol string for a Unicode character.''' |
| 400 | if code_point < 0x10000: |
| 401 | return '<U{:04X}>'.format(code_point) |
| 402 | else: |
| 403 | return '<U{:08X}>'.format(code_point) |
| 404 | |
| 405 | def ucs_symbol_range(code_point_low, code_point_high): |
| 406 | '''Returns a string UCS symbol string for a code point range. |
| 407 | |
| 408 | Example: |
| 409 | |
| 410 | <U0041>..<U005A> |
| 411 | ''' |
| 412 | return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) |
| 413 | |
| 414 | def verifications(): |
| 415 | '''Tests whether the is_* functions observe the known restrictions''' |
| 416 | for code_point in sorted(UNICODE_ATTRIBUTES): |
| 417 | # toupper restriction: "Only characters specified for the keywords |
| 418 | # lower and upper shall be specified. |
| 419 | if (to_upper(code_point) != code_point |
| 420 | and not (is_lower(code_point) or is_upper(code_point))): |
| 421 | sys.stderr.write( |
| 422 | ('%(sym)s is not upper|lower ' |
| 423 | + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ |
| 424 | 'sym': ucs_symbol(code_point), |
| 425 | 'c': code_point, |
| 426 | 'uc': to_upper(code_point)}) |
| 427 | # tolower restriction: "Only characters specified for the keywords |
| 428 | # lower and upper shall be specified. |
| 429 | if (to_lower(code_point) != code_point |
| 430 | and not (is_lower(code_point) or is_upper(code_point))): |
| 431 | sys.stderr.write( |
| 432 | ('%(sym)s is not upper|lower ' |
| 433 | + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ |
| 434 | 'sym': ucs_symbol(code_point), |
| 435 | 'c': code_point, |
| 436 | 'uc': to_lower(code_point)}) |
| 437 | # alpha restriction: "Characters classified as either upper or lower |
| 438 | # shall automatically belong to this class. |
| 439 | if ((is_lower(code_point) or is_upper(code_point)) |
| 440 | and not is_alpha(code_point)): |
| 441 | sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ |
| 442 | 'sym': ucs_symbol(code_point)}) |
| 443 | # alpha restriction: “No character specified for the keywords cntrl, |
| 444 | # digit, punct or space shall be specified.” |
| 445 | if (is_alpha(code_point) and is_cntrl(code_point)): |
| 446 | sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ |
| 447 | 'sym': ucs_symbol(code_point)}) |
| 448 | if (is_alpha(code_point) and is_digit(code_point)): |
| 449 | sys.stderr.write('%(sym)s is alpha and digit\n' %{ |
| 450 | 'sym': ucs_symbol(code_point)}) |
| 451 | if (is_alpha(code_point) and is_punct(code_point)): |
| 452 | sys.stderr.write('%(sym)s is alpha and punct\n' %{ |
| 453 | 'sym': ucs_symbol(code_point)}) |
| 454 | if (is_alpha(code_point) and is_space(code_point)): |
| 455 | sys.stderr.write('%(sym)s is alpha and space\n' %{ |
| 456 | 'sym': ucs_symbol(code_point)}) |
| 457 | # space restriction: “No character specified for the keywords upper, |
| 458 | # lower, alpha, digit, graph or xdigit shall be specified.” |
| 459 | # upper, lower, alpha already checked above. |
| 460 | if (is_space(code_point) and is_digit(code_point)): |
| 461 | sys.stderr.write('%(sym)s is space and digit\n' %{ |
| 462 | 'sym': ucs_symbol(code_point)}) |
| 463 | if (is_space(code_point) and is_graph(code_point)): |
| 464 | sys.stderr.write('%(sym)s is space and graph\n' %{ |
| 465 | 'sym': ucs_symbol(code_point)}) |
| 466 | if (is_space(code_point) and is_xdigit(code_point)): |
| 467 | sys.stderr.write('%(sym)s is space and xdigit\n' %{ |
| 468 | 'sym': ucs_symbol(code_point)}) |
| 469 | # cntrl restriction: “No character specified for the keywords upper, |
| 470 | # lower, alpha, digit, punct, graph, print or xdigit shall be |
| 471 | # specified.” upper, lower, alpha already checked above. |
| 472 | if (is_cntrl(code_point) and is_digit(code_point)): |
| 473 | sys.stderr.write('%(sym)s is cntrl and digit\n' %{ |
| 474 | 'sym': ucs_symbol(code_point)}) |
| 475 | if (is_cntrl(code_point) and is_punct(code_point)): |
| 476 | sys.stderr.write('%(sym)s is cntrl and punct\n' %{ |
| 477 | 'sym': ucs_symbol(code_point)}) |
| 478 | if (is_cntrl(code_point) and is_graph(code_point)): |
| 479 | sys.stderr.write('%(sym)s is cntrl and graph\n' %{ |
| 480 | 'sym': ucs_symbol(code_point)}) |
| 481 | if (is_cntrl(code_point) and is_print(code_point)): |
| 482 | sys.stderr.write('%(sym)s is cntrl and print\n' %{ |
| 483 | 'sym': ucs_symbol(code_point)}) |
| 484 | if (is_cntrl(code_point) and is_xdigit(code_point)): |
| 485 | sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ |
| 486 | 'sym': ucs_symbol(code_point)}) |
| 487 | # punct restriction: “No character specified for the keywords upper, |
| 488 | # lower, alpha, digit, cntrl, xdigit or as the <space> character shall |
| 489 | # be specified.” upper, lower, alpha, cntrl already checked above. |
| 490 | if (is_punct(code_point) and is_digit(code_point)): |
| 491 | sys.stderr.write('%(sym)s is punct and digit\n' %{ |
| 492 | 'sym': ucs_symbol(code_point)}) |
| 493 | if (is_punct(code_point) and is_xdigit(code_point)): |
| 494 | sys.stderr.write('%(sym)s is punct and xdigit\n' %{ |
| 495 | 'sym': ucs_symbol(code_point)}) |
| 496 | if (is_punct(code_point) and code_point == 0x0020): |
| 497 | sys.stderr.write('%(sym)s is punct\n' %{ |
| 498 | 'sym': ucs_symbol(code_point)}) |
| 499 | # graph restriction: “No character specified for the keyword cntrl |
| 500 | # shall be specified.” Already checked above. |
| 501 | |
| 502 | # print restriction: “No character specified for the keyword cntrl |
| 503 | # shall be specified.” Already checked above. |
| 504 | |
| 505 | # graph - print relation: differ only in the <space> character. |
| 506 | # How is this possible if there are more than one space character?! |
| 507 | # I think susv2/xbd/locale.html should speak of “space characters”, |
| 508 | # not “space character”. |
| 509 | if (is_print(code_point) |
| 510 | and not (is_graph(code_point) or is_space(code_point))): |
| 511 | sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ |
| 512 | 'sym': unicode_utils.ucs_symbol(code_point)}) |
| 513 | if (not is_print(code_point) |
| 514 | and (is_graph(code_point) or code_point == 0x0020)): |
| 515 | sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ |
| 516 | 'sym': unicode_utils.ucs_symbol(code_point)}) |