| rjw | 1f88458 | 2022-01-06 17:20:42 +0800 | [diff] [blame] | 1 | /* | 
|  | 2 | * unicode.c | 
|  | 3 | * | 
|  | 4 | * PURPOSE | 
|  | 5 | *	Routines for converting between UTF-8 and OSTA Compressed Unicode. | 
|  | 6 | *      Also handles filename mangling | 
|  | 7 | * | 
|  | 8 | * DESCRIPTION | 
|  | 9 | *	OSTA Compressed Unicode is explained in the OSTA UDF specification. | 
|  | 10 | *		http://www.osta.org/ | 
|  | 11 | *	UTF-8 is explained in the IETF RFC XXXX. | 
|  | 12 | *		ftp://ftp.internic.net/rfc/rfcxxxx.txt | 
|  | 13 | * | 
|  | 14 | * COPYRIGHT | 
|  | 15 | *	This file is distributed under the terms of the GNU General Public | 
|  | 16 | *	License (GPL). Copies of the GPL can be obtained from: | 
|  | 17 | *		ftp://prep.ai.mit.edu/pub/gnu/GPL | 
|  | 18 | *	Each contributing author retains all rights to their own work. | 
|  | 19 | */ | 
|  | 20 |  | 
|  | 21 | #include "udfdecl.h" | 
|  | 22 |  | 
|  | 23 | #include <linux/kernel.h> | 
|  | 24 | #include <linux/string.h>	/* for memset */ | 
|  | 25 | #include <linux/nls.h> | 
|  | 26 | #include <linux/crc-itu-t.h> | 
|  | 27 | #include <linux/slab.h> | 
|  | 28 |  | 
|  | 29 | #include "udf_sb.h" | 
|  | 30 |  | 
|  | 31 | #define SURROGATE_MASK 0xfffff800 | 
|  | 32 | #define SURROGATE_PAIR 0x0000d800 | 
|  | 33 |  | 
|  | 34 | static int udf_uni2char_utf8(wchar_t uni, | 
|  | 35 | unsigned char *out, | 
|  | 36 | int boundlen) | 
|  | 37 | { | 
|  | 38 | int u_len = 0; | 
|  | 39 |  | 
|  | 40 | if (boundlen <= 0) | 
|  | 41 | return -ENAMETOOLONG; | 
|  | 42 |  | 
|  | 43 | if ((uni & SURROGATE_MASK) == SURROGATE_PAIR) | 
|  | 44 | return -EINVAL; | 
|  | 45 |  | 
|  | 46 | if (uni < 0x80) { | 
|  | 47 | out[u_len++] = (unsigned char)uni; | 
|  | 48 | } else if (uni < 0x800) { | 
|  | 49 | if (boundlen < 2) | 
|  | 50 | return -ENAMETOOLONG; | 
|  | 51 | out[u_len++] = (unsigned char)(0xc0 | (uni >> 6)); | 
|  | 52 | out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); | 
|  | 53 | } else { | 
|  | 54 | if (boundlen < 3) | 
|  | 55 | return -ENAMETOOLONG; | 
|  | 56 | out[u_len++] = (unsigned char)(0xe0 | (uni >> 12)); | 
|  | 57 | out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f)); | 
|  | 58 | out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); | 
|  | 59 | } | 
|  | 60 | return u_len; | 
|  | 61 | } | 
|  | 62 |  | 
|  | 63 | static int udf_char2uni_utf8(const unsigned char *in, | 
|  | 64 | int boundlen, | 
|  | 65 | wchar_t *uni) | 
|  | 66 | { | 
|  | 67 | unsigned int utf_char; | 
|  | 68 | unsigned char c; | 
|  | 69 | int utf_cnt, u_len; | 
|  | 70 |  | 
|  | 71 | utf_char = 0; | 
|  | 72 | utf_cnt = 0; | 
|  | 73 | for (u_len = 0; u_len < boundlen;) { | 
|  | 74 | c = in[u_len++]; | 
|  | 75 |  | 
|  | 76 | /* Complete a multi-byte UTF-8 character */ | 
|  | 77 | if (utf_cnt) { | 
|  | 78 | utf_char = (utf_char << 6) | (c & 0x3f); | 
|  | 79 | if (--utf_cnt) | 
|  | 80 | continue; | 
|  | 81 | } else { | 
|  | 82 | /* Check for a multi-byte UTF-8 character */ | 
|  | 83 | if (c & 0x80) { | 
|  | 84 | /* Start a multi-byte UTF-8 character */ | 
|  | 85 | if ((c & 0xe0) == 0xc0) { | 
|  | 86 | utf_char = c & 0x1f; | 
|  | 87 | utf_cnt = 1; | 
|  | 88 | } else if ((c & 0xf0) == 0xe0) { | 
|  | 89 | utf_char = c & 0x0f; | 
|  | 90 | utf_cnt = 2; | 
|  | 91 | } else if ((c & 0xf8) == 0xf0) { | 
|  | 92 | utf_char = c & 0x07; | 
|  | 93 | utf_cnt = 3; | 
|  | 94 | } else if ((c & 0xfc) == 0xf8) { | 
|  | 95 | utf_char = c & 0x03; | 
|  | 96 | utf_cnt = 4; | 
|  | 97 | } else if ((c & 0xfe) == 0xfc) { | 
|  | 98 | utf_char = c & 0x01; | 
|  | 99 | utf_cnt = 5; | 
|  | 100 | } else { | 
|  | 101 | utf_cnt = -1; | 
|  | 102 | break; | 
|  | 103 | } | 
|  | 104 | continue; | 
|  | 105 | } else { | 
|  | 106 | /* Single byte UTF-8 character (most common) */ | 
|  | 107 | utf_char = c; | 
|  | 108 | } | 
|  | 109 | } | 
|  | 110 | *uni = utf_char; | 
|  | 111 | break; | 
|  | 112 | } | 
|  | 113 | if (utf_cnt) { | 
|  | 114 | *uni = '?'; | 
|  | 115 | return -EINVAL; | 
|  | 116 | } | 
|  | 117 | return u_len; | 
|  | 118 | } | 
|  | 119 |  | 
|  | 120 | #define ILLEGAL_CHAR_MARK	'_' | 
|  | 121 | #define EXT_MARK		'.' | 
|  | 122 | #define CRC_MARK		'#' | 
|  | 123 | #define EXT_SIZE		5 | 
|  | 124 | /* Number of chars we need to store generated CRC to make filename unique */ | 
|  | 125 | #define CRC_LEN			5 | 
|  | 126 |  | 
|  | 127 | static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, | 
|  | 128 | int *str_o_idx, | 
|  | 129 | const uint8_t *str_i, int str_i_max_len, | 
|  | 130 | int *str_i_idx, | 
|  | 131 | int u_ch, int *needsCRC, | 
|  | 132 | int (*conv_f)(wchar_t, unsigned char *, int), | 
|  | 133 | int translate) | 
|  | 134 | { | 
|  | 135 | uint32_t c; | 
|  | 136 | int illChar = 0; | 
|  | 137 | int len, gotch = 0; | 
|  | 138 |  | 
|  | 139 | for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) { | 
|  | 140 | if (*str_o_idx >= str_o_max_len) { | 
|  | 141 | *needsCRC = 1; | 
|  | 142 | return gotch; | 
|  | 143 | } | 
|  | 144 |  | 
|  | 145 | /* Expand OSTA compressed Unicode to Unicode */ | 
|  | 146 | c = str_i[*str_i_idx]; | 
|  | 147 | if (u_ch > 1) | 
|  | 148 | c = (c << 8) | str_i[*str_i_idx + 1]; | 
|  | 149 |  | 
|  | 150 | if (translate && (c == '/' || c == 0)) | 
|  | 151 | illChar = 1; | 
|  | 152 | else if (illChar) | 
|  | 153 | break; | 
|  | 154 | else | 
|  | 155 | gotch = 1; | 
|  | 156 | } | 
|  | 157 | if (illChar) { | 
|  | 158 | *needsCRC = 1; | 
|  | 159 | c = ILLEGAL_CHAR_MARK; | 
|  | 160 | gotch = 1; | 
|  | 161 | } | 
|  | 162 | if (gotch) { | 
|  | 163 | len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx); | 
|  | 164 | /* Valid character? */ | 
|  | 165 | if (len >= 0) | 
|  | 166 | *str_o_idx += len; | 
|  | 167 | else if (len == -ENAMETOOLONG) { | 
|  | 168 | *needsCRC = 1; | 
|  | 169 | gotch = 0; | 
|  | 170 | } else { | 
|  | 171 | str_o[(*str_o_idx)++] = '?'; | 
|  | 172 | *needsCRC = 1; | 
|  | 173 | } | 
|  | 174 | } | 
|  | 175 | return gotch; | 
|  | 176 | } | 
|  | 177 |  | 
|  | 178 | static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, | 
|  | 179 | const uint8_t *ocu, int ocu_len, | 
|  | 180 | int (*conv_f)(wchar_t, unsigned char *, int), | 
|  | 181 | int translate) | 
|  | 182 | { | 
|  | 183 | uint32_t c; | 
|  | 184 | uint8_t cmp_id; | 
|  | 185 | int idx, len; | 
|  | 186 | int u_ch; | 
|  | 187 | int needsCRC = 0; | 
|  | 188 | int ext_i_len, ext_max_len; | 
|  | 189 | int str_o_len = 0;	/* Length of resulting output */ | 
|  | 190 | int ext_o_len = 0;	/* Extension output length */ | 
|  | 191 | int ext_crc_len = 0;	/* Extension output length if used with CRC */ | 
|  | 192 | int i_ext = -1;		/* Extension position in input buffer */ | 
|  | 193 | int o_crc = 0;		/* Rightmost possible output pos for CRC+ext */ | 
|  | 194 | unsigned short valueCRC; | 
|  | 195 | uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; | 
|  | 196 | uint8_t crc[CRC_LEN]; | 
|  | 197 |  | 
|  | 198 | if (str_max_len <= 0) | 
|  | 199 | return 0; | 
|  | 200 |  | 
|  | 201 | if (ocu_len == 0) { | 
|  | 202 | memset(str_o, 0, str_max_len); | 
|  | 203 | return 0; | 
|  | 204 | } | 
|  | 205 |  | 
|  | 206 | cmp_id = ocu[0]; | 
|  | 207 | if (cmp_id != 8 && cmp_id != 16) { | 
|  | 208 | memset(str_o, 0, str_max_len); | 
|  | 209 | pr_err("unknown compression code (%d)\n", cmp_id); | 
|  | 210 | return -EINVAL; | 
|  | 211 | } | 
|  | 212 | u_ch = cmp_id >> 3; | 
|  | 213 |  | 
|  | 214 | ocu++; | 
|  | 215 | ocu_len--; | 
|  | 216 |  | 
|  | 217 | if (ocu_len % u_ch) { | 
|  | 218 | pr_err("incorrect filename length (%d)\n", ocu_len + 1); | 
|  | 219 | return -EINVAL; | 
|  | 220 | } | 
|  | 221 |  | 
|  | 222 | if (translate) { | 
|  | 223 | /* Look for extension */ | 
|  | 224 | for (idx = ocu_len - u_ch, ext_i_len = 0; | 
|  | 225 | (idx >= 0) && (ext_i_len < EXT_SIZE); | 
|  | 226 | idx -= u_ch, ext_i_len++) { | 
|  | 227 | c = ocu[idx]; | 
|  | 228 | if (u_ch > 1) | 
|  | 229 | c = (c << 8) | ocu[idx + 1]; | 
|  | 230 |  | 
|  | 231 | if (c == EXT_MARK) { | 
|  | 232 | if (ext_i_len) | 
|  | 233 | i_ext = idx; | 
|  | 234 | break; | 
|  | 235 | } | 
|  | 236 | } | 
|  | 237 | if (i_ext >= 0) { | 
|  | 238 | /* Convert extension */ | 
|  | 239 | ext_max_len = min_t(int, sizeof(ext), str_max_len); | 
|  | 240 | ext[ext_o_len++] = EXT_MARK; | 
|  | 241 | idx = i_ext + u_ch; | 
|  | 242 | while (udf_name_conv_char(ext, ext_max_len, &ext_o_len, | 
|  | 243 | ocu, ocu_len, &idx, | 
|  | 244 | u_ch, &needsCRC, | 
|  | 245 | conv_f, translate)) { | 
|  | 246 | if ((ext_o_len + CRC_LEN) < str_max_len) | 
|  | 247 | ext_crc_len = ext_o_len; | 
|  | 248 | } | 
|  | 249 | } | 
|  | 250 | } | 
|  | 251 |  | 
|  | 252 | idx = 0; | 
|  | 253 | while (1) { | 
|  | 254 | if (translate && (idx == i_ext)) { | 
|  | 255 | if (str_o_len > (str_max_len - ext_o_len)) | 
|  | 256 | needsCRC = 1; | 
|  | 257 | break; | 
|  | 258 | } | 
|  | 259 |  | 
|  | 260 | if (!udf_name_conv_char(str_o, str_max_len, &str_o_len, | 
|  | 261 | ocu, ocu_len, &idx, | 
|  | 262 | u_ch, &needsCRC, conv_f, translate)) | 
|  | 263 | break; | 
|  | 264 |  | 
|  | 265 | if (translate && | 
|  | 266 | (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))) | 
|  | 267 | o_crc = str_o_len; | 
|  | 268 | } | 
|  | 269 |  | 
|  | 270 | if (translate) { | 
|  | 271 | if (str_o_len <= 2 && str_o[0] == '.' && | 
|  | 272 | (str_o_len == 1 || str_o[1] == '.')) | 
|  | 273 | needsCRC = 1; | 
|  | 274 | if (needsCRC) { | 
|  | 275 | str_o_len = o_crc; | 
|  | 276 | valueCRC = crc_itu_t(0, ocu, ocu_len); | 
|  | 277 | crc[0] = CRC_MARK; | 
|  | 278 | crc[1] = hex_asc_upper_hi(valueCRC >> 8); | 
|  | 279 | crc[2] = hex_asc_upper_lo(valueCRC >> 8); | 
|  | 280 | crc[3] = hex_asc_upper_hi(valueCRC); | 
|  | 281 | crc[4] = hex_asc_upper_lo(valueCRC); | 
|  | 282 | len = min_t(int, CRC_LEN, str_max_len - str_o_len); | 
|  | 283 | memcpy(&str_o[str_o_len], crc, len); | 
|  | 284 | str_o_len += len; | 
|  | 285 | ext_o_len = ext_crc_len; | 
|  | 286 | } | 
|  | 287 | if (ext_o_len > 0) { | 
|  | 288 | memcpy(&str_o[str_o_len], ext, ext_o_len); | 
|  | 289 | str_o_len += ext_o_len; | 
|  | 290 | } | 
|  | 291 | } | 
|  | 292 |  | 
|  | 293 | return str_o_len; | 
|  | 294 | } | 
|  | 295 |  | 
|  | 296 | static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, | 
|  | 297 | const uint8_t *str_i, int str_len, | 
|  | 298 | int (*conv_f)(const unsigned char *, int, wchar_t *)) | 
|  | 299 | { | 
|  | 300 | int i, len; | 
|  | 301 | unsigned int max_val; | 
|  | 302 | wchar_t uni_char; | 
|  | 303 | int u_len, u_ch; | 
|  | 304 |  | 
|  | 305 | if (ocu_max_len <= 0) | 
|  | 306 | return 0; | 
|  | 307 |  | 
|  | 308 | memset(ocu, 0, ocu_max_len); | 
|  | 309 | ocu[0] = 8; | 
|  | 310 | max_val = 0xff; | 
|  | 311 | u_ch = 1; | 
|  | 312 |  | 
|  | 313 | try_again: | 
|  | 314 | u_len = 1; | 
|  | 315 | for (i = 0; i < str_len; i++) { | 
|  | 316 | /* Name didn't fit? */ | 
|  | 317 | if (u_len + u_ch > ocu_max_len) | 
|  | 318 | return 0; | 
|  | 319 | len = conv_f(&str_i[i], str_len - i, &uni_char); | 
|  | 320 | if (!len) | 
|  | 321 | continue; | 
|  | 322 | /* Invalid character, deal with it */ | 
|  | 323 | if (len < 0) { | 
|  | 324 | len = 1; | 
|  | 325 | uni_char = '?'; | 
|  | 326 | } | 
|  | 327 |  | 
|  | 328 | if (uni_char > max_val) { | 
|  | 329 | max_val = 0xffff; | 
|  | 330 | ocu[0] = 0x10; | 
|  | 331 | u_ch = 2; | 
|  | 332 | goto try_again; | 
|  | 333 | } | 
|  | 334 |  | 
|  | 335 | if (max_val == 0xffff) | 
|  | 336 | ocu[u_len++] = (uint8_t)(uni_char >> 8); | 
|  | 337 | ocu[u_len++] = (uint8_t)(uni_char & 0xff); | 
|  | 338 | i += len - 1; | 
|  | 339 | } | 
|  | 340 |  | 
|  | 341 | return u_len; | 
|  | 342 | } | 
|  | 343 |  | 
|  | 344 | /* | 
|  | 345 | * Convert CS0 dstring to output charset. Warning: This function may truncate | 
|  | 346 | * input string if it is too long as it is used for informational strings only | 
|  | 347 | * and it is better to truncate the string than to refuse mounting a media. | 
|  | 348 | */ | 
|  | 349 | int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, | 
|  | 350 | const uint8_t *ocu_i, int i_len) | 
|  | 351 | { | 
|  | 352 | int s_len = 0; | 
|  | 353 |  | 
|  | 354 | if (i_len > 0) { | 
|  | 355 | s_len = ocu_i[i_len - 1]; | 
|  | 356 | if (s_len >= i_len) { | 
|  | 357 | pr_warn("incorrect dstring lengths (%d/%d)," | 
|  | 358 | " truncating\n", s_len, i_len); | 
|  | 359 | s_len = i_len - 1; | 
|  | 360 | /* 2-byte encoding? Need to round properly... */ | 
|  | 361 | if (ocu_i[0] == 16) | 
|  | 362 | s_len -= (s_len - 1) & 2; | 
|  | 363 | } | 
|  | 364 | } | 
|  | 365 |  | 
|  | 366 | return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, | 
|  | 367 | udf_uni2char_utf8, 0); | 
|  | 368 | } | 
|  | 369 |  | 
|  | 370 | int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, | 
|  | 371 | uint8_t *dname, int dlen) | 
|  | 372 | { | 
|  | 373 | int (*conv_f)(wchar_t, unsigned char *, int); | 
|  | 374 | int ret; | 
|  | 375 |  | 
|  | 376 | if (!slen) | 
|  | 377 | return -EIO; | 
|  | 378 |  | 
|  | 379 | if (dlen <= 0) | 
|  | 380 | return 0; | 
|  | 381 |  | 
|  | 382 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { | 
|  | 383 | conv_f = udf_uni2char_utf8; | 
|  | 384 | } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { | 
|  | 385 | conv_f = UDF_SB(sb)->s_nls_map->uni2char; | 
|  | 386 | } else | 
|  | 387 | BUG(); | 
|  | 388 |  | 
|  | 389 | ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1); | 
|  | 390 | /* Zero length filename isn't valid... */ | 
|  | 391 | if (ret == 0) | 
|  | 392 | ret = -EINVAL; | 
|  | 393 | return ret; | 
|  | 394 | } | 
|  | 395 |  | 
|  | 396 | int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, | 
|  | 397 | uint8_t *dname, int dlen) | 
|  | 398 | { | 
|  | 399 | int (*conv_f)(const unsigned char *, int, wchar_t *); | 
|  | 400 |  | 
|  | 401 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { | 
|  | 402 | conv_f = udf_char2uni_utf8; | 
|  | 403 | } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { | 
|  | 404 | conv_f = UDF_SB(sb)->s_nls_map->char2uni; | 
|  | 405 | } else | 
|  | 406 | BUG(); | 
|  | 407 |  | 
|  | 408 | return udf_name_to_CS0(dname, dlen, sname, slen, conv_f); | 
|  | 409 | } | 
|  | 410 |  |