Blame - src/kernel/linux/v4.14/fs/udf/unicode.c - T103

blob: 61a1738895b7a733e1c84897beaa5b467de2c52c [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame]	1	/*
				2	* unicode.c
				3	*
				4	* PURPOSE
				5	* Routines for converting between UTF-8 and OSTA Compressed Unicode.
				6	* Also handles filename mangling
				7	*
				8	* DESCRIPTION
				9	* OSTA Compressed Unicode is explained in the OSTA UDF specification.
				10	* http://www.osta.org/
				11	* UTF-8 is explained in the IETF RFC XXXX.
				12	* ftp://ftp.internic.net/rfc/rfcxxxx.txt
				13	*
				14	* COPYRIGHT
				15	* This file is distributed under the terms of the GNU General Public
				16	* License (GPL). Copies of the GPL can be obtained from:
				17	* ftp://prep.ai.mit.edu/pub/gnu/GPL
				18	* Each contributing author retains all rights to their own work.
				19	*/
				20
				21	#include "udfdecl.h"
				22
				23	#include <linux/kernel.h>
				24	#include <linux/string.h> /* for memset */
				25	#include <linux/nls.h>
				26	#include <linux/crc-itu-t.h>
				27	#include <linux/slab.h>
				28
				29	#include "udf_sb.h"
				30
				31	#define SURROGATE_MASK 0xfffff800
				32	#define SURROGATE_PAIR 0x0000d800
				33
				34	static int udf_uni2char_utf8(wchar_t uni,
				35	unsigned char *out,
				36	int boundlen)
				37	{
				38	int u_len = 0;
				39
				40	if (boundlen <= 0)
				41	return -ENAMETOOLONG;
				42
				43	if ((uni & SURROGATE_MASK) == SURROGATE_PAIR)
				44	return -EINVAL;
				45
				46	if (uni < 0x80) {
				47	out[u_len++] = (unsigned char)uni;
				48	} else if (uni < 0x800) {
				49	if (boundlen < 2)
				50	return -ENAMETOOLONG;
				51	out[u_len++] = (unsigned char)(0xc0 \| (uni >> 6));
				52	out[u_len++] = (unsigned char)(0x80 \| (uni & 0x3f));
				53	} else {
				54	if (boundlen < 3)
				55	return -ENAMETOOLONG;
				56	out[u_len++] = (unsigned char)(0xe0 \| (uni >> 12));
				57	out[u_len++] = (unsigned char)(0x80 \| ((uni >> 6) & 0x3f));
				58	out[u_len++] = (unsigned char)(0x80 \| (uni & 0x3f));
				59	}
				60	return u_len;
				61	}
				62
				63	static int udf_char2uni_utf8(const unsigned char *in,
				64	int boundlen,
				65	wchar_t *uni)
				66	{
				67	unsigned int utf_char;
				68	unsigned char c;
				69	int utf_cnt, u_len;
				70
				71	utf_char = 0;
				72	utf_cnt = 0;
				73	for (u_len = 0; u_len < boundlen;) {
				74	c = in[u_len++];
				75
				76	/* Complete a multi-byte UTF-8 character */
				77	if (utf_cnt) {
				78	utf_char = (utf_char << 6) \| (c & 0x3f);
				79	if (--utf_cnt)
				80	continue;
				81	} else {
				82	/* Check for a multi-byte UTF-8 character */
				83	if (c & 0x80) {
				84	/* Start a multi-byte UTF-8 character */
				85	if ((c & 0xe0) == 0xc0) {
				86	utf_char = c & 0x1f;
				87	utf_cnt = 1;
				88	} else if ((c & 0xf0) == 0xe0) {
				89	utf_char = c & 0x0f;
				90	utf_cnt = 2;
				91	} else if ((c & 0xf8) == 0xf0) {
				92	utf_char = c & 0x07;
				93	utf_cnt = 3;
				94	} else if ((c & 0xfc) == 0xf8) {
				95	utf_char = c & 0x03;
				96	utf_cnt = 4;
				97	} else if ((c & 0xfe) == 0xfc) {
				98	utf_char = c & 0x01;
				99	utf_cnt = 5;
				100	} else {
				101	utf_cnt = -1;
				102	break;
				103	}
				104	continue;
				105	} else {
				106	/* Single byte UTF-8 character (most common) */
				107	utf_char = c;
				108	}
				109	}
				110	*uni = utf_char;
				111	break;
				112	}
				113	if (utf_cnt) {
				114	*uni = '?';
				115	return -EINVAL;
				116	}
				117	return u_len;
				118	}
				119
				120	#define ILLEGAL_CHAR_MARK '_'
				121	#define EXT_MARK '.'
				122	#define CRC_MARK '#'
				123	#define EXT_SIZE 5
				124	/* Number of chars we need to store generated CRC to make filename unique */
				125	#define CRC_LEN 5
				126
				127	static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
				128	int *str_o_idx,
				129	const uint8_t *str_i, int str_i_max_len,
				130	int *str_i_idx,
				131	int u_ch, int *needsCRC,
				132	int (conv_f)(wchar_t, unsigned char , int),
				133	int translate)
				134	{
				135	uint32_t c;
				136	int illChar = 0;
				137	int len, gotch = 0;
				138
				139	for (; (!gotch) && (str_i_idx < str_i_max_len); str_i_idx += u_ch) {
				140	if (*str_o_idx >= str_o_max_len) {
				141	*needsCRC = 1;
				142	return gotch;
				143	}
				144
				145	/* Expand OSTA compressed Unicode to Unicode */
				146	c = str_i[*str_i_idx];
				147	if (u_ch > 1)
				148	c = (c << 8) \| str_i[*str_i_idx + 1];
				149
				150	if (translate && (c == '/' \|\| c == 0))
				151	illChar = 1;
				152	else if (illChar)
				153	break;
				154	else
				155	gotch = 1;
				156	}
				157	if (illChar) {
				158	*needsCRC = 1;
				159	c = ILLEGAL_CHAR_MARK;
				160	gotch = 1;
				161	}
				162	if (gotch) {
				163	len = conv_f(c, &str_o[str_o_idx], str_o_max_len - str_o_idx);
				164	/* Valid character? */
				165	if (len >= 0)
				166	*str_o_idx += len;
				167	else if (len == -ENAMETOOLONG) {
				168	*needsCRC = 1;
				169	gotch = 0;
				170	} else {
				171	str_o[(*str_o_idx)++] = '?';
				172	*needsCRC = 1;
				173	}
				174	}
				175	return gotch;
				176	}
				177
				178	static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
				179	const uint8_t *ocu, int ocu_len,
				180	int (conv_f)(wchar_t, unsigned char , int),
				181	int translate)
				182	{
				183	uint32_t c;
				184	uint8_t cmp_id;
				185	int idx, len;
				186	int u_ch;
				187	int needsCRC = 0;
				188	int ext_i_len, ext_max_len;
				189	int str_o_len = 0; /* Length of resulting output */
				190	int ext_o_len = 0; /* Extension output length */
				191	int ext_crc_len = 0; /* Extension output length if used with CRC */
				192	int i_ext = -1; /* Extension position in input buffer */
				193	int o_crc = 0; /* Rightmost possible output pos for CRC+ext */
				194	unsigned short valueCRC;
				195	uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
				196	uint8_t crc[CRC_LEN];
				197
				198	if (str_max_len <= 0)
				199	return 0;
				200
				201	if (ocu_len == 0) {
				202	memset(str_o, 0, str_max_len);
				203	return 0;
				204	}
				205
				206	cmp_id = ocu[0];
				207	if (cmp_id != 8 && cmp_id != 16) {
				208	memset(str_o, 0, str_max_len);
				209	pr_err("unknown compression code (%d)\n", cmp_id);
				210	return -EINVAL;
				211	}
				212	u_ch = cmp_id >> 3;
				213
				214	ocu++;
				215	ocu_len--;
				216
				217	if (ocu_len % u_ch) {
				218	pr_err("incorrect filename length (%d)\n", ocu_len + 1);
				219	return -EINVAL;
				220	}
				221
				222	if (translate) {
				223	/* Look for extension */
				224	for (idx = ocu_len - u_ch, ext_i_len = 0;
				225	(idx >= 0) && (ext_i_len < EXT_SIZE);
				226	idx -= u_ch, ext_i_len++) {
				227	c = ocu[idx];
				228	if (u_ch > 1)
				229	c = (c << 8) \| ocu[idx + 1];
				230
				231	if (c == EXT_MARK) {
				232	if (ext_i_len)
				233	i_ext = idx;
				234	break;
				235	}
				236	}
				237	if (i_ext >= 0) {
				238	/* Convert extension */
				239	ext_max_len = min_t(int, sizeof(ext), str_max_len);
				240	ext[ext_o_len++] = EXT_MARK;
				241	idx = i_ext + u_ch;
				242	while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
				243	ocu, ocu_len, &idx,
				244	u_ch, &needsCRC,
				245	conv_f, translate)) {
				246	if ((ext_o_len + CRC_LEN) < str_max_len)
				247	ext_crc_len = ext_o_len;
				248	}
				249	}
				250	}
				251
				252	idx = 0;
				253	while (1) {
				254	if (translate && (idx == i_ext)) {
				255	if (str_o_len > (str_max_len - ext_o_len))
				256	needsCRC = 1;
				257	break;
				258	}
				259
				260	if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
				261	ocu, ocu_len, &idx,
				262	u_ch, &needsCRC, conv_f, translate))
				263	break;
				264
				265	if (translate &&
				266	(str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
				267	o_crc = str_o_len;
				268	}
				269
				270	if (translate) {
				271	if (str_o_len <= 2 && str_o[0] == '.' &&
				272	(str_o_len == 1 \|\| str_o[1] == '.'))
				273	needsCRC = 1;
				274	if (needsCRC) {
				275	str_o_len = o_crc;
				276	valueCRC = crc_itu_t(0, ocu, ocu_len);
				277	crc[0] = CRC_MARK;
				278	crc[1] = hex_asc_upper_hi(valueCRC >> 8);
				279	crc[2] = hex_asc_upper_lo(valueCRC >> 8);
				280	crc[3] = hex_asc_upper_hi(valueCRC);
				281	crc[4] = hex_asc_upper_lo(valueCRC);
				282	len = min_t(int, CRC_LEN, str_max_len - str_o_len);
				283	memcpy(&str_o[str_o_len], crc, len);
				284	str_o_len += len;
				285	ext_o_len = ext_crc_len;
				286	}
				287	if (ext_o_len > 0) {
				288	memcpy(&str_o[str_o_len], ext, ext_o_len);
				289	str_o_len += ext_o_len;
				290	}
				291	}
				292
				293	return str_o_len;
				294	}
				295
				296	static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
				297	const uint8_t *str_i, int str_len,
				298	int (conv_f)(const unsigned char , int, wchar_t *))
				299	{
				300	int i, len;
				301	unsigned int max_val;
				302	wchar_t uni_char;
				303	int u_len, u_ch;
				304
				305	if (ocu_max_len <= 0)
				306	return 0;
				307
				308	memset(ocu, 0, ocu_max_len);
				309	ocu[0] = 8;
				310	max_val = 0xff;
				311	u_ch = 1;
				312
				313	try_again:
				314	u_len = 1;
				315	for (i = 0; i < str_len; i++) {
				316	/* Name didn't fit? */
				317	if (u_len + u_ch > ocu_max_len)
				318	return 0;
				319	len = conv_f(&str_i[i], str_len - i, &uni_char);
				320	if (!len)
				321	continue;
				322	/* Invalid character, deal with it */
				323	if (len < 0) {
				324	len = 1;
				325	uni_char = '?';
				326	}
				327
				328	if (uni_char > max_val) {
				329	max_val = 0xffff;
				330	ocu[0] = 0x10;
				331	u_ch = 2;
				332	goto try_again;
				333	}
				334
				335	if (max_val == 0xffff)
				336	ocu[u_len++] = (uint8_t)(uni_char >> 8);
				337	ocu[u_len++] = (uint8_t)(uni_char & 0xff);
				338	i += len - 1;
				339	}
				340
				341	return u_len;
				342	}
				343
				344	/*
				345	* Convert CS0 dstring to output charset. Warning: This function may truncate
				346	* input string if it is too long as it is used for informational strings only
				347	* and it is better to truncate the string than to refuse mounting a media.
				348	*/
				349	int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
				350	const uint8_t *ocu_i, int i_len)
				351	{
				352	int s_len = 0;
				353
				354	if (i_len > 0) {
				355	s_len = ocu_i[i_len - 1];
				356	if (s_len >= i_len) {
				357	pr_warn("incorrect dstring lengths (%d/%d),"
				358	" truncating\n", s_len, i_len);
				359	s_len = i_len - 1;
				360	/* 2-byte encoding? Need to round properly... */
				361	if (ocu_i[0] == 16)
				362	s_len -= (s_len - 1) & 2;
				363	}
				364	}
				365
				366	return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len,
				367	udf_uni2char_utf8, 0);
				368	}
				369
				370	int udf_get_filename(struct super_block sb, const uint8_t sname, int slen,
				371	uint8_t *dname, int dlen)
				372	{
				373	int (conv_f)(wchar_t, unsigned char , int);
				374	int ret;
				375
				376	if (!slen)
				377	return -EIO;
				378
				379	if (dlen <= 0)
				380	return 0;
				381
				382	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
				383	conv_f = udf_uni2char_utf8;
				384	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
				385	conv_f = UDF_SB(sb)->s_nls_map->uni2char;
				386	} else
				387	BUG();
				388
				389	ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
				390	/* Zero length filename isn't valid... */
				391	if (ret == 0)
				392	ret = -EINVAL;
				393	return ret;
				394	}
				395
				396	int udf_put_filename(struct super_block sb, const uint8_t sname, int slen,
				397	uint8_t *dname, int dlen)
				398	{
				399	int (conv_f)(const unsigned char , int, wchar_t *);
				400
				401	if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
				402	conv_f = udf_char2uni_utf8;
				403	} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
				404	conv_f = UDF_SB(sb)->s_nls_map->char2uni;
				405	} else
				406	BUG();
				407
				408	return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
				409	}
				410