1 // SPDX-License-Identifier: GPL-2.0-only
6 * Routines for converting between UTF-8 and OSTA Compressed Unicode.
7 * Also handles filename mangling
10 * OSTA Compressed Unicode is explained in the OSTA UDF specification.
11 * http://www.osta.org/
12 * UTF-8 is explained in the IETF RFC XXXX.
13 * ftp://ftp.internic.net/rfc/rfcxxxx.txt
19 #include <linux/kernel.h>
20 #include <linux/string.h> /* for memset */
21 #include <linux/nls.h>
22 #include <linux/crc-itu-t.h>
23 #include <linux/slab.h>
27 #define PLANE_SIZE 0x10000
28 #define UNICODE_MAX 0x10ffff
29 #define SURROGATE_MASK 0xfffff800
30 #define SURROGATE_PAIR 0x0000d800
31 #define SURROGATE_LOW 0x00000400
32 #define SURROGATE_CHAR_BITS 10
33 #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
35 #define ILLEGAL_CHAR_MARK '_'
39 /* Number of chars we need to store generated CRC to make filename unique */
42 static unicode_t
get_utf16_char(const uint8_t *str_i
, int str_i_max_len
,
43 int str_i_idx
, int u_ch
, unicode_t
*ret
)
46 int start_idx
= str_i_idx
;
48 /* Expand OSTA compressed Unicode to Unicode */
49 c
= str_i
[str_i_idx
++];
51 c
= (c
<< 8) | str_i
[str_i_idx
++];
52 if ((c
& SURROGATE_MASK
) == SURROGATE_PAIR
) {
55 /* Trailing surrogate char */
56 if (str_i_idx
>= str_i_max_len
) {
61 /* Low surrogate must follow the high one... */
62 if (c
& SURROGATE_LOW
) {
67 WARN_ON_ONCE(u_ch
!= 2);
68 next
= str_i
[str_i_idx
++] << 8;
69 next
|= str_i
[str_i_idx
++];
70 if ((next
& SURROGATE_MASK
) != SURROGATE_PAIR
||
71 !(next
& SURROGATE_LOW
)) {
77 ((c
& SURROGATE_CHAR_MASK
) << SURROGATE_CHAR_BITS
) +
78 (next
& SURROGATE_CHAR_MASK
);
82 return str_i_idx
- start_idx
;
86 static int udf_name_conv_char(uint8_t *str_o
, int str_o_max_len
,
88 const uint8_t *str_i
, int str_i_max_len
,
90 int u_ch
, int *needsCRC
,
91 int (*conv_f
)(wchar_t, unsigned char *, int),
98 while (!gotch
&& *str_i_idx
< str_i_max_len
) {
99 if (*str_o_idx
>= str_o_max_len
) {
104 len
= get_utf16_char(str_i
, str_i_max_len
, *str_i_idx
, u_ch
,
106 /* These chars cannot be converted. Replace them. */
107 if (c
== 0 || c
> UNICODE_MAX
|| (conv_f
&& c
> MAX_WCHAR_T
) ||
108 (translate
&& c
== '/')) {
120 c
= ILLEGAL_CHAR_MARK
;
125 len
= conv_f(c
, &str_o
[*str_o_idx
],
126 str_o_max_len
- *str_o_idx
);
128 len
= utf32_to_utf8(c
, &str_o
[*str_o_idx
],
129 str_o_max_len
- *str_o_idx
);
133 /* Valid character? */
136 else if (len
== -ENAMETOOLONG
) {
140 str_o
[(*str_o_idx
)++] = ILLEGAL_CHAR_MARK
;
147 static int udf_name_from_CS0(struct super_block
*sb
,
148 uint8_t *str_o
, int str_max_len
,
149 const uint8_t *ocu
, int ocu_len
,
157 int ext_i_len
, ext_max_len
;
158 int str_o_len
= 0; /* Length of resulting output */
159 int ext_o_len
= 0; /* Extension output length */
160 int ext_crc_len
= 0; /* Extension output length if used with CRC */
161 int i_ext
= -1; /* Extension position in input buffer */
162 int o_crc
= 0; /* Rightmost possible output pos for CRC+ext */
163 unsigned short valueCRC
;
164 uint8_t ext
[EXT_SIZE
* NLS_MAX_CHARSET_SIZE
+ 1];
165 uint8_t crc
[CRC_LEN
];
166 int (*conv_f
)(wchar_t, unsigned char *, int);
168 if (str_max_len
<= 0)
172 memset(str_o
, 0, str_max_len
);
176 if (UDF_SB(sb
)->s_nls_map
)
177 conv_f
= UDF_SB(sb
)->s_nls_map
->uni2char
;
182 if (cmp_id
!= 8 && cmp_id
!= 16) {
183 memset(str_o
, 0, str_max_len
);
184 pr_err("unknown compression code (%u)\n", cmp_id
);
192 if (ocu_len
% u_ch
) {
193 pr_err("incorrect filename length (%d)\n", ocu_len
+ 1);
198 /* Look for extension */
199 for (idx
= ocu_len
- u_ch
, ext_i_len
= 0;
200 (idx
>= 0) && (ext_i_len
< EXT_SIZE
);
201 idx
-= u_ch
, ext_i_len
++) {
204 c
= (c
<< 8) | ocu
[idx
+ 1];
213 /* Convert extension */
214 ext_max_len
= min_t(int, sizeof(ext
), str_max_len
);
215 ext
[ext_o_len
++] = EXT_MARK
;
217 while (udf_name_conv_char(ext
, ext_max_len
, &ext_o_len
,
220 conv_f
, translate
)) {
221 if ((ext_o_len
+ CRC_LEN
) < str_max_len
)
222 ext_crc_len
= ext_o_len
;
229 if (translate
&& (idx
== i_ext
)) {
230 if (str_o_len
> (str_max_len
- ext_o_len
))
235 if (!udf_name_conv_char(str_o
, str_max_len
, &str_o_len
,
237 u_ch
, &needsCRC
, conv_f
, translate
))
241 (str_o_len
<= (str_max_len
- ext_o_len
- CRC_LEN
)))
246 if (str_o_len
> 0 && str_o_len
<= 2 && str_o
[0] == '.' &&
247 (str_o_len
== 1 || str_o
[1] == '.'))
251 valueCRC
= crc_itu_t(0, ocu
, ocu_len
);
253 crc
[1] = hex_asc_upper_hi(valueCRC
>> 8);
254 crc
[2] = hex_asc_upper_lo(valueCRC
>> 8);
255 crc
[3] = hex_asc_upper_hi(valueCRC
);
256 crc
[4] = hex_asc_upper_lo(valueCRC
);
257 len
= min_t(int, CRC_LEN
, str_max_len
- str_o_len
);
258 memcpy(&str_o
[str_o_len
], crc
, len
);
260 ext_o_len
= ext_crc_len
;
263 memcpy(&str_o
[str_o_len
], ext
, ext_o_len
);
264 str_o_len
+= ext_o_len
;
271 static int udf_name_to_CS0(struct super_block
*sb
,
272 uint8_t *ocu
, int ocu_max_len
,
273 const uint8_t *str_i
, int str_len
)
276 unsigned int max_val
;
279 int (*conv_f
)(const unsigned char *, int, wchar_t *);
281 if (ocu_max_len
<= 0)
284 if (UDF_SB(sb
)->s_nls_map
)
285 conv_f
= UDF_SB(sb
)->s_nls_map
->char2uni
;
289 memset(ocu
, 0, ocu_max_len
);
296 for (i
= 0; i
< str_len
; i
+= len
) {
297 /* Name didn't fit? */
298 if (u_len
+ u_ch
> ocu_max_len
)
303 len
= conv_f(&str_i
[i
], str_len
- i
, &wchar
);
307 len
= utf8_to_utf32(&str_i
[i
], str_len
- i
,
310 /* Invalid character, deal with it */
311 if (len
<= 0 || uni_char
> UNICODE_MAX
) {
316 if (uni_char
> max_val
) {
319 if (max_val
== 0xff) {
326 * Use UTF-16 encoding for chars outside we
327 * cannot encode directly.
329 if (u_len
+ 2 * u_ch
> ocu_max_len
)
332 uni_char
-= PLANE_SIZE
;
334 ((uni_char
>> SURROGATE_CHAR_BITS
) &
335 SURROGATE_CHAR_MASK
);
336 ocu
[u_len
++] = (uint8_t)(c
>> 8);
337 ocu
[u_len
++] = (uint8_t)(c
& 0xff);
338 uni_char
= SURROGATE_PAIR
| SURROGATE_LOW
|
339 (uni_char
& SURROGATE_CHAR_MASK
);
342 if (max_val
== 0xffff)
343 ocu
[u_len
++] = (uint8_t)(uni_char
>> 8);
344 ocu
[u_len
++] = (uint8_t)(uni_char
& 0xff);
351 * Convert CS0 dstring to output charset. Warning: This function may truncate
352 * input string if it is too long as it is used for informational strings only
353 * and it is better to truncate the string than to refuse mounting a media.
355 int udf_dstrCS0toChar(struct super_block
*sb
, uint8_t *utf_o
, int o_len
,
356 const uint8_t *ocu_i
, int i_len
)
361 s_len
= ocu_i
[i_len
- 1];
362 if (s_len
>= i_len
) {
363 pr_warn("incorrect dstring lengths (%d/%d),"
364 " truncating\n", s_len
, i_len
);
366 /* 2-byte encoding? Need to round properly... */
368 s_len
-= (s_len
- 1) & 2;
372 return udf_name_from_CS0(sb
, utf_o
, o_len
, ocu_i
, s_len
, 0);
375 int udf_get_filename(struct super_block
*sb
, const uint8_t *sname
, int slen
,
376 uint8_t *dname
, int dlen
)
386 ret
= udf_name_from_CS0(sb
, dname
, dlen
, sname
, slen
, 1);
387 /* Zero length filename isn't valid... */
393 int udf_put_filename(struct super_block
*sb
, const uint8_t *sname
, int slen
,
394 uint8_t *dname
, int dlen
)
396 return udf_name_to_CS0(sb
, dname
, dlen
, sname
, slen
);