5 * Routines for converting between UTF-8 and OSTA Compressed Unicode.
6 * Also handles filename mangling
9 * OSTA Compressed Unicode is explained in the OSTA UDF specification.
10 * http://www.osta.org/
11 * UTF-8 is explained in the IETF RFC XXXX.
12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt
15 * This file is distributed under the terms of the GNU General Public
16 * License (GPL). Copies of the GPL can be obtained from:
17 * ftp://prep.ai.mit.edu/pub/gnu/GPL
18 * Each contributing author retains all rights to their own work.
23 #include <linux/kernel.h>
24 #include <linux/string.h> /* for memset */
25 #include <linux/nls.h>
26 #include <linux/crc-itu-t.h>
27 #include <linux/slab.h>
31 #define PLANE_SIZE 0x10000
32 #define UNICODE_MAX 0x10ffff
33 #define SURROGATE_MASK 0xfffff800
34 #define SURROGATE_PAIR 0x0000d800
35 #define SURROGATE_LOW 0x00000400
36 #define SURROGATE_CHAR_BITS 10
37 #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
39 #define ILLEGAL_CHAR_MARK '_'
43 /* Number of chars we need to store generated CRC to make filename unique */
46 static unicode_t
get_utf16_char(const uint8_t *str_i
, int str_i_max_len
,
47 int str_i_idx
, int u_ch
, unicode_t
*ret
)
50 int start_idx
= str_i_idx
;
52 /* Expand OSTA compressed Unicode to Unicode */
53 c
= str_i
[str_i_idx
++];
55 c
= (c
<< 8) | str_i
[str_i_idx
++];
56 if ((c
& SURROGATE_MASK
) == SURROGATE_PAIR
) {
59 /* Trailing surrogate char */
60 if (str_i_idx
>= str_i_max_len
) {
65 /* Low surrogate must follow the high one... */
66 if (c
& SURROGATE_LOW
) {
71 WARN_ON_ONCE(u_ch
!= 2);
72 next
= str_i
[str_i_idx
++] << 8;
73 next
|= str_i
[str_i_idx
++];
74 if ((next
& SURROGATE_MASK
) != SURROGATE_PAIR
||
75 !(next
& SURROGATE_LOW
)) {
81 ((c
& SURROGATE_CHAR_MASK
) << SURROGATE_CHAR_BITS
) +
82 (next
& SURROGATE_CHAR_MASK
);
86 return str_i_idx
- start_idx
;
90 static int udf_name_conv_char(uint8_t *str_o
, int str_o_max_len
,
92 const uint8_t *str_i
, int str_i_max_len
,
94 int u_ch
, int *needsCRC
,
95 int (*conv_f
)(wchar_t, unsigned char *, int),
102 while (!gotch
&& *str_i_idx
< str_i_max_len
) {
103 if (*str_o_idx
>= str_o_max_len
) {
108 len
= get_utf16_char(str_i
, str_i_max_len
, *str_i_idx
, u_ch
,
110 /* These chars cannot be converted. Replace them. */
111 if (c
== 0 || c
> UNICODE_MAX
|| (conv_f
&& c
> MAX_WCHAR_T
) ||
112 (translate
&& c
== '/')) {
124 c
= ILLEGAL_CHAR_MARK
;
129 len
= conv_f(c
, &str_o
[*str_o_idx
],
130 str_o_max_len
- *str_o_idx
);
132 len
= utf32_to_utf8(c
, &str_o
[*str_o_idx
],
133 str_o_max_len
- *str_o_idx
);
137 /* Valid character? */
140 else if (len
== -ENAMETOOLONG
) {
144 str_o
[(*str_o_idx
)++] = ILLEGAL_CHAR_MARK
;
151 static int udf_name_from_CS0(struct super_block
*sb
,
152 uint8_t *str_o
, int str_max_len
,
153 const uint8_t *ocu
, int ocu_len
,
161 int ext_i_len
, ext_max_len
;
162 int str_o_len
= 0; /* Length of resulting output */
163 int ext_o_len
= 0; /* Extension output length */
164 int ext_crc_len
= 0; /* Extension output length if used with CRC */
165 int i_ext
= -1; /* Extension position in input buffer */
166 int o_crc
= 0; /* Rightmost possible output pos for CRC+ext */
167 unsigned short valueCRC
;
168 uint8_t ext
[EXT_SIZE
* NLS_MAX_CHARSET_SIZE
+ 1];
169 uint8_t crc
[CRC_LEN
];
170 int (*conv_f
)(wchar_t, unsigned char *, int);
172 if (str_max_len
<= 0)
176 memset(str_o
, 0, str_max_len
);
180 if (UDF_QUERY_FLAG(sb
, UDF_FLAG_NLS_MAP
))
181 conv_f
= UDF_SB(sb
)->s_nls_map
->uni2char
;
186 if (cmp_id
!= 8 && cmp_id
!= 16) {
187 memset(str_o
, 0, str_max_len
);
188 pr_err("unknown compression code (%u)\n", cmp_id
);
196 if (ocu_len
% u_ch
) {
197 pr_err("incorrect filename length (%d)\n", ocu_len
+ 1);
202 /* Look for extension */
203 for (idx
= ocu_len
- u_ch
, ext_i_len
= 0;
204 (idx
>= 0) && (ext_i_len
< EXT_SIZE
);
205 idx
-= u_ch
, ext_i_len
++) {
208 c
= (c
<< 8) | ocu
[idx
+ 1];
217 /* Convert extension */
218 ext_max_len
= min_t(int, sizeof(ext
), str_max_len
);
219 ext
[ext_o_len
++] = EXT_MARK
;
221 while (udf_name_conv_char(ext
, ext_max_len
, &ext_o_len
,
224 conv_f
, translate
)) {
225 if ((ext_o_len
+ CRC_LEN
) < str_max_len
)
226 ext_crc_len
= ext_o_len
;
233 if (translate
&& (idx
== i_ext
)) {
234 if (str_o_len
> (str_max_len
- ext_o_len
))
239 if (!udf_name_conv_char(str_o
, str_max_len
, &str_o_len
,
241 u_ch
, &needsCRC
, conv_f
, translate
))
245 (str_o_len
<= (str_max_len
- ext_o_len
- CRC_LEN
)))
250 if (str_o_len
<= 2 && str_o
[0] == '.' &&
251 (str_o_len
== 1 || str_o
[1] == '.'))
255 valueCRC
= crc_itu_t(0, ocu
, ocu_len
);
257 crc
[1] = hex_asc_upper_hi(valueCRC
>> 8);
258 crc
[2] = hex_asc_upper_lo(valueCRC
>> 8);
259 crc
[3] = hex_asc_upper_hi(valueCRC
);
260 crc
[4] = hex_asc_upper_lo(valueCRC
);
261 len
= min_t(int, CRC_LEN
, str_max_len
- str_o_len
);
262 memcpy(&str_o
[str_o_len
], crc
, len
);
264 ext_o_len
= ext_crc_len
;
267 memcpy(&str_o
[str_o_len
], ext
, ext_o_len
);
268 str_o_len
+= ext_o_len
;
275 static int udf_name_to_CS0(struct super_block
*sb
,
276 uint8_t *ocu
, int ocu_max_len
,
277 const uint8_t *str_i
, int str_len
)
280 unsigned int max_val
;
283 int (*conv_f
)(const unsigned char *, int, wchar_t *);
285 if (ocu_max_len
<= 0)
288 if (UDF_QUERY_FLAG(sb
, UDF_FLAG_NLS_MAP
))
289 conv_f
= UDF_SB(sb
)->s_nls_map
->char2uni
;
293 memset(ocu
, 0, ocu_max_len
);
300 for (i
= 0; i
< str_len
; i
+= len
) {
301 /* Name didn't fit? */
302 if (u_len
+ u_ch
> ocu_max_len
)
307 len
= conv_f(&str_i
[i
], str_len
- i
, &wchar
);
311 len
= utf8_to_utf32(&str_i
[i
], str_len
- i
,
314 /* Invalid character, deal with it */
315 if (len
<= 0 || uni_char
> UNICODE_MAX
) {
320 if (uni_char
> max_val
) {
323 if (max_val
== 0xff) {
330 * Use UTF-16 encoding for chars outside we
331 * cannot encode directly.
333 if (u_len
+ 2 * u_ch
> ocu_max_len
)
336 uni_char
-= PLANE_SIZE
;
338 ((uni_char
>> SURROGATE_CHAR_BITS
) &
339 SURROGATE_CHAR_MASK
);
340 ocu
[u_len
++] = (uint8_t)(c
>> 8);
341 ocu
[u_len
++] = (uint8_t)(c
& 0xff);
342 uni_char
= SURROGATE_PAIR
| SURROGATE_LOW
|
343 (uni_char
& SURROGATE_CHAR_MASK
);
346 if (max_val
== 0xffff)
347 ocu
[u_len
++] = (uint8_t)(uni_char
>> 8);
348 ocu
[u_len
++] = (uint8_t)(uni_char
& 0xff);
355 * Convert CS0 dstring to output charset. Warning: This function may truncate
356 * input string if it is too long as it is used for informational strings only
357 * and it is better to truncate the string than to refuse mounting a media.
359 int udf_dstrCS0toChar(struct super_block
*sb
, uint8_t *utf_o
, int o_len
,
360 const uint8_t *ocu_i
, int i_len
)
365 s_len
= ocu_i
[i_len
- 1];
366 if (s_len
>= i_len
) {
367 pr_warn("incorrect dstring lengths (%d/%d),"
368 " truncating\n", s_len
, i_len
);
370 /* 2-byte encoding? Need to round properly... */
372 s_len
-= (s_len
- 1) & 2;
376 return udf_name_from_CS0(sb
, utf_o
, o_len
, ocu_i
, s_len
, 0);
379 int udf_get_filename(struct super_block
*sb
, const uint8_t *sname
, int slen
,
380 uint8_t *dname
, int dlen
)
390 ret
= udf_name_from_CS0(sb
, dname
, dlen
, sname
, slen
, 1);
391 /* Zero length filename isn't valid... */
397 int udf_put_filename(struct super_block
*sb
, const uint8_t *sname
, int slen
,
398 uint8_t *dname
, int dlen
)
400 return udf_name_to_CS0(sb
, dname
, dlen
, sname
, slen
);