1/* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * COPYRIGHT 15 * This file is distributed under the terms of the GNU General Public 16 * License (GPL). Copies of the GPL can be obtained from: 17 * ftp://prep.ai.mit.edu/pub/gnu/GPL 18 * Each contributing author retains all rights to their own work. 19 */ 20 21#include "udfdecl.h" 22 23#include <linux/kernel.h> 24#include <linux/string.h> /* for memset */ 25#include <linux/nls.h> 26#include <linux/crc-itu-t.h> 27#include <linux/slab.h> 28 29#include "udf_sb.h" 30 31static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *, 32 int); 33 34static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) 35{ 36 if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2)) 37 return 0; 38 39 memset(dest, 0, sizeof(struct ustr)); 40 memcpy(dest->u_name, src, strlen); 41 dest->u_cmpID = 0x08; 42 dest->u_len = strlen; 43 44 return strlen; 45} 46 47/* 48 * udf_build_ustr 49 */ 50int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) 51{ 52 int usesize; 53 54 if (!dest || !ptr || !size) 55 return -1; 56 BUG_ON(size < 2); 57 58 usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name)); 59 usesize = min(usesize, size - 2); 60 dest->u_cmpID = ptr[0]; 61 dest->u_len = usesize; 62 memcpy(dest->u_name, ptr + 1, usesize); 63 memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize); 64 65 return 0; 66} 67 68/* 69 * udf_build_ustr_exact 70 */ 71static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) 72{ 73 if ((!dest) || (!ptr) || (!exactsize)) 74 return -1; 75 76 memset(dest, 0, sizeof(struct ustr)); 77 dest->u_cmpID = ptr[0]; 78 dest->u_len = exactsize - 1; 79 memcpy(dest->u_name, ptr + 1, exactsize - 1); 80 81 return 0; 82} 83 84/* 85 * udf_ocu_to_utf8 86 * 87 * PURPOSE 88 * Convert OSTA Compressed Unicode to the UTF-8 equivalent. 89 * 90 * PRE-CONDITIONS 91 * utf Pointer to UTF-8 output buffer. 92 * ocu Pointer to OSTA Compressed Unicode input buffer 93 * of size UDF_NAME_LEN bytes. 94 * both of type "struct ustr *" 95 * 96 * POST-CONDITIONS 97 * <return> Zero on success. 98 * 99 * HISTORY 100 * November 12, 1997 - Andrew E. Mileski 101 * Written, tested, and released. 102 */ 103int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) 104{ 105 const uint8_t *ocu; 106 uint8_t cmp_id, ocu_len; 107 int i; 108 109 ocu_len = ocu_i->u_len; 110 if (ocu_len == 0) { 111 memset(utf_o, 0, sizeof(struct ustr)); 112 return 0; 113 } 114 115 cmp_id = ocu_i->u_cmpID; 116 if (cmp_id != 8 && cmp_id != 16) { 117 memset(utf_o, 0, sizeof(struct ustr)); 118 pr_err("unknown compression code (%d) stri=%s\n", 119 cmp_id, ocu_i->u_name); 120 return 0; 121 } 122 123 ocu = ocu_i->u_name; 124 utf_o->u_len = 0; 125 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { 126 127 /* Expand OSTA compressed Unicode to Unicode */ 128 uint32_t c = ocu[i++]; 129 if (cmp_id == 16) 130 c = (c << 8) | ocu[i++]; 131 132 /* Compress Unicode to UTF-8 */ 133 if (c < 0x80U) 134 utf_o->u_name[utf_o->u_len++] = (uint8_t)c; 135 else if (c < 0x800U) { 136 if (utf_o->u_len > (UDF_NAME_LEN - 4)) 137 break; 138 utf_o->u_name[utf_o->u_len++] = 139 (uint8_t)(0xc0 | (c >> 6)); 140 utf_o->u_name[utf_o->u_len++] = 141 (uint8_t)(0x80 | (c & 0x3f)); 142 } else { 143 if (utf_o->u_len > (UDF_NAME_LEN - 5)) 144 break; 145 utf_o->u_name[utf_o->u_len++] = 146 (uint8_t)(0xe0 | (c >> 12)); 147 utf_o->u_name[utf_o->u_len++] = 148 (uint8_t)(0x80 | 149 ((c >> 6) & 0x3f)); 150 utf_o->u_name[utf_o->u_len++] = 151 (uint8_t)(0x80 | (c & 0x3f)); 152 } 153 } 154 utf_o->u_cmpID = 8; 155 156 return utf_o->u_len; 157} 158 159/* 160 * 161 * udf_utf8_to_ocu 162 * 163 * PURPOSE 164 * Convert UTF-8 to the OSTA Compressed Unicode equivalent. 165 * 166 * DESCRIPTION 167 * This routine is only called by udf_lookup(). 168 * 169 * PRE-CONDITIONS 170 * ocu Pointer to OSTA Compressed Unicode output 171 * buffer of size UDF_NAME_LEN bytes. 172 * utf Pointer to UTF-8 input buffer. 173 * utf_len Length of UTF-8 input buffer in bytes. 174 * 175 * POST-CONDITIONS 176 * <return> Zero on success. 177 * 178 * HISTORY 179 * November 12, 1997 - Andrew E. Mileski 180 * Written, tested, and released. 181 */ 182static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) 183{ 184 unsigned c, i, max_val, utf_char; 185 int utf_cnt, u_len, u_ch; 186 187 memset(ocu, 0, sizeof(dstring) * length); 188 ocu[0] = 8; 189 max_val = 0xffU; 190 u_ch = 1; 191 192try_again: 193 u_len = 0U; 194 utf_char = 0U; 195 utf_cnt = 0U; 196 for (i = 0U; i < utf->u_len; i++) { 197 /* Name didn't fit? */ 198 if (u_len + 1 + u_ch >= length) 199 return 0; 200 201 c = (uint8_t)utf->u_name[i]; 202 203 /* Complete a multi-byte UTF-8 character */ 204 if (utf_cnt) { 205 utf_char = (utf_char << 6) | (c & 0x3fU); 206 if (--utf_cnt) 207 continue; 208 } else { 209 /* Check for a multi-byte UTF-8 character */ 210 if (c & 0x80U) { 211 /* Start a multi-byte UTF-8 character */ 212 if ((c & 0xe0U) == 0xc0U) { 213 utf_char = c & 0x1fU; 214 utf_cnt = 1; 215 } else if ((c & 0xf0U) == 0xe0U) { 216 utf_char = c & 0x0fU; 217 utf_cnt = 2; 218 } else if ((c & 0xf8U) == 0xf0U) { 219 utf_char = c & 0x07U; 220 utf_cnt = 3; 221 } else if ((c & 0xfcU) == 0xf8U) { 222 utf_char = c & 0x03U; 223 utf_cnt = 4; 224 } else if ((c & 0xfeU) == 0xfcU) { 225 utf_char = c & 0x01U; 226 utf_cnt = 5; 227 } else { 228 goto error_out; 229 } 230 continue; 231 } else { 232 /* Single byte UTF-8 character (most common) */ 233 utf_char = c; 234 } 235 } 236 237 /* Choose no compression if necessary */ 238 if (utf_char > max_val) { 239 if (max_val == 0xffU) { 240 max_val = 0xffffU; 241 ocu[0] = (uint8_t)0x10U; 242 u_ch = 2; 243 goto try_again; 244 } 245 goto error_out; 246 } 247 248 if (max_val == 0xffffU) 249 ocu[++u_len] = (uint8_t)(utf_char >> 8); 250 ocu[++u_len] = (uint8_t)(utf_char & 0xffU); 251 } 252 253 if (utf_cnt) { 254error_out: 255 ocu[++u_len] = '?'; 256 printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n")); 257 } 258 259 ocu[length - 1] = (uint8_t)u_len + 1; 260 261 return u_len + 1; 262} 263 264static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, 265 const struct ustr *ocu_i) 266{ 267 const uint8_t *ocu; 268 uint8_t cmp_id, ocu_len; 269 int i, len; 270 271 272 ocu_len = ocu_i->u_len; 273 if (ocu_len == 0) { 274 memset(utf_o, 0, sizeof(struct ustr)); 275 return 0; 276 } 277 278 cmp_id = ocu_i->u_cmpID; 279 if (cmp_id != 8 && cmp_id != 16) { 280 memset(utf_o, 0, sizeof(struct ustr)); 281 pr_err("unknown compression code (%d) stri=%s\n", 282 cmp_id, ocu_i->u_name); 283 return 0; 284 } 285 286 ocu = ocu_i->u_name; 287 utf_o->u_len = 0; 288 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { 289 /* Expand OSTA compressed Unicode to Unicode */ 290 uint32_t c = ocu[i++]; 291 if (cmp_id == 16) 292 c = (c << 8) | ocu[i++]; 293 294 len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], 295 UDF_NAME_LEN - 2 - utf_o->u_len); 296 /* Valid character? */ 297 if (len >= 0) 298 utf_o->u_len += len; 299 else 300 utf_o->u_name[utf_o->u_len++] = '?'; 301 } 302 utf_o->u_cmpID = 8; 303 304 return utf_o->u_len; 305} 306 307static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, 308 int length) 309{ 310 int len; 311 unsigned i, max_val; 312 uint16_t uni_char; 313 int u_len, u_ch; 314 315 memset(ocu, 0, sizeof(dstring) * length); 316 ocu[0] = 8; 317 max_val = 0xffU; 318 u_ch = 1; 319 320try_again: 321 u_len = 0U; 322 for (i = 0U; i < uni->u_len; i++) { 323 /* Name didn't fit? */ 324 if (u_len + 1 + u_ch >= length) 325 return 0; 326 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); 327 if (!len) 328 continue; 329 /* Invalid character, deal with it */ 330 if (len < 0) { 331 len = 1; 332 uni_char = '?'; 333 } 334 335 if (uni_char > max_val) { 336 max_val = 0xffffU; 337 ocu[0] = (uint8_t)0x10U; 338 u_ch = 2; 339 goto try_again; 340 } 341 342 if (max_val == 0xffffU) 343 ocu[++u_len] = (uint8_t)(uni_char >> 8); 344 ocu[++u_len] = (uint8_t)(uni_char & 0xffU); 345 i += len - 1; 346 } 347 348 ocu[length - 1] = (uint8_t)u_len + 1; 349 return u_len + 1; 350} 351 352int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, 353 uint8_t *dname, int dlen) 354{ 355 struct ustr *filename, *unifilename; 356 int len = 0; 357 358 filename = kmalloc(sizeof(struct ustr), GFP_NOFS); 359 if (!filename) 360 return 0; 361 362 unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); 363 if (!unifilename) 364 goto out1; 365 366 if (udf_build_ustr_exact(unifilename, sname, slen)) 367 goto out2; 368 369 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 370 if (!udf_CS0toUTF8(filename, unifilename)) { 371 udf_debug("Failed in udf_get_filename: sname = %s\n", 372 sname); 373 goto out2; 374 } 375 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 376 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, 377 unifilename)) { 378 udf_debug("Failed in udf_get_filename: sname = %s\n", 379 sname); 380 goto out2; 381 } 382 } else 383 goto out2; 384 385 len = udf_translate_to_linux(dname, dlen, 386 filename->u_name, filename->u_len, 387 unifilename->u_name, unifilename->u_len); 388out2: 389 kfree(unifilename); 390out1: 391 kfree(filename); 392 return len; 393} 394 395int udf_put_filename(struct super_block *sb, const uint8_t *sname, 396 uint8_t *dname, int flen) 397{ 398 struct ustr unifilename; 399 int namelen; 400 401 if (!udf_char_to_ustr(&unifilename, sname, flen)) 402 return 0; 403 404 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 405 namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN); 406 if (!namelen) 407 return 0; 408 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { 409 namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, 410 &unifilename, UDF_NAME_LEN); 411 if (!namelen) 412 return 0; 413 } else 414 return 0; 415 416 return namelen; 417} 418 419#define ILLEGAL_CHAR_MARK '_' 420#define EXT_MARK '.' 421#define CRC_MARK '#' 422#define EXT_SIZE 5 423/* Number of chars we need to store generated CRC to make filename unique */ 424#define CRC_LEN 5 425 426static int udf_translate_to_linux(uint8_t *newName, int newLen, 427 uint8_t *udfName, int udfLen, 428 uint8_t *fidName, int fidNameLen) 429{ 430 int index, newIndex = 0, needsCRC = 0; 431 int extIndex = 0, newExtIndex = 0, hasExt = 0; 432 unsigned short valueCRC; 433 uint8_t curr; 434 435 if (udfName[0] == '.' && 436 (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) { 437 needsCRC = 1; 438 newIndex = udfLen; 439 memcpy(newName, udfName, udfLen); 440 } else { 441 for (index = 0; index < udfLen; index++) { 442 curr = udfName[index]; 443 if (curr == '/' || curr == 0) { 444 needsCRC = 1; 445 curr = ILLEGAL_CHAR_MARK; 446 while (index + 1 < udfLen && 447 (udfName[index + 1] == '/' || 448 udfName[index + 1] == 0)) 449 index++; 450 } 451 if (curr == EXT_MARK && 452 (udfLen - index - 1) <= EXT_SIZE) { 453 if (udfLen == index + 1) 454 hasExt = 0; 455 else { 456 hasExt = 1; 457 extIndex = index; 458 newExtIndex = newIndex; 459 } 460 } 461 if (newIndex < newLen) 462 newName[newIndex++] = curr; 463 else 464 needsCRC = 1; 465 } 466 } 467 if (needsCRC) { 468 uint8_t ext[EXT_SIZE]; 469 int localExtIndex = 0; 470 471 if (hasExt) { 472 int maxFilenameLen; 473 for (index = 0; 474 index < EXT_SIZE && extIndex + index + 1 < udfLen; 475 index++) { 476 curr = udfName[extIndex + index + 1]; 477 478 if (curr == '/' || curr == 0) { 479 needsCRC = 1; 480 curr = ILLEGAL_CHAR_MARK; 481 while (extIndex + index + 2 < udfLen && 482 (index + 1 < EXT_SIZE && 483 (udfName[extIndex + index + 2] == '/' || 484 udfName[extIndex + index + 2] == 0))) 485 index++; 486 } 487 ext[localExtIndex++] = curr; 488 } 489 maxFilenameLen = newLen - CRC_LEN - localExtIndex; 490 if (newIndex > maxFilenameLen) 491 newIndex = maxFilenameLen; 492 else 493 newIndex = newExtIndex; 494 } else if (newIndex > newLen - CRC_LEN) 495 newIndex = newLen - CRC_LEN; 496 newName[newIndex++] = CRC_MARK; 497 valueCRC = crc_itu_t(0, fidName, fidNameLen); 498 newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); 499 newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8); 500 newName[newIndex++] = hex_asc_upper_hi(valueCRC); 501 newName[newIndex++] = hex_asc_upper_lo(valueCRC); 502 503 if (hasExt) { 504 newName[newIndex++] = EXT_MARK; 505 for (index = 0; index < localExtIndex; index++) 506 newName[newIndex++] = ext[index]; 507 } 508 } 509 510 return newIndex; 511} 512