1/* 2 * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. 3 * 4 * Copyright (c) 2001-2006 Anton Altaparmakov 5 * 6 * This program/include file is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License as published 8 * by the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program/include file is distributed in the hope that it will be 12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty 13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program (in the main directory of the Linux-NTFS 18 * distribution in the file COPYING); if not, write to the Free Software 19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 */ 21 22#include <linux/slab.h> 23 24#include "types.h" 25#include "debug.h" 26#include "ntfs.h" 27 28/* 29 * IMPORTANT 30 * ========= 31 * 32 * All these routines assume that the Unicode characters are in little endian 33 * encoding inside the strings!!! 34 */ 35 36/* 37 * This is used by the name collation functions to quickly determine what 38 * characters are (in)valid. 39 */ 40static const u8 legal_ansi_char_array[0x40] = { 41 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 42 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 43 44 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 45 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 46 47 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, 48 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, 49 50 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 51 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, 52}; 53 54/** 55 * ntfs_are_names_equal - compare two Unicode names for equality 56 * @s1: name to compare to @s2 57 * @s1_len: length in Unicode characters of @s1 58 * @s2: name to compare to @s1 59 * @s2_len: length in Unicode characters of @s2 60 * @ic: ignore case bool 61 * @upcase: upcase table (only if @ic == IGNORE_CASE) 62 * @upcase_size: length in Unicode characters of @upcase (if present) 63 * 64 * Compare the names @s1 and @s2 and return 'true' (1) if the names are 65 * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, 66 * the @upcase table is used to performa a case insensitive comparison. 67 */ 68bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, 69 const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, 70 const ntfschar *upcase, const u32 upcase_size) 71{ 72 if (s1_len != s2_len) 73 return false; 74 if (ic == CASE_SENSITIVE) 75 return !ntfs_ucsncmp(s1, s2, s1_len); 76 return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); 77} 78 79/** 80 * ntfs_collate_names - collate two Unicode names 81 * @name1: first Unicode name to compare 82 * @name2: second Unicode name to compare 83 * @err_val: if @name1 contains an invalid character return this value 84 * @ic: either CASE_SENSITIVE or IGNORE_CASE 85 * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) 86 * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) 87 * 88 * ntfs_collate_names collates two Unicode names and returns: 89 * 90 * -1 if the first name collates before the second one, 91 * 0 if the names match, 92 * 1 if the second name collates before the first one, or 93 * @err_val if an invalid character is found in @name1 during the comparison. 94 * 95 * The following characters are considered invalid: '"', '*', '<', '>' and '?'. 96 */ 97int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, 98 const ntfschar *name2, const u32 name2_len, 99 const int err_val, const IGNORE_CASE_BOOL ic, 100 const ntfschar *upcase, const u32 upcase_len) 101{ 102 u32 cnt, min_len; 103 u16 c1, c2; 104 105 min_len = name1_len; 106 if (name1_len > name2_len) 107 min_len = name2_len; 108 for (cnt = 0; cnt < min_len; ++cnt) { 109 c1 = le16_to_cpu(*name1++); 110 c2 = le16_to_cpu(*name2++); 111 if (ic) { 112 if (c1 < upcase_len) 113 c1 = le16_to_cpu(upcase[c1]); 114 if (c2 < upcase_len) 115 c2 = le16_to_cpu(upcase[c2]); 116 } 117 if (c1 < 64 && legal_ansi_char_array[c1] & 8) 118 return err_val; 119 if (c1 < c2) 120 return -1; 121 if (c1 > c2) 122 return 1; 123 } 124 if (name1_len < name2_len) 125 return -1; 126 if (name1_len == name2_len) 127 return 0; 128 /* name1_len > name2_len */ 129 c1 = le16_to_cpu(*name1); 130 if (c1 < 64 && legal_ansi_char_array[c1] & 8) 131 return err_val; 132 return 1; 133} 134 135/** 136 * ntfs_ucsncmp - compare two little endian Unicode strings 137 * @s1: first string 138 * @s2: second string 139 * @n: maximum unicode characters to compare 140 * 141 * Compare the first @n characters of the Unicode strings @s1 and @s2, 142 * The strings in little endian format and appropriate le16_to_cpu() 143 * conversion is performed on non-little endian machines. 144 * 145 * The function returns an integer less than, equal to, or greater than zero 146 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 147 * to be less than, to match, or be greater than @s2. 148 */ 149int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) 150{ 151 u16 c1, c2; 152 size_t i; 153 154 for (i = 0; i < n; ++i) { 155 c1 = le16_to_cpu(s1[i]); 156 c2 = le16_to_cpu(s2[i]); 157 if (c1 < c2) 158 return -1; 159 if (c1 > c2) 160 return 1; 161 if (!c1) 162 break; 163 } 164 return 0; 165} 166 167/** 168 * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case 169 * @s1: first string 170 * @s2: second string 171 * @n: maximum unicode characters to compare 172 * @upcase: upcase table 173 * @upcase_size: upcase table size in Unicode characters 174 * 175 * Compare the first @n characters of the Unicode strings @s1 and @s2, 176 * ignoring case. The strings in little endian format and appropriate 177 * le16_to_cpu() conversion is performed on non-little endian machines. 178 * 179 * Each character is uppercased using the @upcase table before the comparison. 180 * 181 * The function returns an integer less than, equal to, or greater than zero 182 * if @s1 (or the first @n Unicode characters thereof) is found, respectively, 183 * to be less than, to match, or be greater than @s2. 184 */ 185int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, 186 const ntfschar *upcase, const u32 upcase_size) 187{ 188 size_t i; 189 u16 c1, c2; 190 191 for (i = 0; i < n; ++i) { 192 if ((c1 = le16_to_cpu(s1[i])) < upcase_size) 193 c1 = le16_to_cpu(upcase[c1]); 194 if ((c2 = le16_to_cpu(s2[i])) < upcase_size) 195 c2 = le16_to_cpu(upcase[c2]); 196 if (c1 < c2) 197 return -1; 198 if (c1 > c2) 199 return 1; 200 if (!c1) 201 break; 202 } 203 return 0; 204} 205 206void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, 207 const u32 upcase_len) 208{ 209 u32 i; 210 u16 u; 211 212 for (i = 0; i < name_len; i++) 213 if ((u = le16_to_cpu(name[i])) < upcase_len) 214 name[i] = upcase[u]; 215} 216 217void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, 218 const ntfschar *upcase, const u32 upcase_len) 219{ 220 ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, 221 file_name_attr->file_name_length, upcase, upcase_len); 222} 223 224int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, 225 FILE_NAME_ATTR *file_name_attr2, 226 const int err_val, const IGNORE_CASE_BOOL ic, 227 const ntfschar *upcase, const u32 upcase_len) 228{ 229 return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, 230 file_name_attr1->file_name_length, 231 (ntfschar*)&file_name_attr2->file_name, 232 file_name_attr2->file_name_length, 233 err_val, ic, upcase, upcase_len); 234} 235 236/** 237 * ntfs_nlstoucs - convert NLS string to little endian Unicode string 238 * @vol: ntfs volume which we are working with 239 * @ins: input NLS string buffer 240 * @ins_len: length of input string in bytes 241 * @outs: on return contains the allocated output Unicode string buffer 242 * 243 * Convert the input string @ins, which is in whatever format the loaded NLS 244 * map dictates, into a little endian, 2-byte Unicode string. 245 * 246 * This function allocates the string and the caller is responsible for 247 * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. 248 * 249 * On success the function returns the number of Unicode characters written to 250 * the output string *@outs (>= 0), not counting the terminating Unicode NULL 251 * character. *@outs is set to the allocated output string buffer. 252 * 253 * On error, a negative number corresponding to the error code is returned. In 254 * that case the output string is not allocated. Both *@outs and *@outs_len 255 * are then undefined. 256 * 257 * This might look a bit odd due to fast path optimization... 258 */ 259int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, 260 const int ins_len, ntfschar **outs) 261{ 262 struct nls_table *nls = vol->nls_map; 263 ntfschar *ucs; 264 wchar_t wc; 265 int i, o, wc_len; 266 267 /* We do not trust outside sources. */ 268 if (likely(ins)) { 269 ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); 270 if (likely(ucs)) { 271 for (i = o = 0; i < ins_len; i += wc_len) { 272 wc_len = nls->char2uni(ins + i, ins_len - i, 273 &wc); 274 if (likely(wc_len >= 0 && 275 o < NTFS_MAX_NAME_LEN)) { 276 if (likely(wc)) { 277 ucs[o++] = cpu_to_le16(wc); 278 continue; 279 } /* else if (!wc) */ 280 break; 281 } /* else if (wc_len < 0 || 282 o >= NTFS_MAX_NAME_LEN) */ 283 goto name_err; 284 } 285 ucs[o] = 0; 286 *outs = ucs; 287 return o; 288 } /* else if (!ucs) */ 289 ntfs_error(vol->sb, "Failed to allocate buffer for converted " 290 "name from ntfs_name_cache."); 291 return -ENOMEM; 292 } /* else if (!ins) */ 293 ntfs_error(vol->sb, "Received NULL pointer."); 294 return -EINVAL; 295name_err: 296 kmem_cache_free(ntfs_name_cache, ucs); 297 if (wc_len < 0) { 298 ntfs_error(vol->sb, "Name using character set %s contains " 299 "characters that cannot be converted to " 300 "Unicode.", nls->charset); 301 i = -EILSEQ; 302 } else /* if (o >= NTFS_MAX_NAME_LEN) */ { 303 ntfs_error(vol->sb, "Name is too long (maximum length for a " 304 "name on NTFS is %d Unicode characters.", 305 NTFS_MAX_NAME_LEN); 306 i = -ENAMETOOLONG; 307 } 308 return i; 309} 310 311/** 312 * ntfs_ucstonls - convert little endian Unicode string to NLS string 313 * @vol: ntfs volume which we are working with 314 * @ins: input Unicode string buffer 315 * @ins_len: length of input string in Unicode characters 316 * @outs: on return contains the (allocated) output NLS string buffer 317 * @outs_len: length of output string buffer in bytes 318 * 319 * Convert the input little endian, 2-byte Unicode string @ins, of length 320 * @ins_len into the string format dictated by the loaded NLS. 321 * 322 * If *@outs is NULL, this function allocates the string and the caller is 323 * responsible for calling kfree(*@outs); when finished with it. In this case 324 * @outs_len is ignored and can be 0. 325 * 326 * On success the function returns the number of bytes written to the output 327 * string *@outs (>= 0), not counting the terminating NULL byte. If the output 328 * string buffer was allocated, *@outs is set to it. 329 * 330 * On error, a negative number corresponding to the error code is returned. In 331 * that case the output string is not allocated. The contents of *@outs are 332 * then undefined. 333 * 334 * This might look a bit odd due to fast path optimization... 335 */ 336int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, 337 const int ins_len, unsigned char **outs, int outs_len) 338{ 339 struct nls_table *nls = vol->nls_map; 340 unsigned char *ns; 341 int i, o, ns_len, wc; 342 343 /* We don't trust outside sources. */ 344 if (ins) { 345 ns = *outs; 346 ns_len = outs_len; 347 if (ns && !ns_len) { 348 wc = -ENAMETOOLONG; 349 goto conversion_err; 350 } 351 if (!ns) { 352 ns_len = ins_len * NLS_MAX_CHARSET_SIZE; 353 ns = kmalloc(ns_len + 1, GFP_NOFS); 354 if (!ns) 355 goto mem_err_out; 356 } 357 for (i = o = 0; i < ins_len; i++) { 358retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, 359 ns_len - o); 360 if (wc > 0) { 361 o += wc; 362 continue; 363 } else if (!wc) 364 break; 365 else if (wc == -ENAMETOOLONG && ns != *outs) { 366 unsigned char *tc; 367 /* Grow in multiples of 64 bytes. */ 368 tc = kmalloc((ns_len + 64) & 369 ~63, GFP_NOFS); 370 if (tc) { 371 memcpy(tc, ns, ns_len); 372 ns_len = ((ns_len + 64) & ~63) - 1; 373 kfree(ns); 374 ns = tc; 375 goto retry; 376 } /* No memory so goto conversion_error; */ 377 } /* wc < 0, real error. */ 378 goto conversion_err; 379 } 380 ns[o] = 0; 381 *outs = ns; 382 return o; 383 } /* else (!ins) */ 384 ntfs_error(vol->sb, "Received NULL pointer."); 385 return -EINVAL; 386conversion_err: 387 ntfs_error(vol->sb, "Unicode name contains characters that cannot be " 388 "converted to character set %s. You might want to " 389 "try to use the mount option nls=utf8.", nls->charset); 390 if (ns != *outs) 391 kfree(ns); 392 if (wc != -ENAMETOOLONG) 393 wc = -EILSEQ; 394 return wc; 395mem_err_out: 396 ntfs_error(vol->sb, "Failed to allocate name!"); 397 return -ENOMEM; 398} 399