1/* 2 * Branch/Call/Jump (BCJ) filter decoders 3 * 4 * Authors: Lasse Collin <lasse.collin@tukaani.org> 5 * Igor Pavlov <http://7-zip.org/> 6 * 7 * This file has been put into the public domain. 8 * You can do whatever you want with this file. 9 */ 10 11#include "xz_private.h" 12 13/* 14 * The rest of the file is inside this ifdef. It makes things a little more 15 * convenient when building without support for any BCJ filters. 16 */ 17#ifdef XZ_DEC_BCJ 18 19struct xz_dec_bcj { 20 /* Type of the BCJ filter being used */ 21 enum { 22 BCJ_X86 = 4, /* x86 or x86-64 */ 23 BCJ_POWERPC = 5, /* Big endian only */ 24 BCJ_IA64 = 6, /* Big or little endian */ 25 BCJ_ARM = 7, /* Little endian only */ 26 BCJ_ARMTHUMB = 8, /* Little endian only */ 27 BCJ_SPARC = 9 /* Big or little endian */ 28 } type; 29 30 /* 31 * Return value of the next filter in the chain. We need to preserve 32 * this information across calls, because we must not call the next 33 * filter anymore once it has returned XZ_STREAM_END. 34 */ 35 enum xz_ret ret; 36 37 /* True if we are operating in single-call mode. */ 38 bool single_call; 39 40 /* 41 * Absolute position relative to the beginning of the uncompressed 42 * data (in a single .xz Block). We care only about the lowest 32 43 * bits so this doesn't need to be uint64_t even with big files. 44 */ 45 uint32_t pos; 46 47 /* x86 filter state */ 48 uint32_t x86_prev_mask; 49 50 /* Temporary space to hold the variables from struct xz_buf */ 51 uint8_t *out; 52 size_t out_pos; 53 size_t out_size; 54 55 struct { 56 /* Amount of already filtered data in the beginning of buf */ 57 size_t filtered; 58 59 /* Total amount of data currently stored in buf */ 60 size_t size; 61 62 /* 63 * Buffer to hold a mix of filtered and unfiltered data. This 64 * needs to be big enough to hold Alignment + 2 * Look-ahead: 65 * 66 * Type Alignment Look-ahead 67 * x86 1 4 68 * PowerPC 4 0 69 * IA-64 16 0 70 * ARM 4 0 71 * ARM-Thumb 2 2 72 * SPARC 4 0 73 */ 74 uint8_t buf[16]; 75 } temp; 76}; 77 78#ifdef XZ_DEC_X86 79/* 80 * This is used to test the most significant byte of a memory address 81 * in an x86 instruction. 82 */ 83static inline int bcj_x86_test_msbyte(uint8_t b) 84{ 85 return b == 0x00 || b == 0xFF; 86} 87 88static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 89{ 90 static const bool mask_to_allowed_status[8] 91 = { true, true, true, false, true, false, false, false }; 92 93 static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; 94 95 size_t i; 96 size_t prev_pos = (size_t)-1; 97 uint32_t prev_mask = s->x86_prev_mask; 98 uint32_t src; 99 uint32_t dest; 100 uint32_t j; 101 uint8_t b; 102 103 if (size <= 4) 104 return 0; 105 106 size -= 4; 107 for (i = 0; i < size; ++i) { 108 if ((buf[i] & 0xFE) != 0xE8) 109 continue; 110 111 prev_pos = i - prev_pos; 112 if (prev_pos > 3) { 113 prev_mask = 0; 114 } else { 115 prev_mask = (prev_mask << (prev_pos - 1)) & 7; 116 if (prev_mask != 0) { 117 b = buf[i + 4 - mask_to_bit_num[prev_mask]]; 118 if (!mask_to_allowed_status[prev_mask] 119 || bcj_x86_test_msbyte(b)) { 120 prev_pos = i; 121 prev_mask = (prev_mask << 1) | 1; 122 continue; 123 } 124 } 125 } 126 127 prev_pos = i; 128 129 if (bcj_x86_test_msbyte(buf[i + 4])) { 130 src = get_unaligned_le32(buf + i + 1); 131 while (true) { 132 dest = src - (s->pos + (uint32_t)i + 5); 133 if (prev_mask == 0) 134 break; 135 136 j = mask_to_bit_num[prev_mask] * 8; 137 b = (uint8_t)(dest >> (24 - j)); 138 if (!bcj_x86_test_msbyte(b)) 139 break; 140 141 src = dest ^ (((uint32_t)1 << (32 - j)) - 1); 142 } 143 144 dest &= 0x01FFFFFF; 145 dest |= (uint32_t)0 - (dest & 0x01000000); 146 put_unaligned_le32(dest, buf + i + 1); 147 i += 4; 148 } else { 149 prev_mask = (prev_mask << 1) | 1; 150 } 151 } 152 153 prev_pos = i - prev_pos; 154 s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); 155 return i; 156} 157#endif 158 159#ifdef XZ_DEC_POWERPC 160static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 161{ 162 size_t i; 163 uint32_t instr; 164 165 for (i = 0; i + 4 <= size; i += 4) { 166 instr = get_unaligned_be32(buf + i); 167 if ((instr & 0xFC000003) == 0x48000001) { 168 instr &= 0x03FFFFFC; 169 instr -= s->pos + (uint32_t)i; 170 instr &= 0x03FFFFFC; 171 instr |= 0x48000001; 172 put_unaligned_be32(instr, buf + i); 173 } 174 } 175 176 return i; 177} 178#endif 179 180#ifdef XZ_DEC_IA64 181static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 182{ 183 static const uint8_t branch_table[32] = { 184 0, 0, 0, 0, 0, 0, 0, 0, 185 0, 0, 0, 0, 0, 0, 0, 0, 186 4, 4, 6, 6, 0, 0, 7, 7, 187 4, 4, 0, 0, 4, 4, 0, 0 188 }; 189 190 /* 191 * The local variables take a little bit stack space, but it's less 192 * than what LZMA2 decoder takes, so it doesn't make sense to reduce 193 * stack usage here without doing that for the LZMA2 decoder too. 194 */ 195 196 /* Loop counters */ 197 size_t i; 198 size_t j; 199 200 /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ 201 uint32_t slot; 202 203 /* Bitwise offset of the instruction indicated by slot */ 204 uint32_t bit_pos; 205 206 /* bit_pos split into byte and bit parts */ 207 uint32_t byte_pos; 208 uint32_t bit_res; 209 210 /* Address part of an instruction */ 211 uint32_t addr; 212 213 /* Mask used to detect which instructions to convert */ 214 uint32_t mask; 215 216 /* 41-bit instruction stored somewhere in the lowest 48 bits */ 217 uint64_t instr; 218 219 /* Instruction normalized with bit_res for easier manipulation */ 220 uint64_t norm; 221 222 for (i = 0; i + 16 <= size; i += 16) { 223 mask = branch_table[buf[i] & 0x1F]; 224 for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { 225 if (((mask >> slot) & 1) == 0) 226 continue; 227 228 byte_pos = bit_pos >> 3; 229 bit_res = bit_pos & 7; 230 instr = 0; 231 for (j = 0; j < 6; ++j) 232 instr |= (uint64_t)(buf[i + j + byte_pos]) 233 << (8 * j); 234 235 norm = instr >> bit_res; 236 237 if (((norm >> 37) & 0x0F) == 0x05 238 && ((norm >> 9) & 0x07) == 0) { 239 addr = (norm >> 13) & 0x0FFFFF; 240 addr |= ((uint32_t)(norm >> 36) & 1) << 20; 241 addr <<= 4; 242 addr -= s->pos + (uint32_t)i; 243 addr >>= 4; 244 245 norm &= ~((uint64_t)0x8FFFFF << 13); 246 norm |= (uint64_t)(addr & 0x0FFFFF) << 13; 247 norm |= (uint64_t)(addr & 0x100000) 248 << (36 - 20); 249 250 instr &= (1 << bit_res) - 1; 251 instr |= norm << bit_res; 252 253 for (j = 0; j < 6; j++) 254 buf[i + j + byte_pos] 255 = (uint8_t)(instr >> (8 * j)); 256 } 257 } 258 } 259 260 return i; 261} 262#endif 263 264#ifdef XZ_DEC_ARM 265static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 266{ 267 size_t i; 268 uint32_t addr; 269 270 for (i = 0; i + 4 <= size; i += 4) { 271 if (buf[i + 3] == 0xEB) { 272 addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) 273 | ((uint32_t)buf[i + 2] << 16); 274 addr <<= 2; 275 addr -= s->pos + (uint32_t)i + 8; 276 addr >>= 2; 277 buf[i] = (uint8_t)addr; 278 buf[i + 1] = (uint8_t)(addr >> 8); 279 buf[i + 2] = (uint8_t)(addr >> 16); 280 } 281 } 282 283 return i; 284} 285#endif 286 287#ifdef XZ_DEC_ARMTHUMB 288static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 289{ 290 size_t i; 291 uint32_t addr; 292 293 for (i = 0; i + 4 <= size; i += 2) { 294 if ((buf[i + 1] & 0xF8) == 0xF0 295 && (buf[i + 3] & 0xF8) == 0xF8) { 296 addr = (((uint32_t)buf[i + 1] & 0x07) << 19) 297 | ((uint32_t)buf[i] << 11) 298 | (((uint32_t)buf[i + 3] & 0x07) << 8) 299 | (uint32_t)buf[i + 2]; 300 addr <<= 1; 301 addr -= s->pos + (uint32_t)i + 4; 302 addr >>= 1; 303 buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); 304 buf[i] = (uint8_t)(addr >> 11); 305 buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); 306 buf[i + 2] = (uint8_t)addr; 307 i += 2; 308 } 309 } 310 311 return i; 312} 313#endif 314 315#ifdef XZ_DEC_SPARC 316static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) 317{ 318 size_t i; 319 uint32_t instr; 320 321 for (i = 0; i + 4 <= size; i += 4) { 322 instr = get_unaligned_be32(buf + i); 323 if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { 324 instr <<= 2; 325 instr -= s->pos + (uint32_t)i; 326 instr >>= 2; 327 instr = ((uint32_t)0x40000000 - (instr & 0x400000)) 328 | 0x40000000 | (instr & 0x3FFFFF); 329 put_unaligned_be32(instr, buf + i); 330 } 331 } 332 333 return i; 334} 335#endif 336 337/* 338 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount 339 * of data that got filtered. 340 * 341 * NOTE: This is implemented as a switch statement to avoid using function 342 * pointers, which could be problematic in the kernel boot code, which must 343 * avoid pointers to static data (at least on x86). 344 */ 345static void bcj_apply(struct xz_dec_bcj *s, 346 uint8_t *buf, size_t *pos, size_t size) 347{ 348 size_t filtered; 349 350 buf += *pos; 351 size -= *pos; 352 353 switch (s->type) { 354#ifdef XZ_DEC_X86 355 case BCJ_X86: 356 filtered = bcj_x86(s, buf, size); 357 break; 358#endif 359#ifdef XZ_DEC_POWERPC 360 case BCJ_POWERPC: 361 filtered = bcj_powerpc(s, buf, size); 362 break; 363#endif 364#ifdef XZ_DEC_IA64 365 case BCJ_IA64: 366 filtered = bcj_ia64(s, buf, size); 367 break; 368#endif 369#ifdef XZ_DEC_ARM 370 case BCJ_ARM: 371 filtered = bcj_arm(s, buf, size); 372 break; 373#endif 374#ifdef XZ_DEC_ARMTHUMB 375 case BCJ_ARMTHUMB: 376 filtered = bcj_armthumb(s, buf, size); 377 break; 378#endif 379#ifdef XZ_DEC_SPARC 380 case BCJ_SPARC: 381 filtered = bcj_sparc(s, buf, size); 382 break; 383#endif 384 default: 385 /* Never reached but silence compiler warnings. */ 386 filtered = 0; 387 break; 388 } 389 390 *pos += filtered; 391 s->pos += filtered; 392} 393 394/* 395 * Flush pending filtered data from temp to the output buffer. 396 * Move the remaining mixture of possibly filtered and unfiltered 397 * data to the beginning of temp. 398 */ 399static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) 400{ 401 size_t copy_size; 402 403 copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); 404 memcpy(b->out + b->out_pos, s->temp.buf, copy_size); 405 b->out_pos += copy_size; 406 407 s->temp.filtered -= copy_size; 408 s->temp.size -= copy_size; 409 memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); 410} 411 412/* 413 * The BCJ filter functions are primitive in sense that they process the 414 * data in chunks of 1-16 bytes. To hide this issue, this function does 415 * some buffering. 416 */ 417XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, 418 struct xz_dec_lzma2 *lzma2, 419 struct xz_buf *b) 420{ 421 size_t out_start; 422 423 /* 424 * Flush pending already filtered data to the output buffer. Return 425 * immediatelly if we couldn't flush everything, or if the next 426 * filter in the chain had already returned XZ_STREAM_END. 427 */ 428 if (s->temp.filtered > 0) { 429 bcj_flush(s, b); 430 if (s->temp.filtered > 0) 431 return XZ_OK; 432 433 if (s->ret == XZ_STREAM_END) 434 return XZ_STREAM_END; 435 } 436 437 /* 438 * If we have more output space than what is currently pending in 439 * temp, copy the unfiltered data from temp to the output buffer 440 * and try to fill the output buffer by decoding more data from the 441 * next filter in the chain. Apply the BCJ filter on the new data 442 * in the output buffer. If everything cannot be filtered, copy it 443 * to temp and rewind the output buffer position accordingly. 444 * 445 * This needs to be always run when temp.size == 0 to handle a special 446 * case where the output buffer is full and the next filter has no 447 * more output coming but hasn't returned XZ_STREAM_END yet. 448 */ 449 if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { 450 out_start = b->out_pos; 451 memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); 452 b->out_pos += s->temp.size; 453 454 s->ret = xz_dec_lzma2_run(lzma2, b); 455 if (s->ret != XZ_STREAM_END 456 && (s->ret != XZ_OK || s->single_call)) 457 return s->ret; 458 459 bcj_apply(s, b->out, &out_start, b->out_pos); 460 461 /* 462 * As an exception, if the next filter returned XZ_STREAM_END, 463 * we can do that too, since the last few bytes that remain 464 * unfiltered are meant to remain unfiltered. 465 */ 466 if (s->ret == XZ_STREAM_END) 467 return XZ_STREAM_END; 468 469 s->temp.size = b->out_pos - out_start; 470 b->out_pos -= s->temp.size; 471 memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); 472 473 /* 474 * If there wasn't enough input to the next filter to fill 475 * the output buffer with unfiltered data, there's no point 476 * to try decoding more data to temp. 477 */ 478 if (b->out_pos + s->temp.size < b->out_size) 479 return XZ_OK; 480 } 481 482 /* 483 * We have unfiltered data in temp. If the output buffer isn't full 484 * yet, try to fill the temp buffer by decoding more data from the 485 * next filter. Apply the BCJ filter on temp. Then we hopefully can 486 * fill the actual output buffer by copying filtered data from temp. 487 * A mix of filtered and unfiltered data may be left in temp; it will 488 * be taken care on the next call to this function. 489 */ 490 if (b->out_pos < b->out_size) { 491 /* Make b->out{,_pos,_size} temporarily point to s->temp. */ 492 s->out = b->out; 493 s->out_pos = b->out_pos; 494 s->out_size = b->out_size; 495 b->out = s->temp.buf; 496 b->out_pos = s->temp.size; 497 b->out_size = sizeof(s->temp.buf); 498 499 s->ret = xz_dec_lzma2_run(lzma2, b); 500 501 s->temp.size = b->out_pos; 502 b->out = s->out; 503 b->out_pos = s->out_pos; 504 b->out_size = s->out_size; 505 506 if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) 507 return s->ret; 508 509 bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); 510 511 /* 512 * If the next filter returned XZ_STREAM_END, we mark that 513 * everything is filtered, since the last unfiltered bytes 514 * of the stream are meant to be left as is. 515 */ 516 if (s->ret == XZ_STREAM_END) 517 s->temp.filtered = s->temp.size; 518 519 bcj_flush(s, b); 520 if (s->temp.filtered > 0) 521 return XZ_OK; 522 } 523 524 return s->ret; 525} 526 527XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) 528{ 529 struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); 530 if (s != NULL) 531 s->single_call = single_call; 532 533 return s; 534} 535 536XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) 537{ 538 switch (id) { 539#ifdef XZ_DEC_X86 540 case BCJ_X86: 541#endif 542#ifdef XZ_DEC_POWERPC 543 case BCJ_POWERPC: 544#endif 545#ifdef XZ_DEC_IA64 546 case BCJ_IA64: 547#endif 548#ifdef XZ_DEC_ARM 549 case BCJ_ARM: 550#endif 551#ifdef XZ_DEC_ARMTHUMB 552 case BCJ_ARMTHUMB: 553#endif 554#ifdef XZ_DEC_SPARC 555 case BCJ_SPARC: 556#endif 557 break; 558 559 default: 560 /* Unsupported Filter ID */ 561 return XZ_OPTIONS_ERROR; 562 } 563 564 s->type = id; 565 s->ret = XZ_OK; 566 s->pos = 0; 567 s->x86_prev_mask = 0; 568 s->temp.filtered = 0; 569 s->temp.size = 0; 570 571 return XZ_OK; 572} 573 574#endif 575