1 // SPDX-License-Identifier: 0BSD
4 * Branch/Call/Jump (BCJ) filter decoders
6 * Authors: Lasse Collin <lasse.collin@tukaani.org>
7 * Igor Pavlov <https://7-zip.org/>
10 #include "xz_private.h"
13 * The rest of the file is inside this ifdef. It makes things a little more
14 * convenient when building without support for any BCJ filters.
19 /* Type of the BCJ filter being used */
21 BCJ_X86
= 4, /* x86 or x86-64 */
22 BCJ_POWERPC
= 5, /* Big endian only */
23 BCJ_IA64
= 6, /* Big or little endian */
24 BCJ_ARM
= 7, /* Little endian only */
25 BCJ_ARMTHUMB
= 8, /* Little endian only */
26 BCJ_SPARC
= 9, /* Big or little endian */
27 BCJ_ARM64
= 10, /* AArch64 */
28 BCJ_RISCV
= 11 /* RV32GQC_Zfh, RV64GQC_Zfh */
32 * Return value of the next filter in the chain. We need to preserve
33 * this information across calls, because we must not call the next
34 * filter anymore once it has returned XZ_STREAM_END.
38 /* True if we are operating in single-call mode. */
42 * Absolute position relative to the beginning of the uncompressed
43 * data (in a single .xz Block). We care only about the lowest 32
44 * bits so this doesn't need to be uint64_t even with big files.
48 /* x86 filter state */
49 uint32_t x86_prev_mask
;
51 /* Temporary space to hold the variables from struct xz_buf */
57 /* Amount of already filtered data in the beginning of buf */
60 /* Total amount of data currently stored in buf */
64 * Buffer to hold a mix of filtered and unfiltered data. This
65 * needs to be big enough to hold Alignment + 2 * Look-ahead:
67 * Type Alignment Look-ahead
81 * This is used to test the most significant byte of a memory address
82 * in an x86 instruction.
84 static inline int bcj_x86_test_msbyte(uint8_t b
)
86 return b
== 0x00 || b
== 0xFF;
89 static size_t bcj_x86(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
91 static const bool mask_to_allowed_status
[8]
92 = { true, true, true, false, true, false, false, false };
94 static const uint8_t mask_to_bit_num
[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
97 size_t prev_pos
= (size_t)-1;
98 uint32_t prev_mask
= s
->x86_prev_mask
;
108 for (i
= 0; i
< size
; ++i
) {
109 if ((buf
[i
] & 0xFE) != 0xE8)
112 prev_pos
= i
- prev_pos
;
116 prev_mask
= (prev_mask
<< (prev_pos
- 1)) & 7;
117 if (prev_mask
!= 0) {
118 b
= buf
[i
+ 4 - mask_to_bit_num
[prev_mask
]];
119 if (!mask_to_allowed_status
[prev_mask
]
120 || bcj_x86_test_msbyte(b
)) {
122 prev_mask
= (prev_mask
<< 1) | 1;
130 if (bcj_x86_test_msbyte(buf
[i
+ 4])) {
131 src
= get_unaligned_le32(buf
+ i
+ 1);
133 dest
= src
- (s
->pos
+ (uint32_t)i
+ 5);
137 j
= mask_to_bit_num
[prev_mask
] * 8;
138 b
= (uint8_t)(dest
>> (24 - j
));
139 if (!bcj_x86_test_msbyte(b
))
142 src
= dest
^ (((uint32_t)1 << (32 - j
)) - 1);
146 dest
|= (uint32_t)0 - (dest
& 0x01000000);
147 put_unaligned_le32(dest
, buf
+ i
+ 1);
150 prev_mask
= (prev_mask
<< 1) | 1;
154 prev_pos
= i
- prev_pos
;
155 s
->x86_prev_mask
= prev_pos
> 3 ? 0 : prev_mask
<< (prev_pos
- 1);
160 #ifdef XZ_DEC_POWERPC
161 static size_t bcj_powerpc(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
168 for (i
= 0; i
< size
; i
+= 4) {
169 instr
= get_unaligned_be32(buf
+ i
);
170 if ((instr
& 0xFC000003) == 0x48000001) {
172 instr
-= s
->pos
+ (uint32_t)i
;
175 put_unaligned_be32(instr
, buf
+ i
);
184 static size_t bcj_ia64(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
186 static const uint8_t branch_table
[32] = {
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 4, 4, 6, 6, 0, 0, 7, 7,
190 4, 4, 0, 0, 4, 4, 0, 0
194 * The local variables take a little bit stack space, but it's less
195 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
196 * stack usage here without doing that for the LZMA2 decoder too.
203 /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
206 /* Bitwise offset of the instruction indicated by slot */
209 /* bit_pos split into byte and bit parts */
213 /* Address part of an instruction */
216 /* Mask used to detect which instructions to convert */
219 /* 41-bit instruction stored somewhere in the lowest 48 bits */
222 /* Instruction normalized with bit_res for easier manipulation */
227 for (i
= 0; i
< size
; i
+= 16) {
228 mask
= branch_table
[buf
[i
] & 0x1F];
229 for (slot
= 0, bit_pos
= 5; slot
< 3; ++slot
, bit_pos
+= 41) {
230 if (((mask
>> slot
) & 1) == 0)
233 byte_pos
= bit_pos
>> 3;
234 bit_res
= bit_pos
& 7;
236 for (j
= 0; j
< 6; ++j
)
237 instr
|= (uint64_t)(buf
[i
+ j
+ byte_pos
])
240 norm
= instr
>> bit_res
;
242 if (((norm
>> 37) & 0x0F) == 0x05
243 && ((norm
>> 9) & 0x07) == 0) {
244 addr
= (norm
>> 13) & 0x0FFFFF;
245 addr
|= ((uint32_t)(norm
>> 36) & 1) << 20;
247 addr
-= s
->pos
+ (uint32_t)i
;
250 norm
&= ~((uint64_t)0x8FFFFF << 13);
251 norm
|= (uint64_t)(addr
& 0x0FFFFF) << 13;
252 norm
|= (uint64_t)(addr
& 0x100000)
255 instr
&= (1 << bit_res
) - 1;
256 instr
|= norm
<< bit_res
;
258 for (j
= 0; j
< 6; j
++)
259 buf
[i
+ j
+ byte_pos
]
260 = (uint8_t)(instr
>> (8 * j
));
270 static size_t bcj_arm(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
277 for (i
= 0; i
< size
; i
+= 4) {
278 if (buf
[i
+ 3] == 0xEB) {
279 addr
= (uint32_t)buf
[i
] | ((uint32_t)buf
[i
+ 1] << 8)
280 | ((uint32_t)buf
[i
+ 2] << 16);
282 addr
-= s
->pos
+ (uint32_t)i
+ 8;
284 buf
[i
] = (uint8_t)addr
;
285 buf
[i
+ 1] = (uint8_t)(addr
>> 8);
286 buf
[i
+ 2] = (uint8_t)(addr
>> 16);
294 #ifdef XZ_DEC_ARMTHUMB
295 static size_t bcj_armthumb(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
305 for (i
= 0; i
<= size
; i
+= 2) {
306 if ((buf
[i
+ 1] & 0xF8) == 0xF0
307 && (buf
[i
+ 3] & 0xF8) == 0xF8) {
308 addr
= (((uint32_t)buf
[i
+ 1] & 0x07) << 19)
309 | ((uint32_t)buf
[i
] << 11)
310 | (((uint32_t)buf
[i
+ 3] & 0x07) << 8)
311 | (uint32_t)buf
[i
+ 2];
313 addr
-= s
->pos
+ (uint32_t)i
+ 4;
315 buf
[i
+ 1] = (uint8_t)(0xF0 | ((addr
>> 19) & 0x07));
316 buf
[i
] = (uint8_t)(addr
>> 11);
317 buf
[i
+ 3] = (uint8_t)(0xF8 | ((addr
>> 8) & 0x07));
318 buf
[i
+ 2] = (uint8_t)addr
;
328 static size_t bcj_sparc(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
335 for (i
= 0; i
< size
; i
+= 4) {
336 instr
= get_unaligned_be32(buf
+ i
);
337 if ((instr
>> 22) == 0x100 || (instr
>> 22) == 0x1FF) {
339 instr
-= s
->pos
+ (uint32_t)i
;
341 instr
= ((uint32_t)0x40000000 - (instr
& 0x400000))
342 | 0x40000000 | (instr
& 0x3FFFFF);
343 put_unaligned_be32(instr
, buf
+ i
);
352 static size_t bcj_arm64(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
360 for (i
= 0; i
< size
; i
+= 4) {
361 instr
= get_unaligned_le32(buf
+ i
);
363 if ((instr
>> 26) == 0x25) {
365 addr
= instr
- ((s
->pos
+ (uint32_t)i
) >> 2);
366 instr
= 0x94000000 | (addr
& 0x03FFFFFF);
367 put_unaligned_le32(instr
, buf
+ i
);
369 } else if ((instr
& 0x9F000000) == 0x90000000) {
370 /* ADRP instruction */
371 addr
= ((instr
>> 29) & 3) | ((instr
>> 3) & 0x1FFFFC);
373 /* Only convert values in the range +/-512 MiB. */
374 if ((addr
+ 0x020000) & 0x1C0000)
377 addr
-= (s
->pos
+ (uint32_t)i
) >> 12;
380 instr
|= (addr
& 3) << 29;
381 instr
|= (addr
& 0x03FFFC) << 3;
382 instr
|= (0U - (addr
& 0x020000)) & 0xE00000;
384 put_unaligned_le32(instr
, buf
+ i
);
393 static size_t bcj_riscv(struct xz_dec_bcj
*s
, uint8_t *buf
, size_t size
)
409 for (i
= 0; i
<= size
; i
+= 2) {
415 if ((b1
& 0x0D) != 0)
421 addr
= ((b1
& 0xF0) << 13) | (b2
<< 9) | (b3
<< 1);
422 addr
-= s
->pos
+ (uint32_t)i
;
424 buf
[i
+ 1] = (uint8_t)((b1
& 0x0F)
425 | ((addr
>> 8) & 0xF0));
427 buf
[i
+ 2] = (uint8_t)(((addr
>> 16) & 0x0F)
428 | ((addr
>> 7) & 0x10)
429 | ((addr
<< 4) & 0xE0));
431 buf
[i
+ 3] = (uint8_t)(((addr
>> 4) & 0x7F)
432 | ((addr
>> 13) & 0x80));
436 } else if ((instr
& 0x7F) == 0x17) {
438 instr
|= (uint32_t)buf
[i
+ 1] << 8;
439 instr
|= (uint32_t)buf
[i
+ 2] << 16;
440 instr
|= (uint32_t)buf
[i
+ 3] << 24;
443 /* AUIPC's rd doesn't equal x0 or x2. */
444 instr2
= get_unaligned_le32(buf
+ i
+ 4);
446 if (((instr
<< 8) ^ (instr2
- 3)) & 0xF8003) {
451 addr
= (instr
& 0xFFFFF000) + (instr2
>> 20);
453 instr
= 0x17 | (2 << 7) | (instr2
<< 12);
456 /* AUIPC's rd equals x0 or x2. */
457 instr2_rs1
= instr
>> 27;
459 if ((uint32_t)((instr
- 0x3117) << 18)
460 >= (instr2_rs1
& 0x1D)) {
465 addr
= get_unaligned_be32(buf
+ i
+ 4);
466 addr
-= s
->pos
+ (uint32_t)i
;
468 instr2
= (instr
>> 12) | (addr
<< 20);
470 instr
= 0x17 | (instr2_rs1
<< 7)
471 | ((addr
+ 0x800) & 0xFFFFF000);
474 put_unaligned_le32(instr
, buf
+ i
);
475 put_unaligned_le32(instr2
, buf
+ i
+ 4);
486 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
487 * of data that got filtered.
489 * NOTE: This is implemented as a switch statement to avoid using function
490 * pointers, which could be problematic in the kernel boot code, which must
491 * avoid pointers to static data (at least on x86).
493 static void bcj_apply(struct xz_dec_bcj
*s
,
494 uint8_t *buf
, size_t *pos
, size_t size
)
504 filtered
= bcj_x86(s
, buf
, size
);
507 #ifdef XZ_DEC_POWERPC
509 filtered
= bcj_powerpc(s
, buf
, size
);
514 filtered
= bcj_ia64(s
, buf
, size
);
519 filtered
= bcj_arm(s
, buf
, size
);
522 #ifdef XZ_DEC_ARMTHUMB
524 filtered
= bcj_armthumb(s
, buf
, size
);
529 filtered
= bcj_sparc(s
, buf
, size
);
534 filtered
= bcj_arm64(s
, buf
, size
);
539 filtered
= bcj_riscv(s
, buf
, size
);
543 /* Never reached but silence compiler warnings. */
553 * Flush pending filtered data from temp to the output buffer.
554 * Move the remaining mixture of possibly filtered and unfiltered
555 * data to the beginning of temp.
557 static void bcj_flush(struct xz_dec_bcj
*s
, struct xz_buf
*b
)
561 copy_size
= min_t(size_t, s
->temp
.filtered
, b
->out_size
- b
->out_pos
);
562 memcpy(b
->out
+ b
->out_pos
, s
->temp
.buf
, copy_size
);
563 b
->out_pos
+= copy_size
;
565 s
->temp
.filtered
-= copy_size
;
566 s
->temp
.size
-= copy_size
;
567 memmove(s
->temp
.buf
, s
->temp
.buf
+ copy_size
, s
->temp
.size
);
571 * The BCJ filter functions are primitive in sense that they process the
572 * data in chunks of 1-16 bytes. To hide this issue, this function does
575 enum xz_ret
xz_dec_bcj_run(struct xz_dec_bcj
*s
, struct xz_dec_lzma2
*lzma2
,
581 * Flush pending already filtered data to the output buffer. Return
582 * immediately if we couldn't flush everything, or if the next
583 * filter in the chain had already returned XZ_STREAM_END.
585 if (s
->temp
.filtered
> 0) {
587 if (s
->temp
.filtered
> 0)
590 if (s
->ret
== XZ_STREAM_END
)
591 return XZ_STREAM_END
;
595 * If we have more output space than what is currently pending in
596 * temp, copy the unfiltered data from temp to the output buffer
597 * and try to fill the output buffer by decoding more data from the
598 * next filter in the chain. Apply the BCJ filter on the new data
599 * in the output buffer. If everything cannot be filtered, copy it
600 * to temp and rewind the output buffer position accordingly.
602 * This needs to be always run when temp.size == 0 to handle a special
603 * case where the output buffer is full and the next filter has no
604 * more output coming but hasn't returned XZ_STREAM_END yet.
606 if (s
->temp
.size
< b
->out_size
- b
->out_pos
|| s
->temp
.size
== 0) {
607 out_start
= b
->out_pos
;
608 memcpy(b
->out
+ b
->out_pos
, s
->temp
.buf
, s
->temp
.size
);
609 b
->out_pos
+= s
->temp
.size
;
611 s
->ret
= xz_dec_lzma2_run(lzma2
, b
);
612 if (s
->ret
!= XZ_STREAM_END
613 && (s
->ret
!= XZ_OK
|| s
->single_call
))
616 bcj_apply(s
, b
->out
, &out_start
, b
->out_pos
);
619 * As an exception, if the next filter returned XZ_STREAM_END,
620 * we can do that too, since the last few bytes that remain
621 * unfiltered are meant to remain unfiltered.
623 if (s
->ret
== XZ_STREAM_END
)
624 return XZ_STREAM_END
;
626 s
->temp
.size
= b
->out_pos
- out_start
;
627 b
->out_pos
-= s
->temp
.size
;
628 memcpy(s
->temp
.buf
, b
->out
+ b
->out_pos
, s
->temp
.size
);
631 * If there wasn't enough input to the next filter to fill
632 * the output buffer with unfiltered data, there's no point
633 * to try decoding more data to temp.
635 if (b
->out_pos
+ s
->temp
.size
< b
->out_size
)
640 * We have unfiltered data in temp. If the output buffer isn't full
641 * yet, try to fill the temp buffer by decoding more data from the
642 * next filter. Apply the BCJ filter on temp. Then we hopefully can
643 * fill the actual output buffer by copying filtered data from temp.
644 * A mix of filtered and unfiltered data may be left in temp; it will
645 * be taken care on the next call to this function.
647 if (b
->out_pos
< b
->out_size
) {
648 /* Make b->out{,_pos,_size} temporarily point to s->temp. */
650 s
->out_pos
= b
->out_pos
;
651 s
->out_size
= b
->out_size
;
652 b
->out
= s
->temp
.buf
;
653 b
->out_pos
= s
->temp
.size
;
654 b
->out_size
= sizeof(s
->temp
.buf
);
656 s
->ret
= xz_dec_lzma2_run(lzma2
, b
);
658 s
->temp
.size
= b
->out_pos
;
660 b
->out_pos
= s
->out_pos
;
661 b
->out_size
= s
->out_size
;
663 if (s
->ret
!= XZ_OK
&& s
->ret
!= XZ_STREAM_END
)
666 bcj_apply(s
, s
->temp
.buf
, &s
->temp
.filtered
, s
->temp
.size
);
669 * If the next filter returned XZ_STREAM_END, we mark that
670 * everything is filtered, since the last unfiltered bytes
671 * of the stream are meant to be left as is.
673 if (s
->ret
== XZ_STREAM_END
)
674 s
->temp
.filtered
= s
->temp
.size
;
677 if (s
->temp
.filtered
> 0)
684 struct xz_dec_bcj
*xz_dec_bcj_create(bool single_call
)
686 struct xz_dec_bcj
*s
= kmalloc(sizeof(*s
), GFP_KERNEL
);
688 s
->single_call
= single_call
;
693 enum xz_ret
xz_dec_bcj_reset(struct xz_dec_bcj
*s
, uint8_t id
)
699 #ifdef XZ_DEC_POWERPC
708 #ifdef XZ_DEC_ARMTHUMB
723 /* Unsupported Filter ID */
724 return XZ_OPTIONS_ERROR
;
730 s
->x86_prev_mask
= 0;
731 s
->temp
.filtered
= 0;