1 ///////////////////////////////////////////////////////////////////////////////
4 /// \brief Filter for ARM64 binaries
6 /// This converts ARM64 relative addresses in the BL and ADRP immediates
7 /// to absolute values to increase redundancy of ARM64 code.
9 /// Converting B or ADR instructions was also tested but it's not useful.
10 /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
11 /// These are typical for loops and if-statements. Encoding them to their
12 /// absolute address reduces redundancy since many of the small relative
13 /// jump values are repeated, but very few of the absolute addresses are.
15 // Authors: Lasse Collin
19 // This file has been put into the public domain.
20 // You can do whatever you want with this file.
22 ///////////////////////////////////////////////////////////////////////////////
24 #include "simple_private.h"
28 arm64_code(void *simple
lzma_attribute((__unused__
)),
29 uint32_t now_pos
, bool is_encoder
,
30 uint8_t *buffer
, size_t size
)
34 // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
35 // with auto-vectorization that is enabled by default with -O2.
36 // Such vectorization bloat happens with -O2 when targeting ARM64 too
37 // but performance hasn't been tested.
39 # pragma clang loop vectorize(disable)
41 for (i
= 0; i
+ 4 <= size
; i
+= 4) {
42 uint32_t pc
= (uint32_t)(now_pos
+ i
);
43 uint32_t instr
= read32le(buffer
+ i
);
45 if ((instr
>> 26) == 0x25) {
47 // The full 26-bit immediate is converted.
48 // The range is +/-128 MiB.
50 // Using the full range is helps quite a lot with
51 // big executables. Smaller range would reduce false
52 // positives in non-code sections of the input though
53 // so this is a compromise that slightly favors big
54 // files. With the full range only six bits of the 32
55 // need to match to trigger a conversion.
56 const uint32_t src
= instr
;
63 instr
|= (src
+ pc
) & 0x03FFFFFF;
64 write32le(buffer
+ i
, instr
);
66 } else if ((instr
& 0x9F000000) == 0x90000000) {
68 // Only values in the range +/-512 MiB are converted.
70 // Using less than the full +/-4 GiB range reduces
71 // false positives on non-code sections of the input
72 // while being excellent for executables up to 512 MiB.
73 // The positive effect of ADRP conversion is smaller
74 // than that of BL but it also doesn't hurt so much in
75 // non-code sections of input because, with +/-512 MiB
76 // range, nine bits of 32 need to match to trigger a
77 // conversion (two 10-bit match choices = 9 bits).
78 const uint32_t src
= ((instr
>> 29) & 3)
79 | ((instr
>> 3) & 0x001FFFFC);
81 // With the addition only one branch is needed to
82 // check the +/- range. This is usually false when
83 // processing ARM64 code so branch prediction will
84 // handle it well in terms of performance.
86 //if ((src & 0x001E0000) != 0
87 // && (src & 0x001E0000) != 0x001E0000)
88 if ((src
+ 0x00020000) & 0x001C0000)
97 const uint32_t dest
= src
+ pc
;
98 instr
|= (dest
& 3) << 29;
99 instr
|= (dest
& 0x0003FFFC) << 3;
100 instr
|= (0U - (dest
& 0x00020000)) & 0x00E00000;
101 write32le(buffer
+ i
, instr
);
110 arm64_coder_init(lzma_next_coder
*next
, const lzma_allocator
*allocator
,
111 const lzma_filter_info
*filters
, bool is_encoder
)
113 return lzma_simple_coder_init(next
, allocator
, filters
,
114 &arm64_code
, 0, 4, 4, is_encoder
);
118 #ifdef HAVE_ENCODER_ARM64
120 lzma_simple_arm64_encoder_init(lzma_next_coder
*next
,
121 const lzma_allocator
*allocator
,
122 const lzma_filter_info
*filters
)
124 return arm64_coder_init(next
, allocator
, filters
, true);
129 #ifdef HAVE_DECODER_ARM64
131 lzma_simple_arm64_decoder_init(lzma_next_coder
*next
,
132 const lzma_allocator
*allocator
,
133 const lzma_filter_info
*filters
)
135 return arm64_coder_init(next
, allocator
, filters
, false);