src/liblzma/simple/arm64.c

   1 ///////////////////////////////////////////////////////////////////////////////
   2 //
   3 /// \file       arm64.c
   4 /// \brief      Filter for ARM64 binaries
   5 ///
   6 /// This converts ARM64 relative addresses in the BL and ADRP immediates
   7 /// to absolute values to increase redundancy of ARM64 code.
   8 ///
   9 /// Converting B or ADR instructions was also tested but it's not useful.
  10 /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
  11 /// These are typical for loops and if-statements. Encoding them to their
  12 /// absolute address reduces redundancy since many of the small relative
  13 /// jump values are repeated, but very few of the absolute addresses are.
  14 //
  15 //  Authors:    Lasse Collin
  16 //              Jia Tan
  17 //              Igor Pavlov
  18 //
  19 //  This file has been put into the public domain.
  20 //  You can do whatever you want with this file.
  21 //
  22 ///////////////////////////////////////////////////////////////////////////////
  23
  24 #include "simple_private.h"
  25
  26
  27 static size_t
  28 arm64_code(void *simple lzma_attribute((__unused__)),
  29                 uint32_t now_pos, bool is_encoder,
  30                 uint8_t *buffer, size_t size)
  31 {
  32         size_t i;
  33
  34         // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
  35         // with auto-vectorization that is enabled by default with -O2.
  36         // Such vectorization bloat happens with -O2 when targeting ARM64 too
  37         // but performance hasn't been tested.
  38 #ifdef __clang__
  39 #       pragma clang loop vectorize(disable)
  40 #endif
  41         for (i = 0; i + 4 <= size; i += 4) {
  42                 uint32_t pc = (uint32_t)(now_pos + i);
  43                 uint32_t instr = read32le(buffer + i);
  44
  45                 if ((instr >> 26) == 0x25) {
  46                         // BL instruction:
  47                         // The full 26-bit immediate is converted.
  48                         // The range is +/-128 MiB.
  49                         //
  50                         // Using the full range is helps quite a lot with
  51                         // big executables. Smaller range would reduce false
  52                         // positives in non-code sections of the input though
  53                         // so this is a compromise that slightly favors big
  54                         // files. With the full range only six bits of the 32
  55                         // need to match to trigger a conversion.
  56                         const uint32_t src = instr;
  57                         instr = 0x94000000;
  58
  59                         pc >>= 2;
  60                         if (!is_encoder)
  61                                 pc = 0U - pc;
  62
  63                         instr |= (src + pc) & 0x03FFFFFF;
  64                         write32le(buffer + i, instr);
  65
  66                 } else if ((instr & 0x9F000000) == 0x90000000) {
  67                         // ADRP instruction:
  68                         // Only values in the range +/-512 MiB are converted.
  69                         //
  70                         // Using less than the full +/-4 GiB range reduces
  71                         // false positives on non-code sections of the input
  72                         // while being excellent for executables up to 512 MiB.
  73                         // The positive effect of ADRP conversion is smaller
  74                         // than that of BL but it also doesn't hurt so much in
  75                         // non-code sections of input because, with +/-512 MiB
  76                         // range, nine bits of 32 need to match to trigger a
  77                         // conversion (two 10-bit match choices = 9 bits).
  78                         const uint32_t src = ((instr >> 29) & 3)
  79                                         | ((instr >> 3) & 0x001FFFFC);
  80
  81                         // With the addition only one branch is needed to
  82                         // check the +/- range. This is usually false when
  83                         // processing ARM64 code so branch prediction will
  84                         // handle it well in terms of performance.
  85                         //
  86                         //if ((src & 0x001E0000) != 0
  87                         // && (src & 0x001E0000) != 0x001E0000)
  88                         if ((src + 0x00020000) & 0x001C0000)
  89                                 continue;
  90
  91                         instr &= 0x9000001F;
  92
  93                         pc >>= 12;
  94                         if (!is_encoder)
  95                                 pc = 0U - pc;
  96
  97                         const uint32_t dest = src + pc;
  98                         instr |= (dest & 3) << 29;
  99                         instr |= (dest & 0x0003FFFC) << 3;
 100                         instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
 101                         write32le(buffer + i, instr);
 102                 }
 103         }
 104
 105         return i;
 106 }
 107
 108
 109 static lzma_ret
 110 arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 111                 const lzma_filter_info *filters, bool is_encoder)
 112 {
 113         return lzma_simple_coder_init(next, allocator, filters,
 114                         &arm64_code, 0, 4, 4, is_encoder);
 115 }
 116
 117
 118 #ifdef HAVE_ENCODER_ARM64
 119 extern lzma_ret
 120 lzma_simple_arm64_encoder_init(lzma_next_coder *next,
 121                 const lzma_allocator *allocator,
 122                 const lzma_filter_info *filters)
 123 {
 124         return arm64_coder_init(next, allocator, filters, true);
 125 }
 126 #endif
 127
 128
 129 #ifdef HAVE_DECODER_ARM64
 130 extern lzma_ret
 131 lzma_simple_arm64_decoder_init(lzma_next_coder *next,
 132                 const lzma_allocator *allocator,
 133                 const lzma_filter_info *filters)
 134 {
 135         return arm64_coder_init(next, allocator, filters, false);
 136 }
 137 #endif