src/liblzma/simple/riscv.c

   1 // SPDX-License-Identifier: 0BSD
   2
   3 ///////////////////////////////////////////////////////////////////////////////
   4 //
   5 /// \file       riscv.c
   6 /// \brief      Filter for 32-bit/64-bit little/big endian RISC-V binaries
   7 ///
   8 /// This converts program counter relative addresses in function calls
   9 /// (JAL, AUIPC+JALR), address calculation of functions and global
  10 /// variables (AUIPC+ADDI), loads (AUIPC+load), and stores (AUIPC+store).
  11 ///
  12 /// For AUIPC+inst2 pairs, the paired instruction checking is fairly relaxed.
  13 /// The paired instruction opcode must only have its lowest two bits set,
  14 /// meaning it will convert any paired instruction that is not a 16-bit
  15 /// compressed instruction. This was shown to be enough to keep the number
  16 /// of false matches low while improving code size and speed.
  17 //
  18 //  Authors:    Lasse Collin
  19 //              Jia Tan
  20 //
  21 //  Special thanks:
  22 //
  23 //    - Chien Wong <m@xv97.com> provided a few early versions of RISC-V
  24 //      filter variants along with test files and benchmark results.
  25 //
  26 //    - Igor Pavlov helped a lot in the filter design, getting it both
  27 //      faster and smaller. The implementation here is still independently
  28 //      written, not based on LZMA SDK.
  29 //
  30 ///////////////////////////////////////////////////////////////////////////////
  31
  32 /*
  33
  34 RISC-V filtering
  35 ================
  36
  37     RV32I and RV64I, possibly combined with extensions C, Zfh, F, D,
  38     and Q, are identical enough that the same filter works for both.
  39
  40     The instruction encoding is always little endian, even on systems
  41     with big endian data access. Thus the same filter works for both
  42     endiannesses.
  43
  44     The following instructions have program counter relative
  45     (pc-relative) behavior:
  46
  47 JAL
  48 ---
  49
  50     JAL is used for function calls (including tail calls) and
  51     unconditional jumps within functions. Jumps within functions
  52     aren't useful to filter because the absolute addresses often
  53     appear only once or at most a few times. Tail calls and jumps
  54     within functions look the same to a simple filter so neither
  55     are filtered, that is, JAL x0 is ignored (the ABI name of the
  56     register x0 is "zero").
  57
  58     Almost all calls store the return address to register x1 (ra)
  59     or x5 (t0). To reduce false matches when the filter is applied
  60     to non-code data, only the JAL instructions that use x1 or x5
  61     are converted. JAL has pc-relative range of +/-1 MiB so longer
  62     calls and jumps need another method (AUIPC+JALR).
  63
  64 C.J and C.JAL
  65 -------------
  66
  67     C.J and C.JAL have pc-relative range of +/-2 KiB.
  68
  69     C.J is for tail calls and jumps within functions and isn't
  70     filtered for the reasons mentioned for JAL x0.
  71
  72     C.JAL is an RV32C-only instruction. Its encoding overlaps with
  73     RV64C-only C.ADDIW which is a common instruction. So if filtering
  74     C.JAL was useful (it wasn't tested) then a separate filter would
  75     be needed for RV32 and RV64. Also, false positives would be a
  76     significant problem when the filter is applied to non-code data
  77     because C.JAL needs only five bits to match. Thus, this filter
  78     doesn't modify C.JAL instructions.
  79
  80 BEQ, BNE, BLT, BGE, BLTU, BGEU, C.BEQZ, and C.BNEZ
  81 --------------------------------------------------
  82
  83     These are conditional branches with pc-relative range
  84     of +/-4 KiB (+/-256 B for C.*). The absolute addresses often
  85     appear only once and very short distances are the most common,
  86     so filtering these instructions would make compression worse.
  87
  88 AUIPC with rd != x0
  89 -------------------
  90
  91     AUIPC is paired with a second instruction (inst2) to do
  92     pc-relative jumps, calls, loads, stores, and for taking
  93     an address of a symbol. AUIPC has a 20-bit immediate and
  94     the possible inst2 choices have a 12-bit immediate.
  95
  96     AUIPC stores pc + 20-bit signed immediate to a register.
  97     The immediate encodes a multiple of 4 KiB so AUIPC itself
  98     has a pc-relative range of +/-2 GiB. AUIPC does *NOT* set
  99     the lowest 12 bits of the result to zero! This means that
 100     the 12-bit immediate in inst2 cannot just include the lowest
 101     12 bits of the absolute address as is; the immediate has to
 102     compensate for the lowest 12 bits that AUIPC copies from the
 103     program counter. This means that a good filter has to convert
 104     not only AUIPC but also the paired inst2.
 105
 106     A strict filter would focus on filtering the following
 107     AUIPC+inst2 pairs:
 108
 109       - AUIPC+JALR: Function calls, including tail calls.
 110
 111       - AUIPC+ADDI: Calculating the address of a function
 112         or a global variable.
 113
 114       - AUIPC+load/store from the base instruction sets
 115         (RV32I, RV64I) or from the floating point extensions
 116         Zfh, F, D, and Q:
 117           * RV32I: LB, LH, LW, LBU, LHU, SB, SH, SW
 118           * RV64I has also: LD, LWU, SD
 119           * Zfh: FLH, FSH
 120           * F: FLW, FSW
 121           * D: FLD, FSD
 122           * Q: FLQ, FSQ
 123
 124     NOTE: AUIPC+inst2 can only be a pair if AUIPC's rd specifies
 125     the same register as inst2's rs1.
 126
 127     Instead of strictly accepting only the above instructions as inst2,
 128     this filter uses a much simpler condition: the lowest two bits of
 129     inst2 must be set, that is, inst2 must not be a 16-bit compressed
 130     instruction. So this will accept all 32-bit and possible future
 131     extended instructions as a pair to AUIPC if the bits in AUIPC's
 132     rd [11:7] match the bits [19:15] in inst2 (the bits that I-type and
 133     S-type instructions use for rs1). Testing showed that this relaxed
 134     condition for inst2 did not consistently or significantly affect
 135     compression ratio but it reduced code size and improved speed.
 136
 137     Additionally, the paired instruction is always treated as an I-type
 138     instruction. The S-type instructions used by stores (SB, SH, SW,
 139     etc.) place the lowest 5 bits of the immediate in a different
 140     location than I-type instructions. AUIPC+store pairs are less
 141     common than other pairs, and testing showed that the extra
 142     code required to handle S-type instructions was not worth the
 143     compression ratio gained.
 144
 145     AUIPC+inst2 don't necessarily appear sequentially next to each
 146     other although very often they do. Especially AUIPC+JALR are
 147     sequential as that may allow instruction fusion in processors
 148     (and perhaps help branch prediction as a fused AUIPC+JALR is
 149     a direct branch while JALR alone is an indirect branch).
 150
 151     Clang 16 can generate code where AUIPC+inst2 is split:
 152
 153       - AUIPC is outside a loop and inst2 (load/store) is inside
 154         the loop. This way the AUIPC instruction needs to be
 155         executed only once.
 156
 157       - Load-modify-store may have AUIPC for the load and the same
 158         AUIPC-result is used for the store too. This may get combined
 159         with AUIPC being outside the loop.
 160
 161       - AUIPC is before a conditional branch and inst2 is hundreds
 162         of bytes away at the branch target.
 163
 164       - Inner and outer pair:
 165
 166             auipc   a1,0x2f
 167             auipc   a2,0x3d
 168             ld      a2,-500(a2)
 169             addi    a1,a1,-233
 170
 171       - Many split pairs with an untaken conditional branch between:
 172
 173             auipc   s9,0x1613   # Pair 1
 174             auipc   s4,0x1613   # Pair 2
 175             auipc   s6,0x1613   # Pair 3
 176             auipc   s10,0x1613  # Pair 4
 177             beqz    a5,a3baae
 178             ld      a0,0(a6)
 179             ld      a6,246(s9)  # Pair 1
 180             ld      a1,250(s4)  # Pair 2
 181             ld      a3,254(s6)  # Pair 3
 182             ld      a4,258(s10) # Pair 4
 183
 184     It's not possible to find all split pairs in a filter like this.
 185     At least in 2024, simple sequential pairs are 99 % of AUIPC uses
 186     so filtering only such pairs gives good results and makes the
 187     filter simpler. However, it's possible that future compilers will
 188     produce different code where sequential pairs aren't as common.
 189
 190     This filter doesn't convert AUIPC instructions alone because:
 191
 192     (1) The conversion would be off-by-one (or off-by-4096) half the
 193         time because the lowest 12 bits from inst2 (inst2_imm12)
 194         aren't known. We only know that the absolute address is
 195         pc + AUIPC_imm20 + [-2048, +2047] but there is no way to
 196         know the exact 4096-byte multiple (or 4096 * n + 2048):
 197         there are always two possibilities because AUIPC copies
 198         the 12 lowest bits from pc instead of zeroing them.
 199
 200         NOTE: The sign-extension of inst2_imm12 adds a tiny bit
 201         of extra complexity to AUIPC math in general but it's not
 202         the reason for this problem. The sign-extension only changes
 203         the relative position of the pc-relative 4096-byte window.
 204
 205     (2) Matching AUIPC instruction alone requires only seven bits.
 206         When the filter is applied to non-code data, that leads
 207         to many false positives which make compression worse.
 208         As long as most AUIPC+inst2 pairs appear as two consecutive
 209         instructions, converting only such pairs gives better results.
 210
 211     In assembly, AUIPC+inst2 tend to look like this:
 212
 213         # Call:
 214         auipc   ra, 0x12345
 215         jalr    ra, -42(ra)
 216
 217         # Tail call:
 218         auipc   t1, 0x12345
 219         jalr    zero, -42(t1)
 220
 221         # Getting the absolute address:
 222         auipc   a0, 0x12345
 223         addi    a0, a0, -42
 224
 225         # rd of inst2 isn't necessarily the same as rs1 even
 226         # in cases where there is no reason to preserve rs1.
 227         auipc   a0, 0x12345
 228         addi    a1, a0, -42
 229
 230     As of 2024, 16-bit instructions from the C extension don't
 231     appear as inst2. The RISC-V psABI doesn't list AUIPC+C.* as
 232     a linker relaxation type explicitly but it's not disallowed
 233     either. Usefulness is limited as most of the time the lowest
 234     12 bits won't fit in a C instruction. This filter doesn't
 235     support AUIPC+C.* combinations because this makes the filter
 236     simpler, there are no test files, and it hopefully will never
 237     be needed anyway.
 238
 239     (Compare AUIPC to ARM64 where ADRP does set the lowest 12 bits
 240     to zero. The paired instruction has the lowest 12 bits of the
 241     absolute address as is in a zero-extended immediate. Thus the
 242     ARM64 filter doesn't need to care about the instructions that
 243     are paired with ADRP. An off-by-4096 issue can still occur if
 244     the code section isn't aligned with the filter's start offset.
 245     It's not a problem with standalone ELF files but Windows PE
 246     files need start_offset=3072 for best results. Also, a .tar
 247     stores files with 512-byte alignment so most of the time it
 248     won't be the best for ARM64.)
 249
 250 AUIPC with rd == x0
 251 -------------------
 252
 253     AUIPC instructions with rd=x0 are reserved for HINTs in the base
 254     instruction set. Such AUIPC instructions are never filtered.
 255
 256     As of January 2024, it seems likely that AUIPC with rd=x0 will
 257     be used for landing pads (pseudoinstruction LPAD). LPAD is used
 258     to mark valid targets for indirect jumps (for JALR), for example,
 259     beginnings of functions. The 20-bit immediate in LPAD instruction
 260     is a label, not a pc-relative address. Thus it would be
 261     counterproductive to convert AUIPC instructions with rd=x0.
 262
 263     Often the next instruction after LPAD won't have rs1=x0 and thus
 264     the filtering would be skipped for that reason alone. However,
 265     it's not good to rely on this. For example, consider a function
 266     that begins like this:
 267
 268         int foo(int i)
 269         {
 270             if (i <= 234) {
 271                 ...
 272             }
 273
 274     A compiler may generate something like this:
 275
 276         lpad    0x54321
 277         li      a5, 234
 278         bgt     a0, a5, .L2
 279
 280     Converting the pseudoinstructions to raw instructions:
 281
 282         auipc   x0, 0x54321
 283         addi    x15, x0, 234
 284         blt     x15, x10, .L2
 285
 286     In this case the filter would undesirably convert the AUIPC+ADDI
 287     pair if the filter didn't explicitly skip AUIPC instructions
 288     that have rd=x0.
 289
 290 */
 291
 292
 293 #include "simple_private.h"
 294
 295
 296 // This checks two conditions at once:
 297 //    - AUIPC rd == inst2 rs1.
 298 //    - inst2 opcode has the lowest two bits set.
 299 //
 300 // The 8 bit left shift aligns the rd of AUIPC with the rs1 of inst2.
 301 // By XORing the registers, any non-zero value in those bits indicates the
 302 // registers are not equal and thus not an AUIPC pair. Subtracting 3 from
 303 // inst2 will zero out the first two opcode bits only when they are set.
 304 // The mask tests if any of the register or opcode bits are set (and thus
 305 // not an AUIPC pair).
 306 //
 307 // Alternative expression: (((((auipc) << 8) ^ (inst2)) & 0xF8003) != 3)
 308 #define NOT_AUIPC_PAIR(auipc, inst2) \
 309         ((((auipc) << 8) ^ ((inst2) - 3)) & 0xF8003)
 310
 311 // This macro checks multiple conditions:
 312 //   (1) AUIPC rd [11:7] == x2 (special rd value).
 313 //   (2) AUIPC bits 12 and 13 set (the lowest two opcode bits of packed inst2).
 314 //   (3) inst2_rs1 doesn't equal x0 or x2 because the opposite
 315 //       conversion is only done when
 316 //       auipc_rd != x0 &&
 317 //       auipc_rd != x2 &&
 318 //       auipc_rd == inst2_rs1.
 319 //
 320 // The left-hand side takes care of (1) and (2).
 321 //   (a) The lowest 7 bits are already known to be AUIPC so subtracting 0x17
 322 //       makes those bits zeros.
 323 //   (b) If AUIPC rd equals x2, subtracting 0x100 makes bits [11:7] zeros.
 324 //       If rd doesn't equal x2, then there will be at least one non-zero bit
 325 //       and the next step (c) is irrelevant.
 326 //   (c) If the lowest two opcode bits of the packed inst2 are set in [13:12],
 327 //       then subtracting 0x3000 will make those bits zeros. Otherwise there
 328 //       will be at least one non-zero bit.
 329 //
 330 // The shift by 18 removes the high bits from the final '>=' comparison and
 331 // ensures that any non-zero result will be larger than any possible result
 332 // from the right-hand side of the comparison. The cast ensures that the
 333 // left-hand side didn't get promoted to a larger type than uint32_t.
 334 //
 335 // On the right-hand side, inst2_rs1 & 0x1D will be non-zero as long as
 336 // inst2_rs1 is not x0 or x2.
 337 //
 338 // The final '>=' comparison will make the expression true if:
 339 //   - The subtraction caused any bits to be set (special AUIPC rd value not
 340 //     used or inst2 opcode bits not set). (non-zero >= non-zero or 0)
 341 //   - The subtraction did not cause any bits to be set but inst2_rs1 was
 342 //     x0 or x2. (0 >= 0)
 343 #define NOT_SPECIAL_AUIPC(auipc, inst2_rs1) \
 344         ((uint32_t)(((auipc) - 0x3117) << 18) >= ((inst2_rs1) & 0x1D))
 345
 346
 347 // The encode and decode functions are split for this filter because of the
 348 // AUIPC+inst2 filtering. This filter design allows a decoder-only
 349 // implementation to be smaller than alternative designs.
 350
 351 #ifdef HAVE_ENCODER_RISCV
 352 static size_t
 353 riscv_encode(void *simple lzma_attribute((__unused__)),
 354                 uint32_t now_pos,
 355                 bool is_encoder lzma_attribute((__unused__)),
 356                 uint8_t *buffer, size_t size)
 357 {
 358         // Avoid using i + 8 <= size in the loop condition.
 359         //
 360         // NOTE: If there is a JAL in the last six bytes of the stream, it
 361         // won't be converted. This is intentional to keep the code simpler.
 362         if (size < 8)
 363                 return 0;
 364
 365         size -= 8;
 366
 367         size_t i;
 368
 369         // The loop is advanced by 2 bytes every iteration since the
 370         // instruction stream may include 16-bit instructions (C extension).
 371         for (i = 0; i <= size; i += 2) {
 372                 uint32_t inst = buffer[i];
 373
 374                 if (inst == 0xEF) {
 375                         // JAL
 376                         const uint32_t b1 = buffer[i + 1];
 377
 378                         // Only filter rd=x1(ra) and rd=x5(t0).
 379                         if ((b1 & 0x0D) != 0)
 380                                 continue;
 381
 382                         // The 20-bit immediate is in four pieces.
 383                         // The encoder stores it in big endian form
 384                         // since it improves compression slightly.
 385                         const uint32_t b2 = buffer[i + 2];
 386                         const uint32_t b3 = buffer[i + 3];
 387                         const uint32_t pc = now_pos + (uint32_t)i;
 388
 389 // The following chart shows the highest three bytes of JAL, focusing on
 390 // the 20-bit immediate field [31:12]. The first row of numbers is the
 391 // bit position in a 32-bit little endian instruction. The second row of
 392 // numbers shows the order of the immediate field in a J-type instruction.
 393 // The last row is the bit number in each byte.
 394 //
 395 // To determine the amount to shift each bit, subtract the value in
 396 // the last row from the value in the second last row. If the number
 397 // is positive, shift left. If negative, shift right.
 398 //
 399 // For example, at the rightmost side of the chart, the bit 4 in b1 is
 400 // the bit 12 of the address. Thus that bit needs to be shifted left
 401 // by 12 - 4 = 8 bits to put it in the right place in the addr variable.
 402 //
 403 // NOTE: The immediate of a J-type instruction holds bits [20:1] of
 404 // the address. The bit [0] is always 0 and not part of the immediate.
 405 //
 406 // |          b3             |          b2             |          b1         |
 407 // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
 408 // | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |
 409 // |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |
 410
 411                         uint32_t addr = ((b1 & 0xF0) << 8)
 412                                         | ((b2 & 0x0F) << 16)
 413                                         | ((b2 & 0x10) << 7)
 414                                         | ((b2 & 0xE0) >> 4)
 415                                         | ((b3 & 0x7F) << 4)
 416                                         | ((b3 & 0x80) << 13);
 417
 418                         addr += pc;
 419
 420                         buffer[i + 1] = (uint8_t)((b1 & 0x0F)
 421                                         | ((addr >> 13) & 0xF0));
 422
 423                         buffer[i + 2] = (uint8_t)(addr >> 9);
 424                         buffer[i + 3] = (uint8_t)(addr >> 1);
 425
 426                         // The "-2" is included because the for-loop will
 427                         // always increment by 2. In this case, we want to
 428                         // skip an extra 2 bytes since we used 4 bytes
 429                         // of input.
 430                         i += 4 - 2;
 431
 432                 } else if ((inst & 0x7F) == 0x17) {
 433                         // AUIPC
 434                         inst |= (uint32_t)buffer[i + 1] << 8;
 435                         inst |= (uint32_t)buffer[i + 2] << 16;
 436                         inst |= (uint32_t)buffer[i + 3] << 24;
 437
 438                         // Branch based on AUIPC's rd. The bitmask test does
 439                         // the same thing as this:
 440                         //
 441                         //     const uint32_t auipc_rd = (inst >> 7) & 0x1F;
 442                         //     if (auipc_rd != 0 && auipc_rd != 2) {
 443                         if (inst & 0xE80) {
 444                                 // AUIPC's rd doesn't equal x0 or x2.
 445
 446                                 // Check if AUIPC+inst2 are a pair.
 447                                 uint32_t inst2 = read32le(buffer + i + 4);
 448
 449                                 if (NOT_AUIPC_PAIR(inst, inst2)) {
 450                                         // The NOT_AUIPC_PAIR macro allows
 451                                         // a false AUIPC+AUIPC pair if the
 452                                         // bits [19:15] (where rs1 would be)
 453                                         // in the second AUIPC match the rd
 454                                         // of the first AUIPC.
 455                                         //
 456                                         // We must skip enough forward so
 457                                         // that the first two bytes of the
 458                                         // second AUIPC cannot get converted.
 459                                         // Such a conversion could make the
 460                                         // current pair become a valid pair
 461                                         // which would desync the decoder.
 462                                         //
 463                                         // Skipping six bytes is enough even
 464                                         // though the above condition looks
 465                                         // at the lowest four bits of the
 466                                         // buffer[i + 6] too. This is safe
 467                                         // because this filter never changes
 468                                         // those bits if a conversion at
 469                                         // that position is done.
 470                                         i += 6 - 2;
 471                                         continue;
 472                                 }
 473
 474                                 // Convert AUIPC+inst2 to a special format:
 475                                 //
 476                                 //   - The lowest 7 bits [6:0] retain the
 477                                 //     AUIPC opcode.
 478                                 //
 479                                 //   - The rd [11:7] is set to x2(sp). x2 is
 480                                 //     used as the stack pointer so AUIPC with
 481                                 //     rd=x2 should be very rare in real-world
 482                                 //     executables.
 483                                 //
 484                                 //   - The remaining 20 bits [31:12] (that
 485                                 //     normally hold the pc-relative immediate)
 486                                 //     are used to store the lowest 20 bits of
 487                                 //     inst2. That is, the 12-bit immediate of
 488                                 //     inst2 is not included.
 489                                 //
 490                                 //   - The location of the original inst2 is
 491                                 //     used to store the 32-bit absolute
 492                                 //     address in big endian format. Compared
 493                                 //     to the 20+12-bit split encoding, this
 494                                 //     results in a longer uninterrupted
 495                                 //     sequence of identical common bytes
 496                                 //     when the same address is referred
 497                                 //     with different instruction pairs
 498                                 //     (like AUIPC+LD vs. AUIPC+ADDI) or
 499                                 //     when the occurrences of the same
 500                                 //     pair use different registers. When
 501                                 //     referring to adjacent memory locations
 502                                 //     (like function calls that go via the
 503                                 //     ELF PLT), in big endian order only the
 504                                 //     last 1-2 bytes differ; in little endian
 505                                 //     the differing 1-2 bytes would be in the
 506                                 //     middle of the 8-byte sequence.
 507                                 //
 508                                 // When reversing the transformation, the
 509                                 // original rd of AUIPC can be restored
 510                                 // from inst2's rs1 as they are required to
 511                                 // be the same.
 512
 513                                 // Arithmetic right shift makes sign extension
 514                                 // trivial but (1) it's implementation-defined
 515                                 // behavior (C99/C11/C23 6.5.7-p5) and so is
 516                                 // (2) casting unsigned to signed (6.3.1.3-p3).
 517                                 //
 518                                 // One can check for (1) with
 519                                 //
 520                                 //     if ((-1 >> 1) == -1) ...
 521                                 //
 522                                 // but (2) has to be checked from the
 523                                 // compiler docs. GCC promises that (1)
 524                                 // and (2) behave in the common expected
 525                                 // way and thus
 526                                 //
 527                                 //     addr += (uint32_t)(
 528                                 //             (int32_t)inst2 >> 20);
 529                                 //
 530                                 // does the same as the code below. But since
 531                                 // the 100 % portable way is only a few bytes
 532                                 // bigger code and there is no real speed
 533                                 // difference, let's just use that, especially
 534                                 // since the decoder doesn't need this at all.
 535                                 uint32_t addr = inst & 0xFFFFF000;
 536                                 addr += (inst2 >> 20)
 537                                                 - ((inst2 >> 19) & 0x1000);
 538
 539                                 addr += now_pos + (uint32_t)i;
 540
 541                                 // Construct the first 32 bits:
 542                                 //   [6:0]    AUIPC opcode
 543                                 //   [11:7]   Special AUIPC rd = x2
 544                                 //   [31:12]  The lowest 20 bits of inst2
 545                                 inst = 0x17 | (2 << 7) | (inst2 << 12);
 546
 547                                 write32le(buffer + i, inst);
 548
 549                                 // The second 32 bits store the absolute
 550                                 // address in big endian order.
 551                                 write32be(buffer + i + 4, addr);
 552                         } else {
 553                                 // AUIPC's rd equals x0 or x2.
 554                                 //
 555                                 // x0 indicates a landing pad (LPAD).
 556                                 // It's always skipped.
 557                                 //
 558                                 // AUIPC with rd == x2 is used for the special
 559                                 // format as explained above. When the input
 560                                 // contains a byte sequence that matches the
 561                                 // special format, "fake" decoding must be
 562                                 // done to keep the filter bijective (that
 563                                 // is, safe to apply on arbitrary data).
 564                                 //
 565                                 // See the "x0 or x2" section in riscv_decode()
 566                                 // for how the "real" decoding is done. The
 567                                 // "fake" decoding is a simplified version
 568                                 // of "real" decoding with the following
 569                                 // differences (these reduce code size of
 570                                 // the decoder):
 571                                 // (1) The lowest 12 bits aren't sign-extended.
 572                                 // (2) No address conversion is done.
 573                                 // (3) Big endian format isn't used (the fake
 574                                 //     address is in little endian order).
 575
 576                                 // Check if inst matches the special format.
 577                                 const uint32_t fake_rs1 = inst >> 27;
 578
 579                                 if (NOT_SPECIAL_AUIPC(inst, fake_rs1)) {
 580                                         i += 4 - 2;
 581                                         continue;
 582                                 }
 583
 584                                 const uint32_t fake_addr =
 585                                                 read32le(buffer + i + 4);
 586
 587                                 // Construct the second 32 bits:
 588                                 //   [19:0]   Upper 20 bits from AUIPC
 589                                 //   [31:20]  The lowest 12 bits of fake_addr
 590                                 const uint32_t fake_inst2 = (inst >> 12)
 591                                                 | (fake_addr << 20);
 592
 593                                 // Construct new first 32 bits from:
 594                                 //   [6:0]   AUIPC opcode
 595                                 //   [11:7]  Fake AUIPC rd = fake_rs1
 596                                 //   [31:12] The highest 20 bits of fake_addr
 597                                 inst = 0x17 | (fake_rs1 << 7)
 598                                         | (fake_addr & 0xFFFFF000);
 599
 600                                 write32le(buffer + i, inst);
 601                                 write32le(buffer + i + 4, fake_inst2);
 602                         }
 603
 604                         i += 8 - 2;
 605                 }
 606         }
 607
 608         return i;
 609 }
 610
 611
 612 extern lzma_ret
 613 lzma_simple_riscv_encoder_init(lzma_next_coder *next,
 614                 const lzma_allocator *allocator,
 615                 const lzma_filter_info *filters)
 616 {
 617         return lzma_simple_coder_init(next, allocator, filters,
 618                         &riscv_encode, 0, 8, 2, true);
 619 }
 620 #endif
 621
 622
 623 #ifdef HAVE_DECODER_RISCV
 624 static size_t
 625 riscv_decode(void *simple lzma_attribute((__unused__)),
 626                 uint32_t now_pos,
 627                 bool is_encoder lzma_attribute((__unused__)),
 628                 uint8_t *buffer, size_t size)
 629 {
 630         if (size < 8)
 631                 return 0;
 632
 633         size -= 8;
 634
 635         size_t i;
 636         for (i = 0; i <= size; i += 2) {
 637                 uint32_t inst = buffer[i];
 638
 639                 if (inst == 0xEF) {
 640                         // JAL
 641                         const uint32_t b1 = buffer[i + 1];
 642
 643                         // Only filter rd=x1(ra) and rd=x5(t0).
 644                         if ((b1 & 0x0D) != 0)
 645                                 continue;
 646
 647                         const uint32_t b2 = buffer[i + 2];
 648                         const uint32_t b3 = buffer[i + 3];
 649                         const uint32_t pc = now_pos + (uint32_t)i;
 650
 651 // |          b3             |          b2             |          b1         |
 652 // | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
 653 // | 20 10  9  8  7  6  5  4 |  3  2  1 11 19 18 17 16 | 15 14 13 12 x x x x |
 654 // |  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |  7  6  5  4 x x x x |
 655
 656                         uint32_t addr = ((b1 & 0xF0) << 13)
 657                                         | (b2 << 9) | (b3 << 1);
 658
 659                         addr -= pc;
 660
 661                         buffer[i + 1] = (uint8_t)((b1 & 0x0F)
 662                                         | ((addr >> 8) & 0xF0));
 663
 664                         buffer[i + 2] = (uint8_t)(((addr >> 16) & 0x0F)
 665                                         | ((addr >> 7) & 0x10)
 666                                         | ((addr << 4) & 0xE0));
 667
 668                         buffer[i + 3] = (uint8_t)(((addr >> 4) & 0x7F)
 669                                         | ((addr >> 13) & 0x80));
 670
 671                         i += 4 - 2;
 672
 673                 } else if ((inst & 0x7F) == 0x17) {
 674                         // AUIPC
 675                         uint32_t inst2;
 676
 677                         inst |= (uint32_t)buffer[i + 1] << 8;
 678                         inst |= (uint32_t)buffer[i + 2] << 16;
 679                         inst |= (uint32_t)buffer[i + 3] << 24;
 680
 681                         if (inst & 0xE80) {
 682                                 // AUIPC's rd doesn't equal x0 or x2.
 683
 684                                 // Check if it is a "fake" AUIPC+inst2 pair.
 685                                 inst2 = read32le(buffer + i + 4);
 686
 687                                 if (NOT_AUIPC_PAIR(inst, inst2)) {
 688                                         i += 6 - 2;
 689                                         continue;
 690                                 }
 691
 692                                 // Decode (or more like re-encode) the "fake"
 693                                 // pair. The "fake" format doesn't do
 694                                 // sign-extension, address conversion, or
 695                                 // use big endian. (The use of little endian
 696                                 // allows sharing the write32le() calls in
 697                                 // the decoder to reduce code size when
 698                                 // unaligned access isn't supported.)
 699                                 uint32_t addr = inst & 0xFFFFF000;
 700                                 addr += inst2 >> 20;
 701
 702                                 inst = 0x17 | (2 << 7) | (inst2 << 12);
 703                                 inst2 = addr;
 704                         } else {
 705                                 // AUIPC's rd equals x0 or x2.
 706
 707                                 // Check if inst matches the special format
 708                                 // used by the encoder.
 709                                 const uint32_t inst2_rs1 = inst >> 27;
 710
 711                                 if (NOT_SPECIAL_AUIPC(inst, inst2_rs1)) {
 712                                         i += 4 - 2;
 713                                         continue;
 714                                 }
 715
 716                                 // Decode the "real" pair.
 717                                 uint32_t addr = read32be(buffer + i + 4);
 718
 719                                 addr -= now_pos + (uint32_t)i;
 720
 721                                 // The second instruction:
 722                                 //   - Get the lowest 20 bits from inst.
 723                                 //   - Add the lowest 12 bits of the address
 724                                 //     as the immediate field.
 725                                 inst2 = (inst >> 12) | (addr << 20);
 726
 727                                 // AUIPC:
 728                                 //   - rd is the same as inst2_rs1.
 729                                 //   - The sign extension of the lowest 12 bits
 730                                 //     must be taken into account.
 731                                 inst = 0x17 | (inst2_rs1 << 7)
 732                                         | ((addr + 0x800) & 0xFFFFF000);
 733                         }
 734
 735                         // Both decoder branches write in little endian order.
 736                         write32le(buffer + i, inst);
 737                         write32le(buffer + i + 4, inst2);
 738
 739                         i += 8 - 2;
 740                 }
 741         }
 742
 743         return i;
 744 }
 745
 746
 747 extern lzma_ret
 748 lzma_simple_riscv_decoder_init(lzma_next_coder *next,
 749                 const lzma_allocator *allocator,
 750                 const lzma_filter_info *filters)
 751 {
 752         return lzma_simple_coder_init(next, allocator, filters,
 753                         &riscv_decode, 0, 8, 2, false);
 754 }
 755 #endif