lib/Target/ARM/README-Thumb.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the ARM backend (Thumb specific).
   3 //===---------------------------------------------------------------------===//
   4
   5 * Add support for compiling functions in both ARM and Thumb mode, then taking
   6   the smallest.
   7
   8 * Add support for compiling individual basic blocks in thumb mode, when in a
   9   larger ARM function.  This can be used for presumed cold code, like paths
  10   to abort (failure path of asserts), EH handling code, etc.
  11
  12 * Thumb doesn't have normal pre/post increment addressing modes, but you can
  13   load/store 32-bit integers with pre/postinc by using load/store multiple
  14   instrs with a single register.
  15
  16 * Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
  17   and cmp instructions can use high registers. Also, we can use them as
  18   temporaries to spill values into.
  19
  20 * In thumb mode, short, byte, and bool preferred alignments are currently set
  21   to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
  22   of 4).
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 Potential jumptable improvements:
  27
  28 * If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
  29   jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
  30   function is even smaller. This also applies to ARM.
  31
  32 * Thumb jumptable codegen can improve given some help from the assembler. This
  33   is what we generate right now:
  34
  35         .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
  36 LPCRELL0:
  37         mov r1, #PCRELV0
  38         add r1, pc
  39         ldr r0, [r0, r1]
  40         cpy pc, r0
  41         .align  2
  42 LJTI1_0_0:
  43         .long    LBB1_3
  44         ...
  45
  46 Note there is another pc relative add that we can take advantage of.
  47      add r1, pc, #imm_8 * 4
  48
  49 We should be able to generate:
  50
  51 LPCRELL0:
  52         add r1, LJTI1_0_0
  53         ldr r0, [r0, r1]
  54         cpy pc, r0
  55         .align  2
  56 LJTI1_0_0:
  57         .long    LBB1_3
  58
  59 if the assembler can translate the add to:
  60        add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
  61
  62 Note the assembler also does something similar to constpool load:
  63 LPCRELL0:
  64      ldr r0, LCPI1_0
  65 =>
  66      ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
  67
  68
  69 //===---------------------------------------------------------------------===//
  70
  71 We compiles the following:
  72
  73 define i16 @func_entry_2E_ce(i32 %i) {
  74         switch i32 %i, label %bb12.exitStub [
  75                  i32 0, label %bb4.exitStub
  76                  i32 1, label %bb9.exitStub
  77                  i32 2, label %bb4.exitStub
  78                  i32 3, label %bb4.exitStub
  79                  i32 7, label %bb9.exitStub
  80                  i32 8, label %bb.exitStub
  81                  i32 9, label %bb9.exitStub
  82         ]
  83
  84 bb12.exitStub:
  85         ret i16 0
  86
  87 bb4.exitStub:
  88         ret i16 1
  89
  90 bb9.exitStub:
  91         ret i16 2
  92
  93 bb.exitStub:
  94         ret i16 3
  95 }
  96
  97 into:
  98
  99 _func_entry_2E_ce:
 100         mov r2, #1
 101         lsl r2, r0
 102         cmp r0, #9
 103         bhi LBB1_4      @bb12.exitStub
 104 LBB1_1: @newFuncRoot
 105         mov r1, #13
 106         tst r2, r1
 107         bne LBB1_5      @bb4.exitStub
 108 LBB1_2: @newFuncRoot
 109         ldr r1, LCPI1_0
 110         tst r2, r1
 111         bne LBB1_6      @bb9.exitStub
 112 LBB1_3: @newFuncRoot
 113         mov r1, #1
 114         lsl r1, r1, #8
 115         tst r2, r1
 116         bne LBB1_7      @bb.exitStub
 117 LBB1_4: @bb12.exitStub
 118         mov r0, #0
 119         bx lr
 120 LBB1_5: @bb4.exitStub
 121         mov r0, #1
 122         bx lr
 123 LBB1_6: @bb9.exitStub
 124         mov r0, #2
 125         bx lr
 126 LBB1_7: @bb.exitStub
 127         mov r0, #3
 128         bx lr
 129 LBB1_8:
 130         .align  2
 131 LCPI1_0:
 132         .long   642
 133
 134
 135 gcc compiles to:
 136
 137         cmp     r0, #9
 138         @ lr needed for prologue
 139         bhi     L2
 140         ldr     r3, L11
 141         mov     r2, #1
 142         mov     r1, r2, asl r0
 143         ands    r0, r3, r2, asl r0
 144         movne   r0, #2
 145         bxne    lr
 146         tst     r1, #13
 147         beq     L9
 148 L3:
 149         mov     r0, r2
 150         bx      lr
 151 L9:
 152         tst     r1, #256
 153         movne   r0, #3
 154         bxne    lr
 155 L2:
 156         mov     r0, #0
 157         bx      lr
 158 L12:
 159         .align 2
 160 L11:
 161         .long   642
 162
 163
 164 GCC is doing a couple of clever things here:
 165   1. It is predicating one of the returns.  This isn't a clear win though: in
 166      cases where that return isn't taken, it is replacing one condbranch with
 167      two 'ne' predicated instructions.
 168   2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
 169      tst.  This will probably require whole function isel.
 170   3. GCC emits:
 171         tst     r1, #256
 172      we emit:
 173         mov r1, #1
 174         lsl r1, r1, #8
 175         tst r2, r1
 176
 177
 178 //===---------------------------------------------------------------------===//
 179
 180 When spilling in thumb mode and the sp offset is too large to fit in the ldr /
 181 str offset field, we load the offset from a constpool entry and add it to sp:
 182
 183 ldr r2, LCPI
 184 add r2, sp
 185 ldr r2, [r2]
 186
 187 These instructions preserve the condition code which is important if the spill
 188 is between a cmp and a bcc instruction. However, we can use the (potentially)
 189 cheaper sequnce if we know it's ok to clobber the condition register.
 190
 191 add r2, sp, #255 * 4
 192 add r2, #132
 193 ldr r2, [r2, #7 * 4]
 194
 195 This is especially bad when dynamic alloca is used. The all fixed size stack
 196 objects are referenced off the frame pointer with negative offsets. See
 197 oggenc for an example.
 198
 199 //===---------------------------------------------------------------------===//
 200
 201 We are reserving R3 as a scratch register under thumb mode. So if it is live in
 202 to the function, we save / restore R3 to / from R12. Until register scavenging
 203 is done, we should save R3 to a high callee saved reg at emitPrologue time
 204 (when hasFP is true or stack size is large) and restore R3 from that register
 205 instead. This allows us to at least get rid of the save to r12 everytime it is
 206 used.
 207
 208 //===---------------------------------------------------------------------===//
 209
 210 Poor codegen test/CodeGen/ARM/select.ll f7:
 211
 212         ldr r5, LCPI1_0
 213 LPC0:
 214         add r5, pc
 215         ldr r6, LCPI1_1
 216         ldr r2, LCPI1_2
 217         cpy r3, r6
 218         cpy lr, pc
 219         bx r5
 220
 221 //===---------------------------------------------------------------------===//
 222
 223 Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
 224 etc. Almost all Thumb instructions clobber condition code.
 225
 226 //===---------------------------------------------------------------------===//
 227
 228 Add ldmia, stmia support.