llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
   2 ; RUN: llc < %s -mtriple thumbv8.1m.main-arm-none-eabihf -mattr=+mve | FileCheck %s
   3
   4 declare void @use_int32x4_t(<4 x i32>)
   5
   6 ; A 256-bit addition, with the two halves of the result passed to function
   7 ; calls to spill the carry bit out of FPSCR.
   8 define void @add_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
   9 ; CHECK-LABEL: add_256:
  10 ; CHECK:       @ %bb.0: @ %entry
  11 ; CHECK-NEXT:    .save {r7, lr}
  12 ; CHECK-NEXT:    push {r7, lr}
  13 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
  14 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
  15 ; CHECK-NEXT:    .pad #8
  16 ; CHECK-NEXT:    sub sp, #8
  17 ; CHECK-NEXT:    vadci.i32 q0, q0, q2
  18 ; CHECK-NEXT:    vmov q4, q3
  19 ; CHECK-NEXT:    vmov q5, q1
  20 ; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
  21 ; CHECK-NEXT:    bl use_int32x4_t
  22 ; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
  23 ; CHECK-NEXT:    vadc.i32 q0, q5, q4
  24 ; CHECK-NEXT:    add sp, #8
  25 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
  26 ; CHECK-NEXT:    pop.w {r7, lr}
  27 ; CHECK-NEXT:    b use_int32x4_t
  28 entry:
  29   %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
  30   %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
  31   %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
  32   tail call void @use_int32x4_t(<4 x i32> %result_low)
  33   %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
  34   %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
  35   tail call void @use_int32x4_t(<4 x i32> %result_high)
  36   ret void
  37 }
  38
  39 ; A 256-bit subtraction, with the two halves of the result passed to function
  40 ; calls to spill the carry bit out of FPSCR.
  41 define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high) {
  42 ; CHECK-LABEL: sub_256:
  43 ; CHECK:       @ %bb.0: @ %entry
  44 ; CHECK-NEXT:    .save {r7, lr}
  45 ; CHECK-NEXT:    push {r7, lr}
  46 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
  47 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
  48 ; CHECK-NEXT:    .pad #8
  49 ; CHECK-NEXT:    sub sp, #8
  50 ; CHECK-NEXT:    vsbci.i32 q0, q0, q2
  51 ; CHECK-NEXT:    vmov q4, q3
  52 ; CHECK-NEXT:    vmov q5, q1
  53 ; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
  54 ; CHECK-NEXT:    bl use_int32x4_t
  55 ; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
  56 ; CHECK-NEXT:    vsbc.i32 q0, q5, q4
  57 ; CHECK-NEXT:    add sp, #8
  58 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
  59 ; CHECK-NEXT:    pop.w {r7, lr}
  60 ; CHECK-NEXT:    b use_int32x4_t
  61 entry:
  62   %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 536870912)
  63   %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
  64   %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
  65   tail call void @use_int32x4_t(<4 x i32> %result_low)
  66   %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
  67   %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
  68   tail call void @use_int32x4_t(<4 x i32> %result_high)
  69   ret void
  70 }
  71
  72 ; The carry-out of the first VADC intrinsic call is used by two other VADCs,
  73 ; both of which will modify FPSCR, so it must be spilled and reloaded.
  74 ; Missed optimisation: the first VLDR isn't needed, because the carry bit is
  75 ; already in FPSCR.
  76 define <4 x i32> @multiple_uses_of_carry_bit(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x i32> %b_high, <4 x i32> %a_high_2, <4 x i32> %b_high_2) {
  77 ; CHECK-LABEL: multiple_uses_of_carry_bit:
  78 ; CHECK:       @ %bb.0: @ %entry
  79 ; CHECK-NEXT:    .pad #8
  80 ; CHECK-NEXT:    sub sp, #8
  81 ; CHECK-NEXT:    vadci.i32 q0, q0, q2
  82 ; CHECK-NEXT:    add r0, sp, #24
  83 ; CHECK-NEXT:    vstr fpscr_nzcvqc, [sp, #4] @ 4-byte Spill
  84 ; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
  85 ; CHECK-NEXT:    vadc.i32 q1, q1, q3
  86 ; CHECK-NEXT:    veor q0, q0, q1
  87 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
  88 ; CHECK-NEXT:    add r0, sp, #8
  89 ; CHECK-NEXT:    vldr fpscr_nzcvqc, [sp, #4] @ 4-byte Reload
  90 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
  91 ; CHECK-NEXT:    vadc.i32 q1, q2, q1
  92 ; CHECK-NEXT:    veor q0, q0, q1
  93 ; CHECK-NEXT:    add sp, #8
  94 ; CHECK-NEXT:    bx lr
  95 entry:
  96   %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
  97   %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
  98   %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
  99   %adc_high = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high, <4 x i32> %b_high, i32 %carry)
 100   %result_high = extractvalue { <4 x i32>, i32 } %adc_high, 0
 101   %checksum_1 = xor <4 x i32> %result_low, %result_high
 102   %adc_high_2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a_high_2, <4 x i32> %b_high_2, i32 %carry)
 103   %result_high_2 = extractvalue { <4 x i32>, i32 } %adc_high_2, 0
 104   %checksum_2 = xor <4 x i32> %checksum_1, %result_high_2
 105   ret <4 x i32> %checksum_2
 106 }