llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: opt -instcombine -mtriple=thumbv8.1m.main -S %s | FileCheck --check-prefix=IR %s
   3 ; RUN: opt -instcombine -mtriple=thumbv8.1m.main    %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s
   4
   5 %struct.foo = type { [2 x <4 x i32>] }
   6
   7 define arm_aapcs_vfpcc i32 @test_vadciq_multiple(%struct.foo %a, %struct.foo %b, i32 %carry) {
   8 entry:
   9   %a.0 = extractvalue %struct.foo %a, 0, 0
  10   %a.1 = extractvalue %struct.foo %a, 0, 1
  11   %b.0 = extractvalue %struct.foo %b, 0, 0
  12   %b.1 = extractvalue %struct.foo %b, 0, 1
  13
  14   %fpscr.in.0 = shl i32 %carry, 29
  15   %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0)
  16   %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
  17   %shifted.out.0 = lshr i32 %fpscr.out.0, 29
  18   %carry.out.0 = and i32 1, %shifted.out.0
  19   %fpscr.in.1 = shl i32 %carry.out.0, 29
  20   %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.in.1)
  21   %fpscr.out.1 = extractvalue { <4 x i32>, i32 } %outpair.1, 1
  22   %shifted.out.1 = lshr i32 %fpscr.out.1, 29
  23   %carry.out.1 = and i32 1, %shifted.out.1
  24   ret i32 %carry.out.1
  25 }
  26
  27 define arm_aapcs_vfpcc i32 @test_vadciq_pred_multiple(%struct.foo %a, %struct.foo %b, i32 %ipred, i32 %carry) {
  28 entry:
  29   %a.0 = extractvalue %struct.foo %a, 0, 0
  30   %a.1 = extractvalue %struct.foo %a, 0, 1
  31   %b.0 = extractvalue %struct.foo %b, 0, 0
  32   %b.1 = extractvalue %struct.foo %b, 0, 1
  33
  34   %vpred = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %ipred)
  35   %fpscr.in.0 = shl i32 %carry, 29
  36   %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0, <4 x i1> %vpred)
  37   %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
  38   %shifted.out.0 = lshr i32 %fpscr.out.0, 29
  39   %carry.out.0 = and i32 1, %shifted.out.0
  40   %fpscr.in.1 = shl i32 %carry.out.0, 29
  41   %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.in.1, <4 x i1> %vpred)
  42   %fpscr.out.1 = extractvalue { <4 x i32>, i32 } %outpair.1, 1
  43   %shifted.out.1 = lshr i32 %fpscr.out.1, 29
  44   %carry.out.1 = and i32 1, %shifted.out.1
  45   ret i32 %carry.out.1
  46 }
  47
  48 declare { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32>, <4 x i32>, i32)
  49 declare { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i32>, i32, <4 x i1>)
  50 declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
  51
  52 ; Expect the transformation in between the two intrinsics, where the
  53 ; fpscr-formatted output value is turned back into just the carry bit
  54 ; at bit 0 and then back again for the next call, to be optimized away
  55 ; completely in InstCombine, so that the FPSCR output from one
  56 ; intrinsic is passed straight on to the next:
  57
  58 ; IR: %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0)
  59 ; IR: %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
  60 ; IR: %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.out.0)
  61
  62 ; IR: %outpair.0 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.0, <4 x i32> %b.0, i32 %fpscr.in.0, <4 x i1> %vpred)
  63 ; IR: %fpscr.out.0 = extractvalue { <4 x i32>, i32 } %outpair.0, 1
  64 ; IR: %outpair.1 = call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> undef, <4 x i32> %a.1, <4 x i32> %b.1, i32 %fpscr.out.0, <4 x i1> %vpred)
  65
  66 ; And this is the assembly language we expect at the end of it, with
  67 ; the two vadc.i32 instructions right next to each other, and the
  68 ; second one implicitly reusing the FPSCR written by the first.
  69
  70 ; ASM: test_vadciq_multiple:
  71 ; ASM:      lsls r0, r0, #29
  72 ; ASM-NEXT: vmsr fpscr_nzcvqc, r0
  73 ; ASM-NEXT: vadc.i32 q0, q0, q2
  74 ; ASM-NEXT: vadc.i32 q0, q1, q3
  75 ; ASM-NEXT: vmrs r0, fpscr_nzcvqc
  76 ; ASM-NEXT: ubfx r0, r0, #29, #1
  77 ; ASM-NEXT: bx lr
  78
  79 ; ASM: test_vadciq_pred_multiple:
  80 ; ASM: lsls r1, r1, #29
  81 ; ASM-NEXT: vmsr p0, r0
  82 ; ASM-NEXT: vmsr fpscr_nzcvqc, r1
  83 ; ASM-NEXT: vpstt
  84 ; ASM-NEXT: vadct.i32 q0, q0, q2
  85 ; ASM-NEXT: vadct.i32 q0, q1, q3
  86 ; ASM-NEXT: vmrs r0, fpscr_nzcvqc
  87 ; ASM-NEXT: ubfx r0, r0, #29, #1
  88 ; ASM-NEXT: bx lr