llvm/test/CodeGen/AArch64/vldn_shuffle.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s
   3
   4 define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
   5 ; CHECK-LABEL: vld2:
   6 ; CHECK:       // %bb.0: // %entry
   7 ; CHECK-NEXT:    mov x8, xzr
   8 ; CHECK-NEXT:  .LBB0_1: // %vector.body
   9 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
  10 ; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0], #32
  11 ; CHECK-NEXT:    fmul v2.4s, v0.4s, v0.4s
  12 ; CHECK-NEXT:    fmla v2.4s, v1.4s, v1.4s
  13 ; CHECK-NEXT:    str q2, [x1, x8]
  14 ; CHECK-NEXT:    add x8, x8, #16
  15 ; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
  16 ; CHECK-NEXT:    b.ne .LBB0_1
  17 ; CHECK-NEXT:  // %bb.2: // %while.end
  18 ; CHECK-NEXT:    ret
  19 entry:
  20   br label %vector.body
  21
  22 vector.body:                                      ; preds = %vector.body, %entry
  23   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  24   %0 = shl i64 %index, 1
  25   %next.gep = getelementptr float, float* %pSrc, i64 %0
  26   %next.gep19 = getelementptr float, float* %pDst, i64 %index
  27   %1 = bitcast float* %next.gep to <8 x float>*
  28   %wide.vec = load <8 x float>, <8 x float>* %1, align 4
  29   %2 = fmul fast <8 x float> %wide.vec, %wide.vec
  30   %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  31   %4 = fmul fast <8 x float> %wide.vec, %wide.vec
  32   %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  33   %6 = fadd fast <4 x float> %5, %3
  34   %7 = bitcast float* %next.gep19 to <4 x float>*
  35   store <4 x float> %6, <4 x float>* %7, align 4
  36   %index.next = add i64 %index, 4
  37   %8 = icmp eq i64 %index.next, 1024
  38   br i1 %8, label %while.end, label %vector.body
  39
  40 while.end:                                        ; preds = %vector.body
  41   ret void
  42 }
  43
  44 define void @vld3(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
  45 ; CHECK-LABEL: vld3:
  46 ; CHECK:       // %bb.0: // %entry
  47 ; CHECK-NEXT:    mov x8, xzr
  48 ; CHECK-NEXT:  .LBB1_1: // %vector.body
  49 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
  50 ; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
  51 ; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
  52 ; CHECK-NEXT:    fmla v3.4s, v1.4s, v1.4s
  53 ; CHECK-NEXT:    fmla v3.4s, v2.4s, v2.4s
  54 ; CHECK-NEXT:    str q3, [x1, x8]
  55 ; CHECK-NEXT:    add x8, x8, #16
  56 ; CHECK-NEXT:    cmp x8, #1, lsl #12 // =4096
  57 ; CHECK-NEXT:    b.ne .LBB1_1
  58 ; CHECK-NEXT:  // %bb.2: // %while.end
  59 ; CHECK-NEXT:    ret
  60 entry:
  61   br label %vector.body
  62
  63 vector.body:                                      ; preds = %vector.body, %entry
  64   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  65   %0 = mul i64 %index, 3
  66   %next.gep = getelementptr float, float* %pSrc, i64 %0
  67   %next.gep23 = getelementptr float, float* %pDst, i64 %index
  68   %1 = bitcast float* %next.gep to <12 x float>*
  69   %wide.vec = load <12 x float>, <12 x float>* %1, align 4
  70   %2 = fmul fast <12 x float> %wide.vec, %wide.vec
  71   %3 = shufflevector <12 x float> %2, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
  72   %4 = fmul fast <12 x float> %wide.vec, %wide.vec
  73   %5 = shufflevector <12 x float> %4, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
  74   %6 = fadd fast <4 x float> %5, %3
  75   %7 = fmul fast <12 x float> %wide.vec, %wide.vec
  76   %8 = shufflevector <12 x float> %7, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
  77   %9 = fadd fast <4 x float> %6, %8
  78   %10 = bitcast float* %next.gep23 to <4 x float>*
  79   store <4 x float> %9, <4 x float>* %10, align 4
  80   %index.next = add i64 %index, 4
  81   %11 = icmp eq i64 %index.next, 1024
  82   br i1 %11, label %while.end, label %vector.body
  83
  84 while.end:                                        ; preds = %vector.body
  85   ret void
  86 }
  87
  88 define void @vld4(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
  89 ; CHECK-LABEL: vld4:
  90 ; CHECK:       // %bb.0: // %entry
  91 ; CHECK-NEXT:    mov x8, xzr
  92 ; CHECK-NEXT:  .LBB2_1: // %vector.body
  93 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
  94 ; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
  95 ; CHECK-NEXT:    add x9, x1, x8
  96 ; CHECK-NEXT:    add x8, x8, #32
  97 ; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
  98 ; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
  99 ; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
 100 ; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
 101 ; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
 102 ; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x9]
 103 ; CHECK-NEXT:    b.ne .LBB2_1
 104 ; CHECK-NEXT:  // %bb.2: // %while.end
 105 ; CHECK-NEXT:    ret
 106 entry:
 107   br label %vector.body
 108
 109 vector.body:                                      ; preds = %vector.body, %entry
 110   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
 111   %0 = shl i64 %index, 2
 112   %next.gep = getelementptr float, float* %pSrc, i64 %0
 113   %1 = shl i64 %index, 1
 114   %2 = bitcast float* %next.gep to <16 x float>*
 115   %wide.vec = load <16 x float>, <16 x float>* %2, align 4
 116   %3 = fmul fast <16 x float> %wide.vec, %wide.vec
 117   %4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 118   %5 = fmul fast <16 x float> %wide.vec, %wide.vec
 119   %6 = shufflevector <16 x float> %5, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 120   %7 = fadd fast <4 x float> %6, %4
 121   %8 = fmul fast <16 x float> %wide.vec, %wide.vec
 122   %9 = shufflevector <16 x float> %8, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
 123   %10 = fmul fast <16 x float> %wide.vec, %wide.vec
 124   %11 = shufflevector <16 x float> %10, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
 125   %12 = fadd fast <4 x float> %11, %9
 126   %13 = getelementptr inbounds float, float* %pDst, i64 %1
 127   %14 = bitcast float* %13 to <8 x float>*
 128   %interleaved.vec = shufflevector <4 x float> %7, <4 x float> %12, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 129   store <8 x float> %interleaved.vec, <8 x float>* %14, align 4
 130   %index.next = add i64 %index, 4
 131   %15 = icmp eq i64 %index.next, 1024
 132   br i1 %15, label %while.end, label %vector.body
 133
 134 while.end:                                        ; preds = %vector.body
 135   ret void
 136 }
 137
 138 define void @twosrc(float* nocapture readonly %pSrc, float* nocapture readonly %pSrc2, float* noalias nocapture %pDst, i32 %numSamples) {
 139 ; CHECK-LABEL: twosrc:
 140 ; CHECK:       // %bb.0: // %entry
 141 ; CHECK-NEXT:    mov x8, xzr
 142 ; CHECK-NEXT:  .LBB3_1: // %vector.body
 143 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 144 ; CHECK-NEXT:    add x9, x0, x8
 145 ; CHECK-NEXT:    add x10, x1, x8
 146 ; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x9]
 147 ; CHECK-NEXT:    ld2 { v2.4s, v3.4s }, [x10]
 148 ; CHECK-NEXT:    add x8, x8, #32
 149 ; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
 150 ; CHECK-NEXT:    fmul v4.4s, v2.4s, v0.4s
 151 ; CHECK-NEXT:    fmla v4.4s, v1.4s, v3.4s
 152 ; CHECK-NEXT:    str q4, [x2], #16
 153 ; CHECK-NEXT:    b.ne .LBB3_1
 154 ; CHECK-NEXT:  // %bb.2: // %while.end
 155 ; CHECK-NEXT:    ret
 156 entry:
 157   br label %vector.body
 158
 159 vector.body:                                      ; preds = %vector.body, %entry
 160   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
 161   %0 = shl i64 %index, 1
 162   %next.gep = getelementptr float, float* %pSrc, i64 %0
 163   %1 = shl i64 %index, 1
 164   %next.gep23 = getelementptr float, float* %pSrc2, i64 %1
 165   %next.gep24 = getelementptr float, float* %pDst, i64 %index
 166   %2 = bitcast float* %next.gep to <8 x float>*
 167   %wide.vec = load <8 x float>, <8 x float>* %2, align 4
 168   %3 = bitcast float* %next.gep23 to <8 x float>*
 169   %wide.vec26 = load <8 x float>, <8 x float>* %3, align 4
 170   %4 = fmul fast <8 x float> %wide.vec26, %wide.vec
 171   %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 172   %6 = fmul fast <8 x float> %wide.vec26, %wide.vec
 173   %7 = shufflevector <8 x float> %6, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 174   %8 = fadd fast <4 x float> %7, %5
 175   %9 = bitcast float* %next.gep24 to <4 x float>*
 176   store <4 x float> %8, <4 x float>* %9, align 4
 177   %index.next = add i64 %index, 4
 178   %10 = icmp eq i64 %index.next, 1024
 179   br i1 %10, label %while.end, label %vector.body
 180
 181 while.end:                                        ; preds = %vector.body
 182   ret void
 183 }