llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s
   3
   4 target triple = "aarch64"
   5 ; Expected to transform
   6 ;   *p = (a * b);
   7 ;   return (a * b) * a;
   8 define <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b, ptr %p) {
   9 ; CHECK-LABEL: mul_triangle:
  10 ; CHECK:       // %bb.0: // %entry
  11 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
  12 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
  13 ; CHECK-NEXT:    fcmla v3.4s, v0.4s, v1.4s, #0
  14 ; CHECK-NEXT:    fcmla v3.4s, v0.4s, v1.4s, #90
  15 ; CHECK-NEXT:    fcmla v2.4s, v3.4s, v0.4s, #0
  16 ; CHECK-NEXT:    str q3, [x0]
  17 ; CHECK-NEXT:    fcmla v2.4s, v3.4s, v0.4s, #90
  18 ; CHECK-NEXT:    mov v0.16b, v2.16b
  19 ; CHECK-NEXT:    ret
  20 entry:
  21   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
  22   %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
  23   %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
  24   %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
  25   %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
  26   %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
  27   %2 = fsub fast <2 x float> %0, %1
  28   %3 = fmul fast <2 x float> %2, %strided.vec35
  29   %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
  30   %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
  31   %6 = fadd fast <2 x float> %4, %5
  32   %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
  33   store <4 x float> %otheruse, ptr %p
  34   %7 = fmul fast <2 x float> %6, %strided.vec
  35   %8 = fadd fast <2 x float> %3, %7
  36   %9 = fmul fast <2 x float> %2, %strided.vec
  37   %10 = fmul fast <2 x float> %6, %strided.vec35
  38   %11 = fsub fast <2 x float> %9, %10
  39   %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
  40   ret <4 x float> %interleaved.vec
  41 }
  42
  43 ; Expected to not transform. Shows that external use prevents deinterleaving.
  44 ;   *p = (a * b).real();
  45 ;   return (a * b) * a;
  46 define <4 x float> @mul_triangle_external_use(<4 x float> %a, <4 x float> %b, ptr %p) {
  47 ; CHECK-LABEL: mul_triangle_external_use:
  48 ; CHECK:       // %bb.0: // %entry
  49 ; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
  50 ; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
  51 ; CHECK-NEXT:    zip2 v4.2s, v0.2s, v2.2s
  52 ; CHECK-NEXT:    zip1 v5.2s, v1.2s, v3.2s
  53 ; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
  54 ; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
  55 ; CHECK-NEXT:    fmul v2.2s, v4.2s, v5.2s
  56 ; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
  57 ; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
  58 ; CHECK-NEXT:    fneg v1.2s, v3.2s
  59 ; CHECK-NEXT:    fmul v3.2s, v2.2s, v4.2s
  60 ; CHECK-NEXT:    str d2, [x0]
  61 ; CHECK-NEXT:    fmla v1.2s, v0.2s, v5.2s
  62 ; CHECK-NEXT:    fmul v5.2s, v2.2s, v0.2s
  63 ; CHECK-NEXT:    fneg v3.2s, v3.2s
  64 ; CHECK-NEXT:    fmla v5.2s, v4.2s, v1.2s
  65 ; CHECK-NEXT:    fmla v3.2s, v0.2s, v1.2s
  66 ; CHECK-NEXT:    zip1 v0.4s, v3.4s, v5.4s
  67 ; CHECK-NEXT:    ret
  68 entry:
  69   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
  70   %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
  71   %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
  72   %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
  73   %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
  74   %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
  75   %2 = fsub fast <2 x float> %0, %1
  76   %3 = fmul fast <2 x float> %2, %strided.vec35
  77   %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
  78   %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
  79   %6 = fadd fast <2 x float> %4, %5
  80   store <2 x float> %6, ptr %p
  81   %7 = fmul fast <2 x float> %6, %strided.vec
  82   %8 = fadd fast <2 x float> %3, %7
  83   %9 = fmul fast <2 x float> %2, %strided.vec
  84   %10 = fmul fast <2 x float> %6, %strided.vec35
  85   %11 = fsub fast <2 x float> %9, %10
  86   %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
  87   ret <4 x float> %interleaved.vec
  88 }
  89
  90 ; Expected to transform partially (only d * c). Shows that external use of shufflevector does not prevent deinterleaving.
  91 ;   *p1 = (a * b).real();
  92 ;   *p2 = (a * b) * c;
  93 ;   return d * c;
  94 define <4 x float> @multiple_muls_shuffle_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) {
  95 ; CHECK-LABEL: multiple_muls_shuffle_external:
  96 ; CHECK:       // %bb.0: // %entry
  97 ; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
  98 ; CHECK-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
  99 ; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
 100 ; CHECK-NEXT:    zip2 v7.2s, v0.2s, v5.2s
 101 ; CHECK-NEXT:    zip1 v16.2s, v1.2s, v6.2s
 102 ; CHECK-NEXT:    zip2 v1.2s, v1.2s, v6.2s
 103 ; CHECK-NEXT:    zip1 v0.2s, v0.2s, v5.2s
 104 ; CHECK-NEXT:    fmul v5.2s, v16.2s, v7.2s
 105 ; CHECK-NEXT:    fmul v6.2s, v1.2s, v7.2s
 106 ; CHECK-NEXT:    fmla v5.2s, v0.2s, v1.2s
 107 ; CHECK-NEXT:    fneg v1.2s, v6.2s
 108 ; CHECK-NEXT:    zip1 v6.2s, v2.2s, v4.2s
 109 ; CHECK-NEXT:    zip2 v4.2s, v2.2s, v4.2s
 110 ; CHECK-NEXT:    fmla v1.2s, v0.2s, v16.2s
 111 ; CHECK-NEXT:    fmul v17.2s, v6.2s, v5.2s
 112 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 113 ; CHECK-NEXT:    fmul v5.2s, v4.2s, v5.2s
 114 ; CHECK-NEXT:    fmla v17.2s, v1.2s, v4.2s
 115 ; CHECK-NEXT:    fcmla v0.4s, v2.4s, v3.4s, #0
 116 ; CHECK-NEXT:    str d1, [x0]
 117 ; CHECK-NEXT:    fneg v16.2s, v5.2s
 118 ; CHECK-NEXT:    fcmla v0.4s, v2.4s, v3.4s, #90
 119 ; CHECK-NEXT:    fmla v16.2s, v1.2s, v6.2s
 120 ; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x1]
 121 ; CHECK-NEXT:    ret
 122 entry:
 123   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 124   %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 125   %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 126   %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 127   %0 = fmul fast <2 x float> %strided.vec91, %strided.vec
 128   %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88
 129   %2 = fadd fast <2 x float> %0, %1
 130   %3 = fmul fast <2 x float> %strided.vec90, %strided.vec
 131   %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88
 132   %5 = fsub fast <2 x float> %3, %4
 133   store <2 x float> %5, ptr %p1
 134   %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 135   %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 136   %6 = fmul fast <2 x float> %strided.vec94, %5
 137   %7 = fmul fast <2 x float> %strided.vec93, %2
 138   %8 = fadd fast <2 x float> %6, %7
 139   %9 = fmul fast <2 x float> %strided.vec93, %5
 140   %10 = fmul fast <2 x float> %strided.vec94, %2
 141   %11 = fsub fast <2 x float> %9, %10
 142   %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 143   store <4 x float> %interleaved.vec, ptr %p2
 144   %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 145   %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 146   %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94
 147   %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93
 148   %14 = fadd fast <2 x float> %13, %12
 149   %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93
 150   %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94
 151   %17 = fsub fast <2 x float> %15, %16
 152   %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 153   ret <4 x float> %interleaved.vec98
 154 }
 155
 156 ; Same as above but data are loaded from memory instead of being passes as arguments.
 157 ; Expected to transform partially (only d * c).
 158 ; Shows that ld2 is not generated for `c` although it used by both complex `d * c` and non-complex `(a * b) * c` instruction chains.
 159 define <4 x float> @multiple_muls_shuffle_external_with_loads(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_d, ptr %p1, ptr %p2) {
 160 ; CHECK-LABEL: multiple_muls_shuffle_external_with_loads:
 161 ; CHECK:       // %bb.0: // %entry
 162 ; CHECK-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
 163 ; CHECK-NEXT:    ld2 { v2.2s, v3.2s }, [x1]
 164 ; CHECK-NEXT:    fmul v4.2s, v3.2s, v1.2s
 165 ; CHECK-NEXT:    fmul v6.2s, v2.2s, v1.2s
 166 ; CHECK-NEXT:    fneg v4.2s, v4.2s
 167 ; CHECK-NEXT:    fmla v6.2s, v0.2s, v3.2s
 168 ; CHECK-NEXT:    fmla v4.2s, v0.2s, v2.2s
 169 ; CHECK-NEXT:    str d4, [x4]
 170 ; CHECK-NEXT:    ldr q5, [x2]
 171 ; CHECK-NEXT:    ext v7.16b, v5.16b, v5.16b, #8
 172 ; CHECK-NEXT:    zip1 v0.2s, v5.2s, v7.2s
 173 ; CHECK-NEXT:    zip2 v1.2s, v5.2s, v7.2s
 174 ; CHECK-NEXT:    fmul v3.2s, v0.2s, v6.2s
 175 ; CHECK-NEXT:    fmul v6.2s, v1.2s, v6.2s
 176 ; CHECK-NEXT:    fmla v3.2s, v4.2s, v1.2s
 177 ; CHECK-NEXT:    fneg v2.2s, v6.2s
 178 ; CHECK-NEXT:    fmla v2.2s, v4.2s, v0.2s
 179 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 180 ; CHECK-NEXT:    st2 { v2.2s, v3.2s }, [x5]
 181 ; CHECK-NEXT:    ldr q1, [x3]
 182 ; CHECK-NEXT:    fcmla v0.4s, v5.4s, v1.4s, #0
 183 ; CHECK-NEXT:    fcmla v0.4s, v5.4s, v1.4s, #90
 184 ; CHECK-NEXT:    ret
 185 entry:
 186   %a = load <4 x float>, ptr %ptr_a
 187   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 188   %strided.vec88 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 189   %b = load <4 x float>, ptr %ptr_b
 190   %strided.vec90 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 191   %strided.vec91 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 192   %0 = fmul fast <2 x float> %strided.vec91, %strided.vec
 193   %1 = fmul fast <2 x float> %strided.vec90, %strided.vec88
 194   %2 = fadd fast <2 x float> %0, %1
 195   %3 = fmul fast <2 x float> %strided.vec90, %strided.vec
 196   %4 = fmul fast <2 x float> %strided.vec91, %strided.vec88
 197   %5 = fsub fast <2 x float> %3, %4
 198   store <2 x float> %5, ptr %p1
 199   %c = load <4 x float>, ptr %ptr_c
 200   %strided.vec93 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 201   %strided.vec94 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 202   %6 = fmul fast <2 x float> %strided.vec94, %5
 203   %7 = fmul fast <2 x float> %strided.vec93, %2
 204   %8 = fadd fast <2 x float> %6, %7
 205   %9 = fmul fast <2 x float> %strided.vec93, %5
 206   %10 = fmul fast <2 x float> %strided.vec94, %2
 207   %11 = fsub fast <2 x float> %9, %10
 208   %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 209   store <4 x float> %interleaved.vec, ptr %p2
 210   %d = load <4 x float>, ptr %ptr_d
 211   %strided.vec96 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 212   %strided.vec97 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 213   %12 = fmul fast <2 x float> %strided.vec96, %strided.vec94
 214   %13 = fmul fast <2 x float> %strided.vec97, %strided.vec93
 215   %14 = fadd fast <2 x float> %13, %12
 216   %15 = fmul fast <2 x float> %strided.vec96, %strided.vec93
 217   %16 = fmul fast <2 x float> %strided.vec97, %strided.vec94
 218   %17 = fsub fast <2 x float> %15, %16
 219   %interleaved.vec98 = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 220   ret <4 x float> %interleaved.vec98
 221 }
 222
 223 ; Expected to not transform. Shows that external use prevents deinterleaving whole chain.
 224 ;   *p1 = (a * b).real();
 225 ;   *p2 = (a * b) * (d * c);
 226 ;   return d * c;
 227 define <4 x float> @multiple_muls_mul_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) {
 228 ; CHECK-LABEL: multiple_muls_mul_external:
 229 ; CHECK:       // %bb.0: // %entry
 230 ; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
 231 ; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
 232 ; CHECK-NEXT:    ext v16.16b, v2.16b, v2.16b, #8
 233 ; CHECK-NEXT:    ext v17.16b, v3.16b, v3.16b, #8
 234 ; CHECK-NEXT:    zip2 v6.2s, v0.2s, v4.2s
 235 ; CHECK-NEXT:    zip2 v7.2s, v1.2s, v5.2s
 236 ; CHECK-NEXT:    zip1 v19.2s, v2.2s, v16.2s
 237 ; CHECK-NEXT:    zip2 v2.2s, v2.2s, v16.2s
 238 ; CHECK-NEXT:    zip2 v16.2s, v3.2s, v17.2s
 239 ; CHECK-NEXT:    zip1 v0.2s, v0.2s, v4.2s
 240 ; CHECK-NEXT:    zip1 v1.2s, v1.2s, v5.2s
 241 ; CHECK-NEXT:    zip1 v3.2s, v3.2s, v17.2s
 242 ; CHECK-NEXT:    fmul v18.2s, v6.2s, v7.2s
 243 ; CHECK-NEXT:    fmul v5.2s, v19.2s, v16.2s
 244 ; CHECK-NEXT:    fmul v16.2s, v2.2s, v16.2s
 245 ; CHECK-NEXT:    fmul v7.2s, v0.2s, v7.2s
 246 ; CHECK-NEXT:    fneg v4.2s, v18.2s
 247 ; CHECK-NEXT:    fmla v5.2s, v3.2s, v2.2s
 248 ; CHECK-NEXT:    fneg v2.2s, v16.2s
 249 ; CHECK-NEXT:    fmla v7.2s, v1.2s, v6.2s
 250 ; CHECK-NEXT:    fmla v4.2s, v1.2s, v0.2s
 251 ; CHECK-NEXT:    fmla v2.2s, v3.2s, v19.2s
 252 ; CHECK-NEXT:    fmul v0.2s, v7.2s, v5.2s
 253 ; CHECK-NEXT:    fmul v17.2s, v4.2s, v5.2s
 254 ; CHECK-NEXT:    str d4, [x0]
 255 ; CHECK-NEXT:    fmla v17.2s, v2.2s, v7.2s
 256 ; CHECK-NEXT:    fneg v16.2s, v0.2s
 257 ; CHECK-NEXT:    zip1 v0.4s, v2.4s, v5.4s
 258 ; CHECK-NEXT:    fmla v16.2s, v2.2s, v4.2s
 259 ; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x1]
 260 ; CHECK-NEXT:    ret
 261 entry:
 262   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 263   %strided.vec126 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 264   %strided.vec128 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 265   %strided.vec129 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 266   %0 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec129
 267   %1 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec128
 268   %2 = fadd nnan ninf contract <2 x float> %1, %0
 269   %3 = fmul nnan ninf contract <2 x float> %strided.vec, %strided.vec128
 270   %4 = fmul nnan ninf contract <2 x float> %strided.vec126, %strided.vec129
 271   %5 = fsub nnan ninf contract <2 x float> %3, %4
 272   store <2 x float> %5, ptr %p1
 273   %strided.vec131 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 274   %strided.vec132 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 275   %strided.vec134 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
 276   %strided.vec135 = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
 277   %6 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec135
 278   %7 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec134
 279   %8 = fadd nnan ninf contract <2 x float> %7, %6
 280   %9 = fmul nnan ninf contract <2 x float> %strided.vec131, %strided.vec134
 281   %10 = fmul nnan ninf contract <2 x float> %strided.vec132, %strided.vec135
 282   %11 = fsub nnan ninf contract <2 x float> %9, %10
 283   %12 = fmul nnan ninf contract <2 x float> %5, %8
 284   %13 = fmul nnan ninf contract <2 x float> %2, %11
 285   %14 = fadd nnan ninf contract <2 x float> %13, %12
 286   %15 = fmul nnan ninf contract <2 x float> %5, %11
 287   %16 = fmul nnan ninf contract <2 x float> %2, %8
 288   %17 = fsub nnan ninf contract <2 x float> %15, %16
 289   %interleaved.vec = shufflevector <2 x float> %17, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 290   store <4 x float> %interleaved.vec, ptr %p2
 291   %interleaved.vec136 = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 292   ret <4 x float> %interleaved.vec136
 293 }
 294
 295 ; Expected to transform. Shows that composite common subexpression is not generated twice.
 296 ;  u[i] = a[i] * b[i] - (c[i] * d[i] + g[i] * h[i]);
 297 ;  v[i] = e[i] * f[i] + (c[i] * d[i] + g[i] * h[i]);
 298 define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) {
 299 ; CHECK-LABEL: mul_add_common_mul_add_mul:
 300 ; CHECK:       // %bb.0: // %entry
 301 ; CHECK-NEXT:    movi v16.2d, #0000000000000000
 302 ; CHECK-NEXT:    movi v17.2d, #0000000000000000
 303 ; CHECK-NEXT:    ldr q19, [sp, #112]
 304 ; CHECK-NEXT:    ldp q18, q20, [sp, #80]
 305 ; CHECK-NEXT:    ldr q21, [sp, #64]
 306 ; CHECK-NEXT:    movi v22.2d, #0000000000000000
 307 ; CHECK-NEXT:    fcmla v16.2d, v18.2d, v19.2d, #0
 308 ; CHECK-NEXT:    fcmla v17.2d, v21.2d, v20.2d, #0
 309 ; CHECK-NEXT:    fcmla v22.2d, v1.2d, v3.2d, #0
 310 ; CHECK-NEXT:    fcmla v16.2d, v18.2d, v19.2d, #90
 311 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
 312 ; CHECK-NEXT:    fcmla v17.2d, v21.2d, v20.2d, #90
 313 ; CHECK-NEXT:    fcmla v22.2d, v1.2d, v3.2d, #90
 314 ; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #0
 315 ; CHECK-NEXT:    fcmla v18.2d, v0.2d, v2.2d, #0
 316 ; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #0
 317 ; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #90
 318 ; CHECK-NEXT:    fcmla v18.2d, v0.2d, v2.2d, #90
 319 ; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #90
 320 ; CHECK-NEXT:    ldp q3, q0, [sp, #32]
 321 ; CHECK-NEXT:    ldp q2, q1, [sp]
 322 ; CHECK-NEXT:    fsub v4.2d, v22.2d, v16.2d
 323 ; CHECK-NEXT:    fsub v5.2d, v18.2d, v17.2d
 324 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v1.2d, #0
 325 ; CHECK-NEXT:    fcmla v17.2d, v3.2d, v2.2d, #0
 326 ; CHECK-NEXT:    stp q5, q4, [x0]
 327 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v1.2d, #90
 328 ; CHECK-NEXT:    fcmla v17.2d, v3.2d, v2.2d, #90
 329 ; CHECK-NEXT:    stp q17, q16, [x1]
 330 ; CHECK-NEXT:    ret
 331 entry:
 332   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 333   %strided.vec123 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 334   %strided.vec125 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 335   %strided.vec126 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 336   %0 = fmul fast <2 x double> %strided.vec125, %strided.vec
 337   %1 = fmul fast <2 x double> %strided.vec126, %strided.vec
 338   %2 = fmul fast <2 x double> %strided.vec125, %strided.vec123
 339   %3 = fadd fast <2 x double> %1, %2
 340   %strided.vec128 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 341   %strided.vec129 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 342   %strided.vec131 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 343   %strided.vec132 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 344   %4 = fmul fast <2 x double> %strided.vec131, %strided.vec128
 345   %5 = fmul fast <2 x double> %strided.vec132, %strided.vec129
 346   %6 = fmul fast <2 x double> %strided.vec132, %strided.vec128
 347   %7 = fmul fast <2 x double> %strided.vec131, %strided.vec129
 348   %8 = fsub fast <2 x double> %4, %5
 349   %strided.vec134 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 350   %strided.vec135 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 351   %strided.vec137 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 352   %strided.vec138 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 353   %9 = fmul fast <2 x double> %strided.vec138, %strided.vec134
 354   %10 = fmul fast <2 x double> %strided.vec137, %strided.vec135
 355   %11 = fmul fast <2 x double> %strided.vec137, %strided.vec134
 356   %12 = fmul fast <2 x double> %strided.vec135, %strided.vec138
 357   %13 = fsub fast <2 x double> %11, %12
 358   %14 = fadd fast <2 x double> %13, %8
 359   %15 = fadd fast <2 x double> %6, %7
 360   %16 = fadd fast <2 x double> %15, %9
 361   %17 = fadd fast <2 x double> %16, %10
 362   %18 = fmul fast <2 x double> %strided.vec126, %strided.vec123
 363   %19 = fadd fast <2 x double> %18, %14
 364   %20 = fsub fast <2 x double> %0, %19
 365   %21 = fsub fast <2 x double> %3, %17
 366   %interleaved.vec = shufflevector <2 x double> %20, <2 x double> %21, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 367   store <4 x double> %interleaved.vec, ptr %p1, align 8
 368   %strided.vec140 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 369   %strided.vec141 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 370   %strided.vec143 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 0, i32 2>
 371   %strided.vec144 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> <i32 1, i32 3>
 372   %22 = fmul fast <2 x double> %strided.vec143, %strided.vec140
 373   %23 = fmul fast <2 x double> %strided.vec144, %strided.vec140
 374   %24 = fmul fast <2 x double> %strided.vec143, %strided.vec141
 375   %25 = fadd fast <2 x double> %22, %14
 376   %26 = fmul fast <2 x double> %strided.vec144, %strided.vec141
 377   %27 = fsub fast <2 x double> %25, %26
 378   %28 = fadd fast <2 x double> %24, %17
 379   %29 = fadd fast <2 x double> %28, %23
 380   %interleaved.vec145 = shufflevector <2 x double> %27, <2 x double> %29, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 381   store <4 x double> %interleaved.vec145, ptr %p2, align 8
 382   ret void
 383 }