1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
13 ; CHECK-LABEL: stack_fold_fmadd123ph:
15 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19 ; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
21 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
25 declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
27 define <32 x half> @stack_fold_fmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
28 ; CHECK-LABEL: stack_fold_fmadd213ph:
30 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
34 ; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
36 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
37 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
41 define <32 x half> @stack_fold_fmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
42 ; CHECK-LABEL: stack_fold_fmadd231ph:
44 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
48 ; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
50 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
51 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
55 define <32 x half> @stack_fold_fmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
56 ; CHECK-LABEL: stack_fold_fmadd321ph:
58 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
62 ; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
64 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
65 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
69 define <32 x half> @stack_fold_fmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
70 ; CHECK-LABEL: stack_fold_fmadd132ph:
72 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
76 ; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
78 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
79 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
83 define <32 x half> @stack_fold_fmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
84 ; CHECK-LABEL: stack_fold_fmadd312ph:
86 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90 ; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
92 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
97 define <32 x half> @stack_fold_fmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
98 ; CHECK-LABEL: stack_fold_fmadd123ph_mask:
100 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
103 ; CHECK-NEXT: #NO_APP
104 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
105 ; CHECK-NEXT: kmovd %esi, %k1
106 ; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
107 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
109 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
110 %a0 = load <32 x half>, ptr %p
111 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
112 %3 = bitcast i32 %mask to <32 x i1>
113 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
117 define <32 x half> @stack_fold_fmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
118 ; CHECK-LABEL: stack_fold_fmadd213ph_mask:
120 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
123 ; CHECK-NEXT: #NO_APP
124 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
125 ; CHECK-NEXT: kmovd %esi, %k1
126 ; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
127 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
129 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
130 %a0 = load <32 x half>, ptr %p
131 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
132 %3 = bitcast i32 %mask to <32 x i1>
133 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
137 define <32 x half> @stack_fold_fmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
138 ; CHECK-LABEL: stack_fold_fmadd231ph_mask:
140 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
143 ; CHECK-NEXT: #NO_APP
144 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
145 ; CHECK-NEXT: kmovd %esi, %k1
146 ; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
147 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
149 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150 %a0 = load <32 x half>, ptr %p
151 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
152 %3 = bitcast i32 %mask to <32 x i1>
153 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
157 define <32 x half> @stack_fold_fmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
158 ; CHECK-LABEL: stack_fold_fmadd321ph_mask:
160 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
163 ; CHECK-NEXT: #NO_APP
164 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
165 ; CHECK-NEXT: kmovd %esi, %k1
166 ; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
167 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
169 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
170 %a0 = load <32 x half>, ptr %p
171 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
172 %3 = bitcast i32 %mask to <32 x i1>
173 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
177 define <32 x half> @stack_fold_fmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
178 ; CHECK-LABEL: stack_fold_fmadd132ph_mask:
180 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
183 ; CHECK-NEXT: #NO_APP
184 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
185 ; CHECK-NEXT: kmovd %esi, %k1
186 ; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
187 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
189 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
190 %a0 = load <32 x half>, ptr %p
191 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
192 %3 = bitcast i32 %mask to <32 x i1>
193 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
197 define <32 x half> @stack_fold_fmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
198 ; CHECK-LABEL: stack_fold_fmadd312ph_mask:
200 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
203 ; CHECK-NEXT: #NO_APP
204 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
205 ; CHECK-NEXT: kmovd %esi, %k1
206 ; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
207 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
210 %a0 = load <32 x half>, ptr %p
211 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
212 %3 = bitcast i32 %mask to <32 x i1>
213 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
217 define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
218 ; CHECK-LABEL: stack_fold_fmadd123ph_maskz:
220 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
223 ; CHECK-NEXT: #NO_APP
224 ; CHECK-NEXT: kmovd (%rdi), %k1
225 ; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
227 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
228 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
229 %3 = load i32, ptr %mask
230 %4 = bitcast i32 %3 to <32 x i1>
231 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
235 define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
236 ; CHECK-LABEL: stack_fold_fmadd213ph_maskz:
238 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
241 ; CHECK-NEXT: #NO_APP
242 ; CHECK-NEXT: kmovd (%rdi), %k1
243 ; CHECK-NEXT: vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
245 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
246 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
247 %3 = load i32, ptr %mask
248 %4 = bitcast i32 %3 to <32 x i1>
249 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
253 define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
254 ; CHECK-LABEL: stack_fold_fmadd231ph_maskz:
256 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
259 ; CHECK-NEXT: #NO_APP
260 ; CHECK-NEXT: kmovd (%rdi), %k1
261 ; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
264 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
265 %3 = load i32, ptr %mask
266 %4 = bitcast i32 %3 to <32 x i1>
267 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
271 define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
272 ; CHECK-LABEL: stack_fold_fmadd321ph_maskz:
274 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
277 ; CHECK-NEXT: #NO_APP
278 ; CHECK-NEXT: kmovd (%rdi), %k1
279 ; CHECK-NEXT: vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
281 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
282 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
283 %3 = load i32, ptr %mask
284 %4 = bitcast i32 %3 to <32 x i1>
285 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
289 define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
290 ; CHECK-LABEL: stack_fold_fmadd132ph_maskz:
292 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
295 ; CHECK-NEXT: #NO_APP
296 ; CHECK-NEXT: kmovd (%rdi), %k1
297 ; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
299 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
300 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
301 %3 = load i32, ptr %mask
302 %4 = bitcast i32 %3 to <32 x i1>
303 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
307 define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
308 ; CHECK-LABEL: stack_fold_fmadd312ph_maskz:
310 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
313 ; CHECK-NEXT: #NO_APP
314 ; CHECK-NEXT: kmovd (%rdi), %k1
315 ; CHECK-NEXT: vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
318 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
319 %3 = load i32, ptr %mask
320 %4 = bitcast i32 %3 to <32 x i1>
321 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
325 define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
326 ; CHECK-LABEL: stack_fold_fmsub123ph:
328 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
331 ; CHECK-NEXT: #NO_APP
332 ; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
334 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
335 %2 = fneg <32 x half> %a2
336 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %2)
340 define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
341 ; CHECK-LABEL: stack_fold_fmsub213ph:
343 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
346 ; CHECK-NEXT: #NO_APP
347 ; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
349 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
350 %2 = fneg <32 x half> %a2
351 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %2)
355 define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
356 ; CHECK-LABEL: stack_fold_fmsub231ph:
358 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
361 ; CHECK-NEXT: #NO_APP
362 ; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
364 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
365 %2 = fneg <32 x half> %a0
366 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %2)
370 define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
371 ; CHECK-LABEL: stack_fold_fmsub321ph:
373 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
376 ; CHECK-NEXT: #NO_APP
377 ; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
379 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
380 %2 = fneg <32 x half> %a0
381 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %2)
385 define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
386 ; CHECK-LABEL: stack_fold_fmsub132ph:
388 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
391 ; CHECK-NEXT: #NO_APP
392 ; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
395 %2 = fneg <32 x half> %a1
396 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %2)
400 define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
401 ; CHECK-LABEL: stack_fold_fmsub312ph:
403 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
406 ; CHECK-NEXT: #NO_APP
407 ; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
409 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
410 %2 = fneg <32 x half> %a1
411 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %2)
415 define <32 x half> @stack_fold_fmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
416 ; CHECK-LABEL: stack_fold_fmsub123ph_mask:
418 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
421 ; CHECK-NEXT: #NO_APP
422 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
423 ; CHECK-NEXT: kmovd %esi, %k1
424 ; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
425 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
427 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
428 %a0 = load <32 x half>, ptr %p
429 %neg = fneg <32 x half> %a2
430 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
431 %3 = bitcast i32 %mask to <32 x i1>
432 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
436 define <32 x half> @stack_fold_fmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
437 ; CHECK-LABEL: stack_fold_fmsub213ph_mask:
439 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
442 ; CHECK-NEXT: #NO_APP
443 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
444 ; CHECK-NEXT: kmovd %esi, %k1
445 ; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
446 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
448 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449 %a0 = load <32 x half>, ptr %p
450 %neg = fneg <32 x half> %a2
451 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
452 %3 = bitcast i32 %mask to <32 x i1>
453 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
457 define <32 x half> @stack_fold_fmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
458 ; CHECK-LABEL: stack_fold_fmsub231ph_mask:
460 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
463 ; CHECK-NEXT: #NO_APP
464 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
465 ; CHECK-NEXT: kmovd %esi, %k1
466 ; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
467 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
469 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
470 %a0 = load <32 x half>, ptr %p
471 %neg = fneg <32 x half> %a0
472 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
473 %3 = bitcast i32 %mask to <32 x i1>
474 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
478 define <32 x half> @stack_fold_fmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
479 ; CHECK-LABEL: stack_fold_fmsub321ph_mask:
481 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
484 ; CHECK-NEXT: #NO_APP
485 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
486 ; CHECK-NEXT: kmovd %esi, %k1
487 ; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
488 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
491 %a0 = load <32 x half>, ptr %p
492 %neg = fneg <32 x half> %a0
493 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
494 %3 = bitcast i32 %mask to <32 x i1>
495 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
499 define <32 x half> @stack_fold_fmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
500 ; CHECK-LABEL: stack_fold_fmsub132ph_mask:
502 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
505 ; CHECK-NEXT: #NO_APP
506 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
507 ; CHECK-NEXT: kmovd %esi, %k1
508 ; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
509 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
511 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
512 %a0 = load <32 x half>, ptr %p
513 %neg = fneg <32 x half> %a1
514 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
515 %3 = bitcast i32 %mask to <32 x i1>
516 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
520 define <32 x half> @stack_fold_fmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
521 ; CHECK-LABEL: stack_fold_fmsub312ph_mask:
523 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
526 ; CHECK-NEXT: #NO_APP
527 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
528 ; CHECK-NEXT: kmovd %esi, %k1
529 ; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
530 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
533 %a0 = load <32 x half>, ptr %p
534 %neg = fneg <32 x half> %a1
535 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
536 %3 = bitcast i32 %mask to <32 x i1>
537 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
541 define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
542 ; CHECK-LABEL: stack_fold_fmsub123ph_maskz:
544 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
547 ; CHECK-NEXT: #NO_APP
548 ; CHECK-NEXT: kmovd (%rdi), %k1
549 ; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
551 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
552 %neg = fneg <32 x half> %a2
553 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
554 %3 = load i32, ptr %mask
555 %4 = bitcast i32 %3 to <32 x i1>
556 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
560 define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
561 ; CHECK-LABEL: stack_fold_fmsub213ph_maskz:
563 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
566 ; CHECK-NEXT: #NO_APP
567 ; CHECK-NEXT: kmovd (%rdi), %k1
568 ; CHECK-NEXT: vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
570 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
571 %neg = fneg <32 x half> %a2
572 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
573 %3 = load i32, ptr %mask
574 %4 = bitcast i32 %3 to <32 x i1>
575 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
579 define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
580 ; CHECK-LABEL: stack_fold_fmsub231ph_maskz:
582 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
585 ; CHECK-NEXT: #NO_APP
586 ; CHECK-NEXT: kmovd (%rdi), %k1
587 ; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
590 %neg = fneg <32 x half> %a0
591 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
592 %3 = load i32, ptr %mask
593 %4 = bitcast i32 %3 to <32 x i1>
594 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
598 define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
599 ; CHECK-LABEL: stack_fold_fmsub321ph_maskz:
601 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
604 ; CHECK-NEXT: #NO_APP
605 ; CHECK-NEXT: kmovd (%rdi), %k1
606 ; CHECK-NEXT: vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
608 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
609 %neg = fneg <32 x half> %a0
610 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
611 %3 = load i32, ptr %mask
612 %4 = bitcast i32 %3 to <32 x i1>
613 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
617 define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
618 ; CHECK-LABEL: stack_fold_fmsub132ph_maskz:
620 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
623 ; CHECK-NEXT: #NO_APP
624 ; CHECK-NEXT: kmovd (%rdi), %k1
625 ; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
627 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
628 %neg = fneg <32 x half> %a1
629 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
630 %3 = load i32, ptr %mask
631 %4 = bitcast i32 %3 to <32 x i1>
632 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
636 define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
637 ; CHECK-LABEL: stack_fold_fmsub312ph_maskz:
639 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
642 ; CHECK-NEXT: #NO_APP
643 ; CHECK-NEXT: kmovd (%rdi), %k1
644 ; CHECK-NEXT: vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
646 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
647 %neg = fneg <32 x half> %a1
648 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
649 %3 = load i32, ptr %mask
650 %4 = bitcast i32 %3 to <32 x i1>
651 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
655 define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
656 ; CHECK-LABEL: stack_fold_fnmadd123ph:
658 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
661 ; CHECK-NEXT: #NO_APP
662 ; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
664 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
665 %2 = fneg <32 x half> %a0
666 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a2)
670 define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
671 ; CHECK-LABEL: stack_fold_fnmadd213ph:
673 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
676 ; CHECK-NEXT: #NO_APP
677 ; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
679 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
680 %2 = fneg <32 x half> %a1
681 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a2)
685 define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
686 ; CHECK-LABEL: stack_fold_fnmadd231ph:
688 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
691 ; CHECK-NEXT: #NO_APP
692 ; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
694 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
695 %2 = fneg <32 x half> %a1
696 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a0)
700 define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
701 ; CHECK-LABEL: stack_fold_fnmadd321ph:
703 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
706 ; CHECK-NEXT: #NO_APP
707 ; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
709 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
710 %2 = fneg <32 x half> %a2
711 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a0)
715 define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
716 ; CHECK-LABEL: stack_fold_fnmadd132ph:
718 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
721 ; CHECK-NEXT: #NO_APP
722 ; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
724 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
725 %2 = fneg <32 x half> %a0
726 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a1)
730 define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
731 ; CHECK-LABEL: stack_fold_fnmadd312ph:
733 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
736 ; CHECK-NEXT: #NO_APP
737 ; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
740 %2 = fneg <32 x half> %a2
741 %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a1)
745 define <32 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
746 ; CHECK-LABEL: stack_fold_fnmadd123ph_mask:
748 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
751 ; CHECK-NEXT: #NO_APP
752 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
753 ; CHECK-NEXT: kmovd %esi, %k1
754 ; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
755 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
758 %a0 = load <32 x half>, ptr %p
759 %neg = fneg <32 x half> %a0
760 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
761 %3 = bitcast i32 %mask to <32 x i1>
762 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
766 define <32 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
767 ; CHECK-LABEL: stack_fold_fnmadd213ph_mask:
769 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
772 ; CHECK-NEXT: #NO_APP
773 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
774 ; CHECK-NEXT: kmovd %esi, %k1
775 ; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
776 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
779 %a0 = load <32 x half>, ptr %p
780 %neg = fneg <32 x half> %a1
781 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
782 %3 = bitcast i32 %mask to <32 x i1>
783 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
787 define <32 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
788 ; CHECK-LABEL: stack_fold_fnmadd231ph_mask:
790 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
793 ; CHECK-NEXT: #NO_APP
794 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
795 ; CHECK-NEXT: kmovd %esi, %k1
796 ; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
797 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
800 %a0 = load <32 x half>, ptr %p
801 %neg = fneg <32 x half> %a1
802 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
803 %3 = bitcast i32 %mask to <32 x i1>
804 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
808 define <32 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
809 ; CHECK-LABEL: stack_fold_fnmadd321ph_mask:
811 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
814 ; CHECK-NEXT: #NO_APP
815 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
816 ; CHECK-NEXT: kmovd %esi, %k1
817 ; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
818 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
820 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
821 %a0 = load <32 x half>, ptr %p
822 %neg = fneg <32 x half> %a2
823 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
824 %3 = bitcast i32 %mask to <32 x i1>
825 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
829 define <32 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
830 ; CHECK-LABEL: stack_fold_fnmadd132ph_mask:
832 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
835 ; CHECK-NEXT: #NO_APP
836 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
837 ; CHECK-NEXT: kmovd %esi, %k1
838 ; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
839 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
842 %a0 = load <32 x half>, ptr %p
843 %neg = fneg <32 x half> %a0
844 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
845 %3 = bitcast i32 %mask to <32 x i1>
846 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
850 define <32 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
851 ; CHECK-LABEL: stack_fold_fnmadd312ph_mask:
853 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
856 ; CHECK-NEXT: #NO_APP
857 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
858 ; CHECK-NEXT: kmovd %esi, %k1
859 ; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
860 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
862 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
863 %a0 = load <32 x half>, ptr %p
864 %neg = fneg <32 x half> %a2
865 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
866 %3 = bitcast i32 %mask to <32 x i1>
867 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
871 define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
872 ; CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
874 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
877 ; CHECK-NEXT: #NO_APP
878 ; CHECK-NEXT: kmovd (%rdi), %k1
879 ; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
881 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
882 %neg = fneg <32 x half> %a0
883 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
884 %3 = load i32, ptr %mask
885 %4 = bitcast i32 %3 to <32 x i1>
886 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
890 define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
891 ; CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
893 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
896 ; CHECK-NEXT: #NO_APP
897 ; CHECK-NEXT: kmovd (%rdi), %k1
898 ; CHECK-NEXT: vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
900 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
901 %neg = fneg <32 x half> %a1
902 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
903 %3 = load i32, ptr %mask
904 %4 = bitcast i32 %3 to <32 x i1>
905 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
909 define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
910 ; CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
912 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
915 ; CHECK-NEXT: #NO_APP
916 ; CHECK-NEXT: kmovd (%rdi), %k1
917 ; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
919 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
920 %neg = fneg <32 x half> %a1
921 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
922 %3 = load i32, ptr %mask
923 %4 = bitcast i32 %3 to <32 x i1>
924 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
928 define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
929 ; CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
931 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
934 ; CHECK-NEXT: #NO_APP
935 ; CHECK-NEXT: kmovd (%rdi), %k1
936 ; CHECK-NEXT: vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
938 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
939 %neg = fneg <32 x half> %a2
940 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
941 %3 = load i32, ptr %mask
942 %4 = bitcast i32 %3 to <32 x i1>
943 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
947 define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
948 ; CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
950 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
953 ; CHECK-NEXT: #NO_APP
954 ; CHECK-NEXT: kmovd (%rdi), %k1
955 ; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
958 %neg = fneg <32 x half> %a0
959 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
960 %3 = load i32, ptr %mask
961 %4 = bitcast i32 %3 to <32 x i1>
962 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
966 define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
967 ; CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
969 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
972 ; CHECK-NEXT: #NO_APP
973 ; CHECK-NEXT: kmovd (%rdi), %k1
974 ; CHECK-NEXT: vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
976 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
977 %neg = fneg <32 x half> %a2
978 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
979 %3 = load i32, ptr %mask
980 %4 = bitcast i32 %3 to <32 x i1>
981 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
985 define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
986 ; CHECK-LABEL: stack_fold_fnmsub123ph:
988 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
991 ; CHECK-NEXT: #NO_APP
992 ; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
995 %2 = fneg <32 x half> %a0
996 %3 = fneg <32 x half> %a2
997 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
1001 define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1002 ; CHECK-LABEL: stack_fold_fnmsub213ph:
1004 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1007 ; CHECK-NEXT: #NO_APP
1008 ; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1011 %2 = fneg <32 x half> %a1
1012 %3 = fneg <32 x half> %a2
1013 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
1017 define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1018 ; CHECK-LABEL: stack_fold_fnmsub231ph:
1020 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1023 ; CHECK-NEXT: #NO_APP
1024 ; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1026 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1027 %2 = fneg <32 x half> %a1
1028 %3 = fneg <32 x half> %a0
1029 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
1033 define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1034 ; CHECK-LABEL: stack_fold_fnmsub321ph:
1036 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1039 ; CHECK-NEXT: #NO_APP
1040 ; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1043 %2 = fneg <32 x half> %a2
1044 %3 = fneg <32 x half> %a0
1045 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
1049 define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1050 ; CHECK-LABEL: stack_fold_fnmsub132ph:
1052 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1055 ; CHECK-NEXT: #NO_APP
1056 ; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1058 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1059 %2 = fneg <32 x half> %a0
1060 %3 = fneg <32 x half> %a1
1061 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
1065 define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
1066 ; CHECK-LABEL: stack_fold_fnmsub312ph:
1068 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1071 ; CHECK-NEXT: #NO_APP
1072 ; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
1074 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1075 %2 = fneg <32 x half> %a2
1076 %3 = fneg <32 x half> %a1
1077 %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
1081 define <32 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1082 ; CHECK-LABEL: stack_fold_fnmsub123ph_mask:
1084 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1087 ; CHECK-NEXT: #NO_APP
1088 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1089 ; CHECK-NEXT: kmovd %esi, %k1
1090 ; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1091 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1093 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1094 %a0 = load <32 x half>, ptr %p
1095 %neg = fneg <32 x half> %a2
1096 %neg1 = fneg <32 x half> %a0
1097 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1098 %3 = bitcast i32 %mask to <32 x i1>
1099 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1103 define <32 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1104 ; CHECK-LABEL: stack_fold_fnmsub213ph_mask:
1106 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1109 ; CHECK-NEXT: #NO_APP
1110 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1111 ; CHECK-NEXT: kmovd %esi, %k1
1112 ; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1113 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1116 %a0 = load <32 x half>, ptr %p
1117 %neg = fneg <32 x half> %a2
1118 %neg1 = fneg <32 x half> %a1
1119 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1120 %3 = bitcast i32 %mask to <32 x i1>
1121 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1125 define <32 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1126 ; CHECK-LABEL: stack_fold_fnmsub231ph_mask:
1128 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1131 ; CHECK-NEXT: #NO_APP
1132 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1133 ; CHECK-NEXT: kmovd %esi, %k1
1134 ; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1135 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1137 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1138 %a0 = load <32 x half>, ptr %p
1139 %neg = fneg <32 x half> %a0
1140 %neg1 = fneg <32 x half> %a1
1141 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1142 %3 = bitcast i32 %mask to <32 x i1>
1143 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1147 define <32 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1148 ; CHECK-LABEL: stack_fold_fnmsub321ph_mask:
1150 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1153 ; CHECK-NEXT: #NO_APP
1154 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1155 ; CHECK-NEXT: kmovd %esi, %k1
1156 ; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1157 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1159 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1160 %a0 = load <32 x half>, ptr %p
1161 %neg = fneg <32 x half> %a0
1162 %neg1 = fneg <32 x half> %a2
1163 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1164 %3 = bitcast i32 %mask to <32 x i1>
1165 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1169 define <32 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1170 ; CHECK-LABEL: stack_fold_fnmsub132ph_mask:
1172 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1175 ; CHECK-NEXT: #NO_APP
1176 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1177 ; CHECK-NEXT: kmovd %esi, %k1
1178 ; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1179 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1181 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1182 %a0 = load <32 x half>, ptr %p
1183 %neg = fneg <32 x half> %a1
1184 %neg1 = fneg <32 x half> %a0
1185 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1186 %3 = bitcast i32 %mask to <32 x i1>
1187 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1191 define <32 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
1192 ; CHECK-LABEL: stack_fold_fnmsub312ph_mask:
1194 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1197 ; CHECK-NEXT: #NO_APP
1198 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
1199 ; CHECK-NEXT: kmovd %esi, %k1
1200 ; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
1201 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1204 %a0 = load <32 x half>, ptr %p
1205 %neg = fneg <32 x half> %a1
1206 %neg1 = fneg <32 x half> %a2
1207 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1208 %3 = bitcast i32 %mask to <32 x i1>
1209 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
1213 define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1214 ; CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
1216 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1219 ; CHECK-NEXT: #NO_APP
1220 ; CHECK-NEXT: kmovd (%rdi), %k1
1221 ; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1223 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1224 %neg = fneg <32 x half> %a2
1225 %neg1 = fneg <32 x half> %a0
1226 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1227 %3 = load i32, ptr %mask
1228 %4 = bitcast i32 %3 to <32 x i1>
1229 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1233 define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1234 ; CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
1236 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1239 ; CHECK-NEXT: #NO_APP
1240 ; CHECK-NEXT: kmovd (%rdi), %k1
1241 ; CHECK-NEXT: vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1243 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1244 %neg = fneg <32 x half> %a2
1245 %neg1 = fneg <32 x half> %a1
1246 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1247 %3 = load i32, ptr %mask
1248 %4 = bitcast i32 %3 to <32 x i1>
1249 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1253 define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1254 ; CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
1256 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1259 ; CHECK-NEXT: #NO_APP
1260 ; CHECK-NEXT: kmovd (%rdi), %k1
1261 ; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1263 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1264 %neg = fneg <32 x half> %a0
1265 %neg1 = fneg <32 x half> %a1
1266 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1267 %3 = load i32, ptr %mask
1268 %4 = bitcast i32 %3 to <32 x i1>
1269 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1273 define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1274 ; CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
1276 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1279 ; CHECK-NEXT: #NO_APP
1280 ; CHECK-NEXT: kmovd (%rdi), %k1
1281 ; CHECK-NEXT: vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1283 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1284 %neg = fneg <32 x half> %a0
1285 %neg1 = fneg <32 x half> %a2
1286 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
1287 %3 = load i32, ptr %mask
1288 %4 = bitcast i32 %3 to <32 x i1>
1289 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1293 define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1294 ; CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
1296 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1299 ; CHECK-NEXT: #NO_APP
1300 ; CHECK-NEXT: kmovd (%rdi), %k1
1301 ; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1303 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1304 %neg = fneg <32 x half> %a1
1305 %neg1 = fneg <32 x half> %a0
1306 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
1307 %3 = load i32, ptr %mask
1308 %4 = bitcast i32 %3 to <32 x i1>
1309 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1313 define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
1314 ; CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
1316 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1319 ; CHECK-NEXT: #NO_APP
1320 ; CHECK-NEXT: kmovd (%rdi), %k1
1321 ; CHECK-NEXT: vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
1323 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1324 %neg = fneg <32 x half> %a1
1325 %neg1 = fneg <32 x half> %a2
1326 %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
1327 %3 = load i32, ptr %mask
1328 %4 = bitcast i32 %3 to <32 x i1>
1329 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
1333 define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
1334 ; CHECK-LABEL: stack_fold_fmadd123sh:
1336 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1339 ; CHECK-NEXT: #NO_APP
1340 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1342 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1343 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
1346 declare half @llvm.fma.f16(half, half, half)
1348 define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
1349 ; CHECK-LABEL: stack_fold_fmadd213sh:
1351 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1354 ; CHECK-NEXT: #NO_APP
1355 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1357 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1358 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
1362 define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
1363 ; CHECK-LABEL: stack_fold_fmadd231sh:
1365 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1368 ; CHECK-NEXT: #NO_APP
1369 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1371 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1372 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
1376 define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
1377 ; CHECK-LABEL: stack_fold_fmadd321sh:
1379 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1382 ; CHECK-NEXT: #NO_APP
1383 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1385 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1386 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
1390 define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
1391 ; CHECK-LABEL: stack_fold_fmadd132sh:
1393 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1396 ; CHECK-NEXT: #NO_APP
1397 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1400 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
1404 define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
1405 ; CHECK-LABEL: stack_fold_fmadd312sh:
1407 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1410 ; CHECK-NEXT: #NO_APP
1411 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1413 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1414 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
1418 define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
1419 ; CHECK-LABEL: stack_fold_fmsub123sh:
1421 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1424 ; CHECK-NEXT: #NO_APP
1425 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1427 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1429 %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2)
1433 define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
1434 ; CHECK-LABEL: stack_fold_fmsub213sh:
1436 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1439 ; CHECK-NEXT: #NO_APP
1440 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1442 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1444 %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2)
1448 define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
1449 ; CHECK-LABEL: stack_fold_fmsub231sh:
1451 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1454 ; CHECK-NEXT: #NO_APP
1455 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1457 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1459 %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2)
1463 define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
1464 ; CHECK-LABEL: stack_fold_fmsub321sh:
1466 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1469 ; CHECK-NEXT: #NO_APP
1470 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1474 %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2)
1478 define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
1479 ; CHECK-LABEL: stack_fold_fmsub132sh:
1481 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1484 ; CHECK-NEXT: #NO_APP
1485 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1487 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1489 %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2)
1493 define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
1494 ; CHECK-LABEL: stack_fold_fmsub312sh:
1496 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1499 ; CHECK-NEXT: #NO_APP
1500 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1502 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1504 %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2)
1508 define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
1509 ; CHECK-LABEL: stack_fold_fnmadd123sh:
1511 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1514 ; CHECK-NEXT: #NO_APP
1515 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1517 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1519 %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2)
1523 define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
1524 ; CHECK-LABEL: stack_fold_fnmadd213sh:
1526 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1529 ; CHECK-NEXT: #NO_APP
1530 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1532 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1534 %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2)
1538 define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
1539 ; CHECK-LABEL: stack_fold_fnmadd231sh:
1541 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1544 ; CHECK-NEXT: #NO_APP
1545 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1547 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1549 %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0)
1553 define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
1554 ; CHECK-LABEL: stack_fold_fnmadd321sh:
1556 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1559 ; CHECK-NEXT: #NO_APP
1560 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1564 %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0)
1568 define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
1569 ; CHECK-LABEL: stack_fold_fnmadd132sh:
1571 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1574 ; CHECK-NEXT: #NO_APP
1575 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1577 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1579 %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1)
1583 define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
1584 ; CHECK-LABEL: stack_fold_fnmadd312sh:
1586 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1589 ; CHECK-NEXT: #NO_APP
1590 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1592 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1594 %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1)
1598 define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
1599 ; CHECK-LABEL: stack_fold_fnmsub123sh:
1601 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1604 ; CHECK-NEXT: #NO_APP
1605 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1610 %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
1614 define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
1615 ; CHECK-LABEL: stack_fold_fnmsub213sh:
1617 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1620 ; CHECK-NEXT: #NO_APP
1621 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1623 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1626 %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
1630 define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
1631 ; CHECK-LABEL: stack_fold_fnmsub231sh:
1633 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1636 ; CHECK-NEXT: #NO_APP
1637 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1639 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1642 %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
1646 define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
1647 ; CHECK-LABEL: stack_fold_fnmsub321sh:
1649 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1652 ; CHECK-NEXT: #NO_APP
1653 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1655 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1658 %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
1662 define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
1663 ; CHECK-LABEL: stack_fold_fnmsub132sh:
1665 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1668 ; CHECK-NEXT: #NO_APP
1669 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1671 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1674 %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
1678 define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
1679 ; CHECK-LABEL: stack_fold_fnmsub312sh:
1681 ; CHECK-NEXT: vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1684 ; CHECK-NEXT: #NO_APP
1685 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1687 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1690 %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
1694 define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1695 ; CHECK-LABEL: stack_fold_fmadd123sh_int:
1697 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1700 ; CHECK-NEXT: #NO_APP
1701 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1704 %a0 = extractelement <8 x half> %a0v, i64 0
1705 %a1 = extractelement <8 x half> %a1v, i64 0
1706 %a2 = extractelement <8 x half> %a2v, i64 0
1707 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
1708 %res = insertelement <8 x half> %a0v, half %2, i64 0
1712 define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1713 ; CHECK-LABEL: stack_fold_fmadd213sh_int:
1715 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1718 ; CHECK-NEXT: #NO_APP
1719 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1721 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1722 %a0 = extractelement <8 x half> %a0v, i64 0
1723 %a1 = extractelement <8 x half> %a1v, i64 0
1724 %a2 = extractelement <8 x half> %a2v, i64 0
1725 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
1726 %res = insertelement <8 x half> %a0v, half %2, i64 0
1730 define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1731 ; CHECK-LABEL: stack_fold_fmadd231sh_int:
1733 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1736 ; CHECK-NEXT: #NO_APP
1737 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1739 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1740 %a0 = extractelement <8 x half> %a0v, i64 0
1741 %a1 = extractelement <8 x half> %a1v, i64 0
1742 %a2 = extractelement <8 x half> %a2v, i64 0
1743 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
1744 %res = insertelement <8 x half> %a0v, half %2, i64 0
1748 define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1749 ; CHECK-LABEL: stack_fold_fmadd321sh_int:
1751 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1754 ; CHECK-NEXT: #NO_APP
1755 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1757 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1758 %a0 = extractelement <8 x half> %a0v, i64 0
1759 %a1 = extractelement <8 x half> %a1v, i64 0
1760 %a2 = extractelement <8 x half> %a2v, i64 0
1761 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
1762 %res = insertelement <8 x half> %a0v, half %2, i64 0
1766 define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1767 ; CHECK-LABEL: stack_fold_fmadd132sh_int:
1769 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1772 ; CHECK-NEXT: #NO_APP
1773 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1775 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1776 %a0 = extractelement <8 x half> %a0v, i64 0
1777 %a1 = extractelement <8 x half> %a1v, i64 0
1778 %a2 = extractelement <8 x half> %a2v, i64 0
1779 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
1780 %res = insertelement <8 x half> %a0v, half %2, i64 0
1784 define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1785 ; CHECK-LABEL: stack_fold_fmadd312sh_int:
1787 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1790 ; CHECK-NEXT: #NO_APP
1791 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1793 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1794 %a0 = extractelement <8 x half> %a0v, i64 0
1795 %a1 = extractelement <8 x half> %a1v, i64 0
1796 %a2 = extractelement <8 x half> %a2v, i64 0
1797 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
1798 %res = insertelement <8 x half> %a0v, half %2, i64 0
1802 define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1803 ; CHECK-LABEL: stack_fold_fmsub123sh_int:
1805 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1808 ; CHECK-NEXT: #NO_APP
1809 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1811 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1812 %a0 = extractelement <8 x half> %a0v, i64 0
1813 %a1 = extractelement <8 x half> %a1v, i64 0
1814 %a2 = extractelement <8 x half> %a2v, i64 0
1815 %neg = fneg half %a2
1816 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
1817 %res = insertelement <8 x half> %a0v, half %2, i64 0
1821 define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1822 ; CHECK-LABEL: stack_fold_fmsub213sh_int:
1824 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1827 ; CHECK-NEXT: #NO_APP
1828 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1830 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1831 %a0 = extractelement <8 x half> %a0v, i64 0
1832 %a1 = extractelement <8 x half> %a1v, i64 0
1833 %a2 = extractelement <8 x half> %a2v, i64 0
1834 %neg = fneg half %a2
1835 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
1836 %res = insertelement <8 x half> %a0v, half %2, i64 0
1840 define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1841 ; CHECK-LABEL: stack_fold_fmsub231sh_int:
1843 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1846 ; CHECK-NEXT: #NO_APP
1847 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1849 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1850 %a0 = extractelement <8 x half> %a0v, i64 0
1851 %a1 = extractelement <8 x half> %a1v, i64 0
1852 %a2 = extractelement <8 x half> %a2v, i64 0
1853 %neg = fneg half %a0
1854 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
1855 %res = insertelement <8 x half> %a0v, half %2, i64 0
1859 define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1860 ; CHECK-LABEL: stack_fold_fmsub321sh_int:
1862 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1865 ; CHECK-NEXT: #NO_APP
1866 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1868 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1869 %a0 = extractelement <8 x half> %a0v, i64 0
1870 %a1 = extractelement <8 x half> %a1v, i64 0
1871 %a2 = extractelement <8 x half> %a2v, i64 0
1872 %neg = fneg half %a0
1873 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
1874 %res = insertelement <8 x half> %a0v, half %2, i64 0
1878 define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1879 ; CHECK-LABEL: stack_fold_fmsub132sh_int:
1881 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1884 ; CHECK-NEXT: #NO_APP
1885 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1887 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1888 %a0 = extractelement <8 x half> %a0v, i64 0
1889 %a1 = extractelement <8 x half> %a1v, i64 0
1890 %a2 = extractelement <8 x half> %a2v, i64 0
1891 %neg = fneg half %a1
1892 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
1893 %res = insertelement <8 x half> %a0v, half %2, i64 0
1897 define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1898 ; CHECK-LABEL: stack_fold_fmsub312sh_int:
1900 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1903 ; CHECK-NEXT: #NO_APP
1904 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1906 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1907 %a0 = extractelement <8 x half> %a0v, i64 0
1908 %a1 = extractelement <8 x half> %a1v, i64 0
1909 %a2 = extractelement <8 x half> %a2v, i64 0
1910 %neg = fneg half %a1
1911 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
1912 %res = insertelement <8 x half> %a0v, half %2, i64 0
1916 define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1917 ; CHECK-LABEL: stack_fold_fnmadd123sh_int:
1919 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1922 ; CHECK-NEXT: #NO_APP
1923 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1925 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1926 %a0 = extractelement <8 x half> %a0v, i64 0
1927 %a1 = extractelement <8 x half> %a1v, i64 0
1928 %a2 = extractelement <8 x half> %a2v, i64 0
1929 %neg1 = fneg half %a0
1930 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
1931 %res = insertelement <8 x half> %a0v, half %2, i64 0
1935 define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1936 ; CHECK-LABEL: stack_fold_fnmadd213sh_int:
1938 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1941 ; CHECK-NEXT: #NO_APP
1942 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1945 %a0 = extractelement <8 x half> %a0v, i64 0
1946 %a1 = extractelement <8 x half> %a1v, i64 0
1947 %a2 = extractelement <8 x half> %a2v, i64 0
1948 %neg1 = fneg half %a1
1949 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
1950 %res = insertelement <8 x half> %a0v, half %2, i64 0
1954 define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1955 ; CHECK-LABEL: stack_fold_fnmadd231sh_int:
1957 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1960 ; CHECK-NEXT: #NO_APP
1961 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1963 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1964 %a0 = extractelement <8 x half> %a0v, i64 0
1965 %a1 = extractelement <8 x half> %a1v, i64 0
1966 %a2 = extractelement <8 x half> %a2v, i64 0
1967 %neg1 = fneg half %a1
1968 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
1969 %res = insertelement <8 x half> %a0v, half %2, i64 0
1973 define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1974 ; CHECK-LABEL: stack_fold_fnmadd321sh_int:
1976 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1979 ; CHECK-NEXT: #NO_APP
1980 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
1982 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1983 %a0 = extractelement <8 x half> %a0v, i64 0
1984 %a1 = extractelement <8 x half> %a1v, i64 0
1985 %a2 = extractelement <8 x half> %a2v, i64 0
1986 %neg1 = fneg half %a2
1987 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
1988 %res = insertelement <8 x half> %a0v, half %2, i64 0
1992 define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
1993 ; CHECK-LABEL: stack_fold_fnmadd132sh_int:
1995 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1998 ; CHECK-NEXT: #NO_APP
1999 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2001 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2002 %a0 = extractelement <8 x half> %a0v, i64 0
2003 %a1 = extractelement <8 x half> %a1v, i64 0
2004 %a2 = extractelement <8 x half> %a2v, i64 0
2005 %neg1 = fneg half %a0
2006 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
2007 %res = insertelement <8 x half> %a0v, half %2, i64 0
2011 define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2012 ; CHECK-LABEL: stack_fold_fnmadd312sh_int:
2014 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2017 ; CHECK-NEXT: #NO_APP
2018 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2020 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2021 %a0 = extractelement <8 x half> %a0v, i64 0
2022 %a1 = extractelement <8 x half> %a1v, i64 0
2023 %a2 = extractelement <8 x half> %a2v, i64 0
2024 %neg1 = fneg half %a2
2025 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
2026 %res = insertelement <8 x half> %a0v, half %2, i64 0
2030 define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2031 ; CHECK-LABEL: stack_fold_fnmsub123sh_int:
2033 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2036 ; CHECK-NEXT: #NO_APP
2037 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2039 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2040 %a0 = extractelement <8 x half> %a0v, i64 0
2041 %a1 = extractelement <8 x half> %a1v, i64 0
2042 %a2 = extractelement <8 x half> %a2v, i64 0
2043 %neg = fneg half %a2
2044 %neg1 = fneg half %a0
2045 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2046 %res = insertelement <8 x half> %a0v, half %2, i64 0
2050 define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2051 ; CHECK-LABEL: stack_fold_fnmsub213sh_int:
2053 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2056 ; CHECK-NEXT: #NO_APP
2057 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2059 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2060 %a0 = extractelement <8 x half> %a0v, i64 0
2061 %a1 = extractelement <8 x half> %a1v, i64 0
2062 %a2 = extractelement <8 x half> %a2v, i64 0
2063 %neg = fneg half %a2
2064 %neg1 = fneg half %a1
2065 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2066 %res = insertelement <8 x half> %a0v, half %2, i64 0
2070 define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2071 ; CHECK-LABEL: stack_fold_fnmsub231sh_int:
2073 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2076 ; CHECK-NEXT: #NO_APP
2077 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2079 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2080 %a0 = extractelement <8 x half> %a0v, i64 0
2081 %a1 = extractelement <8 x half> %a1v, i64 0
2082 %a2 = extractelement <8 x half> %a2v, i64 0
2083 %neg = fneg half %a0
2084 %neg1 = fneg half %a1
2085 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2086 %res = insertelement <8 x half> %a0v, half %2, i64 0
2090 define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2091 ; CHECK-LABEL: stack_fold_fnmsub321sh_int:
2093 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2096 ; CHECK-NEXT: #NO_APP
2097 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2099 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2100 %a0 = extractelement <8 x half> %a0v, i64 0
2101 %a1 = extractelement <8 x half> %a1v, i64 0
2102 %a2 = extractelement <8 x half> %a2v, i64 0
2103 %neg = fneg half %a0
2104 %neg1 = fneg half %a2
2105 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2106 %res = insertelement <8 x half> %a0v, half %2, i64 0
2110 define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2111 ; CHECK-LABEL: stack_fold_fnmsub132sh_int:
2113 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2116 ; CHECK-NEXT: #NO_APP
2117 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2119 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2120 %a0 = extractelement <8 x half> %a0v, i64 0
2121 %a1 = extractelement <8 x half> %a1v, i64 0
2122 %a2 = extractelement <8 x half> %a2v, i64 0
2123 %neg = fneg half %a1
2124 %neg1 = fneg half %a0
2125 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2126 %res = insertelement <8 x half> %a0v, half %2, i64 0
2130 define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
2131 ; CHECK-LABEL: stack_fold_fnmsub312sh_int:
2133 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2136 ; CHECK-NEXT: #NO_APP
2137 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
2139 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2140 %a0 = extractelement <8 x half> %a0v, i64 0
2141 %a1 = extractelement <8 x half> %a1v, i64 0
2142 %a2 = extractelement <8 x half> %a2v, i64 0
2143 %neg = fneg half %a1
2144 %neg1 = fneg half %a2
2145 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2146 %res = insertelement <8 x half> %a0v, half %2, i64 0
2150 define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2151 ; CHECK-LABEL: stack_fold_fmadd123sh_intk:
2153 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2156 ; CHECK-NEXT: #NO_APP
2157 ; CHECK-NEXT: kmovb (%rdi), %k1
2158 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2160 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2161 %a0 = extractelement <8 x half> %a0v, i64 0
2162 %a1 = extractelement <8 x half> %a1v, i64 0
2163 %a2 = extractelement <8 x half> %a2v, i64 0
2164 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
2165 %3 = load i8, ptr %mask
2166 %4 = bitcast i8 %3 to <8 x i1>
2167 %5 = extractelement <8 x i1> %4, i64 0
2168 %6 = select i1 %5, half %2, half %a0
2169 %res = insertelement <8 x half> %a0v, half %6, i64 0
2173 define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2174 ; CHECK-LABEL: stack_fold_fmadd213sh_intk:
2176 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2179 ; CHECK-NEXT: #NO_APP
2180 ; CHECK-NEXT: kmovb (%rdi), %k1
2181 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2183 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2184 %a0 = extractelement <8 x half> %a0v, i64 0
2185 %a1 = extractelement <8 x half> %a1v, i64 0
2186 %a2 = extractelement <8 x half> %a2v, i64 0
2187 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
2188 %3 = load i8, ptr %mask
2189 %4 = bitcast i8 %3 to <8 x i1>
2190 %5 = extractelement <8 x i1> %4, i64 0
2191 %6 = select i1 %5, half %2, half %a0
2192 %res = insertelement <8 x half> %a0v, half %6, i64 0
2196 define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2197 ; CHECK-LABEL: stack_fold_fmadd231sh_intk:
2199 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2202 ; CHECK-NEXT: #NO_APP
2203 ; CHECK-NEXT: kmovb (%rdi), %k1
2204 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2206 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2207 %a0 = extractelement <8 x half> %a0v, i64 0
2208 %a1 = extractelement <8 x half> %a1v, i64 0
2209 %a2 = extractelement <8 x half> %a2v, i64 0
2210 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
2211 %3 = load i8, ptr %mask
2212 %4 = bitcast i8 %3 to <8 x i1>
2213 %5 = extractelement <8 x i1> %4, i64 0
2214 %6 = select i1 %5, half %2, half %a0
2215 %res = insertelement <8 x half> %a0v, half %6, i64 0
2219 define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2220 ; CHECK-LABEL: stack_fold_fmadd321sh_intk:
2222 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2225 ; CHECK-NEXT: #NO_APP
2226 ; CHECK-NEXT: kmovb (%rdi), %k1
2227 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2229 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2230 %a0 = extractelement <8 x half> %a0v, i64 0
2231 %a1 = extractelement <8 x half> %a1v, i64 0
2232 %a2 = extractelement <8 x half> %a2v, i64 0
2233 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
2234 %3 = load i8, ptr %mask
2235 %4 = bitcast i8 %3 to <8 x i1>
2236 %5 = extractelement <8 x i1> %4, i64 0
2237 %6 = select i1 %5, half %2, half %a0
2238 %res = insertelement <8 x half> %a0v, half %6, i64 0
2242 define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2243 ; CHECK-LABEL: stack_fold_fmadd132sh_intk:
2245 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2248 ; CHECK-NEXT: #NO_APP
2249 ; CHECK-NEXT: kmovb (%rdi), %k1
2250 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2252 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2253 %a0 = extractelement <8 x half> %a0v, i64 0
2254 %a1 = extractelement <8 x half> %a1v, i64 0
2255 %a2 = extractelement <8 x half> %a2v, i64 0
2256 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
2257 %3 = load i8, ptr %mask
2258 %4 = bitcast i8 %3 to <8 x i1>
2259 %5 = extractelement <8 x i1> %4, i64 0
2260 %6 = select i1 %5, half %2, half %a0
2261 %res = insertelement <8 x half> %a0v, half %6, i64 0
2265 define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2266 ; CHECK-LABEL: stack_fold_fmadd312sh_intk:
2268 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2271 ; CHECK-NEXT: #NO_APP
2272 ; CHECK-NEXT: kmovb (%rdi), %k1
2273 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2275 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2276 %a0 = extractelement <8 x half> %a0v, i64 0
2277 %a1 = extractelement <8 x half> %a1v, i64 0
2278 %a2 = extractelement <8 x half> %a2v, i64 0
2279 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
2280 %3 = load i8, ptr %mask
2281 %4 = bitcast i8 %3 to <8 x i1>
2282 %5 = extractelement <8 x i1> %4, i64 0
2283 %6 = select i1 %5, half %2, half %a0
2284 %res = insertelement <8 x half> %a0v, half %6, i64 0
2288 define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2289 ; CHECK-LABEL: stack_fold_fmsub123sh_intk:
2291 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2294 ; CHECK-NEXT: #NO_APP
2295 ; CHECK-NEXT: kmovb (%rdi), %k1
2296 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2298 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2299 %a0 = extractelement <8 x half> %a0v, i64 0
2300 %a1 = extractelement <8 x half> %a1v, i64 0
2301 %a2 = extractelement <8 x half> %a2v, i64 0
2302 %neg = fneg half %a2
2303 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
2304 %3 = load i8, ptr %mask
2305 %4 = bitcast i8 %3 to <8 x i1>
2306 %5 = extractelement <8 x i1> %4, i64 0
2307 %6 = select i1 %5, half %2, half %a0
2308 %res = insertelement <8 x half> %a0v, half %6, i64 0
2312 define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2313 ; CHECK-LABEL: stack_fold_fmsub213sh_intk:
2315 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2318 ; CHECK-NEXT: #NO_APP
2319 ; CHECK-NEXT: kmovb (%rdi), %k1
2320 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2322 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2323 %a0 = extractelement <8 x half> %a0v, i64 0
2324 %a1 = extractelement <8 x half> %a1v, i64 0
2325 %a2 = extractelement <8 x half> %a2v, i64 0
2326 %neg = fneg half %a2
2327 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
2328 %3 = load i8, ptr %mask
2329 %4 = bitcast i8 %3 to <8 x i1>
2330 %5 = extractelement <8 x i1> %4, i64 0
2331 %6 = select i1 %5, half %2, half %a0
2332 %res = insertelement <8 x half> %a0v, half %6, i64 0
2336 define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2337 ; CHECK-LABEL: stack_fold_fmsub231sh_intk:
2339 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2342 ; CHECK-NEXT: #NO_APP
2343 ; CHECK-NEXT: kmovb (%rdi), %k1
2344 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2346 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2347 %a0 = extractelement <8 x half> %a0v, i64 0
2348 %a1 = extractelement <8 x half> %a1v, i64 0
2349 %a2 = extractelement <8 x half> %a2v, i64 0
2350 %neg = fneg half %a0
2351 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
2352 %3 = load i8, ptr %mask
2353 %4 = bitcast i8 %3 to <8 x i1>
2354 %5 = extractelement <8 x i1> %4, i64 0
2355 %6 = select i1 %5, half %2, half %a0
2356 %res = insertelement <8 x half> %a0v, half %6, i64 0
2360 define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2361 ; CHECK-LABEL: stack_fold_fmsub321sh_intk:
2363 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2366 ; CHECK-NEXT: #NO_APP
2367 ; CHECK-NEXT: kmovb (%rdi), %k1
2368 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2370 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2371 %a0 = extractelement <8 x half> %a0v, i64 0
2372 %a1 = extractelement <8 x half> %a1v, i64 0
2373 %a2 = extractelement <8 x half> %a2v, i64 0
2374 %neg = fneg half %a0
2375 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
2376 %3 = load i8, ptr %mask
2377 %4 = bitcast i8 %3 to <8 x i1>
2378 %5 = extractelement <8 x i1> %4, i64 0
2379 %6 = select i1 %5, half %2, half %a0
2380 %res = insertelement <8 x half> %a0v, half %6, i64 0
2384 define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2385 ; CHECK-LABEL: stack_fold_fmsub132sh_intk:
2387 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2390 ; CHECK-NEXT: #NO_APP
2391 ; CHECK-NEXT: kmovb (%rdi), %k1
2392 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2395 %a0 = extractelement <8 x half> %a0v, i64 0
2396 %a1 = extractelement <8 x half> %a1v, i64 0
2397 %a2 = extractelement <8 x half> %a2v, i64 0
2398 %neg = fneg half %a1
2399 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
2400 %3 = load i8, ptr %mask
2401 %4 = bitcast i8 %3 to <8 x i1>
2402 %5 = extractelement <8 x i1> %4, i64 0
2403 %6 = select i1 %5, half %2, half %a0
2404 %res = insertelement <8 x half> %a0v, half %6, i64 0
2408 define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2409 ; CHECK-LABEL: stack_fold_fmsub312sh_intk:
2411 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2414 ; CHECK-NEXT: #NO_APP
2415 ; CHECK-NEXT: kmovb (%rdi), %k1
2416 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2418 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2419 %a0 = extractelement <8 x half> %a0v, i64 0
2420 %a1 = extractelement <8 x half> %a1v, i64 0
2421 %a2 = extractelement <8 x half> %a2v, i64 0
2422 %neg = fneg half %a1
2423 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
2424 %3 = load i8, ptr %mask
2425 %4 = bitcast i8 %3 to <8 x i1>
2426 %5 = extractelement <8 x i1> %4, i64 0
2427 %6 = select i1 %5, half %2, half %a0
2428 %res = insertelement <8 x half> %a0v, half %6, i64 0
2432 define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2433 ; CHECK-LABEL: stack_fold_fnmadd123sh_intk:
2435 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2438 ; CHECK-NEXT: #NO_APP
2439 ; CHECK-NEXT: kmovb (%rdi), %k1
2440 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2442 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2443 %a0 = extractelement <8 x half> %a0v, i64 0
2444 %a1 = extractelement <8 x half> %a1v, i64 0
2445 %a2 = extractelement <8 x half> %a2v, i64 0
2446 %neg1 = fneg half %a0
2447 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
2448 %3 = load i8, ptr %mask
2449 %4 = bitcast i8 %3 to <8 x i1>
2450 %5 = extractelement <8 x i1> %4, i64 0
2451 %6 = select i1 %5, half %2, half %a0
2452 %res = insertelement <8 x half> %a0v, half %6, i64 0
2456 define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2457 ; CHECK-LABEL: stack_fold_fnmadd213sh_intk:
2459 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2462 ; CHECK-NEXT: #NO_APP
2463 ; CHECK-NEXT: kmovb (%rdi), %k1
2464 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2466 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2467 %a0 = extractelement <8 x half> %a0v, i64 0
2468 %a1 = extractelement <8 x half> %a1v, i64 0
2469 %a2 = extractelement <8 x half> %a2v, i64 0
2470 %neg1 = fneg half %a1
2471 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
2472 %3 = load i8, ptr %mask
2473 %4 = bitcast i8 %3 to <8 x i1>
2474 %5 = extractelement <8 x i1> %4, i64 0
2475 %6 = select i1 %5, half %2, half %a0
2476 %res = insertelement <8 x half> %a0v, half %6, i64 0
2480 define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2481 ; CHECK-LABEL: stack_fold_fnmadd231sh_intk:
2483 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2486 ; CHECK-NEXT: #NO_APP
2487 ; CHECK-NEXT: kmovb (%rdi), %k1
2488 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2490 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2491 %a0 = extractelement <8 x half> %a0v, i64 0
2492 %a1 = extractelement <8 x half> %a1v, i64 0
2493 %a2 = extractelement <8 x half> %a2v, i64 0
2494 %neg1 = fneg half %a1
2495 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
2496 %3 = load i8, ptr %mask
2497 %4 = bitcast i8 %3 to <8 x i1>
2498 %5 = extractelement <8 x i1> %4, i64 0
2499 %6 = select i1 %5, half %2, half %a0
2500 %res = insertelement <8 x half> %a0v, half %6, i64 0
2504 define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2505 ; CHECK-LABEL: stack_fold_fnmadd321sh_intk:
2507 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2510 ; CHECK-NEXT: #NO_APP
2511 ; CHECK-NEXT: kmovb (%rdi), %k1
2512 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2514 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2515 %a0 = extractelement <8 x half> %a0v, i64 0
2516 %a1 = extractelement <8 x half> %a1v, i64 0
2517 %a2 = extractelement <8 x half> %a2v, i64 0
2518 %neg1 = fneg half %a2
2519 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
2520 %3 = load i8, ptr %mask
2521 %4 = bitcast i8 %3 to <8 x i1>
2522 %5 = extractelement <8 x i1> %4, i64 0
2523 %6 = select i1 %5, half %2, half %a0
2524 %res = insertelement <8 x half> %a0v, half %6, i64 0
2528 define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2529 ; CHECK-LABEL: stack_fold_fnmadd132sh_intk:
2531 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2534 ; CHECK-NEXT: #NO_APP
2535 ; CHECK-NEXT: kmovb (%rdi), %k1
2536 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2538 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2539 %a0 = extractelement <8 x half> %a0v, i64 0
2540 %a1 = extractelement <8 x half> %a1v, i64 0
2541 %a2 = extractelement <8 x half> %a2v, i64 0
2542 %neg1 = fneg half %a0
2543 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
2544 %3 = load i8, ptr %mask
2545 %4 = bitcast i8 %3 to <8 x i1>
2546 %5 = extractelement <8 x i1> %4, i64 0
2547 %6 = select i1 %5, half %2, half %a0
2548 %res = insertelement <8 x half> %a0v, half %6, i64 0
2552 define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2553 ; CHECK-LABEL: stack_fold_fnmadd312sh_intk:
2555 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2558 ; CHECK-NEXT: #NO_APP
2559 ; CHECK-NEXT: kmovb (%rdi), %k1
2560 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2562 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2563 %a0 = extractelement <8 x half> %a0v, i64 0
2564 %a1 = extractelement <8 x half> %a1v, i64 0
2565 %a2 = extractelement <8 x half> %a2v, i64 0
2566 %neg1 = fneg half %a2
2567 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
2568 %3 = load i8, ptr %mask
2569 %4 = bitcast i8 %3 to <8 x i1>
2570 %5 = extractelement <8 x i1> %4, i64 0
2571 %6 = select i1 %5, half %2, half %a0
2572 %res = insertelement <8 x half> %a0v, half %6, i64 0
2576 define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2577 ; CHECK-LABEL: stack_fold_fnmsub123sh_intk:
2579 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2582 ; CHECK-NEXT: #NO_APP
2583 ; CHECK-NEXT: kmovb (%rdi), %k1
2584 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2586 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2587 %a0 = extractelement <8 x half> %a0v, i64 0
2588 %a1 = extractelement <8 x half> %a1v, i64 0
2589 %a2 = extractelement <8 x half> %a2v, i64 0
2590 %neg = fneg half %a2
2591 %neg1 = fneg half %a0
2592 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2593 %3 = load i8, ptr %mask
2594 %4 = bitcast i8 %3 to <8 x i1>
2595 %5 = extractelement <8 x i1> %4, i64 0
2596 %6 = select i1 %5, half %2, half %a0
2597 %res = insertelement <8 x half> %a0v, half %6, i64 0
2601 define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2602 ; CHECK-LABEL: stack_fold_fnmsub213sh_intk:
2604 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2607 ; CHECK-NEXT: #NO_APP
2608 ; CHECK-NEXT: kmovb (%rdi), %k1
2609 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2611 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2612 %a0 = extractelement <8 x half> %a0v, i64 0
2613 %a1 = extractelement <8 x half> %a1v, i64 0
2614 %a2 = extractelement <8 x half> %a2v, i64 0
2615 %neg = fneg half %a2
2616 %neg1 = fneg half %a1
2617 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2618 %3 = load i8, ptr %mask
2619 %4 = bitcast i8 %3 to <8 x i1>
2620 %5 = extractelement <8 x i1> %4, i64 0
2621 %6 = select i1 %5, half %2, half %a0
2622 %res = insertelement <8 x half> %a0v, half %6, i64 0
2626 define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2627 ; CHECK-LABEL: stack_fold_fnmsub231sh_intk:
2629 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2632 ; CHECK-NEXT: #NO_APP
2633 ; CHECK-NEXT: kmovb (%rdi), %k1
2634 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2636 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2637 %a0 = extractelement <8 x half> %a0v, i64 0
2638 %a1 = extractelement <8 x half> %a1v, i64 0
2639 %a2 = extractelement <8 x half> %a2v, i64 0
2640 %neg = fneg half %a0
2641 %neg1 = fneg half %a1
2642 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2643 %3 = load i8, ptr %mask
2644 %4 = bitcast i8 %3 to <8 x i1>
2645 %5 = extractelement <8 x i1> %4, i64 0
2646 %6 = select i1 %5, half %2, half %a0
2647 %res = insertelement <8 x half> %a0v, half %6, i64 0
2651 define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2652 ; CHECK-LABEL: stack_fold_fnmsub321sh_intk:
2654 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2657 ; CHECK-NEXT: #NO_APP
2658 ; CHECK-NEXT: kmovb (%rdi), %k1
2659 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2661 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2662 %a0 = extractelement <8 x half> %a0v, i64 0
2663 %a1 = extractelement <8 x half> %a1v, i64 0
2664 %a2 = extractelement <8 x half> %a2v, i64 0
2665 %neg = fneg half %a0
2666 %neg1 = fneg half %a2
2667 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
2668 %3 = load i8, ptr %mask
2669 %4 = bitcast i8 %3 to <8 x i1>
2670 %5 = extractelement <8 x i1> %4, i64 0
2671 %6 = select i1 %5, half %2, half %a0
2672 %res = insertelement <8 x half> %a0v, half %6, i64 0
2676 define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2677 ; CHECK-LABEL: stack_fold_fnmsub132sh_intk:
2679 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2682 ; CHECK-NEXT: #NO_APP
2683 ; CHECK-NEXT: kmovb (%rdi), %k1
2684 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2686 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2687 %a0 = extractelement <8 x half> %a0v, i64 0
2688 %a1 = extractelement <8 x half> %a1v, i64 0
2689 %a2 = extractelement <8 x half> %a2v, i64 0
2690 %neg = fneg half %a1
2691 %neg1 = fneg half %a0
2692 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
2693 %3 = load i8, ptr %mask
2694 %4 = bitcast i8 %3 to <8 x i1>
2695 %5 = extractelement <8 x i1> %4, i64 0
2696 %6 = select i1 %5, half %2, half %a0
2697 %res = insertelement <8 x half> %a0v, half %6, i64 0
2701 define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2702 ; CHECK-LABEL: stack_fold_fnmsub312sh_intk:
2704 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2707 ; CHECK-NEXT: #NO_APP
2708 ; CHECK-NEXT: kmovb (%rdi), %k1
2709 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
2711 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2712 %a0 = extractelement <8 x half> %a0v, i64 0
2713 %a1 = extractelement <8 x half> %a1v, i64 0
2714 %a2 = extractelement <8 x half> %a2v, i64 0
2715 %neg = fneg half %a1
2716 %neg1 = fneg half %a2
2717 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
2718 %3 = load i8, ptr %mask
2719 %4 = bitcast i8 %3 to <8 x i1>
2720 %5 = extractelement <8 x i1> %4, i64 0
2721 %6 = select i1 %5, half %2, half %a0
2722 %res = insertelement <8 x half> %a0v, half %6, i64 0
2726 define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2727 ; CHECK-LABEL: stack_fold_fmadd123sh_intkz:
2729 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2732 ; CHECK-NEXT: #NO_APP
2733 ; CHECK-NEXT: kmovb (%rdi), %k1
2734 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2736 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2737 %a0 = extractelement <8 x half> %a0v, i64 0
2738 %a1 = extractelement <8 x half> %a1v, i64 0
2739 %a2 = extractelement <8 x half> %a2v, i64 0
2740 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
2741 %3 = load i8, ptr %mask
2742 %4 = bitcast i8 %3 to <8 x i1>
2743 %5 = extractelement <8 x i1> %4, i64 0
2744 %6 = select i1 %5, half %2, half zeroinitializer
2745 %res = insertelement <8 x half> %a0v, half %6, i64 0
2749 define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2750 ; CHECK-LABEL: stack_fold_fmadd213sh_intkz:
2752 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2755 ; CHECK-NEXT: #NO_APP
2756 ; CHECK-NEXT: kmovb (%rdi), %k1
2757 ; CHECK-NEXT: vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2759 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2760 %a0 = extractelement <8 x half> %a0v, i64 0
2761 %a1 = extractelement <8 x half> %a1v, i64 0
2762 %a2 = extractelement <8 x half> %a2v, i64 0
2763 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
2764 %3 = load i8, ptr %mask
2765 %4 = bitcast i8 %3 to <8 x i1>
2766 %5 = extractelement <8 x i1> %4, i64 0
2767 %6 = select i1 %5, half %2, half zeroinitializer
2768 %res = insertelement <8 x half> %a0v, half %6, i64 0
2772 define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2773 ; CHECK-LABEL: stack_fold_fmadd231sh_intkz:
2775 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2778 ; CHECK-NEXT: #NO_APP
2779 ; CHECK-NEXT: kmovb (%rdi), %k1
2780 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2782 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2783 %a0 = extractelement <8 x half> %a0v, i64 0
2784 %a1 = extractelement <8 x half> %a1v, i64 0
2785 %a2 = extractelement <8 x half> %a2v, i64 0
2786 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
2787 %3 = load i8, ptr %mask
2788 %4 = bitcast i8 %3 to <8 x i1>
2789 %5 = extractelement <8 x i1> %4, i64 0
2790 %6 = select i1 %5, half %2, half zeroinitializer
2791 %res = insertelement <8 x half> %a0v, half %6, i64 0
2795 define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2796 ; CHECK-LABEL: stack_fold_fmadd321sh_intkz:
2798 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2801 ; CHECK-NEXT: #NO_APP
2802 ; CHECK-NEXT: kmovb (%rdi), %k1
2803 ; CHECK-NEXT: vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2805 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2806 %a0 = extractelement <8 x half> %a0v, i64 0
2807 %a1 = extractelement <8 x half> %a1v, i64 0
2808 %a2 = extractelement <8 x half> %a2v, i64 0
2809 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
2810 %3 = load i8, ptr %mask
2811 %4 = bitcast i8 %3 to <8 x i1>
2812 %5 = extractelement <8 x i1> %4, i64 0
2813 %6 = select i1 %5, half %2, half zeroinitializer
2814 %res = insertelement <8 x half> %a0v, half %6, i64 0
2818 define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2819 ; CHECK-LABEL: stack_fold_fmadd132sh_intkz:
2821 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2824 ; CHECK-NEXT: #NO_APP
2825 ; CHECK-NEXT: kmovb (%rdi), %k1
2826 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2828 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2829 %a0 = extractelement <8 x half> %a0v, i64 0
2830 %a1 = extractelement <8 x half> %a1v, i64 0
2831 %a2 = extractelement <8 x half> %a2v, i64 0
2832 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
2833 %3 = load i8, ptr %mask
2834 %4 = bitcast i8 %3 to <8 x i1>
2835 %5 = extractelement <8 x i1> %4, i64 0
2836 %6 = select i1 %5, half %2, half zeroinitializer
2837 %res = insertelement <8 x half> %a0v, half %6, i64 0
2841 define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2842 ; CHECK-LABEL: stack_fold_fmadd312sh_intkz:
2844 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2847 ; CHECK-NEXT: #NO_APP
2848 ; CHECK-NEXT: kmovb (%rdi), %k1
2849 ; CHECK-NEXT: vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2851 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2852 %a0 = extractelement <8 x half> %a0v, i64 0
2853 %a1 = extractelement <8 x half> %a1v, i64 0
2854 %a2 = extractelement <8 x half> %a2v, i64 0
2855 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
2856 %3 = load i8, ptr %mask
2857 %4 = bitcast i8 %3 to <8 x i1>
2858 %5 = extractelement <8 x i1> %4, i64 0
2859 %6 = select i1 %5, half %2, half zeroinitializer
2860 %res = insertelement <8 x half> %a0v, half %6, i64 0
2864 define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2865 ; CHECK-LABEL: stack_fold_fmsub123sh_intkz:
2867 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2870 ; CHECK-NEXT: #NO_APP
2871 ; CHECK-NEXT: kmovb (%rdi), %k1
2872 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2874 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2875 %a0 = extractelement <8 x half> %a0v, i64 0
2876 %a1 = extractelement <8 x half> %a1v, i64 0
2877 %a2 = extractelement <8 x half> %a2v, i64 0
2878 %neg = fneg half %a2
2879 %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
2880 %3 = load i8, ptr %mask
2881 %4 = bitcast i8 %3 to <8 x i1>
2882 %5 = extractelement <8 x i1> %4, i64 0
2883 %6 = select i1 %5, half %2, half zeroinitializer
2884 %res = insertelement <8 x half> %a0v, half %6, i64 0
2888 define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2889 ; CHECK-LABEL: stack_fold_fmsub213sh_intkz:
2891 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2894 ; CHECK-NEXT: #NO_APP
2895 ; CHECK-NEXT: kmovb (%rdi), %k1
2896 ; CHECK-NEXT: vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2899 %a0 = extractelement <8 x half> %a0v, i64 0
2900 %a1 = extractelement <8 x half> %a1v, i64 0
2901 %a2 = extractelement <8 x half> %a2v, i64 0
2902 %neg = fneg half %a2
2903 %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
2904 %3 = load i8, ptr %mask
2905 %4 = bitcast i8 %3 to <8 x i1>
2906 %5 = extractelement <8 x i1> %4, i64 0
2907 %6 = select i1 %5, half %2, half zeroinitializer
2908 %res = insertelement <8 x half> %a0v, half %6, i64 0
2912 define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2913 ; CHECK-LABEL: stack_fold_fmsub231sh_intkz:
2915 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2918 ; CHECK-NEXT: #NO_APP
2919 ; CHECK-NEXT: kmovb (%rdi), %k1
2920 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2922 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2923 %a0 = extractelement <8 x half> %a0v, i64 0
2924 %a1 = extractelement <8 x half> %a1v, i64 0
2925 %a2 = extractelement <8 x half> %a2v, i64 0
2926 %neg = fneg half %a0
2927 %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
2928 %3 = load i8, ptr %mask
2929 %4 = bitcast i8 %3 to <8 x i1>
2930 %5 = extractelement <8 x i1> %4, i64 0
2931 %6 = select i1 %5, half %2, half zeroinitializer
2932 %res = insertelement <8 x half> %a0v, half %6, i64 0
2936 define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2937 ; CHECK-LABEL: stack_fold_fmsub321sh_intkz:
2939 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2942 ; CHECK-NEXT: #NO_APP
2943 ; CHECK-NEXT: kmovb (%rdi), %k1
2944 ; CHECK-NEXT: vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2946 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2947 %a0 = extractelement <8 x half> %a0v, i64 0
2948 %a1 = extractelement <8 x half> %a1v, i64 0
2949 %a2 = extractelement <8 x half> %a2v, i64 0
2950 %neg = fneg half %a0
2951 %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
2952 %3 = load i8, ptr %mask
2953 %4 = bitcast i8 %3 to <8 x i1>
2954 %5 = extractelement <8 x i1> %4, i64 0
2955 %6 = select i1 %5, half %2, half zeroinitializer
2956 %res = insertelement <8 x half> %a0v, half %6, i64 0
2960 define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2961 ; CHECK-LABEL: stack_fold_fmsub132sh_intkz:
2963 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2966 ; CHECK-NEXT: #NO_APP
2967 ; CHECK-NEXT: kmovb (%rdi), %k1
2968 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2970 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2971 %a0 = extractelement <8 x half> %a0v, i64 0
2972 %a1 = extractelement <8 x half> %a1v, i64 0
2973 %a2 = extractelement <8 x half> %a2v, i64 0
2974 %neg = fneg half %a1
2975 %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
2976 %3 = load i8, ptr %mask
2977 %4 = bitcast i8 %3 to <8 x i1>
2978 %5 = extractelement <8 x i1> %4, i64 0
2979 %6 = select i1 %5, half %2, half zeroinitializer
2980 %res = insertelement <8 x half> %a0v, half %6, i64 0
2984 define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
2985 ; CHECK-LABEL: stack_fold_fmsub312sh_intkz:
2987 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2990 ; CHECK-NEXT: #NO_APP
2991 ; CHECK-NEXT: kmovb (%rdi), %k1
2992 ; CHECK-NEXT: vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
2994 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2995 %a0 = extractelement <8 x half> %a0v, i64 0
2996 %a1 = extractelement <8 x half> %a1v, i64 0
2997 %a2 = extractelement <8 x half> %a2v, i64 0
2998 %neg = fneg half %a1
2999 %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
3000 %3 = load i8, ptr %mask
3001 %4 = bitcast i8 %3 to <8 x i1>
3002 %5 = extractelement <8 x i1> %4, i64 0
3003 %6 = select i1 %5, half %2, half zeroinitializer
3004 %res = insertelement <8 x half> %a0v, half %6, i64 0
3008 define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3009 ; CHECK-LABEL: stack_fold_fnmadd123sh_intkz:
3011 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3014 ; CHECK-NEXT: #NO_APP
3015 ; CHECK-NEXT: kmovb (%rdi), %k1
3016 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3018 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3019 %a0 = extractelement <8 x half> %a0v, i64 0
3020 %a1 = extractelement <8 x half> %a1v, i64 0
3021 %a2 = extractelement <8 x half> %a2v, i64 0
3022 %neg1 = fneg half %a0
3023 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
3024 %3 = load i8, ptr %mask
3025 %4 = bitcast i8 %3 to <8 x i1>
3026 %5 = extractelement <8 x i1> %4, i64 0
3027 %6 = select i1 %5, half %2, half zeroinitializer
3028 %res = insertelement <8 x half> %a0v, half %6, i64 0
3032 define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3033 ; CHECK-LABEL: stack_fold_fnmadd213sh_intkz:
3035 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3038 ; CHECK-NEXT: #NO_APP
3039 ; CHECK-NEXT: kmovb (%rdi), %k1
3040 ; CHECK-NEXT: vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3042 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3043 %a0 = extractelement <8 x half> %a0v, i64 0
3044 %a1 = extractelement <8 x half> %a1v, i64 0
3045 %a2 = extractelement <8 x half> %a2v, i64 0
3046 %neg1 = fneg half %a1
3047 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
3048 %3 = load i8, ptr %mask
3049 %4 = bitcast i8 %3 to <8 x i1>
3050 %5 = extractelement <8 x i1> %4, i64 0
3051 %6 = select i1 %5, half %2, half zeroinitializer
3052 %res = insertelement <8 x half> %a0v, half %6, i64 0
3056 define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3057 ; CHECK-LABEL: stack_fold_fnmadd231sh_intkz:
3059 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3062 ; CHECK-NEXT: #NO_APP
3063 ; CHECK-NEXT: kmovb (%rdi), %k1
3064 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3066 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3067 %a0 = extractelement <8 x half> %a0v, i64 0
3068 %a1 = extractelement <8 x half> %a1v, i64 0
3069 %a2 = extractelement <8 x half> %a2v, i64 0
3070 %neg1 = fneg half %a1
3071 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
3072 %3 = load i8, ptr %mask
3073 %4 = bitcast i8 %3 to <8 x i1>
3074 %5 = extractelement <8 x i1> %4, i64 0
3075 %6 = select i1 %5, half %2, half zeroinitializer
3076 %res = insertelement <8 x half> %a0v, half %6, i64 0
3080 define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3081 ; CHECK-LABEL: stack_fold_fnmadd321sh_intkz:
3083 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3086 ; CHECK-NEXT: #NO_APP
3087 ; CHECK-NEXT: kmovb (%rdi), %k1
3088 ; CHECK-NEXT: vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3090 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3091 %a0 = extractelement <8 x half> %a0v, i64 0
3092 %a1 = extractelement <8 x half> %a1v, i64 0
3093 %a2 = extractelement <8 x half> %a2v, i64 0
3094 %neg1 = fneg half %a2
3095 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
3096 %3 = load i8, ptr %mask
3097 %4 = bitcast i8 %3 to <8 x i1>
3098 %5 = extractelement <8 x i1> %4, i64 0
3099 %6 = select i1 %5, half %2, half zeroinitializer
3100 %res = insertelement <8 x half> %a0v, half %6, i64 0
3104 define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3105 ; CHECK-LABEL: stack_fold_fnmadd132sh_intkz:
3107 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3110 ; CHECK-NEXT: #NO_APP
3111 ; CHECK-NEXT: kmovb (%rdi), %k1
3112 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3114 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3115 %a0 = extractelement <8 x half> %a0v, i64 0
3116 %a1 = extractelement <8 x half> %a1v, i64 0
3117 %a2 = extractelement <8 x half> %a2v, i64 0
3118 %neg1 = fneg half %a0
3119 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
3120 %3 = load i8, ptr %mask
3121 %4 = bitcast i8 %3 to <8 x i1>
3122 %5 = extractelement <8 x i1> %4, i64 0
3123 %6 = select i1 %5, half %2, half zeroinitializer
3124 %res = insertelement <8 x half> %a0v, half %6, i64 0
3128 define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3129 ; CHECK-LABEL: stack_fold_fnmadd312sh_intkz:
3131 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3134 ; CHECK-NEXT: #NO_APP
3135 ; CHECK-NEXT: kmovb (%rdi), %k1
3136 ; CHECK-NEXT: vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3138 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3139 %a0 = extractelement <8 x half> %a0v, i64 0
3140 %a1 = extractelement <8 x half> %a1v, i64 0
3141 %a2 = extractelement <8 x half> %a2v, i64 0
3142 %neg1 = fneg half %a2
3143 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
3144 %3 = load i8, ptr %mask
3145 %4 = bitcast i8 %3 to <8 x i1>
3146 %5 = extractelement <8 x i1> %4, i64 0
3147 %6 = select i1 %5, half %2, half zeroinitializer
3148 %res = insertelement <8 x half> %a0v, half %6, i64 0
3152 define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3153 ; CHECK-LABEL: stack_fold_fnmsub123sh_intkz:
3155 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3158 ; CHECK-NEXT: #NO_APP
3159 ; CHECK-NEXT: kmovb (%rdi), %k1
3160 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3162 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3163 %a0 = extractelement <8 x half> %a0v, i64 0
3164 %a1 = extractelement <8 x half> %a1v, i64 0
3165 %a2 = extractelement <8 x half> %a2v, i64 0
3166 %neg = fneg half %a2
3167 %neg1 = fneg half %a0
3168 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
3169 %3 = load i8, ptr %mask
3170 %4 = bitcast i8 %3 to <8 x i1>
3171 %5 = extractelement <8 x i1> %4, i64 0
3172 %6 = select i1 %5, half %2, half zeroinitializer
3173 %res = insertelement <8 x half> %a0v, half %6, i64 0
3177 define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3178 ; CHECK-LABEL: stack_fold_fnmsub213sh_intkz:
3180 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3183 ; CHECK-NEXT: #NO_APP
3184 ; CHECK-NEXT: kmovb (%rdi), %k1
3185 ; CHECK-NEXT: vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3188 %a0 = extractelement <8 x half> %a0v, i64 0
3189 %a1 = extractelement <8 x half> %a1v, i64 0
3190 %a2 = extractelement <8 x half> %a2v, i64 0
3191 %neg = fneg half %a2
3192 %neg1 = fneg half %a1
3193 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
3194 %3 = load i8, ptr %mask
3195 %4 = bitcast i8 %3 to <8 x i1>
3196 %5 = extractelement <8 x i1> %4, i64 0
3197 %6 = select i1 %5, half %2, half zeroinitializer
3198 %res = insertelement <8 x half> %a0v, half %6, i64 0
3202 define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3203 ; CHECK-LABEL: stack_fold_fnmsub231sh_intkz:
3205 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3208 ; CHECK-NEXT: #NO_APP
3209 ; CHECK-NEXT: kmovb (%rdi), %k1
3210 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3212 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3213 %a0 = extractelement <8 x half> %a0v, i64 0
3214 %a1 = extractelement <8 x half> %a1v, i64 0
3215 %a2 = extractelement <8 x half> %a2v, i64 0
3216 %neg = fneg half %a0
3217 %neg1 = fneg half %a1
3218 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
3219 %3 = load i8, ptr %mask
3220 %4 = bitcast i8 %3 to <8 x i1>
3221 %5 = extractelement <8 x i1> %4, i64 0
3222 %6 = select i1 %5, half %2, half zeroinitializer
3223 %res = insertelement <8 x half> %a0v, half %6, i64 0
3227 define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3228 ; CHECK-LABEL: stack_fold_fnmsub321sh_intkz:
3230 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3233 ; CHECK-NEXT: #NO_APP
3234 ; CHECK-NEXT: kmovb (%rdi), %k1
3235 ; CHECK-NEXT: vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3237 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3238 %a0 = extractelement <8 x half> %a0v, i64 0
3239 %a1 = extractelement <8 x half> %a1v, i64 0
3240 %a2 = extractelement <8 x half> %a2v, i64 0
3241 %neg = fneg half %a0
3242 %neg1 = fneg half %a2
3243 %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
3244 %3 = load i8, ptr %mask
3245 %4 = bitcast i8 %3 to <8 x i1>
3246 %5 = extractelement <8 x i1> %4, i64 0
3247 %6 = select i1 %5, half %2, half zeroinitializer
3248 %res = insertelement <8 x half> %a0v, half %6, i64 0
3252 define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3253 ; CHECK-LABEL: stack_fold_fnmsub132sh_intkz:
3255 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3258 ; CHECK-NEXT: #NO_APP
3259 ; CHECK-NEXT: kmovb (%rdi), %k1
3260 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3262 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3263 %a0 = extractelement <8 x half> %a0v, i64 0
3264 %a1 = extractelement <8 x half> %a1v, i64 0
3265 %a2 = extractelement <8 x half> %a2v, i64 0
3266 %neg = fneg half %a1
3267 %neg1 = fneg half %a0
3268 %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
3269 %3 = load i8, ptr %mask
3270 %4 = bitcast i8 %3 to <8 x i1>
3271 %5 = extractelement <8 x i1> %4, i64 0
3272 %6 = select i1 %5, half %2, half zeroinitializer
3273 %res = insertelement <8 x half> %a0v, half %6, i64 0
3277 define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
3278 ; CHECK-LABEL: stack_fold_fnmsub312sh_intkz:
3280 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3283 ; CHECK-NEXT: #NO_APP
3284 ; CHECK-NEXT: kmovb (%rdi), %k1
3285 ; CHECK-NEXT: vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
3287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3288 %a0 = extractelement <8 x half> %a0v, i64 0
3289 %a1 = extractelement <8 x half> %a1v, i64 0
3290 %a2 = extractelement <8 x half> %a2v, i64 0
3291 %neg = fneg half %a1
3292 %neg1 = fneg half %a2
3293 %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
3294 %3 = load i8, ptr %mask
3295 %4 = bitcast i8 %3 to <8 x i1>
3296 %5 = extractelement <8 x i1> %4, i64 0
3297 %6 = select i1 %5, half %2, half zeroinitializer
3298 %res = insertelement <8 x half> %a0v, half %6, i64 0
3302 define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3303 ; CHECK-LABEL: stack_fold_fmaddsub123ph:
3305 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3308 ; CHECK-NEXT: #NO_APP
3309 ; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3311 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3312 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
3315 declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32)
3317 define <32 x half> @stack_fold_fmaddsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3318 ; CHECK-LABEL: stack_fold_fmaddsub213ph:
3320 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3323 ; CHECK-NEXT: #NO_APP
3324 ; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3326 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3327 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
3331 define <32 x half> @stack_fold_fmaddsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3332 ; CHECK-LABEL: stack_fold_fmaddsub231ph:
3334 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3337 ; CHECK-NEXT: #NO_APP
3338 ; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3340 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3341 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
3345 define <32 x half> @stack_fold_fmaddsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3346 ; CHECK-LABEL: stack_fold_fmaddsub321ph:
3348 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3351 ; CHECK-NEXT: #NO_APP
3352 ; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3354 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3355 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
3359 define <32 x half> @stack_fold_fmaddsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3360 ; CHECK-LABEL: stack_fold_fmaddsub132ph:
3362 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3365 ; CHECK-NEXT: #NO_APP
3366 ; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3368 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3369 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
3373 define <32 x half> @stack_fold_fmaddsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3374 ; CHECK-LABEL: stack_fold_fmaddsub312ph:
3376 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3379 ; CHECK-NEXT: #NO_APP
3380 ; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3383 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
3387 define <32 x half> @stack_fold_fmaddsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3388 ; CHECK-LABEL: stack_fold_fmaddsub123ph_mask:
3390 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3393 ; CHECK-NEXT: #NO_APP
3394 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3395 ; CHECK-NEXT: kmovd %esi, %k1
3396 ; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3397 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3399 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3400 %a0 = load <32 x half>, ptr %p
3401 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
3402 %3 = bitcast i32 %mask to <32 x i1>
3403 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3407 define <32 x half> @stack_fold_fmaddsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3408 ; CHECK-LABEL: stack_fold_fmaddsub213ph_mask:
3410 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3413 ; CHECK-NEXT: #NO_APP
3414 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3415 ; CHECK-NEXT: kmovd %esi, %k1
3416 ; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3417 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3419 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3420 %a0 = load <32 x half>, ptr %p
3421 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
3422 %3 = bitcast i32 %mask to <32 x i1>
3423 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3427 define <32 x half> @stack_fold_fmaddsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3428 ; CHECK-LABEL: stack_fold_fmaddsub231ph_mask:
3430 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3433 ; CHECK-NEXT: #NO_APP
3434 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3435 ; CHECK-NEXT: kmovd %esi, %k1
3436 ; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3437 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3439 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3440 %a0 = load <32 x half>, ptr %p
3441 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
3442 %3 = bitcast i32 %mask to <32 x i1>
3443 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3447 define <32 x half> @stack_fold_fmaddsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3448 ; CHECK-LABEL: stack_fold_fmaddsub321ph_mask:
3450 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3453 ; CHECK-NEXT: #NO_APP
3454 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3455 ; CHECK-NEXT: kmovd %esi, %k1
3456 ; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3457 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3459 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3460 %a0 = load <32 x half>, ptr %p
3461 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
3462 %3 = bitcast i32 %mask to <32 x i1>
3463 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3467 define <32 x half> @stack_fold_fmaddsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3468 ; CHECK-LABEL: stack_fold_fmaddsub132ph_mask:
3470 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3473 ; CHECK-NEXT: #NO_APP
3474 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3475 ; CHECK-NEXT: kmovd %esi, %k1
3476 ; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3477 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3479 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3480 %a0 = load <32 x half>, ptr %p
3481 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
3482 %3 = bitcast i32 %mask to <32 x i1>
3483 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3487 define <32 x half> @stack_fold_fmaddsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3488 ; CHECK-LABEL: stack_fold_fmaddsub312ph_mask:
3490 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3493 ; CHECK-NEXT: #NO_APP
3494 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3495 ; CHECK-NEXT: kmovd %esi, %k1
3496 ; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3497 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3499 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3500 %a0 = load <32 x half>, ptr %p
3501 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
3502 %3 = bitcast i32 %mask to <32 x i1>
3503 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3507 define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3508 ; CHECK-LABEL: stack_fold_fmaddsub123ph_maskz:
3510 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3513 ; CHECK-NEXT: #NO_APP
3514 ; CHECK-NEXT: kmovd (%rdi), %k1
3515 ; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3517 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3518 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
3519 %3 = load i32, ptr %mask
3520 %4 = bitcast i32 %3 to <32 x i1>
3521 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3525 define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3526 ; CHECK-LABEL: stack_fold_fmaddsub213ph_maskz:
3528 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3531 ; CHECK-NEXT: #NO_APP
3532 ; CHECK-NEXT: kmovd (%rdi), %k1
3533 ; CHECK-NEXT: vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3535 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3536 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
3537 %3 = load i32, ptr %mask
3538 %4 = bitcast i32 %3 to <32 x i1>
3539 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3543 define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3544 ; CHECK-LABEL: stack_fold_fmaddsub231ph_maskz:
3546 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3549 ; CHECK-NEXT: #NO_APP
3550 ; CHECK-NEXT: kmovd (%rdi), %k1
3551 ; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3553 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3554 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
3555 %3 = load i32, ptr %mask
3556 %4 = bitcast i32 %3 to <32 x i1>
3557 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3561 define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3562 ; CHECK-LABEL: stack_fold_fmaddsub321ph_maskz:
3564 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3567 ; CHECK-NEXT: #NO_APP
3568 ; CHECK-NEXT: kmovd (%rdi), %k1
3569 ; CHECK-NEXT: vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3571 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3572 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
3573 %3 = load i32, ptr %mask
3574 %4 = bitcast i32 %3 to <32 x i1>
3575 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3579 define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3580 ; CHECK-LABEL: stack_fold_fmaddsub132ph_maskz:
3582 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3585 ; CHECK-NEXT: #NO_APP
3586 ; CHECK-NEXT: kmovd (%rdi), %k1
3587 ; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3589 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3590 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
3591 %3 = load i32, ptr %mask
3592 %4 = bitcast i32 %3 to <32 x i1>
3593 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3597 define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3598 ; CHECK-LABEL: stack_fold_fmaddsub312ph_maskz:
3600 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3603 ; CHECK-NEXT: #NO_APP
3604 ; CHECK-NEXT: kmovd (%rdi), %k1
3605 ; CHECK-NEXT: vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3607 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3608 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
3609 %3 = load i32, ptr %mask
3610 %4 = bitcast i32 %3 to <32 x i1>
3611 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3615 define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3616 ; CHECK-LABEL: stack_fold_fmsubadd123ph:
3618 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3621 ; CHECK-NEXT: #NO_APP
3622 ; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3624 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3625 %2 = fneg <32 x half> %a2
3626 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %2, i32 4)
3630 define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3631 ; CHECK-LABEL: stack_fold_fmsubadd213ph:
3633 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3636 ; CHECK-NEXT: #NO_APP
3637 ; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3639 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3640 %2 = fneg <32 x half> %a2
3641 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %2, i32 4)
3645 define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3646 ; CHECK-LABEL: stack_fold_fmsubadd231ph:
3648 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3651 ; CHECK-NEXT: #NO_APP
3652 ; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3654 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3655 %2 = fneg <32 x half> %a0
3656 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %2, i32 4)
3660 define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3661 ; CHECK-LABEL: stack_fold_fmsubadd321ph:
3663 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3666 ; CHECK-NEXT: #NO_APP
3667 ; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3669 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3670 %2 = fneg <32 x half> %a0
3671 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %2, i32 4)
3675 define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3676 ; CHECK-LABEL: stack_fold_fmsubadd132ph:
3678 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3681 ; CHECK-NEXT: #NO_APP
3682 ; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3684 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3685 %2 = fneg <32 x half> %a1
3686 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %2, i32 4)
3690 define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
3691 ; CHECK-LABEL: stack_fold_fmsubadd312ph:
3693 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3696 ; CHECK-NEXT: #NO_APP
3697 ; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
3699 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3700 %2 = fneg <32 x half> %a1
3701 %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %2, i32 4)
3705 define <32 x half> @stack_fold_fmsubadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3706 ; CHECK-LABEL: stack_fold_fmsubadd123ph_mask:
3708 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3711 ; CHECK-NEXT: #NO_APP
3712 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3713 ; CHECK-NEXT: kmovd %esi, %k1
3714 ; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3715 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3717 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3718 %a0 = load <32 x half>, ptr %p
3719 %neg = fneg <32 x half> %a2
3720 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
3721 %3 = bitcast i32 %mask to <32 x i1>
3722 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3726 define <32 x half> @stack_fold_fmsubadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3727 ; CHECK-LABEL: stack_fold_fmsubadd213ph_mask:
3729 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3732 ; CHECK-NEXT: #NO_APP
3733 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3734 ; CHECK-NEXT: kmovd %esi, %k1
3735 ; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3736 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3738 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3739 %a0 = load <32 x half>, ptr %p
3740 %neg = fneg <32 x half> %a2
3741 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
3742 %3 = bitcast i32 %mask to <32 x i1>
3743 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3747 define <32 x half> @stack_fold_fmsubadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3748 ; CHECK-LABEL: stack_fold_fmsubadd231ph_mask:
3750 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3753 ; CHECK-NEXT: #NO_APP
3754 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3755 ; CHECK-NEXT: kmovd %esi, %k1
3756 ; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3757 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3759 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3760 %a0 = load <32 x half>, ptr %p
3761 %neg = fneg <32 x half> %a0
3762 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
3763 %3 = bitcast i32 %mask to <32 x i1>
3764 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3768 define <32 x half> @stack_fold_fmsubadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3769 ; CHECK-LABEL: stack_fold_fmsubadd321ph_mask:
3771 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3774 ; CHECK-NEXT: #NO_APP
3775 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3776 ; CHECK-NEXT: kmovd %esi, %k1
3777 ; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3778 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3780 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3781 %a0 = load <32 x half>, ptr %p
3782 %neg = fneg <32 x half> %a0
3783 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
3784 %3 = bitcast i32 %mask to <32 x i1>
3785 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3789 define <32 x half> @stack_fold_fmsubadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3790 ; CHECK-LABEL: stack_fold_fmsubadd132ph_mask:
3792 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3795 ; CHECK-NEXT: #NO_APP
3796 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3797 ; CHECK-NEXT: kmovd %esi, %k1
3798 ; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3799 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3801 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3802 %a0 = load <32 x half>, ptr %p
3803 %neg = fneg <32 x half> %a1
3804 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
3805 %3 = bitcast i32 %mask to <32 x i1>
3806 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3810 define <32 x half> @stack_fold_fmsubadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
3811 ; CHECK-LABEL: stack_fold_fmsubadd312ph_mask:
3813 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3816 ; CHECK-NEXT: #NO_APP
3817 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3818 ; CHECK-NEXT: kmovd %esi, %k1
3819 ; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3820 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3822 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3823 %a0 = load <32 x half>, ptr %p
3824 %neg = fneg <32 x half> %a1
3825 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
3826 %3 = bitcast i32 %mask to <32 x i1>
3827 %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
3831 define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3832 ; CHECK-LABEL: stack_fold_fmsubadd123ph_maskz:
3834 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3837 ; CHECK-NEXT: #NO_APP
3838 ; CHECK-NEXT: kmovd (%rdi), %k1
3839 ; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3841 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3842 %neg = fneg <32 x half> %a2
3843 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
3844 %3 = load i32, ptr %mask
3845 %4 = bitcast i32 %3 to <32 x i1>
3846 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3850 define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3851 ; CHECK-LABEL: stack_fold_fmsubadd213ph_maskz:
3853 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3856 ; CHECK-NEXT: #NO_APP
3857 ; CHECK-NEXT: kmovd (%rdi), %k1
3858 ; CHECK-NEXT: vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3860 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3861 %neg = fneg <32 x half> %a2
3862 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
3863 %3 = load i32, ptr %mask
3864 %4 = bitcast i32 %3 to <32 x i1>
3865 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3869 define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3870 ; CHECK-LABEL: stack_fold_fmsubadd231ph_maskz:
3872 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3875 ; CHECK-NEXT: #NO_APP
3876 ; CHECK-NEXT: kmovd (%rdi), %k1
3877 ; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3879 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3880 %neg = fneg <32 x half> %a0
3881 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
3882 %3 = load i32, ptr %mask
3883 %4 = bitcast i32 %3 to <32 x i1>
3884 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3888 define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3889 ; CHECK-LABEL: stack_fold_fmsubadd321ph_maskz:
3891 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3894 ; CHECK-NEXT: #NO_APP
3895 ; CHECK-NEXT: kmovd (%rdi), %k1
3896 ; CHECK-NEXT: vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3899 %neg = fneg <32 x half> %a0
3900 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
3901 %3 = load i32, ptr %mask
3902 %4 = bitcast i32 %3 to <32 x i1>
3903 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3907 define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3908 ; CHECK-LABEL: stack_fold_fmsubadd132ph_maskz:
3910 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3913 ; CHECK-NEXT: #NO_APP
3914 ; CHECK-NEXT: kmovd (%rdi), %k1
3915 ; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3917 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3918 %neg = fneg <32 x half> %a1
3919 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
3920 %3 = load i32, ptr %mask
3921 %4 = bitcast i32 %3 to <32 x i1>
3922 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
3926 define <32 x half> @stack_fold_fmsubadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
3927 ; CHECK-LABEL: stack_fold_fmsubadd312ph_maskz:
3929 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3932 ; CHECK-NEXT: #NO_APP
3933 ; CHECK-NEXT: kmovd (%rdi), %k1
3934 ; CHECK-NEXT: vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
3936 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3937 %neg = fneg <32 x half> %a1
3938 %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
3939 %3 = load i32, ptr %mask
3940 %4 = bitcast i32 %3 to <32 x i1>
3941 %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer