1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
3 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
5 define <4 x i64> @permq_ri_256(<4 x i64> %a0) {
6 ; ENABLE-LABEL: permq_ri_256:
10 ; ENABLE-NEXT: #NO_APP
11 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
12 ; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
13 ; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
16 ; DISABLE-LABEL: permq_ri_256:
20 ; DISABLE-NEXT: #NO_APP
21 ; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
22 ; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
24 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
25 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
26 %res = add <4 x i64> %2, %a0
30 define <4 x i64> @permq_rr_256(<4 x i64> %a0, <4 x i64> %idx) {
31 ; ENABLE-LABEL: permq_rr_256:
33 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
36 ; ENABLE-NEXT: #NO_APP
37 ; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
38 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
39 ; ENABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1
40 ; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
41 ; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
44 ; DISABLE-LABEL: permq_rr_256:
46 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
49 ; DISABLE-NEXT: #NO_APP
50 ; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
51 ; DISABLE-NEXT: vpermq %ymm0, %ymm2, %ymm1
52 ; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
53 ; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
55 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
56 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
57 %t = add <4 x i64> %a0, %idx
58 %res = add <4 x i64> %t, %2
62 define <4 x i64> @permq_rm_256(ptr %p0, <4 x i64> %idx) {
63 ; ENABLE-LABEL: permq_rm_256:
67 ; ENABLE-NEXT: #NO_APP
68 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
69 ; ENABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1
70 ; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
73 ; DISABLE-LABEL: permq_rm_256:
77 ; DISABLE-NEXT: #NO_APP
78 ; DISABLE-NEXT: vpermq (%rdi), %ymm0, %ymm1
79 ; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
81 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
82 %a0 = load <4 x i64>, ptr %p0, align 64
83 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
84 %res = add <4 x i64> %idx, %2
88 define <4 x i64> @permq_mi_256(ptr %p0) {
89 ; ENABLE-LABEL: permq_mi_256:
93 ; ENABLE-NEXT: #NO_APP
94 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
95 ; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
98 ; DISABLE-LABEL: permq_mi_256:
102 ; DISABLE-NEXT: #NO_APP
103 ; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
105 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
106 %a0 = load <4 x i64>, ptr %p0, align 64
107 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
111 define <4 x i64> @permq_broadcast_256(ptr %p0, <4 x i64> %idx) {
112 ; ENABLE-LABEL: permq_broadcast_256:
114 ; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
117 ; ENABLE-NEXT: #NO_APP
118 ; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
119 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
120 ; ENABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0
121 ; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
124 ; DISABLE-LABEL: permq_broadcast_256:
126 ; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
129 ; DISABLE-NEXT: #NO_APP
130 ; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
131 ; DISABLE-NEXT: vpermq (%rdi){1to4}, %ymm1, %ymm0
132 ; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
134 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
135 %v0 = load i64, ptr %p0, align 4
136 %t0 = insertelement <4 x i64> undef, i64 %v0, i64 0
137 %a0 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer
138 %2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx)
139 %res = add <4 x i64> %2, %idx
143 define <4 x i64> @permq_maskz_256(<4 x i64> %a0, <4 x i64> %idx, ptr %mask) {
144 ; ENABLE-LABEL: permq_maskz_256:
148 ; ENABLE-NEXT: #NO_APP
149 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
150 ; ENABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2
151 ; ENABLE-NEXT: kmovb (%rdi), %k1
152 ; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
153 ; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1}
156 ; DISABLE-LABEL: permq_maskz_256:
160 ; DISABLE-NEXT: #NO_APP
161 ; DISABLE-NEXT: vpermq %ymm0, %ymm1, %ymm2
162 ; DISABLE-NEXT: kmovb (%rdi), %k1
163 ; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
164 ; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0 {%k1}
166 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
167 %2 = load i8, ptr %mask
168 %3 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a0, <4 x i64> %idx, <4 x i64> zeroinitializer, i8 %2)
169 %t = add <4 x i64> %a0, %idx
170 %res = add <4 x i64> %3, %t
174 declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
175 declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
177 define <8 x i64> @permq_rr_512(<8 x i64> %a0, <8 x i64> %idx) {
178 ; ENABLE-LABEL: permq_rr_512:
180 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
183 ; ENABLE-NEXT: #NO_APP
184 ; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
185 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
186 ; ENABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1
187 ; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
188 ; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
191 ; DISABLE-LABEL: permq_rr_512:
193 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
196 ; DISABLE-NEXT: #NO_APP
197 ; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
198 ; DISABLE-NEXT: vpermq %zmm0, %zmm2, %zmm1
199 ; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
200 ; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
202 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
203 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
204 %t = add <8 x i64> %a0, %idx
205 %res = add <8 x i64> %t, %2
209 define <8 x i64> @permq_rm_512(ptr %p0, <8 x i64> %idx) {
210 ; ENABLE-LABEL: permq_rm_512:
214 ; ENABLE-NEXT: #NO_APP
215 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
216 ; ENABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1
217 ; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
220 ; DISABLE-LABEL: permq_rm_512:
224 ; DISABLE-NEXT: #NO_APP
225 ; DISABLE-NEXT: vpermq (%rdi), %zmm0, %zmm1
226 ; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
228 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
229 %a0 = load <8 x i64>, ptr %p0, align 64
230 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
231 %res = add <8 x i64> %idx, %2
235 define <8 x i64> @permq_broadcast_512(ptr %p0, <8 x i64> %idx) {
236 ; ENABLE-LABEL: permq_broadcast_512:
238 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
241 ; ENABLE-NEXT: #NO_APP
242 ; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
243 ; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
244 ; ENABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0
245 ; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
248 ; DISABLE-LABEL: permq_broadcast_512:
250 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
253 ; DISABLE-NEXT: #NO_APP
254 ; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
255 ; DISABLE-NEXT: vpermq (%rdi){1to8}, %zmm1, %zmm0
256 ; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
258 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
259 %v0 = load i64, ptr %p0, align 4
260 %t0 = insertelement <8 x i64> undef, i64 %v0, i64 0
261 %a0 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer
262 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx)
263 %res = add <8 x i64> %2, %idx
267 define <8 x i64> @permq_maskz_512(<8 x i64> %a0, <8 x i64> %idx, ptr %mask) {
268 ; ENABLE-LABEL: permq_maskz_512:
272 ; ENABLE-NEXT: #NO_APP
273 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
274 ; ENABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2
275 ; ENABLE-NEXT: kmovb (%rdi), %k1
276 ; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
277 ; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1}
280 ; DISABLE-LABEL: permq_maskz_512:
284 ; DISABLE-NEXT: #NO_APP
285 ; DISABLE-NEXT: vpermq %zmm0, %zmm1, %zmm2
286 ; DISABLE-NEXT: kmovb (%rdi), %k1
287 ; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
288 ; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0 {%k1}
290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
291 %2 = load i8, ptr %mask
292 %3 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a0, <8 x i64> %idx, <8 x i64> zeroinitializer, i8 %2)
293 %t = add <8 x i64> %a0, %idx
294 %res = add <8 x i64> %3, %t
298 declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
299 declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
301 define <8 x i32> @permd_rr_256(<8 x i32> %a0, <8 x i32> %idx) {
302 ; ENABLE-LABEL: permd_rr_256:
304 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
307 ; ENABLE-NEXT: #NO_APP
308 ; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
309 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
310 ; ENABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1
311 ; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0
312 ; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
315 ; DISABLE-LABEL: permd_rr_256:
317 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
320 ; DISABLE-NEXT: #NO_APP
321 ; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
322 ; DISABLE-NEXT: vpermd %ymm0, %ymm2, %ymm1
323 ; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0
324 ; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
326 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
327 %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1)
328 %t = add <8 x i32> %a0, %idx
329 %res = add <8 x i32> %t, %2
333 define <8 x i32> @permd_rm_256(ptr %p0, <8 x i32> %idx) {
334 ; ENABLE-LABEL: permd_rm_256:
338 ; ENABLE-NEXT: #NO_APP
339 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
340 ; ENABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1
341 ; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
344 ; DISABLE-LABEL: permd_rm_256:
348 ; DISABLE-NEXT: #NO_APP
349 ; DISABLE-NEXT: vpermd (%rdi), %ymm0, %ymm1
350 ; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
352 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
353 %a0 = load <8 x i32>, ptr %p0, align 64
354 %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> undef, i8 -1)
355 %res = add <8 x i32> %idx, %2
359 define <8 x i32> @permd_broadcast_256(ptr %p0, <8 x i32> %idx) {
360 ; ENABLE-LABEL: permd_broadcast_256:
362 ; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
365 ; ENABLE-NEXT: #NO_APP
366 ; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
367 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
368 ; ENABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0
369 ; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
372 ; DISABLE-LABEL: permd_broadcast_256:
374 ; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
377 ; DISABLE-NEXT: #NO_APP
378 ; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
379 ; DISABLE-NEXT: vpermd (%rdi){1to8}, %ymm1, %ymm0
380 ; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
383 %v0 = load i32, ptr %p0, align 4
384 %t0 = insertelement <8 x i32> undef, i32 %v0, i32 0
385 %a0 = shufflevector <8 x i32> %t0, <8 x i32> undef, <8 x i32> zeroinitializer
386 %2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 -1)
387 %res = add <8 x i32> %2, %idx
391 define <8 x i32> @permd_maskz_256(<8 x i32> %a0, <8 x i32> %idx, ptr %mask) {
392 ; ENABLE-LABEL: permd_maskz_256:
396 ; ENABLE-NEXT: #NO_APP
397 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
398 ; ENABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2
399 ; ENABLE-NEXT: kmovb (%rdi), %k1
400 ; ENABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
401 ; ENABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1}
404 ; DISABLE-LABEL: permd_maskz_256:
408 ; DISABLE-NEXT: #NO_APP
409 ; DISABLE-NEXT: vpermd %ymm0, %ymm1, %ymm2
410 ; DISABLE-NEXT: kmovb (%rdi), %k1
411 ; DISABLE-NEXT: vpaddd %ymm1, %ymm0, %ymm0
412 ; DISABLE-NEXT: vpaddd %ymm2, %ymm0, %ymm0 {%k1}
414 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
415 %2 = load i8, ptr %mask
416 %3 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %a0, <8 x i32> %idx, <8 x i32> zeroinitializer, i8 %2)
417 %t = add <8 x i32> %a0, %idx
418 %res = add <8 x i32> %3, %t
422 declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
424 define <16 x i32> @permd_rr_512(<16 x i32> %a0, <16 x i32> %idx) {
425 ; ENABLE-LABEL: permd_rr_512:
427 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
430 ; ENABLE-NEXT: #NO_APP
431 ; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
432 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
433 ; ENABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1
434 ; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0
435 ; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
438 ; DISABLE-LABEL: permd_rr_512:
440 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
443 ; DISABLE-NEXT: #NO_APP
444 ; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
445 ; DISABLE-NEXT: vpermd %zmm0, %zmm2, %zmm1
446 ; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0
447 ; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
449 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
450 %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
451 %t = add <16 x i32> %a0, %idx
452 %res = add <16 x i32> %t, %2
456 define <16 x i32> @permd_rm_512(ptr %p0, <16 x i32> %idx) {
457 ; ENABLE-LABEL: permd_rm_512:
461 ; ENABLE-NEXT: #NO_APP
462 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
463 ; ENABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1
464 ; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
467 ; DISABLE-LABEL: permd_rm_512:
471 ; DISABLE-NEXT: #NO_APP
472 ; DISABLE-NEXT: vpermd (%rdi), %zmm0, %zmm1
473 ; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
475 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
476 %a0 = load <16 x i32>, ptr %p0, align 64
477 %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
478 %res = add <16 x i32> %idx, %2
482 define <16 x i32> @permd_broadcast_512(ptr %p0, <16 x i32> %idx) {
483 ; ENABLE-LABEL: permd_broadcast_512:
485 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
488 ; ENABLE-NEXT: #NO_APP
489 ; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
490 ; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
491 ; ENABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0
492 ; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
495 ; DISABLE-LABEL: permd_broadcast_512:
497 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
500 ; DISABLE-NEXT: #NO_APP
501 ; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
502 ; DISABLE-NEXT: vpermd (%rdi){1to16}, %zmm1, %zmm0
503 ; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
505 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
506 %v0 = load i32, ptr %p0, align 4
507 %t0 = insertelement <16 x i32> undef, i32 %v0, i32 0
508 %a0 = shufflevector <16 x i32> %t0, <16 x i32> undef, <16 x i32> zeroinitializer
509 %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> undef, i16 -1)
510 %res = add <16 x i32> %2, %idx
514 define <16 x i32> @permd_maskz_512(<16 x i32> %a0, <16 x i32> %idx, ptr %mask) {
515 ; ENABLE-LABEL: permd_maskz_512:
519 ; ENABLE-NEXT: #NO_APP
520 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
521 ; ENABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2
522 ; ENABLE-NEXT: kmovw (%rdi), %k1
523 ; ENABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
524 ; ENABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1}
527 ; DISABLE-LABEL: permd_maskz_512:
531 ; DISABLE-NEXT: #NO_APP
532 ; DISABLE-NEXT: vpermd %zmm0, %zmm1, %zmm2
533 ; DISABLE-NEXT: kmovw (%rdi), %k1
534 ; DISABLE-NEXT: vpaddd %zmm1, %zmm0, %zmm0
535 ; DISABLE-NEXT: vpaddd %zmm2, %zmm0, %zmm0 {%k1}
537 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
538 %2 = load i16, ptr %mask
539 %3 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a0, <16 x i32> %idx, <16 x i32> zeroinitializer, i16 %2)
540 %t = add <16 x i32> %a0, %idx
541 %res = add <16 x i32> %3, %t
545 declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
547 define <4 x double> @permpd_ri_256(<4 x double> %a0) {
548 ; ENABLE-LABEL: permpd_ri_256:
552 ; ENABLE-NEXT: #NO_APP
553 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
554 ; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
555 ; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
558 ; DISABLE-LABEL: permpd_ri_256:
562 ; DISABLE-NEXT: #NO_APP
563 ; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
564 ; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
566 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
567 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
568 %res = fadd <4 x double> %2, %a0
569 ret <4 x double> %res
572 define <4 x double> @permpd_rr_256(<4 x double> %a0, <4 x i64> %idx) {
573 ; ENABLE-LABEL: permpd_rr_256:
575 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
578 ; ENABLE-NEXT: #NO_APP
579 ; ENABLE-NEXT: vmovapd %ymm0, %ymm2
580 ; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
581 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
582 ; ENABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1
583 ; ENABLE-NEXT: vcvtqq2pd %ymm0, %ymm0
584 ; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
585 ; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
588 ; DISABLE-LABEL: permpd_rr_256:
590 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
593 ; DISABLE-NEXT: #NO_APP
594 ; DISABLE-NEXT: vmovapd %ymm0, %ymm2
595 ; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
596 ; DISABLE-NEXT: vpermpd %ymm2, %ymm0, %ymm1
597 ; DISABLE-NEXT: vcvtqq2pd %ymm0, %ymm0
598 ; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
599 ; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
601 %1 = tail call <4 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
602 %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %1, <4 x i64> %idx)
603 %a1 = sitofp <4 x i64> %idx to <4 x double>
604 %t = fadd <4 x double> %1, %a1
605 %res = fadd <4 x double> %2, %t
606 ret <4 x double> %res
609 define <4 x double> @permpd_rm_256(ptr %p0, <4 x i64> %idx) {
610 ; ENABLE-LABEL: permpd_rm_256:
612 ; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
615 ; ENABLE-NEXT: #NO_APP
616 ; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
617 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
618 ; ENABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0
619 ; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
620 ; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
623 ; DISABLE-LABEL: permpd_rm_256:
625 ; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
628 ; DISABLE-NEXT: #NO_APP
629 ; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
630 ; DISABLE-NEXT: vpermpd (%rdi), %ymm1, %ymm0
631 ; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
632 ; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
634 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
635 %a0 = load <4 x double>, ptr %p0, align 64
636 %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx)
637 %a1 = sitofp <4 x i64> %idx to <4 x double>
638 %res = fadd <4 x double> %2, %a1
639 ret <4 x double> %res
642 define <4 x double> @permpd_mi_256(ptr %p0) {
643 ; ENABLE-LABEL: permpd_mi_256:
647 ; ENABLE-NEXT: #NO_APP
648 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
649 ; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
652 ; DISABLE-LABEL: permpd_mi_256:
656 ; DISABLE-NEXT: #NO_APP
657 ; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,2,0]
659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
660 %a0 = load <4 x double>, ptr %p0, align 64
661 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
665 define <4 x double> @permpd_broadcast_256(ptr %p0, <4 x i64> %idx) {
666 ; ENABLE-LABEL: permpd_broadcast_256:
668 ; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
671 ; ENABLE-NEXT: #NO_APP
672 ; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
673 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
674 ; ENABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0
675 ; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
676 ; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
679 ; DISABLE-LABEL: permpd_broadcast_256:
681 ; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
684 ; DISABLE-NEXT: #NO_APP
685 ; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
686 ; DISABLE-NEXT: vpermpd (%rdi){1to4}, %ymm1, %ymm0
687 ; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
688 ; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
690 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
691 %v0 = load double, ptr %p0, align 4
692 %t0 = insertelement <4 x double> undef, double %v0, i64 0
693 %a0 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer
694 %2 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %a0, <4 x i64> %idx)
695 %a1 = sitofp <4 x i64> %idx to <4 x double>
696 %res = fadd <4 x double> %2, %a1
697 ret <4 x double> %res
700 define <4 x double> @permpd_maskz_256(<4 x double> %a0, <4 x i64> %idx, ptr %mask) {
701 ; ENABLE-LABEL: permpd_maskz_256:
705 ; ENABLE-NEXT: #NO_APP
706 ; ENABLE-NEXT: kmovb (%rdi), %k1
707 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
708 ; ENABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z}
709 ; ENABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
710 ; ENABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
711 ; ENABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
714 ; DISABLE-LABEL: permpd_maskz_256:
718 ; DISABLE-NEXT: #NO_APP
719 ; DISABLE-NEXT: kmovb (%rdi), %k1
720 ; DISABLE-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} {z}
721 ; DISABLE-NEXT: vcvtqq2pd %ymm1, %ymm1
722 ; DISABLE-NEXT: vaddpd %ymm1, %ymm0, %ymm0
723 ; DISABLE-NEXT: vaddpd %ymm0, %ymm2, %ymm0
725 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
726 %2 = load i8, ptr %mask
727 %3 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a0, <4 x i64> %idx, <4 x double> zeroinitializer, i8 %2)
728 %a1 = sitofp <4 x i64> %idx to <4 x double>
729 %t = fadd <4 x double> %a0, %a1
730 %res = fadd <4 x double> %3, %t
731 ret <4 x double> %res
734 declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
735 declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8)
737 define <8 x double> @permpd_rr_512(<8 x double> %a0, <8 x i64> %idx) {
738 ; ENABLE-LABEL: permpd_rr_512:
740 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
743 ; ENABLE-NEXT: #NO_APP
744 ; ENABLE-NEXT: vmovapd %zmm0, %zmm2
745 ; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
746 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
747 ; ENABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1
748 ; ENABLE-NEXT: vcvtqq2pd %zmm0, %zmm0
749 ; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
750 ; ENABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
753 ; DISABLE-LABEL: permpd_rr_512:
755 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
758 ; DISABLE-NEXT: #NO_APP
759 ; DISABLE-NEXT: vmovapd %zmm0, %zmm2
760 ; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
761 ; DISABLE-NEXT: vpermpd %zmm2, %zmm0, %zmm1
762 ; DISABLE-NEXT: vcvtqq2pd %zmm0, %zmm0
763 ; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
764 ; DISABLE-NEXT: vaddpd %zmm0, %zmm1, %zmm0
766 %1 = tail call <8 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
767 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> %idx)
768 %a1 = sitofp <8 x i64> %idx to <8 x double>
769 %t = fadd <8 x double> %1, %a1
770 %res = fadd <8 x double> %2, %t
771 ret <8 x double> %res
774 define <8 x double> @permpd_rm_512(ptr %p0, <8 x i64> %idx) {
775 ; ENABLE-LABEL: permpd_rm_512:
777 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
780 ; ENABLE-NEXT: #NO_APP
781 ; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
782 ; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
783 ; ENABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0
784 ; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
785 ; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
788 ; DISABLE-LABEL: permpd_rm_512:
790 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
793 ; DISABLE-NEXT: #NO_APP
794 ; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
795 ; DISABLE-NEXT: vpermpd (%rdi), %zmm1, %zmm0
796 ; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
797 ; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
799 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
800 %a0 = load <8 x double>, ptr %p0, align 64
801 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx)
802 %a1 = sitofp <8 x i64> %idx to <8 x double>
803 %res = fadd <8 x double> %2, %a1
804 ret <8 x double> %res
807 define <8 x double> @permpd_broadcast_512(ptr %p0, <8 x i64> %idx) {
808 ; ENABLE-LABEL: permpd_broadcast_512:
810 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
813 ; ENABLE-NEXT: #NO_APP
814 ; ENABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
815 ; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
816 ; ENABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0
817 ; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
818 ; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
821 ; DISABLE-LABEL: permpd_broadcast_512:
823 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
826 ; DISABLE-NEXT: #NO_APP
827 ; DISABLE-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
828 ; DISABLE-NEXT: vpermpd (%rdi){1to8}, %zmm1, %zmm0
829 ; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
830 ; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
832 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
833 %v0 = load double, ptr %p0, align 4
834 %t0 = insertelement <8 x double> undef, double %v0, i64 0
835 %a0 = shufflevector <8 x double> %t0, <8 x double> undef, <8 x i32> zeroinitializer
836 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %a0, <8 x i64> %idx)
837 %a1 = sitofp <8 x i64> %idx to <8 x double>
838 %res = fadd <8 x double> %2, %a1
839 ret <8 x double> %res
842 define <8 x double> @permpd_maskz_512(<8 x double> %a0, <8 x i64> %idx, ptr %mask) {
843 ; ENABLE-LABEL: permpd_maskz_512:
847 ; ENABLE-NEXT: #NO_APP
848 ; ENABLE-NEXT: kmovb (%rdi), %k1
849 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
850 ; ENABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z}
851 ; ENABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
852 ; ENABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
853 ; ENABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
856 ; DISABLE-LABEL: permpd_maskz_512:
860 ; DISABLE-NEXT: #NO_APP
861 ; DISABLE-NEXT: kmovb (%rdi), %k1
862 ; DISABLE-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} {z}
863 ; DISABLE-NEXT: vcvtqq2pd %zmm1, %zmm1
864 ; DISABLE-NEXT: vaddpd %zmm1, %zmm0, %zmm0
865 ; DISABLE-NEXT: vaddpd %zmm0, %zmm2, %zmm0
867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
868 %2 = load i8, ptr %mask
869 %3 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a0, <8 x i64> %idx, <8 x double> zeroinitializer, i8 %2)
870 %a1 = sitofp <8 x i64> %idx to <8 x double>
871 %t = fadd <8 x double> %a0, %a1
872 %res = fadd <8 x double> %3, %t
873 ret <8 x double> %res
876 declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
877 declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
880 define <8 x float> @permps_rr_256(<8 x float> %a0, <8 x i32> %idx) {
881 ; ENABLE-LABEL: permps_rr_256:
883 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
886 ; ENABLE-NEXT: #NO_APP
887 ; ENABLE-NEXT: vmovaps %ymm0, %ymm2
888 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
889 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
890 ; ENABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1
891 ; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
892 ; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
893 ; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
896 ; DISABLE-LABEL: permps_rr_256:
898 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
901 ; DISABLE-NEXT: #NO_APP
902 ; DISABLE-NEXT: vmovaps %ymm0, %ymm2
903 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
904 ; DISABLE-NEXT: vpermps %ymm2, %ymm0, %ymm1
905 ; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
906 ; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
907 ; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
909 %1 = tail call <8 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
910 %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %1, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
911 %a1 = sitofp <8 x i32> %idx to <8 x float>
912 %t = fadd <8 x float> %1, %a1
913 %res = fadd <8 x float> %2, %t
917 define <8 x float> @permps_rm_256(ptr %p0, <8 x i32> %idx) {
918 ; ENABLE-LABEL: permps_rm_256:
920 ; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
923 ; ENABLE-NEXT: #NO_APP
924 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
925 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
926 ; ENABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0
927 ; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
928 ; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
931 ; DISABLE-LABEL: permps_rm_256:
933 ; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
936 ; DISABLE-NEXT: #NO_APP
937 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
938 ; DISABLE-NEXT: vpermps (%rdi), %ymm1, %ymm0
939 ; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
940 ; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
942 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
943 %a0 = load <8 x float>, ptr %p0, align 64
944 %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
945 %a1 = sitofp <8 x i32> %idx to <8 x float>
946 %res = fadd <8 x float> %2, %a1
950 define <8 x float> @permps_broadcast_256(ptr %p0, <8 x i32> %idx) {
951 ; ENABLE-LABEL: permps_broadcast_256:
953 ; ENABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
956 ; ENABLE-NEXT: #NO_APP
957 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
958 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
959 ; ENABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0
960 ; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
961 ; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
964 ; DISABLE-LABEL: permps_broadcast_256:
966 ; DISABLE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
969 ; DISABLE-NEXT: #NO_APP
970 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
971 ; DISABLE-NEXT: vpermps (%rdi){1to8}, %ymm1, %ymm0
972 ; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
973 ; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
975 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
976 %v0 = load float, ptr %p0, align 4
977 %t0 = insertelement <8 x float> undef, float %v0, i32 0
978 %a0 = shufflevector <8 x float> %t0, <8 x float> undef, <8 x i32> zeroinitializer
979 %2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 -1)
980 %a1 = sitofp <8 x i32> %idx to <8 x float>
981 %res = fadd <8 x float> %2, %a1
985 define <8 x float> @permps_maskz_256(<8 x float> %a0, <8 x i32> %idx, ptr %mask) {
986 ; ENABLE-LABEL: permps_maskz_256:
990 ; ENABLE-NEXT: #NO_APP
991 ; ENABLE-NEXT: kmovb (%rdi), %k1
992 ; ENABLE-NEXT: vxorps %xmm2, %xmm2, %xmm2
993 ; ENABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z}
994 ; ENABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
995 ; ENABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
996 ; ENABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
999 ; DISABLE-LABEL: permps_maskz_256:
1001 ; DISABLE-NEXT: #APP
1003 ; DISABLE-NEXT: #NO_APP
1004 ; DISABLE-NEXT: kmovb (%rdi), %k1
1005 ; DISABLE-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} {z}
1006 ; DISABLE-NEXT: vcvtdq2ps %ymm1, %ymm1
1007 ; DISABLE-NEXT: vaddps %ymm1, %ymm0, %ymm0
1008 ; DISABLE-NEXT: vaddps %ymm0, %ymm2, %ymm0
1009 ; DISABLE-NEXT: retq
1010 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1011 %2 = load i8, ptr %mask
1012 %3 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %a0, <8 x i32> %idx, <8 x float> zeroinitializer, i8 %2)
1013 %a1 = sitofp <8 x i32> %idx to <8 x float>
1014 %t = fadd <8 x float> %a0, %a1
1015 %res = fadd <8 x float> %3, %t
1016 ret <8 x float> %res
1019 declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8)
1021 define <16 x float> @permps_rr_512(<16 x float> %a0, <16 x i32> %idx) {
1022 ; ENABLE-LABEL: permps_rr_512:
1024 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1027 ; ENABLE-NEXT: #NO_APP
1028 ; ENABLE-NEXT: vmovaps %zmm0, %zmm2
1029 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1030 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
1031 ; ENABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1
1032 ; ENABLE-NEXT: vcvtdq2ps %zmm0, %zmm0
1033 ; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
1034 ; ENABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
1037 ; DISABLE-LABEL: permps_rr_512:
1039 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1040 ; DISABLE-NEXT: #APP
1042 ; DISABLE-NEXT: #NO_APP
1043 ; DISABLE-NEXT: vmovaps %zmm0, %zmm2
1044 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1045 ; DISABLE-NEXT: vpermps %zmm2, %zmm0, %zmm1
1046 ; DISABLE-NEXT: vcvtdq2ps %zmm0, %zmm0
1047 ; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
1048 ; DISABLE-NEXT: vaddps %zmm0, %zmm1, %zmm0
1049 ; DISABLE-NEXT: retq
1050 %1 = tail call <16 x float> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1051 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %1, <16 x i32> %idx)
1052 %a1 = sitofp <16 x i32> %idx to <16 x float>
1053 %t = fadd <16 x float> %1, %a1
1054 %res = fadd <16 x float> %2, %t
1055 ret <16 x float> %res
1058 define <16 x float> @permps_rm_512(ptr %p0, <16 x i32> %idx) {
1059 ; ENABLE-LABEL: permps_rm_512:
1061 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1064 ; ENABLE-NEXT: #NO_APP
1065 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1066 ; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
1067 ; ENABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0
1068 ; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
1069 ; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
1072 ; DISABLE-LABEL: permps_rm_512:
1074 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1075 ; DISABLE-NEXT: #APP
1077 ; DISABLE-NEXT: #NO_APP
1078 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1079 ; DISABLE-NEXT: vpermps (%rdi), %zmm1, %zmm0
1080 ; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
1081 ; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
1082 ; DISABLE-NEXT: retq
1083 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1084 %a0 = load <16 x float>, ptr %p0, align 64
1085 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx)
1086 %a1 = sitofp <16 x i32> %idx to <16 x float>
1087 %res = fadd <16 x float> %2, %a1
1088 ret <16 x float> %res
1091 define <16 x float> @permps_broadcast_512(ptr %p0, <16 x i32> %idx) {
1092 ; ENABLE-LABEL: permps_broadcast_512:
1094 ; ENABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1097 ; ENABLE-NEXT: #NO_APP
1098 ; ENABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1099 ; ENABLE-NEXT: vpxor %xmm0, %xmm0, %xmm0
1100 ; ENABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0
1101 ; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
1102 ; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
1105 ; DISABLE-LABEL: permps_broadcast_512:
1107 ; DISABLE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1108 ; DISABLE-NEXT: #APP
1110 ; DISABLE-NEXT: #NO_APP
1111 ; DISABLE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
1112 ; DISABLE-NEXT: vpermps (%rdi){1to16}, %zmm1, %zmm0
1113 ; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
1114 ; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
1115 ; DISABLE-NEXT: retq
1116 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1117 %v0 = load float, ptr %p0, align 4
1118 %t0 = insertelement <16 x float> undef, float %v0, i32 0
1119 %a0 = shufflevector <16 x float> %t0, <16 x float> undef, <16 x i32> zeroinitializer
1120 %2 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx)
1121 %a1 = sitofp <16 x i32> %idx to <16 x float>
1122 %res = fadd <16 x float> %2, %a1
1123 ret <16 x float> %res
1126 define <16 x float> @permps_maskz_512(<16 x float> %a0, <16 x i32> %idx, ptr %mask) {
1127 ; ENABLE-LABEL: permps_maskz_512:
1131 ; ENABLE-NEXT: #NO_APP
1132 ; ENABLE-NEXT: kmovw (%rdi), %k1
1133 ; ENABLE-NEXT: vpxor %xmm2, %xmm2, %xmm2
1134 ; ENABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z}
1135 ; ENABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
1136 ; ENABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
1137 ; ENABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
1140 ; DISABLE-LABEL: permps_maskz_512:
1142 ; DISABLE-NEXT: #APP
1144 ; DISABLE-NEXT: #NO_APP
1145 ; DISABLE-NEXT: kmovw (%rdi), %k1
1146 ; DISABLE-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} {z}
1147 ; DISABLE-NEXT: vcvtdq2ps %zmm1, %zmm1
1148 ; DISABLE-NEXT: vaddps %zmm1, %zmm0, %zmm0
1149 ; DISABLE-NEXT: vaddps %zmm0, %zmm2, %zmm0
1150 ; DISABLE-NEXT: retq
1151 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1152 %2 = load i16, ptr %mask
1153 %3 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a0, <16 x i32> %idx, <16 x float> zeroinitializer, i16 %2)
1154 %a1 = sitofp <16 x i32> %idx to <16 x float>
1155 %t = fadd <16 x float> %a0, %a1
1156 %res = fadd <16 x float> %3, %t
1157 ret <16 x float> %res
1160 declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
1161 declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)