1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-ADL
3 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-SPR
4 ; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-ADL
5 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-SPR
7 define <8 x i32> @permd(<8 x i32> %a0, <8 x i32> %a1) {
8 ; ENABLE-ADL-LABEL: permd:
10 ; ENABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11 ; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12 ; ENABLE-ADL-NEXT: #APP
13 ; ENABLE-ADL-NEXT: nop
14 ; ENABLE-ADL-NEXT: #NO_APP
15 ; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16 ; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
17 ; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0
18 ; ENABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0
19 ; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1
20 ; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
21 ; ENABLE-ADL-NEXT: retq
23 ; ENABLE-SPR-LABEL: permd:
24 ; ENABLE-SPR: # %bb.0:
25 ; ENABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16
26 ; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17
27 ; ENABLE-SPR-NEXT: #APP
28 ; ENABLE-SPR-NEXT: nop
29 ; ENABLE-SPR-NEXT: #NO_APP
30 ; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0
31 ; ENABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0
32 ; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1
33 ; ENABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0
34 ; ENABLE-SPR-NEXT: retq
36 ; DISABLE-ADL-LABEL: permd:
37 ; DISABLE-ADL: # %bb.0:
38 ; DISABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
39 ; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
40 ; DISABLE-ADL-NEXT: #APP
41 ; DISABLE-ADL-NEXT: nop
42 ; DISABLE-ADL-NEXT: #NO_APP
43 ; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
44 ; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
45 ; DISABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0
46 ; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1
47 ; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
48 ; DISABLE-ADL-NEXT: retq
50 ; DISABLE-SPR-LABEL: permd:
51 ; DISABLE-SPR: # %bb.0:
52 ; DISABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16
53 ; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17
54 ; DISABLE-SPR-NEXT: #APP
55 ; DISABLE-SPR-NEXT: nop
56 ; DISABLE-SPR-NEXT: #NO_APP
57 ; DISABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0
58 ; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1
59 ; DISABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0
60 ; DISABLE-SPR-NEXT: retq
61 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
62 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1)
63 %3 = add <8 x i32> %a0, %a1
64 %res = add <8 x i32> %2, %3
68 define <8 x i32> @permd_mem(ptr %p0, <8 x i32> %a1) {
69 ; ENABLE-ADL-LABEL: permd_mem:
70 ; ENABLE-ADL: # %bb.0:
71 ; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
72 ; ENABLE-ADL-NEXT: #APP
73 ; ENABLE-ADL-NEXT: nop
74 ; ENABLE-ADL-NEXT: #NO_APP
75 ; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
76 ; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0
77 ; ENABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0
78 ; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
79 ; ENABLE-ADL-NEXT: retq
81 ; ENABLE-SPR-LABEL: permd_mem:
82 ; ENABLE-SPR: # %bb.0:
83 ; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16
84 ; ENABLE-SPR-NEXT: #APP
85 ; ENABLE-SPR-NEXT: nop
86 ; ENABLE-SPR-NEXT: #NO_APP
87 ; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0
88 ; ENABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0
89 ; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0
90 ; ENABLE-SPR-NEXT: retq
92 ; DISABLE-ADL-LABEL: permd_mem:
93 ; DISABLE-ADL: # %bb.0:
94 ; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
95 ; DISABLE-ADL-NEXT: #APP
96 ; DISABLE-ADL-NEXT: nop
97 ; DISABLE-ADL-NEXT: #NO_APP
98 ; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
99 ; DISABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0
100 ; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
101 ; DISABLE-ADL-NEXT: retq
103 ; DISABLE-SPR-LABEL: permd_mem:
104 ; DISABLE-SPR: # %bb.0:
105 ; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16
106 ; DISABLE-SPR-NEXT: #APP
107 ; DISABLE-SPR-NEXT: nop
108 ; DISABLE-SPR-NEXT: #NO_APP
109 ; DISABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0
110 ; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0
111 ; DISABLE-SPR-NEXT: retq
112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
113 %a0 = load <8 x i32>, ptr %p0, align 64
114 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1)
115 %res = add <8 x i32> %2, %a1
119 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
121 define <4 x i64> @permq(<4 x i64> %a0) {
122 ; ENABLE-LABEL: permq:
126 ; ENABLE-NEXT: #NO_APP
127 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
128 ; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
129 ; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
132 ; DISABLE-LABEL: permq:
136 ; DISABLE-NEXT: #NO_APP
137 ; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0]
138 ; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
140 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
141 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
142 %res = add <4 x i64> %2, %a0
146 define <4 x i64> @permq_mem(ptr %p0) {
147 ; ENABLE-LABEL: permq_mem:
151 ; ENABLE-NEXT: #NO_APP
152 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
153 ; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
156 ; DISABLE-LABEL: permq_mem:
160 ; DISABLE-NEXT: #NO_APP
161 ; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
164 %a0 = load <4 x i64>, ptr %p0, align 64
165 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
169 define <8 x float> @permps(<8 x float> %a0, <8 x i32> %a1) {
170 ; ENABLE-ADL-LABEL: permps:
171 ; ENABLE-ADL: # %bb.0:
172 ; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
173 ; ENABLE-ADL-NEXT: #APP
174 ; ENABLE-ADL-NEXT: nop
175 ; ENABLE-ADL-NEXT: #NO_APP
176 ; ENABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
177 ; ENABLE-ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1
178 ; ENABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1
179 ; ENABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0
180 ; ENABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0
181 ; ENABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0
182 ; ENABLE-ADL-NEXT: retq
184 ; ENABLE-SPR-LABEL: permps:
185 ; ENABLE-SPR: # %bb.0:
186 ; ENABLE-SPR-NEXT: vmovaps %ymm0, %ymm16
187 ; ENABLE-SPR-NEXT: #APP
188 ; ENABLE-SPR-NEXT: nop
189 ; ENABLE-SPR-NEXT: #NO_APP
190 ; ENABLE-SPR-NEXT: vxorps %xmm1, %xmm1, %xmm1
191 ; ENABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1
192 ; ENABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0
193 ; ENABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0
194 ; ENABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0
195 ; ENABLE-SPR-NEXT: retq
197 ; DISABLE-ADL-LABEL: permps:
198 ; DISABLE-ADL: # %bb.0:
199 ; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
200 ; DISABLE-ADL-NEXT: #APP
201 ; DISABLE-ADL-NEXT: nop
202 ; DISABLE-ADL-NEXT: #NO_APP
203 ; DISABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
204 ; DISABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1
205 ; DISABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0
206 ; DISABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0
207 ; DISABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0
208 ; DISABLE-ADL-NEXT: retq
210 ; DISABLE-SPR-LABEL: permps:
211 ; DISABLE-SPR: # %bb.0:
212 ; DISABLE-SPR-NEXT: vmovaps %ymm0, %ymm16
213 ; DISABLE-SPR-NEXT: #APP
214 ; DISABLE-SPR-NEXT: nop
215 ; DISABLE-SPR-NEXT: #NO_APP
216 ; DISABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1
217 ; DISABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0
218 ; DISABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0
219 ; DISABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0
220 ; DISABLE-SPR-NEXT: retq
221 %1 = tail call <8 x i32> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
222 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1)
223 %t = sitofp <8 x i32> %1 to <8 x float>
224 %3 = fadd <8 x float> %t, %a0
225 %res = fadd <8 x float> %2, %3
229 define <8 x float> @permps_mem(ptr %p0, <8 x i32> %a1) {
230 ; ENABLE-LABEL: permps_mem:
234 ; ENABLE-NEXT: #NO_APP
235 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
236 ; ENABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1
237 ; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
238 ; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
241 ; DISABLE-LABEL: permps_mem:
245 ; DISABLE-NEXT: #NO_APP
246 ; DISABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1
247 ; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0
248 ; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0
250 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
251 %a0 = load <8 x float>, ptr %p0, align 64
252 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1)
253 %t = sitofp <8 x i32> %a1 to <8 x float>
254 %res = fadd <8 x float> %2, %t
258 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
260 define <4 x double> @permpd(<4 x double> %a0) {
261 ; ENABLE-LABEL: permpd:
265 ; ENABLE-NEXT: #NO_APP
266 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
267 ; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
268 ; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
271 ; DISABLE-LABEL: permpd:
275 ; DISABLE-NEXT: #NO_APP
276 ; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0]
277 ; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0
279 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
280 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
281 %res = fadd <4 x double> %2, %a0
282 ret <4 x double> %res
285 define <4 x double> @permpd_mem(ptr %p0) {
286 ; ENABLE-LABEL: permpd_mem:
290 ; ENABLE-NEXT: #NO_APP
291 ; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0
292 ; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
295 ; DISABLE-LABEL: permpd_mem:
299 ; DISABLE-NEXT: #NO_APP
300 ; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0]
302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
303 %a0 = load <4 x double>, ptr %p0, align 64
304 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>