1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
9 define void @test_demanded_haddps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind {
10 ; X86-LABEL: test_demanded_haddps_128:
12 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
13 ; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0
14 ; X86-NEXT: vmovss %xmm0, (%eax)
17 ; X64-LABEL: test_demanded_haddps_128:
19 ; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0
20 ; X64-NEXT: vmovss %xmm0, (%rdi)
22 %1 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
23 %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %1)
24 %3 = extractelement <4 x float> %2, i32 0
25 store float %3, ptr%a2
29 define void @test_demanded_hsubps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind {
30 ; X86-LABEL: test_demanded_hsubps_128:
32 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
33 ; X86-NEXT: vhsubps %xmm1, %xmm0, %xmm0
34 ; X86-NEXT: vextractps $2, %xmm0, (%eax)
37 ; X64-LABEL: test_demanded_hsubps_128:
39 ; X64-NEXT: vhsubps %xmm1, %xmm0, %xmm0
40 ; X64-NEXT: vextractps $2, %xmm0, (%rdi)
42 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
43 %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %a1)
44 %3 = extractelement <4 x float> %2, i32 2
45 store float %3, ptr%a2
49 define void @test_demanded_haddpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind {
50 ; X86-LABEL: test_demanded_haddpd_128:
52 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
53 ; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
54 ; X86-NEXT: vmovlpd %xmm0, (%eax)
57 ; X64-LABEL: test_demanded_haddpd_128:
59 ; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
60 ; X64-NEXT: vmovlpd %xmm0, (%rdi)
62 %1 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
63 %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %1)
64 %3 = extractelement <2 x double> %2, i32 0
65 store double %3, ptr%a2
69 define void @test_demanded_hsubpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind {
70 ; X86-LABEL: test_demanded_hsubpd_128:
72 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
73 ; X86-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
74 ; X86-NEXT: vmovlpd %xmm0, (%eax)
77 ; X64-LABEL: test_demanded_hsubpd_128:
79 ; X64-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
80 ; X64-NEXT: vmovlpd %xmm0, (%rdi)
82 %1 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
83 %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %1)
84 %3 = extractelement <2 x double> %2, i32 0
85 store double %3, ptr%a2
89 define void @test_demanded_phaddd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind {
90 ; X86-LABEL: test_demanded_phaddd_128:
92 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
93 ; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0
94 ; X86-NEXT: vmovd %xmm0, (%eax)
97 ; X64-LABEL: test_demanded_phaddd_128:
99 ; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0
100 ; X64-NEXT: vmovd %xmm0, (%rdi)
102 %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> zeroinitializer
103 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %1)
104 %3 = extractelement <4 x i32> %2, i32 0
109 define void @test_demanded_phsubd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind {
110 ; X86-LABEL: test_demanded_phsubd_128:
112 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
113 ; X86-NEXT: vphsubd %xmm0, %xmm0, %xmm0
114 ; X86-NEXT: vpextrd $1, %xmm0, (%eax)
117 ; X64-LABEL: test_demanded_phsubd_128:
119 ; X64-NEXT: vphsubd %xmm0, %xmm0, %xmm0
120 ; X64-NEXT: vpextrd $1, %xmm0, (%rdi)
122 %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> zeroinitializer
123 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %1)
124 %3 = extractelement <4 x i32> %2, i32 1
129 define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind {
130 ; X86-LABEL: test_demanded_phaddw_128:
132 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
133 ; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0
134 ; X86-NEXT: vpextrw $0, %xmm0, (%eax)
137 ; X64-LABEL: test_demanded_phaddw_128:
139 ; X64-NEXT: vphaddw %xmm0, %xmm0, %xmm0
140 ; X64-NEXT: vpextrw $0, %xmm0, (%rdi)
142 %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer
143 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %1)
144 %3 = extractelement <8 x i16> %2, i16 0
149 define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind {
150 ; X86-LABEL: test_demanded_phsubw_128:
152 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
153 ; X86-NEXT: vphsubw %xmm0, %xmm0, %xmm0
154 ; X86-NEXT: vpextrw $2, %xmm0, (%eax)
157 ; X64-LABEL: test_demanded_phsubw_128:
159 ; X64-NEXT: vphsubw %xmm0, %xmm0, %xmm0
160 ; X64-NEXT: vpextrw $2, %xmm0, (%rdi)
162 %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer
163 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %1)
164 %3 = extractelement <8 x i16> %2, i16 2
173 define void @test_demanded_haddps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind {
174 ; X86-LABEL: test_demanded_haddps_256:
176 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
177 ; X86-NEXT: vhaddps %ymm0, %ymm0, %ymm0
178 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
179 ; X86-NEXT: vmovss %xmm0, (%eax)
180 ; X86-NEXT: vzeroupper
183 ; X64-LABEL: test_demanded_haddps_256:
185 ; X64-NEXT: vhaddps %ymm0, %ymm0, %ymm0
186 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
187 ; X64-NEXT: vmovss %xmm0, (%rdi)
188 ; X64-NEXT: vzeroupper
190 %1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> zeroinitializer
191 %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %1)
192 %3 = extractelement <8 x float> %2, i32 4
193 store float %3, ptr%a2
197 define void @test_demanded_hsubps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind {
198 ; X86-LABEL: test_demanded_hsubps_256:
200 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
201 ; X86-NEXT: vhsubps %ymm1, %ymm0, %ymm0
202 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
203 ; X86-NEXT: vextractps $3, %xmm0, (%eax)
204 ; X86-NEXT: vzeroupper
207 ; X64-LABEL: test_demanded_hsubps_256:
209 ; X64-NEXT: vhsubps %ymm1, %ymm0, %ymm0
210 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
211 ; X64-NEXT: vextractps $3, %xmm0, (%rdi)
212 ; X64-NEXT: vzeroupper
214 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
215 %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %a1)
216 %3 = extractelement <8 x float> %2, i32 7
217 store float %3, ptr%a2
221 define void @test_demanded_haddpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind {
222 ; X86-LABEL: test_demanded_haddpd_256:
224 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
225 ; X86-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
226 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
227 ; X86-NEXT: vmovlpd %xmm0, (%eax)
228 ; X86-NEXT: vzeroupper
231 ; X64-LABEL: test_demanded_haddpd_256:
233 ; X64-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
234 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
235 ; X64-NEXT: vmovlpd %xmm0, (%rdi)
236 ; X64-NEXT: vzeroupper
238 %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> zeroinitializer
239 %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %1)
240 %3 = extractelement <4 x double> %2, i32 2
241 store double %3, ptr%a2
245 define void @test_demanded_hsubpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind {
246 ; X86-LABEL: test_demanded_hsubpd_256:
248 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
249 ; X86-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
250 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
251 ; X86-NEXT: vmovlpd %xmm0, (%eax)
252 ; X86-NEXT: vzeroupper
255 ; X64-LABEL: test_demanded_hsubpd_256:
257 ; X64-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
258 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
259 ; X64-NEXT: vmovlpd %xmm0, (%rdi)
260 ; X64-NEXT: vzeroupper
262 %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> zeroinitializer
263 %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %1)
264 %3 = extractelement <4 x double> %2, i32 2
265 store double %3, ptr%a2
269 define void @test_demanded_phaddd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind {
270 ; X86-LABEL: test_demanded_phaddd_256:
272 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
273 ; X86-NEXT: vphaddd %ymm1, %ymm0, %ymm0
274 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm0
275 ; X86-NEXT: vpextrd $3, %xmm0, (%eax)
276 ; X86-NEXT: vzeroupper
279 ; X64-LABEL: test_demanded_phaddd_256:
281 ; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
282 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
283 ; X64-NEXT: vpextrd $3, %xmm0, (%rdi)
284 ; X64-NEXT: vzeroupper
286 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
287 %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %a1)
288 %3 = extractelement <8 x i32> %2, i32 7
293 define void @test_demanded_phsubd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind {
294 ; X86-LABEL: test_demanded_phsubd_256:
296 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
297 ; X86-NEXT: vphsubd %ymm0, %ymm0, %ymm0
298 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm0
299 ; X86-NEXT: vpextrd $1, %xmm0, (%eax)
300 ; X86-NEXT: vzeroupper
303 ; X64-LABEL: test_demanded_phsubd_256:
305 ; X64-NEXT: vphsubd %ymm0, %ymm0, %ymm0
306 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
307 ; X64-NEXT: vpextrd $1, %xmm0, (%rdi)
308 ; X64-NEXT: vzeroupper
310 %1 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> zeroinitializer
311 %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %1)
312 %3 = extractelement <8 x i32> %2, i32 5
317 define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind {
318 ; X86-LABEL: test_demanded_phaddw_256:
320 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
321 ; X86-NEXT: vpbroadcastw %xmm1, %xmm0
322 ; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0
323 ; X86-NEXT: vpextrw $4, %xmm0, (%eax)
324 ; X86-NEXT: vzeroupper
327 ; X64-LABEL: test_demanded_phaddw_256:
329 ; X64-NEXT: vpbroadcastw %xmm1, %xmm0
330 ; X64-NEXT: vphaddw %xmm0, %xmm0, %xmm0
331 ; X64-NEXT: vpextrw $4, %xmm0, (%rdi)
332 ; X64-NEXT: vzeroupper
334 %1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> zeroinitializer
335 %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %1)
336 %3 = extractelement <16 x i16> %2, i32 4
341 define void @test_demanded_phsubw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind {
342 ; X86-LABEL: test_demanded_phsubw_256:
344 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
345 ; X86-NEXT: vphsubw %xmm1, %xmm0, %xmm0
346 ; X86-NEXT: vpextrw $6, %xmm0, (%eax)
347 ; X86-NEXT: vzeroupper
350 ; X64-LABEL: test_demanded_phsubw_256:
352 ; X64-NEXT: vphsubw %xmm1, %xmm0, %xmm0
353 ; X64-NEXT: vpextrw $6, %xmm0, (%rdi)
354 ; X64-NEXT: vzeroupper
356 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer
357 %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %1, <16 x i16> %a1)
358 %3 = extractelement <16 x i16> %2, i32 6
363 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
364 declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
365 declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
366 declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
368 declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
369 declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
370 declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
371 declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
373 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
374 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
375 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)
376 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>)
378 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>)
379 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>)
380 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>)
381 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>)