1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -O2 -mattr=avx512f -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64
3 ; RUN: llc < %s -O2 -mattr=avx512f -mtriple=i386-unknown | FileCheck %s --check-prefix=CHECK32
4 ; RUN: llc < %s -O2 -mattr=avx512vl -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64
5 ; RUN: llc < %s -O2 -mattr=avx512vl -mtriple=i386-unknown | FileCheck %s --check-prefix=CHECK32
7 define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
8 ; CHECK64-LABEL: test_mm_mask_move_ss:
9 ; CHECK64: # %bb.0: # %entry
10 ; CHECK64-NEXT: kmovw %edi, %k1
11 ; CHECK64-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1}
14 ; CHECK32-LABEL: test_mm_mask_move_ss:
15 ; CHECK32: # %bb.0: # %entry
16 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
17 ; CHECK32-NEXT: kmovw %eax, %k1
18 ; CHECK32-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1}
22 %tobool.i = icmp ne i8 %0, 0
23 %__B.elt.i = extractelement <4 x float> %__B, i32 0
24 %__W.elt.i = extractelement <4 x float> %__W, i32 0
25 %vecext1.i = select i1 %tobool.i, float %__B.elt.i, float %__W.elt.i
26 %vecins.i = insertelement <4 x float> %__A, float %vecext1.i, i32 0
27 ret <4 x float> %vecins.i
30 define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
31 ; CHECK64-LABEL: test_mm_maskz_move_ss:
32 ; CHECK64: # %bb.0: # %entry
33 ; CHECK64-NEXT: kmovw %edi, %k1
34 ; CHECK64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
37 ; CHECK32-LABEL: test_mm_maskz_move_ss:
38 ; CHECK32: # %bb.0: # %entry
39 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
40 ; CHECK32-NEXT: kmovw %eax, %k1
41 ; CHECK32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
45 %tobool.i = icmp ne i8 %0, 0
46 %vecext.i = extractelement <4 x float> %__B, i32 0
47 %cond.i = select i1 %tobool.i, float %vecext.i, float 0.000000e+00
48 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
49 ret <4 x float> %vecins.i
52 define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
53 ; CHECK64-LABEL: test_mm_mask_move_sd:
54 ; CHECK64: # %bb.0: # %entry
55 ; CHECK64-NEXT: kmovw %edi, %k1
56 ; CHECK64-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1}
59 ; CHECK32-LABEL: test_mm_mask_move_sd:
60 ; CHECK32: # %bb.0: # %entry
61 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
62 ; CHECK32-NEXT: kmovw %eax, %k1
63 ; CHECK32-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1}
67 %tobool.i = icmp ne i8 %0, 0
68 %__B.elt.i = extractelement <2 x double> %__B, i32 0
69 %__W.elt.i = extractelement <2 x double> %__W, i32 0
70 %vecext1.i = select i1 %tobool.i, double %__B.elt.i, double %__W.elt.i
71 %vecins.i = insertelement <2 x double> %__A, double %vecext1.i, i32 0
72 ret <2 x double> %vecins.i
75 define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
76 ; CHECK64-LABEL: test_mm_maskz_move_sd:
77 ; CHECK64: # %bb.0: # %entry
78 ; CHECK64-NEXT: kmovw %edi, %k1
79 ; CHECK64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
82 ; CHECK32-LABEL: test_mm_maskz_move_sd:
83 ; CHECK32: # %bb.0: # %entry
84 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
85 ; CHECK32-NEXT: kmovw %eax, %k1
86 ; CHECK32-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
90 %tobool.i = icmp ne i8 %0, 0
91 %vecext.i = extractelement <2 x double> %__B, i32 0
92 %cond.i = select i1 %tobool.i, double %vecext.i, double 0.000000e+00
93 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
94 ret <2 x double> %vecins.i
97 define void @test_mm_mask_store_ss(ptr %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #1 {
98 ; CHECK64-LABEL: test_mm_mask_store_ss:
99 ; CHECK64: # %bb.0: # %entry
100 ; CHECK64-NEXT: kmovw %esi, %k1
101 ; CHECK64-NEXT: vmovss %xmm0, (%rdi) {%k1}
104 ; CHECK32-LABEL: test_mm_mask_store_ss:
105 ; CHECK32: # %bb.0: # %entry
106 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
107 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
108 ; CHECK32-NEXT: kmovw %ecx, %k1
109 ; CHECK32-NEXT: vmovss %xmm0, (%eax) {%k1}
112 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
114 %conv2.i = zext i8 %0 to i16
115 %1 = bitcast i16 %conv2.i to <16 x i1>
116 tail call void @llvm.masked.store.v16f32.p0(<16 x float> %shuffle.i.i, ptr %__W, i32 16, <16 x i1> %1) #5
120 define void @test_mm_mask_store_sd(ptr %__W, i8 zeroext %__U, <2 x double> %__A) local_unnamed_addr #1 {
121 ; CHECK64-LABEL: test_mm_mask_store_sd:
122 ; CHECK64: # %bb.0: # %entry
123 ; CHECK64-NEXT: kmovw %esi, %k1
124 ; CHECK64-NEXT: vmovsd %xmm0, (%rdi) {%k1}
127 ; CHECK32-LABEL: test_mm_mask_store_sd:
128 ; CHECK32: # %bb.0: # %entry
129 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
130 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
131 ; CHECK32-NEXT: kmovw %ecx, %k1
132 ; CHECK32-NEXT: vmovsd %xmm0, (%eax) {%k1}
135 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
137 %1 = bitcast i8 %0 to <8 x i1>
138 tail call void @llvm.masked.store.v8f64.p0(<8 x double> %shuffle.i.i, ptr %__W, i32 16, <8 x i1> %1) #5
142 define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr %__W) local_unnamed_addr #2 {
143 ; CHECK64-LABEL: test_mm_mask_load_ss:
144 ; CHECK64: # %bb.0: # %entry
145 ; CHECK64-NEXT: kmovw %edi, %k1
146 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1}
149 ; CHECK32-LABEL: test_mm_mask_load_ss:
150 ; CHECK32: # %bb.0: # %entry
151 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
152 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
153 ; CHECK32-NEXT: kmovw %ecx, %k1
154 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1}
157 %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
158 %shuffle.i.i = shufflevector <4 x float> %shuffle.i, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
160 %conv2.i = zext i8 %0 to i16
161 %1 = bitcast i16 %conv2.i to <16 x i1>
162 %2 = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %__W, i32 16, <16 x i1> %1, <16 x float> %shuffle.i.i) #5
163 %shuffle4.i = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
164 ret <4 x float> %shuffle4.i
167 define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, ptr %__W) local_unnamed_addr #2 {
168 ; CHECK64-LABEL: test_mm_mask_load_sd:
169 ; CHECK64: # %bb.0: # %entry
170 ; CHECK64-NEXT: kmovw %edi, %k1
171 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1}
174 ; CHECK32-LABEL: test_mm_mask_load_sd:
175 ; CHECK32: # %bb.0: # %entry
176 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
177 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
178 ; CHECK32-NEXT: kmovw %ecx, %k1
179 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1}
182 %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
183 %shuffle.i.i = shufflevector <2 x double> %shuffle5.i, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
185 %1 = bitcast i8 %0 to <8 x i1>
186 %2 = tail call <8 x double> @llvm.masked.load.v8f64.p0(ptr %__W, i32 16, <8 x i1> %1, <8 x double> %shuffle.i.i) #5
187 %shuffle3.i = shufflevector <8 x double> %2, <8 x double> undef, <2 x i32> <i32 0, i32 1>
188 ret <2 x double> %shuffle3.i
191 define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnamed_addr #2 {
192 ; CHECK64-LABEL: test_mm_maskz_load_ss:
193 ; CHECK64: # %bb.0: # %entry
194 ; CHECK64-NEXT: kmovw %edi, %k1
195 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1} {z}
198 ; CHECK32-LABEL: test_mm_maskz_load_ss:
199 ; CHECK32: # %bb.0: # %entry
200 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
201 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
202 ; CHECK32-NEXT: kmovw %ecx, %k1
203 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1} {z}
207 %conv2.i = zext i8 %0 to i16
208 %1 = bitcast i16 %conv2.i to <16 x i1>
209 %2 = tail call <16 x float> @llvm.masked.load.v16f32.p0(ptr %__W, i32 16, <16 x i1> %1, <16 x float> zeroinitializer) #5
210 %shuffle.i = shufflevector <16 x float> %2, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
211 ret <4 x float> %shuffle.i
214 define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unnamed_addr #2 {
215 ; CHECK64-LABEL: test_mm_maskz_load_sd:
216 ; CHECK64: # %bb.0: # %entry
217 ; CHECK64-NEXT: kmovw %edi, %k1
218 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1} {z}
221 ; CHECK32-LABEL: test_mm_maskz_load_sd:
222 ; CHECK32: # %bb.0: # %entry
223 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
224 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
225 ; CHECK32-NEXT: kmovw %ecx, %k1
226 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1} {z}
230 %1 = bitcast i8 %0 to <8 x i1>
231 %2 = tail call <8 x double> @llvm.masked.load.v8f64.p0(ptr %__W, i32 16, <8 x i1> %1, <8 x double> zeroinitializer) #5
232 %shuffle.i = shufflevector <8 x double> %2, <8 x double> undef, <2 x i32> <i32 0, i32 1>
233 ret <2 x double> %shuffle.i
236 ; The tests below match clang's newer codegen that uses 128-bit masked load/stores.
238 define void @test_mm_mask_store_ss_2(ptr %__P, i8 zeroext %__U, <4 x float> %__A) {
239 ; CHECK64-LABEL: test_mm_mask_store_ss_2:
240 ; CHECK64: # %bb.0: # %entry
241 ; CHECK64-NEXT: kmovw %esi, %k1
242 ; CHECK64-NEXT: vmovss %xmm0, (%rdi) {%k1}
245 ; CHECK32-LABEL: test_mm_mask_store_ss_2:
246 ; CHECK32: # %bb.0: # %entry
247 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
248 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
249 ; CHECK32-NEXT: kmovw %ecx, %k1
250 ; CHECK32-NEXT: vmovss %xmm0, (%eax) {%k1}
254 %1 = bitcast i8 %0 to <8 x i1>
255 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
256 tail call void @llvm.masked.store.v4f32.p0(<4 x float> %__A, ptr %__P, i32 1, <4 x i1> %extract.i)
260 define void @test_mm_mask_store_sd_2(ptr %__P, i8 zeroext %__U, <2 x double> %__A) {
261 ; CHECK64-LABEL: test_mm_mask_store_sd_2:
262 ; CHECK64: # %bb.0: # %entry
263 ; CHECK64-NEXT: kmovw %esi, %k1
264 ; CHECK64-NEXT: vmovsd %xmm0, (%rdi) {%k1}
267 ; CHECK32-LABEL: test_mm_mask_store_sd_2:
268 ; CHECK32: # %bb.0: # %entry
269 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
270 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
271 ; CHECK32-NEXT: kmovw %ecx, %k1
272 ; CHECK32-NEXT: vmovsd %xmm0, (%eax) {%k1}
276 %1 = bitcast i8 %0 to <8 x i1>
277 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
278 tail call void @llvm.masked.store.v2f64.p0(<2 x double> %__A, ptr %__P, i32 1, <2 x i1> %extract.i)
282 define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, ptr readonly %__W) {
283 ; CHECK64-LABEL: test_mm_mask_load_ss_2:
284 ; CHECK64: # %bb.0: # %entry
285 ; CHECK64-NEXT: kmovw %edi, %k1
286 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1}
289 ; CHECK32-LABEL: test_mm_mask_load_ss_2:
290 ; CHECK32: # %bb.0: # %entry
291 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
292 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
293 ; CHECK32-NEXT: kmovw %ecx, %k1
294 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1}
297 %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
299 %1 = bitcast i8 %0 to <8 x i1>
300 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
301 %2 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %__W, i32 1, <4 x i1> %extract.i, <4 x float> %shuffle.i)
305 define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W) {
306 ; CHECK64-LABEL: test_mm_maskz_load_ss_2:
307 ; CHECK64: # %bb.0: # %entry
308 ; CHECK64-NEXT: kmovw %edi, %k1
309 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1} {z}
312 ; CHECK32-LABEL: test_mm_maskz_load_ss_2:
313 ; CHECK32: # %bb.0: # %entry
314 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
315 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
316 ; CHECK32-NEXT: kmovw %ecx, %k1
317 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1} {z}
321 %1 = bitcast i8 %0 to <8 x i1>
322 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
323 %2 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %__W, i32 1, <4 x i1> %extract.i, <4 x float> zeroinitializer)
327 define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U, ptr readonly %__W) {
328 ; CHECK64-LABEL: test_mm_mask_load_sd_2:
329 ; CHECK64: # %bb.0: # %entry
330 ; CHECK64-NEXT: kmovw %edi, %k1
331 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1}
334 ; CHECK32-LABEL: test_mm_mask_load_sd_2:
335 ; CHECK32: # %bb.0: # %entry
336 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
337 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
338 ; CHECK32-NEXT: kmovw %ecx, %k1
339 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1}
342 %shuffle3.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
344 %1 = bitcast i8 %0 to <8 x i1>
345 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
346 %2 = tail call <2 x double> @llvm.masked.load.v2f64.p0(ptr %__W, i32 1, <2 x i1> %extract.i, <2 x double> %shuffle3.i)
350 define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W) {
351 ; CHECK64-LABEL: test_mm_maskz_load_sd_2:
352 ; CHECK64: # %bb.0: # %entry
353 ; CHECK64-NEXT: kmovw %edi, %k1
354 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1} {z}
357 ; CHECK32-LABEL: test_mm_maskz_load_sd_2:
358 ; CHECK32: # %bb.0: # %entry
359 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
360 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
361 ; CHECK32-NEXT: kmovw %ecx, %k1
362 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1} {z}
366 %1 = bitcast i8 %0 to <8 x i1>
367 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
368 %2 = tail call <2 x double> @llvm.masked.load.v2f64.p0(ptr %__W, i32 1, <2 x i1> %extract.i, <2 x double> zeroinitializer)
373 declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>) #3
375 declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>) #3
377 declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>) #4
379 declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>) #4
381 declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
383 declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
385 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
387 declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)