1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -O2 -mattr=avx512f -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64
3 ; RUN: llc < %s -O2 -mattr=avx512f -mtriple=i386-unknown | FileCheck %s --check-prefix=CHECK32
4 ; RUN: llc < %s -O2 -mattr=avx512vl -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64
5 ; RUN: llc < %s -O2 -mattr=avx512vl -mtriple=i386-unknown | FileCheck %s --check-prefix=CHECK32
7 define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
8 ; CHECK64-LABEL: test_mm_mask_move_ss:
9 ; CHECK64: # %bb.0: # %entry
10 ; CHECK64-NEXT: kmovw %edi, %k1
11 ; CHECK64-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1}
14 ; CHECK32-LABEL: test_mm_mask_move_ss:
15 ; CHECK32: # %bb.0: # %entry
16 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
17 ; CHECK32-NEXT: kmovw %eax, %k1
18 ; CHECK32-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1}
22 %tobool.i = icmp ne i8 %0, 0
23 %__B.elt.i = extractelement <4 x float> %__B, i32 0
24 %__W.elt.i = extractelement <4 x float> %__W, i32 0
25 %vecext1.i = select i1 %tobool.i, float %__B.elt.i, float %__W.elt.i
26 %vecins.i = insertelement <4 x float> %__A, float %vecext1.i, i32 0
27 ret <4 x float> %vecins.i
30 define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
31 ; CHECK64-LABEL: test_mm_maskz_move_ss:
32 ; CHECK64: # %bb.0: # %entry
33 ; CHECK64-NEXT: kmovw %edi, %k1
34 ; CHECK64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
37 ; CHECK32-LABEL: test_mm_maskz_move_ss:
38 ; CHECK32: # %bb.0: # %entry
39 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
40 ; CHECK32-NEXT: kmovw %eax, %k1
41 ; CHECK32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
45 %tobool.i = icmp ne i8 %0, 0
46 %vecext.i = extractelement <4 x float> %__B, i32 0
47 %cond.i = select i1 %tobool.i, float %vecext.i, float 0.000000e+00
48 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
49 ret <4 x float> %vecins.i
52 define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
53 ; CHECK64-LABEL: test_mm_mask_move_sd:
54 ; CHECK64: # %bb.0: # %entry
55 ; CHECK64-NEXT: kmovw %edi, %k1
56 ; CHECK64-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1}
59 ; CHECK32-LABEL: test_mm_mask_move_sd:
60 ; CHECK32: # %bb.0: # %entry
61 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
62 ; CHECK32-NEXT: kmovw %eax, %k1
63 ; CHECK32-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1}
67 %tobool.i = icmp ne i8 %0, 0
68 %__B.elt.i = extractelement <2 x double> %__B, i32 0
69 %__W.elt.i = extractelement <2 x double> %__W, i32 0
70 %vecext1.i = select i1 %tobool.i, double %__B.elt.i, double %__W.elt.i
71 %vecins.i = insertelement <2 x double> %__A, double %vecext1.i, i32 0
72 ret <2 x double> %vecins.i
75 define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
76 ; CHECK64-LABEL: test_mm_maskz_move_sd:
77 ; CHECK64: # %bb.0: # %entry
78 ; CHECK64-NEXT: kmovw %edi, %k1
79 ; CHECK64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
82 ; CHECK32-LABEL: test_mm_maskz_move_sd:
83 ; CHECK32: # %bb.0: # %entry
84 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
85 ; CHECK32-NEXT: kmovw %eax, %k1
86 ; CHECK32-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
90 %tobool.i = icmp ne i8 %0, 0
91 %vecext.i = extractelement <2 x double> %__B, i32 0
92 %cond.i = select i1 %tobool.i, double %vecext.i, double 0.000000e+00
93 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
94 ret <2 x double> %vecins.i
97 define void @test_mm_mask_store_ss(float* %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #1 {
98 ; CHECK64-LABEL: test_mm_mask_store_ss:
99 ; CHECK64: # %bb.0: # %entry
100 ; CHECK64-NEXT: kmovw %esi, %k1
101 ; CHECK64-NEXT: vmovss %xmm0, (%rdi) {%k1}
104 ; CHECK32-LABEL: test_mm_mask_store_ss:
105 ; CHECK32: # %bb.0: # %entry
106 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
107 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
108 ; CHECK32-NEXT: kmovw %ecx, %k1
109 ; CHECK32-NEXT: vmovss %xmm0, (%eax) {%k1}
112 %0 = bitcast float* %__W to <16 x float>*
113 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
115 %conv2.i = zext i8 %1 to i16
116 %2 = bitcast i16 %conv2.i to <16 x i1>
117 tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %shuffle.i.i, <16 x float>* %0, i32 16, <16 x i1> %2) #5
121 define void @test_mm_mask_store_sd(double* %__W, i8 zeroext %__U, <2 x double> %__A) local_unnamed_addr #1 {
122 ; CHECK64-LABEL: test_mm_mask_store_sd:
123 ; CHECK64: # %bb.0: # %entry
124 ; CHECK64-NEXT: kmovw %esi, %k1
125 ; CHECK64-NEXT: vmovsd %xmm0, (%rdi) {%k1}
128 ; CHECK32-LABEL: test_mm_mask_store_sd:
129 ; CHECK32: # %bb.0: # %entry
130 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
131 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
132 ; CHECK32-NEXT: kmovw %ecx, %k1
133 ; CHECK32-NEXT: vmovsd %xmm0, (%eax) {%k1}
136 %0 = bitcast double* %__W to <8 x double>*
137 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
139 %2 = bitcast i8 %1 to <8 x i1>
140 tail call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %shuffle.i.i, <8 x double>* %0, i32 16, <8 x i1> %2) #5
144 define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
145 ; CHECK64-LABEL: test_mm_mask_load_ss:
146 ; CHECK64: # %bb.0: # %entry
147 ; CHECK64-NEXT: kmovw %edi, %k1
148 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1}
151 ; CHECK32-LABEL: test_mm_mask_load_ss:
152 ; CHECK32: # %bb.0: # %entry
153 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
154 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
155 ; CHECK32-NEXT: kmovw %ecx, %k1
156 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1}
159 %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
160 %0 = bitcast float* %__W to <16 x float>*
161 %shuffle.i.i = shufflevector <4 x float> %shuffle.i, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
163 %conv2.i = zext i8 %1 to i16
164 %2 = bitcast i16 %conv2.i to <16 x i1>
165 %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> %shuffle.i.i) #5
166 %shuffle4.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
167 ret <4 x float> %shuffle4.i
170 define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
171 ; CHECK64-LABEL: test_mm_mask_load_sd:
172 ; CHECK64: # %bb.0: # %entry
173 ; CHECK64-NEXT: kmovw %edi, %k1
174 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1}
177 ; CHECK32-LABEL: test_mm_mask_load_sd:
178 ; CHECK32: # %bb.0: # %entry
179 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
180 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
181 ; CHECK32-NEXT: kmovw %ecx, %k1
182 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1}
185 %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
186 %0 = bitcast double* %__W to <8 x double>*
187 %shuffle.i.i = shufflevector <2 x double> %shuffle5.i, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
189 %2 = bitcast i8 %1 to <8 x i1>
190 %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> %shuffle.i.i) #5
191 %shuffle3.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> <i32 0, i32 1>
192 ret <2 x double> %shuffle3.i
195 define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
196 ; CHECK64-LABEL: test_mm_maskz_load_ss:
197 ; CHECK64: # %bb.0: # %entry
198 ; CHECK64-NEXT: kmovw %edi, %k1
199 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1} {z}
202 ; CHECK32-LABEL: test_mm_maskz_load_ss:
203 ; CHECK32: # %bb.0: # %entry
204 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
205 ; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
206 ; CHECK32-NEXT: kmovw %ecx, %k1
207 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1} {z}
210 %0 = bitcast float* %__W to <16 x float>*
212 %conv2.i = zext i8 %1 to i16
213 %2 = bitcast i16 %conv2.i to <16 x i1>
214 %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> zeroinitializer) #5
215 %shuffle.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
216 ret <4 x float> %shuffle.i
219 define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
220 ; CHECK64-LABEL: test_mm_maskz_load_sd:
221 ; CHECK64: # %bb.0: # %entry
222 ; CHECK64-NEXT: kmovw %edi, %k1
223 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1} {z}
226 ; CHECK32-LABEL: test_mm_maskz_load_sd:
227 ; CHECK32: # %bb.0: # %entry
228 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
229 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
230 ; CHECK32-NEXT: kmovw %ecx, %k1
231 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1} {z}
234 %0 = bitcast double* %__W to <8 x double>*
236 %2 = bitcast i8 %1 to <8 x i1>
237 %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> zeroinitializer) #5
238 %shuffle.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> <i32 0, i32 1>
239 ret <2 x double> %shuffle.i
242 ; The tests below match clang's newer codegen that uses 128-bit masked load/stores.
244 define void @test_mm_mask_store_ss_2(float* %__P, i8 zeroext %__U, <4 x float> %__A) {
245 ; CHECK64-LABEL: test_mm_mask_store_ss_2:
246 ; CHECK64: # %bb.0: # %entry
247 ; CHECK64-NEXT: kmovw %esi, %k1
248 ; CHECK64-NEXT: vmovss %xmm0, (%rdi) {%k1}
251 ; CHECK32-LABEL: test_mm_mask_store_ss_2:
252 ; CHECK32: # %bb.0: # %entry
253 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
254 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
255 ; CHECK32-NEXT: kmovw %ecx, %k1
256 ; CHECK32-NEXT: vmovss %xmm0, (%eax) {%k1}
259 %0 = bitcast float* %__P to <4 x float>*
261 %2 = bitcast i8 %1 to <8 x i1>
262 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
263 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %__A, <4 x float>* %0, i32 1, <4 x i1> %extract.i)
267 define void @test_mm_mask_store_sd_2(double* %__P, i8 zeroext %__U, <2 x double> %__A) {
268 ; CHECK64-LABEL: test_mm_mask_store_sd_2:
269 ; CHECK64: # %bb.0: # %entry
270 ; CHECK64-NEXT: kmovw %esi, %k1
271 ; CHECK64-NEXT: vmovsd %xmm0, (%rdi) {%k1}
274 ; CHECK32-LABEL: test_mm_mask_store_sd_2:
275 ; CHECK32: # %bb.0: # %entry
276 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
277 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
278 ; CHECK32-NEXT: kmovw %ecx, %k1
279 ; CHECK32-NEXT: vmovsd %xmm0, (%eax) {%k1}
282 %0 = bitcast double* %__P to <2 x double>*
284 %2 = bitcast i8 %1 to <8 x i1>
285 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
286 tail call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %__A, <2 x double>* %0, i32 1, <2 x i1> %extract.i)
290 define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, float* readonly %__W) {
291 ; CHECK64-LABEL: test_mm_mask_load_ss_2:
292 ; CHECK64: # %bb.0: # %entry
293 ; CHECK64-NEXT: kmovw %edi, %k1
294 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1}
297 ; CHECK32-LABEL: test_mm_mask_load_ss_2:
298 ; CHECK32: # %bb.0: # %entry
299 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
300 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
301 ; CHECK32-NEXT: kmovw %ecx, %k1
302 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1}
305 %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
306 %0 = bitcast float* %__W to <4 x float>*
308 %2 = bitcast i8 %1 to <8 x i1>
309 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
310 %3 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 1, <4 x i1> %extract.i, <4 x float> %shuffle.i)
314 define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, float* readonly %__W) {
315 ; CHECK64-LABEL: test_mm_maskz_load_ss_2:
316 ; CHECK64: # %bb.0: # %entry
317 ; CHECK64-NEXT: kmovw %edi, %k1
318 ; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1} {z}
321 ; CHECK32-LABEL: test_mm_maskz_load_ss_2:
322 ; CHECK32: # %bb.0: # %entry
323 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
324 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
325 ; CHECK32-NEXT: kmovw %ecx, %k1
326 ; CHECK32-NEXT: vmovss (%eax), %xmm0 {%k1} {z}
329 %0 = bitcast float* %__W to <4 x float>*
331 %2 = bitcast i8 %1 to <8 x i1>
332 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333 %3 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 1, <4 x i1> %extract.i, <4 x float> zeroinitializer)
337 define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U, double* readonly %__W) {
338 ; CHECK64-LABEL: test_mm_mask_load_sd_2:
339 ; CHECK64: # %bb.0: # %entry
340 ; CHECK64-NEXT: kmovw %edi, %k1
341 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1}
344 ; CHECK32-LABEL: test_mm_mask_load_sd_2:
345 ; CHECK32: # %bb.0: # %entry
346 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
347 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
348 ; CHECK32-NEXT: kmovw %ecx, %k1
349 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1}
352 %shuffle3.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
353 %0 = bitcast double* %__W to <2 x double>*
355 %2 = bitcast i8 %1 to <8 x i1>
356 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
357 %3 = tail call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %0, i32 1, <2 x i1> %extract.i, <2 x double> %shuffle3.i)
361 define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, double* readonly %__W) {
362 ; CHECK64-LABEL: test_mm_maskz_load_sd_2:
363 ; CHECK64: # %bb.0: # %entry
364 ; CHECK64-NEXT: kmovw %edi, %k1
365 ; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1} {z}
368 ; CHECK32-LABEL: test_mm_maskz_load_sd_2:
369 ; CHECK32: # %bb.0: # %entry
370 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
371 ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
372 ; CHECK32-NEXT: kmovw %ecx, %k1
373 ; CHECK32-NEXT: vmovsd (%eax), %xmm0 {%k1} {z}
376 %0 = bitcast double* %__W to <2 x double>*
378 %2 = bitcast i8 %1 to <8 x i1>
379 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
380 %3 = tail call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %0, i32 1, <2 x i1> %extract.i, <2 x double> zeroinitializer)
385 declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) #3
387 declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) #3
389 declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) #4
391 declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) #4
393 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
395 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
397 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
399 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)