1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s
4 ; Skylake-avx512 target supports masked load/store for i8 and i16 vectors
6 define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
7 ; CHECK-LABEL: test_mask_load_16xi8:
9 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
10 ; CHECK-NEXT: vpmovb2m %xmm0, %k1
11 ; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z}
13 %res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
16 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
18 define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
19 ; CHECK-LABEL: test_mask_load_32xi8:
21 ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
22 ; CHECK-NEXT: vpmovb2m %ymm0, %k1
23 ; CHECK-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
25 %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> %val)
28 declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
30 define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
31 ; CHECK-LABEL: test_mask_load_64xi8:
33 ; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
34 ; CHECK-NEXT: vpmovb2m %zmm0, %k1
35 ; CHECK-NEXT: vpblendmb (%rdi), %zmm1, %zmm0 {%k1}
37 %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
40 declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
42 define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
43 ; CHECK-LABEL: test_mask_load_8xi16:
45 ; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
46 ; CHECK-NEXT: vpmovw2m %xmm0, %k1
47 ; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
49 %res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
52 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
54 define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
55 ; CHECK-LABEL: test_mask_load_16xi16:
57 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
58 ; CHECK-NEXT: vpmovb2m %xmm0, %k1
59 ; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
61 %res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
64 declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
66 define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
67 ; CHECK-LABEL: test_mask_load_32xi16:
69 ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
70 ; CHECK-NEXT: vpmovb2m %ymm0, %k1
71 ; CHECK-NEXT: vpblendmw (%rdi), %zmm1, %zmm0 {%k1}
73 %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
76 declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
78 define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
79 ; CHECK-LABEL: test_mask_store_16xi8:
81 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
82 ; CHECK-NEXT: vpmovb2m %xmm0, %k1
83 ; CHECK-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1}
85 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
88 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
90 define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
91 ; CHECK-LABEL: test_mask_store_32xi8:
93 ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
94 ; CHECK-NEXT: vpmovb2m %ymm0, %k1
95 ; CHECK-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
96 ; CHECK-NEXT: vzeroupper
98 call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
101 declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
103 define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
104 ; CHECK-LABEL: test_mask_store_64xi8:
106 ; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
107 ; CHECK-NEXT: vpmovb2m %zmm0, %k1
108 ; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
109 ; CHECK-NEXT: vzeroupper
111 call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
114 declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
116 define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
117 ; CHECK-LABEL: test_mask_store_8xi16:
119 ; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
120 ; CHECK-NEXT: vpmovw2m %xmm0, %k1
121 ; CHECK-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
123 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
126 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
128 define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
129 ; CHECK-LABEL: test_mask_store_16xi16:
131 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
132 ; CHECK-NEXT: vpmovb2m %xmm0, %k1
133 ; CHECK-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
134 ; CHECK-NEXT: vzeroupper
136 call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
139 declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
141 define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
142 ; CHECK-LABEL: test_mask_store_32xi16:
144 ; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
145 ; CHECK-NEXT: vpmovb2m %ymm0, %k1
146 ; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
147 ; CHECK-NEXT: vzeroupper
149 call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
153 declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>)
155 ; Make sure we scalarize masked loads of f16.
156 define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
157 ; CHECK-LABEL: test_mask_load_16xf16:
159 ; CHECK-NEXT: movq %rdi, %rax
160 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
161 ; CHECK-NEXT: vpmovb2m %xmm0, %k0
162 ; CHECK-NEXT: kmovd %k0, %ecx
163 ; CHECK-NEXT: testb $1, %cl
164 ; CHECK-NEXT: je LBB12_1
165 ; CHECK-NEXT: ## %bb.2: ## %cond.load
166 ; CHECK-NEXT: movswl (%rsi), %ecx
167 ; CHECK-NEXT: vmovd %ecx, %xmm0
168 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm8
169 ; CHECK-NEXT: jmp LBB12_3
170 ; CHECK-NEXT: LBB12_1:
171 ; CHECK-NEXT: vxorps %xmm8, %xmm8, %xmm8
172 ; CHECK-NEXT: LBB12_3: ## %else
173 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
174 ; CHECK-NEXT: vxorps %xmm9, %xmm9, %xmm9
175 ; CHECK-NEXT: kshiftrw $1, %k0, %k1
176 ; CHECK-NEXT: kmovd %k1, %ecx
177 ; CHECK-NEXT: testb $1, %cl
178 ; CHECK-NEXT: je LBB12_4
179 ; CHECK-NEXT: ## %bb.5: ## %cond.load1
180 ; CHECK-NEXT: movswl 2(%rsi), %ecx
181 ; CHECK-NEXT: vmovd %ecx, %xmm0
182 ; CHECK-NEXT: vmovaps %xmm2, %xmm1
183 ; CHECK-NEXT: vmovaps %xmm2, %xmm7
184 ; CHECK-NEXT: vmovaps %xmm2, %xmm6
185 ; CHECK-NEXT: vmovaps %xmm2, %xmm5
186 ; CHECK-NEXT: vmovaps %xmm2, %xmm4
187 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
188 ; CHECK-NEXT: vmovaps %xmm2, %xmm16
189 ; CHECK-NEXT: vmovaps %xmm2, %xmm15
190 ; CHECK-NEXT: vmovaps %xmm2, %xmm14
191 ; CHECK-NEXT: vmovaps %xmm2, %xmm13
192 ; CHECK-NEXT: vmovaps %xmm2, %xmm12
193 ; CHECK-NEXT: vmovaps %xmm2, %xmm11
194 ; CHECK-NEXT: vmovaps %xmm2, %xmm10
195 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm2
196 ; CHECK-NEXT: jmp LBB12_6
197 ; CHECK-NEXT: LBB12_4:
198 ; CHECK-NEXT: vmovaps %xmm2, %xmm1
199 ; CHECK-NEXT: vmovaps %xmm2, %xmm7
200 ; CHECK-NEXT: vmovaps %xmm2, %xmm6
201 ; CHECK-NEXT: vmovaps %xmm2, %xmm5
202 ; CHECK-NEXT: vmovaps %xmm2, %xmm4
203 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
204 ; CHECK-NEXT: vmovaps %xmm2, %xmm16
205 ; CHECK-NEXT: vmovaps %xmm2, %xmm15
206 ; CHECK-NEXT: vmovaps %xmm2, %xmm14
207 ; CHECK-NEXT: vmovaps %xmm2, %xmm13
208 ; CHECK-NEXT: vmovaps %xmm2, %xmm12
209 ; CHECK-NEXT: vmovaps %xmm2, %xmm11
210 ; CHECK-NEXT: vmovaps %xmm2, %xmm10
211 ; CHECK-NEXT: LBB12_6: ## %else2
212 ; CHECK-NEXT: kshiftrw $2, %k0, %k1
213 ; CHECK-NEXT: kmovd %k1, %ecx
214 ; CHECK-NEXT: testb $1, %cl
215 ; CHECK-NEXT: je LBB12_8
216 ; CHECK-NEXT: ## %bb.7: ## %cond.load4
217 ; CHECK-NEXT: movswl 4(%rsi), %ecx
218 ; CHECK-NEXT: vmovd %ecx, %xmm0
219 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1
220 ; CHECK-NEXT: LBB12_8: ## %else5
221 ; CHECK-NEXT: kshiftrw $3, %k0, %k1
222 ; CHECK-NEXT: kmovd %k1, %ecx
223 ; CHECK-NEXT: testb $1, %cl
224 ; CHECK-NEXT: je LBB12_10
225 ; CHECK-NEXT: ## %bb.9: ## %cond.load7
226 ; CHECK-NEXT: movswl 6(%rsi), %ecx
227 ; CHECK-NEXT: vmovd %ecx, %xmm0
228 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm7
229 ; CHECK-NEXT: LBB12_10: ## %else8
230 ; CHECK-NEXT: kshiftrw $4, %k0, %k1
231 ; CHECK-NEXT: kmovd %k1, %ecx
232 ; CHECK-NEXT: testb $1, %cl
233 ; CHECK-NEXT: je LBB12_12
234 ; CHECK-NEXT: ## %bb.11: ## %cond.load10
235 ; CHECK-NEXT: movswl 8(%rsi), %ecx
236 ; CHECK-NEXT: vmovd %ecx, %xmm0
237 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm6
238 ; CHECK-NEXT: LBB12_12: ## %else11
239 ; CHECK-NEXT: kshiftrw $5, %k0, %k1
240 ; CHECK-NEXT: kmovd %k1, %ecx
241 ; CHECK-NEXT: testb $1, %cl
242 ; CHECK-NEXT: je LBB12_14
243 ; CHECK-NEXT: ## %bb.13: ## %cond.load13
244 ; CHECK-NEXT: movswl 10(%rsi), %ecx
245 ; CHECK-NEXT: vmovd %ecx, %xmm0
246 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm5
247 ; CHECK-NEXT: LBB12_14: ## %else14
248 ; CHECK-NEXT: kshiftrw $6, %k0, %k1
249 ; CHECK-NEXT: kmovd %k1, %ecx
250 ; CHECK-NEXT: testb $1, %cl
251 ; CHECK-NEXT: je LBB12_16
252 ; CHECK-NEXT: ## %bb.15: ## %cond.load16
253 ; CHECK-NEXT: movswl 12(%rsi), %ecx
254 ; CHECK-NEXT: vmovd %ecx, %xmm0
255 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm4
256 ; CHECK-NEXT: LBB12_16: ## %else17
257 ; CHECK-NEXT: kshiftrw $7, %k0, %k1
258 ; CHECK-NEXT: kmovd %k1, %ecx
259 ; CHECK-NEXT: testb $1, %cl
260 ; CHECK-NEXT: je LBB12_18
261 ; CHECK-NEXT: ## %bb.17: ## %cond.load19
262 ; CHECK-NEXT: movswl 14(%rsi), %ecx
263 ; CHECK-NEXT: vmovd %ecx, %xmm0
264 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm3
265 ; CHECK-NEXT: LBB12_18: ## %else20
266 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
267 ; CHECK-NEXT: kmovd %k1, %ecx
268 ; CHECK-NEXT: testb $1, %cl
269 ; CHECK-NEXT: je LBB12_20
270 ; CHECK-NEXT: ## %bb.19: ## %cond.load22
271 ; CHECK-NEXT: movswl 16(%rsi), %ecx
272 ; CHECK-NEXT: vmovd %ecx, %xmm0
273 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm16
274 ; CHECK-NEXT: LBB12_20: ## %else23
275 ; CHECK-NEXT: kshiftrw $9, %k0, %k1
276 ; CHECK-NEXT: kmovd %k1, %ecx
277 ; CHECK-NEXT: testb $1, %cl
278 ; CHECK-NEXT: je LBB12_22
279 ; CHECK-NEXT: ## %bb.21: ## %cond.load25
280 ; CHECK-NEXT: movswl 18(%rsi), %ecx
281 ; CHECK-NEXT: vmovd %ecx, %xmm0
282 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm15
283 ; CHECK-NEXT: LBB12_22: ## %else26
284 ; CHECK-NEXT: kshiftrw $10, %k0, %k1
285 ; CHECK-NEXT: kmovd %k1, %ecx
286 ; CHECK-NEXT: testb $1, %cl
287 ; CHECK-NEXT: je LBB12_24
288 ; CHECK-NEXT: ## %bb.23: ## %cond.load28
289 ; CHECK-NEXT: movswl 20(%rsi), %ecx
290 ; CHECK-NEXT: vmovd %ecx, %xmm0
291 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm14
292 ; CHECK-NEXT: LBB12_24: ## %else29
293 ; CHECK-NEXT: kshiftrw $11, %k0, %k1
294 ; CHECK-NEXT: kmovd %k1, %ecx
295 ; CHECK-NEXT: testb $1, %cl
296 ; CHECK-NEXT: je LBB12_26
297 ; CHECK-NEXT: ## %bb.25: ## %cond.load31
298 ; CHECK-NEXT: movswl 22(%rsi), %ecx
299 ; CHECK-NEXT: vmovd %ecx, %xmm0
300 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm13
301 ; CHECK-NEXT: LBB12_26: ## %else32
302 ; CHECK-NEXT: kshiftrw $12, %k0, %k1
303 ; CHECK-NEXT: kmovd %k1, %ecx
304 ; CHECK-NEXT: testb $1, %cl
305 ; CHECK-NEXT: je LBB12_28
306 ; CHECK-NEXT: ## %bb.27: ## %cond.load34
307 ; CHECK-NEXT: movswl 24(%rsi), %ecx
308 ; CHECK-NEXT: vmovd %ecx, %xmm0
309 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm12
310 ; CHECK-NEXT: LBB12_28: ## %else35
311 ; CHECK-NEXT: kshiftrw $13, %k0, %k1
312 ; CHECK-NEXT: kmovd %k1, %ecx
313 ; CHECK-NEXT: testb $1, %cl
314 ; CHECK-NEXT: je LBB12_30
315 ; CHECK-NEXT: ## %bb.29: ## %cond.load37
316 ; CHECK-NEXT: movswl 26(%rsi), %ecx
317 ; CHECK-NEXT: vmovd %ecx, %xmm0
318 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm11
319 ; CHECK-NEXT: LBB12_30: ## %else38
320 ; CHECK-NEXT: kshiftrw $14, %k0, %k1
321 ; CHECK-NEXT: kmovd %k1, %ecx
322 ; CHECK-NEXT: testb $1, %cl
323 ; CHECK-NEXT: je LBB12_32
324 ; CHECK-NEXT: ## %bb.31: ## %cond.load40
325 ; CHECK-NEXT: movswl 28(%rsi), %ecx
326 ; CHECK-NEXT: vmovd %ecx, %xmm0
327 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm10
328 ; CHECK-NEXT: LBB12_32: ## %else41
329 ; CHECK-NEXT: kshiftrw $15, %k0, %k0
330 ; CHECK-NEXT: kmovd %k0, %ecx
331 ; CHECK-NEXT: testb $1, %cl
332 ; CHECK-NEXT: je LBB12_34
333 ; CHECK-NEXT: ## %bb.33: ## %cond.load43
334 ; CHECK-NEXT: movswl 30(%rsi), %ecx
335 ; CHECK-NEXT: vmovd %ecx, %xmm0
336 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm9
337 ; CHECK-NEXT: LBB12_34: ## %else44
338 ; CHECK-NEXT: vcvtps2ph $4, %xmm8, %xmm0
339 ; CHECK-NEXT: vmovd %xmm0, %ecx
340 ; CHECK-NEXT: movw %cx, (%rax)
341 ; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0
342 ; CHECK-NEXT: vmovd %xmm0, %ecx
343 ; CHECK-NEXT: movw %cx, 2(%rax)
344 ; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0
345 ; CHECK-NEXT: vmovd %xmm0, %ecx
346 ; CHECK-NEXT: movw %cx, 4(%rax)
347 ; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0
348 ; CHECK-NEXT: vmovd %xmm0, %ecx
349 ; CHECK-NEXT: movw %cx, 6(%rax)
350 ; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0
351 ; CHECK-NEXT: vmovd %xmm0, %ecx
352 ; CHECK-NEXT: movw %cx, 8(%rax)
353 ; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0
354 ; CHECK-NEXT: vmovd %xmm0, %ecx
355 ; CHECK-NEXT: movw %cx, 10(%rax)
356 ; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0
357 ; CHECK-NEXT: vmovd %xmm0, %ecx
358 ; CHECK-NEXT: movw %cx, 12(%rax)
359 ; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0
360 ; CHECK-NEXT: vmovd %xmm0, %ecx
361 ; CHECK-NEXT: movw %cx, 14(%rax)
362 ; CHECK-NEXT: vcvtps2ph $4, %xmm16, %xmm0
363 ; CHECK-NEXT: vmovd %xmm0, %ecx
364 ; CHECK-NEXT: movw %cx, 16(%rax)
365 ; CHECK-NEXT: vcvtps2ph $4, %xmm15, %xmm0
366 ; CHECK-NEXT: vmovd %xmm0, %ecx
367 ; CHECK-NEXT: movw %cx, 18(%rax)
368 ; CHECK-NEXT: vcvtps2ph $4, %xmm14, %xmm0
369 ; CHECK-NEXT: vmovd %xmm0, %ecx
370 ; CHECK-NEXT: movw %cx, 20(%rax)
371 ; CHECK-NEXT: vcvtps2ph $4, %xmm13, %xmm0
372 ; CHECK-NEXT: vmovd %xmm0, %ecx
373 ; CHECK-NEXT: movw %cx, 22(%rax)
374 ; CHECK-NEXT: vcvtps2ph $4, %xmm12, %xmm0
375 ; CHECK-NEXT: vmovd %xmm0, %ecx
376 ; CHECK-NEXT: movw %cx, 24(%rax)
377 ; CHECK-NEXT: vcvtps2ph $4, %xmm11, %xmm0
378 ; CHECK-NEXT: vmovd %xmm0, %ecx
379 ; CHECK-NEXT: movw %cx, 26(%rax)
380 ; CHECK-NEXT: vcvtps2ph $4, %xmm10, %xmm0
381 ; CHECK-NEXT: vmovd %xmm0, %ecx
382 ; CHECK-NEXT: movw %cx, 28(%rax)
383 ; CHECK-NEXT: vcvtps2ph $4, %xmm9, %xmm0
384 ; CHECK-NEXT: vmovd %xmm0, %ecx
385 ; CHECK-NEXT: movw %cx, 30(%rax)
387 %res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer)
390 declare <16 x half> @llvm.masked.load.v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>)
392 ; Make sure we scalarize masked stores of f16.
393 define void @test_mask_store_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
394 ; CHECK-LABEL: test_mask_store_16xf16:
396 ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
397 ; CHECK-NEXT: vpmovb2m %xmm0, %k0
398 ; CHECK-NEXT: kmovd %k0, %eax
399 ; CHECK-NEXT: testb $1, %al
400 ; CHECK-NEXT: je LBB13_2
401 ; CHECK-NEXT: ## %bb.1: ## %cond.store
402 ; CHECK-NEXT: vcvtps2ph $4, %xmm1, %xmm0
403 ; CHECK-NEXT: vmovd %xmm0, %eax
404 ; CHECK-NEXT: movw %ax, (%rdi)
405 ; CHECK-NEXT: LBB13_2: ## %else
406 ; CHECK-NEXT: kshiftrw $1, %k0, %k1
407 ; CHECK-NEXT: kmovd %k1, %eax
408 ; CHECK-NEXT: testb $1, %al
409 ; CHECK-NEXT: je LBB13_4
410 ; CHECK-NEXT: ## %bb.3: ## %cond.store1
411 ; CHECK-NEXT: vcvtps2ph $4, %xmm2, %xmm0
412 ; CHECK-NEXT: vmovd %xmm0, %eax
413 ; CHECK-NEXT: movw %ax, 2(%rdi)
414 ; CHECK-NEXT: LBB13_4: ## %else2
415 ; CHECK-NEXT: kshiftrw $2, %k0, %k1
416 ; CHECK-NEXT: kmovd %k1, %eax
417 ; CHECK-NEXT: testb $1, %al
418 ; CHECK-NEXT: je LBB13_6
419 ; CHECK-NEXT: ## %bb.5: ## %cond.store3
420 ; CHECK-NEXT: vcvtps2ph $4, %xmm3, %xmm0
421 ; CHECK-NEXT: vmovd %xmm0, %eax
422 ; CHECK-NEXT: movw %ax, 4(%rdi)
423 ; CHECK-NEXT: LBB13_6: ## %else4
424 ; CHECK-NEXT: kshiftrw $3, %k0, %k1
425 ; CHECK-NEXT: kmovd %k1, %eax
426 ; CHECK-NEXT: testb $1, %al
427 ; CHECK-NEXT: je LBB13_8
428 ; CHECK-NEXT: ## %bb.7: ## %cond.store5
429 ; CHECK-NEXT: vcvtps2ph $4, %xmm4, %xmm0
430 ; CHECK-NEXT: vmovd %xmm0, %eax
431 ; CHECK-NEXT: movw %ax, 6(%rdi)
432 ; CHECK-NEXT: LBB13_8: ## %else6
433 ; CHECK-NEXT: kshiftrw $4, %k0, %k1
434 ; CHECK-NEXT: kmovd %k1, %eax
435 ; CHECK-NEXT: testb $1, %al
436 ; CHECK-NEXT: je LBB13_10
437 ; CHECK-NEXT: ## %bb.9: ## %cond.store7
438 ; CHECK-NEXT: vcvtps2ph $4, %xmm5, %xmm0
439 ; CHECK-NEXT: vmovd %xmm0, %eax
440 ; CHECK-NEXT: movw %ax, 8(%rdi)
441 ; CHECK-NEXT: LBB13_10: ## %else8
442 ; CHECK-NEXT: kshiftrw $5, %k0, %k1
443 ; CHECK-NEXT: kmovd %k1, %eax
444 ; CHECK-NEXT: testb $1, %al
445 ; CHECK-NEXT: je LBB13_12
446 ; CHECK-NEXT: ## %bb.11: ## %cond.store9
447 ; CHECK-NEXT: vcvtps2ph $4, %xmm6, %xmm0
448 ; CHECK-NEXT: vmovd %xmm0, %eax
449 ; CHECK-NEXT: movw %ax, 10(%rdi)
450 ; CHECK-NEXT: LBB13_12: ## %else10
451 ; CHECK-NEXT: kshiftrw $6, %k0, %k1
452 ; CHECK-NEXT: kmovd %k1, %eax
453 ; CHECK-NEXT: testb $1, %al
454 ; CHECK-NEXT: je LBB13_14
455 ; CHECK-NEXT: ## %bb.13: ## %cond.store11
456 ; CHECK-NEXT: vcvtps2ph $4, %xmm7, %xmm0
457 ; CHECK-NEXT: vmovd %xmm0, %eax
458 ; CHECK-NEXT: movw %ax, 12(%rdi)
459 ; CHECK-NEXT: LBB13_14: ## %else12
460 ; CHECK-NEXT: kshiftrw $7, %k0, %k1
461 ; CHECK-NEXT: kmovd %k1, %eax
462 ; CHECK-NEXT: testb $1, %al
463 ; CHECK-NEXT: je LBB13_16
464 ; CHECK-NEXT: ## %bb.15: ## %cond.store13
465 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
466 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
467 ; CHECK-NEXT: vmovd %xmm0, %eax
468 ; CHECK-NEXT: movw %ax, 14(%rdi)
469 ; CHECK-NEXT: LBB13_16: ## %else14
470 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
471 ; CHECK-NEXT: kmovd %k1, %eax
472 ; CHECK-NEXT: testb $1, %al
473 ; CHECK-NEXT: je LBB13_18
474 ; CHECK-NEXT: ## %bb.17: ## %cond.store15
475 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
476 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
477 ; CHECK-NEXT: vmovd %xmm0, %eax
478 ; CHECK-NEXT: movw %ax, 16(%rdi)
479 ; CHECK-NEXT: LBB13_18: ## %else16
480 ; CHECK-NEXT: kshiftrw $9, %k0, %k1
481 ; CHECK-NEXT: kmovd %k1, %eax
482 ; CHECK-NEXT: testb $1, %al
483 ; CHECK-NEXT: je LBB13_20
484 ; CHECK-NEXT: ## %bb.19: ## %cond.store17
485 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
486 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
487 ; CHECK-NEXT: vmovd %xmm0, %eax
488 ; CHECK-NEXT: movw %ax, 18(%rdi)
489 ; CHECK-NEXT: LBB13_20: ## %else18
490 ; CHECK-NEXT: kshiftrw $10, %k0, %k1
491 ; CHECK-NEXT: kmovd %k1, %eax
492 ; CHECK-NEXT: testb $1, %al
493 ; CHECK-NEXT: je LBB13_22
494 ; CHECK-NEXT: ## %bb.21: ## %cond.store19
495 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
496 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
497 ; CHECK-NEXT: vmovd %xmm0, %eax
498 ; CHECK-NEXT: movw %ax, 20(%rdi)
499 ; CHECK-NEXT: LBB13_22: ## %else20
500 ; CHECK-NEXT: kshiftrw $11, %k0, %k1
501 ; CHECK-NEXT: kmovd %k1, %eax
502 ; CHECK-NEXT: testb $1, %al
503 ; CHECK-NEXT: je LBB13_24
504 ; CHECK-NEXT: ## %bb.23: ## %cond.store21
505 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
506 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
507 ; CHECK-NEXT: vmovd %xmm0, %eax
508 ; CHECK-NEXT: movw %ax, 22(%rdi)
509 ; CHECK-NEXT: LBB13_24: ## %else22
510 ; CHECK-NEXT: kshiftrw $12, %k0, %k1
511 ; CHECK-NEXT: kmovd %k1, %eax
512 ; CHECK-NEXT: testb $1, %al
513 ; CHECK-NEXT: je LBB13_26
514 ; CHECK-NEXT: ## %bb.25: ## %cond.store23
515 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
516 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
517 ; CHECK-NEXT: vmovd %xmm0, %eax
518 ; CHECK-NEXT: movw %ax, 24(%rdi)
519 ; CHECK-NEXT: LBB13_26: ## %else24
520 ; CHECK-NEXT: kshiftrw $13, %k0, %k1
521 ; CHECK-NEXT: kmovd %k1, %eax
522 ; CHECK-NEXT: testb $1, %al
523 ; CHECK-NEXT: je LBB13_28
524 ; CHECK-NEXT: ## %bb.27: ## %cond.store25
525 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
526 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
527 ; CHECK-NEXT: vmovd %xmm0, %eax
528 ; CHECK-NEXT: movw %ax, 26(%rdi)
529 ; CHECK-NEXT: LBB13_28: ## %else26
530 ; CHECK-NEXT: kshiftrw $14, %k0, %k1
531 ; CHECK-NEXT: kmovd %k1, %eax
532 ; CHECK-NEXT: testb $1, %al
533 ; CHECK-NEXT: je LBB13_30
534 ; CHECK-NEXT: ## %bb.29: ## %cond.store27
535 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
536 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
537 ; CHECK-NEXT: vmovd %xmm0, %eax
538 ; CHECK-NEXT: movw %ax, 28(%rdi)
539 ; CHECK-NEXT: LBB13_30: ## %else28
540 ; CHECK-NEXT: kshiftrw $15, %k0, %k0
541 ; CHECK-NEXT: kmovd %k0, %eax
542 ; CHECK-NEXT: testb $1, %al
543 ; CHECK-NEXT: je LBB13_32
544 ; CHECK-NEXT: ## %bb.31: ## %cond.store29
545 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
546 ; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
547 ; CHECK-NEXT: vmovd %xmm0, %eax
548 ; CHECK-NEXT: movw %ax, 30(%rdi)
549 ; CHECK-NEXT: LBB13_32: ## %else30
551 call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
554 declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>)