1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X32 -check-prefix=X32-KNL
3 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X32 -check-prefix=X32-SKX
4 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-KNL
5 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN32 -check-prefix=WIN32-SKX
6 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-KNL
7 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=skx | FileCheck %s -check-prefix=WIN64 -check-prefix=WIN64-SKX
8 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=X64 -check-prefix=X64-KNL
9 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -check-prefix=X64 -check-prefix=X64-SKX
11 declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
12 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
13 declare i32 @func_int(i32, i32)
15 ;test calling conventions - input parameters
16 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
17 ; X32-LABEL: testf16_inp:
19 ; X32-NEXT: pushl %ebp
20 ; X32-NEXT: movl %esp, %ebp
21 ; X32-NEXT: andl $-64, %esp
22 ; X32-NEXT: subl $192, %esp
23 ; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0
24 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
25 ; X32-NEXT: movl %eax, (%esp)
26 ; X32-NEXT: calll _func_float16_ptr
27 ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
28 ; X32-NEXT: movl %ebp, %esp
32 ; WIN32-LABEL: testf16_inp:
34 ; WIN32-NEXT: pushl %ebp
35 ; WIN32-NEXT: movl %esp, %ebp
36 ; WIN32-NEXT: andl $-64, %esp
37 ; WIN32-NEXT: subl $128, %esp
38 ; WIN32-NEXT: vaddps %zmm1, %zmm0, %zmm0
39 ; WIN32-NEXT: movl %esp, %eax
40 ; WIN32-NEXT: pushl %eax
41 ; WIN32-NEXT: calll _func_float16_ptr
42 ; WIN32-NEXT: addl $4, %esp
43 ; WIN32-NEXT: vaddps (%esp), %zmm0, %zmm0
44 ; WIN32-NEXT: movl %ebp, %esp
45 ; WIN32-NEXT: popl %ebp
48 ; WIN64-LABEL: testf16_inp:
50 ; WIN64-NEXT: pushq %rbp
51 ; WIN64-NEXT: subq $176, %rsp
52 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
53 ; WIN64-NEXT: andq $-64, %rsp
54 ; WIN64-NEXT: vmovaps (%rcx), %zmm0
55 ; WIN64-NEXT: vaddps (%rdx), %zmm0, %zmm0
56 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
57 ; WIN64-NEXT: callq func_float16_ptr
58 ; WIN64-NEXT: vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
59 ; WIN64-NEXT: leaq 48(%rbp), %rsp
60 ; WIN64-NEXT: popq %rbp
63 ; X64-LABEL: testf16_inp:
65 ; X64-NEXT: pushq %rbp
66 ; X64-NEXT: movq %rsp, %rbp
67 ; X64-NEXT: pushq %r13
68 ; X64-NEXT: pushq %r12
69 ; X64-NEXT: andq $-64, %rsp
70 ; X64-NEXT: subq $128, %rsp
71 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0
72 ; X64-NEXT: movq %rsp, %rdi
73 ; X64-NEXT: callq _func_float16_ptr
74 ; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0
75 ; X64-NEXT: leaq -16(%rbp), %rsp
80 %y = alloca <16 x float>, align 16
81 %x = fadd <16 x float> %a, %b
82 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
83 %2 = load <16 x float>, <16 x float>* %y, align 16
84 %3 = fadd <16 x float> %2, %1
88 ;test calling conventions - preserved registers
90 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
91 ; X32-LABEL: testf16_regs:
93 ; X32-NEXT: pushl %ebp
94 ; X32-NEXT: movl %esp, %ebp
95 ; X32-NEXT: andl $-64, %esp
96 ; X32-NEXT: subl $256, %esp ## imm = 0x100
97 ; X32-NEXT: vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill
98 ; X32-NEXT: vaddps %zmm1, %zmm0, %zmm0
99 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
100 ; X32-NEXT: movl %eax, (%esp)
101 ; X32-NEXT: calll _func_float16_ptr
102 ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload
103 ; X32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
104 ; X32-NEXT: movl %ebp, %esp
105 ; X32-NEXT: popl %ebp
108 ; WIN32-LABEL: testf16_regs:
110 ; WIN32-NEXT: pushl %ebp
111 ; WIN32-NEXT: movl %esp, %ebp
112 ; WIN32-NEXT: andl $-64, %esp
113 ; WIN32-NEXT: subl $192, %esp
114 ; WIN32-NEXT: vmovaps %zmm1, (%esp) # 64-byte Spill
115 ; WIN32-NEXT: vaddps %zmm1, %zmm0, %zmm0
116 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
117 ; WIN32-NEXT: pushl %eax
118 ; WIN32-NEXT: calll _func_float16_ptr
119 ; WIN32-NEXT: addl $4, %esp
120 ; WIN32-NEXT: vaddps (%esp), %zmm0, %zmm0 # 64-byte Folded Reload
121 ; WIN32-NEXT: vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
122 ; WIN32-NEXT: movl %ebp, %esp
123 ; WIN32-NEXT: popl %ebp
126 ; WIN64-LABEL: testf16_regs:
128 ; WIN64-NEXT: pushq %rbp
129 ; WIN64-NEXT: subq $176, %rsp
130 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
131 ; WIN64-NEXT: andq $-64, %rsp
132 ; WIN64-NEXT: vmovaps (%rdx), %zmm16
133 ; WIN64-NEXT: vaddps (%rcx), %zmm16, %zmm0
134 ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
135 ; WIN64-NEXT: callq func_float16_ptr
136 ; WIN64-NEXT: vaddps %zmm16, %zmm0, %zmm0
137 ; WIN64-NEXT: vaddps {{[0-9]+}}(%rsp), %zmm0, %zmm0
138 ; WIN64-NEXT: leaq 48(%rbp), %rsp
139 ; WIN64-NEXT: popq %rbp
142 ; X64-LABEL: testf16_regs:
144 ; X64-NEXT: pushq %rbp
145 ; X64-NEXT: movq %rsp, %rbp
146 ; X64-NEXT: pushq %r13
147 ; X64-NEXT: pushq %r12
148 ; X64-NEXT: andq $-64, %rsp
149 ; X64-NEXT: subq $128, %rsp
150 ; X64-NEXT: vmovaps %zmm1, %zmm16
151 ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0
152 ; X64-NEXT: movq %rsp, %rdi
153 ; X64-NEXT: callq _func_float16_ptr
154 ; X64-NEXT: vaddps %zmm16, %zmm0, %zmm0
155 ; X64-NEXT: vaddps (%rsp), %zmm0, %zmm0
156 ; X64-NEXT: leaq -16(%rbp), %rsp
157 ; X64-NEXT: popq %r12
158 ; X64-NEXT: popq %r13
159 ; X64-NEXT: popq %rbp
161 %y = alloca <16 x float>, align 16
162 %x = fadd <16 x float> %a, %b
163 %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
164 %2 = load <16 x float>, <16 x float>* %y, align 16
165 %3 = fadd <16 x float> %1, %b
166 %4 = fadd <16 x float> %2, %3
170 ; test calling conventions - prolog and epilog
171 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
172 ; X32-LABEL: test_prolog_epilog:
174 ; X32-NEXT: subl $12, %esp
175 ; X32-NEXT: calll _func_float16
176 ; X32-NEXT: addl $12, %esp
179 ; WIN32-LABEL: test_prolog_epilog:
181 ; WIN32-NEXT: calll _func_float16
184 ; WIN64-KNL-LABEL: test_prolog_epilog:
185 ; WIN64-KNL: # %bb.0:
186 ; WIN64-KNL-NEXT: pushq %rbp
187 ; WIN64-KNL-NEXT: subq $1328, %rsp # imm = 0x530
188 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
189 ; WIN64-KNL-NEXT: kmovw %k7, 1198(%rbp) # 2-byte Spill
190 ; WIN64-KNL-NEXT: kmovw %k6, 1196(%rbp) # 2-byte Spill
191 ; WIN64-KNL-NEXT: kmovw %k5, 1194(%rbp) # 2-byte Spill
192 ; WIN64-KNL-NEXT: kmovw %k4, 1192(%rbp) # 2-byte Spill
193 ; WIN64-KNL-NEXT: vmovaps %zmm21, 1104(%rbp) # 64-byte Spill
194 ; WIN64-KNL-NEXT: vmovaps %zmm20, 992(%rbp) # 64-byte Spill
195 ; WIN64-KNL-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill
196 ; WIN64-KNL-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill
197 ; WIN64-KNL-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill
198 ; WIN64-KNL-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill
199 ; WIN64-KNL-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill
200 ; WIN64-KNL-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill
201 ; WIN64-KNL-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill
202 ; WIN64-KNL-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill
203 ; WIN64-KNL-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill
204 ; WIN64-KNL-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill
205 ; WIN64-KNL-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill
206 ; WIN64-KNL-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill
207 ; WIN64-KNL-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill
208 ; WIN64-KNL-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill
209 ; WIN64-KNL-NEXT: andq $-64, %rsp
210 ; WIN64-KNL-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
211 ; WIN64-KNL-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
212 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
213 ; WIN64-KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
214 ; WIN64-KNL-NEXT: callq func_float16
215 ; WIN64-KNL-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload
216 ; WIN64-KNL-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload
217 ; WIN64-KNL-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload
218 ; WIN64-KNL-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload
219 ; WIN64-KNL-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload
220 ; WIN64-KNL-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload
221 ; WIN64-KNL-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload
222 ; WIN64-KNL-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload
223 ; WIN64-KNL-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload
224 ; WIN64-KNL-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload
225 ; WIN64-KNL-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload
226 ; WIN64-KNL-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload
227 ; WIN64-KNL-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload
228 ; WIN64-KNL-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload
229 ; WIN64-KNL-NEXT: vmovaps 992(%rbp), %zmm20 # 64-byte Reload
230 ; WIN64-KNL-NEXT: vmovaps 1104(%rbp), %zmm21 # 64-byte Reload
231 ; WIN64-KNL-NEXT: kmovw 1192(%rbp), %k4 # 2-byte Reload
232 ; WIN64-KNL-NEXT: kmovw 1194(%rbp), %k5 # 2-byte Reload
233 ; WIN64-KNL-NEXT: kmovw 1196(%rbp), %k6 # 2-byte Reload
234 ; WIN64-KNL-NEXT: kmovw 1198(%rbp), %k7 # 2-byte Reload
235 ; WIN64-KNL-NEXT: leaq 1200(%rbp), %rsp
236 ; WIN64-KNL-NEXT: popq %rbp
237 ; WIN64-KNL-NEXT: retq
239 ; WIN64-SKX-LABEL: test_prolog_epilog:
240 ; WIN64-SKX: # %bb.0:
241 ; WIN64-SKX-NEXT: pushq %rbp
242 ; WIN64-SKX-NEXT: subq $1328, %rsp # imm = 0x530
243 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
244 ; WIN64-SKX-NEXT: kmovq %k7, 1192(%rbp) # 8-byte Spill
245 ; WIN64-SKX-NEXT: kmovq %k6, 1184(%rbp) # 8-byte Spill
246 ; WIN64-SKX-NEXT: kmovq %k5, 1176(%rbp) # 8-byte Spill
247 ; WIN64-SKX-NEXT: kmovq %k4, 1168(%rbp) # 8-byte Spill
248 ; WIN64-SKX-NEXT: vmovaps %zmm21, 1056(%rbp) # 64-byte Spill
249 ; WIN64-SKX-NEXT: vmovaps %zmm20, 960(%rbp) # 64-byte Spill
250 ; WIN64-SKX-NEXT: vmovaps %zmm19, 896(%rbp) # 64-byte Spill
251 ; WIN64-SKX-NEXT: vmovaps %zmm18, 832(%rbp) # 64-byte Spill
252 ; WIN64-SKX-NEXT: vmovaps %zmm17, 768(%rbp) # 64-byte Spill
253 ; WIN64-SKX-NEXT: vmovaps %zmm16, 704(%rbp) # 64-byte Spill
254 ; WIN64-SKX-NEXT: vmovaps %zmm15, 640(%rbp) # 64-byte Spill
255 ; WIN64-SKX-NEXT: vmovaps %zmm14, 576(%rbp) # 64-byte Spill
256 ; WIN64-SKX-NEXT: vmovaps %zmm13, 512(%rbp) # 64-byte Spill
257 ; WIN64-SKX-NEXT: vmovaps %zmm12, 448(%rbp) # 64-byte Spill
258 ; WIN64-SKX-NEXT: vmovaps %zmm11, 384(%rbp) # 64-byte Spill
259 ; WIN64-SKX-NEXT: vmovaps %zmm10, 320(%rbp) # 64-byte Spill
260 ; WIN64-SKX-NEXT: vmovaps %zmm9, 256(%rbp) # 64-byte Spill
261 ; WIN64-SKX-NEXT: vmovaps %zmm8, 192(%rbp) # 64-byte Spill
262 ; WIN64-SKX-NEXT: vmovaps %zmm7, 128(%rbp) # 64-byte Spill
263 ; WIN64-SKX-NEXT: vmovaps %zmm6, 64(%rbp) # 64-byte Spill
264 ; WIN64-SKX-NEXT: andq $-64, %rsp
265 ; WIN64-SKX-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp)
266 ; WIN64-SKX-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
267 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
268 ; WIN64-SKX-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
269 ; WIN64-SKX-NEXT: callq func_float16
270 ; WIN64-SKX-NEXT: vmovaps 64(%rbp), %zmm6 # 64-byte Reload
271 ; WIN64-SKX-NEXT: vmovaps 128(%rbp), %zmm7 # 64-byte Reload
272 ; WIN64-SKX-NEXT: vmovaps 192(%rbp), %zmm8 # 64-byte Reload
273 ; WIN64-SKX-NEXT: vmovaps 256(%rbp), %zmm9 # 64-byte Reload
274 ; WIN64-SKX-NEXT: vmovaps 320(%rbp), %zmm10 # 64-byte Reload
275 ; WIN64-SKX-NEXT: vmovaps 384(%rbp), %zmm11 # 64-byte Reload
276 ; WIN64-SKX-NEXT: vmovaps 448(%rbp), %zmm12 # 64-byte Reload
277 ; WIN64-SKX-NEXT: vmovaps 512(%rbp), %zmm13 # 64-byte Reload
278 ; WIN64-SKX-NEXT: vmovaps 576(%rbp), %zmm14 # 64-byte Reload
279 ; WIN64-SKX-NEXT: vmovaps 640(%rbp), %zmm15 # 64-byte Reload
280 ; WIN64-SKX-NEXT: vmovaps 704(%rbp), %zmm16 # 64-byte Reload
281 ; WIN64-SKX-NEXT: vmovaps 768(%rbp), %zmm17 # 64-byte Reload
282 ; WIN64-SKX-NEXT: vmovaps 832(%rbp), %zmm18 # 64-byte Reload
283 ; WIN64-SKX-NEXT: vmovaps 896(%rbp), %zmm19 # 64-byte Reload
284 ; WIN64-SKX-NEXT: vmovaps 960(%rbp), %zmm20 # 64-byte Reload
285 ; WIN64-SKX-NEXT: vmovaps 1056(%rbp), %zmm21 # 64-byte Reload
286 ; WIN64-SKX-NEXT: kmovq 1168(%rbp), %k4 # 8-byte Reload
287 ; WIN64-SKX-NEXT: kmovq 1176(%rbp), %k5 # 8-byte Reload
288 ; WIN64-SKX-NEXT: kmovq 1184(%rbp), %k6 # 8-byte Reload
289 ; WIN64-SKX-NEXT: kmovq 1192(%rbp), %k7 # 8-byte Reload
290 ; WIN64-SKX-NEXT: leaq 1200(%rbp), %rsp
291 ; WIN64-SKX-NEXT: popq %rbp
292 ; WIN64-SKX-NEXT: retq
294 ; X64-KNL-LABEL: test_prolog_epilog:
296 ; X64-KNL-NEXT: pushq %rsi
297 ; X64-KNL-NEXT: pushq %rdi
298 ; X64-KNL-NEXT: subq $1064, %rsp ## imm = 0x428
299 ; X64-KNL-NEXT: kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill
300 ; X64-KNL-NEXT: kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill
301 ; X64-KNL-NEXT: kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill
302 ; X64-KNL-NEXT: kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill
303 ; X64-KNL-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
304 ; X64-KNL-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
305 ; X64-KNL-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
306 ; X64-KNL-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
307 ; X64-KNL-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
308 ; X64-KNL-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
309 ; X64-KNL-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
310 ; X64-KNL-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
311 ; X64-KNL-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
312 ; X64-KNL-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
313 ; X64-KNL-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
314 ; X64-KNL-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
315 ; X64-KNL-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
316 ; X64-KNL-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
317 ; X64-KNL-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
318 ; X64-KNL-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill
319 ; X64-KNL-NEXT: callq _func_float16
320 ; X64-KNL-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload
321 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
322 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
323 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
324 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
325 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
326 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
327 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
328 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
329 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
330 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
331 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
332 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
333 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
334 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
335 ; X64-KNL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
336 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload
337 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload
338 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload
339 ; X64-KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload
340 ; X64-KNL-NEXT: addq $1064, %rsp ## imm = 0x428
341 ; X64-KNL-NEXT: popq %rdi
342 ; X64-KNL-NEXT: popq %rsi
345 ; X64-SKX-LABEL: test_prolog_epilog:
347 ; X64-SKX-NEXT: pushq %rsi
348 ; X64-SKX-NEXT: pushq %rdi
349 ; X64-SKX-NEXT: subq $1192, %rsp ## imm = 0x4A8
350 ; X64-SKX-NEXT: kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill
351 ; X64-SKX-NEXT: kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill
352 ; X64-SKX-NEXT: kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill
353 ; X64-SKX-NEXT: kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill
354 ; X64-SKX-NEXT: vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
355 ; X64-SKX-NEXT: vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
356 ; X64-SKX-NEXT: vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
357 ; X64-SKX-NEXT: vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
358 ; X64-SKX-NEXT: vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
359 ; X64-SKX-NEXT: vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
360 ; X64-SKX-NEXT: vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
361 ; X64-SKX-NEXT: vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
362 ; X64-SKX-NEXT: vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
363 ; X64-SKX-NEXT: vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
364 ; X64-SKX-NEXT: vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
365 ; X64-SKX-NEXT: vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
366 ; X64-SKX-NEXT: vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
367 ; X64-SKX-NEXT: vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
368 ; X64-SKX-NEXT: vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
369 ; X64-SKX-NEXT: vmovups %zmm16, (%rsp) ## 64-byte Spill
370 ; X64-SKX-NEXT: callq _func_float16
371 ; X64-SKX-NEXT: vmovups (%rsp), %zmm16 ## 64-byte Reload
372 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
373 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
374 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
375 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
376 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
377 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
378 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
379 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
380 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
381 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
382 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
383 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
384 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
385 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
386 ; X64-SKX-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
387 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload
388 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload
389 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload
390 ; X64-SKX-NEXT: kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload
391 ; X64-SKX-NEXT: addq $1192, %rsp ## imm = 0x4A8
392 ; X64-SKX-NEXT: popq %rdi
393 ; X64-SKX-NEXT: popq %rsi
395 %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
400 declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
402 define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask) {
403 ; X32-LABEL: testf16_inp_mask:
405 ; X32-NEXT: subl $12, %esp
406 ; X32-NEXT: .cfi_def_cfa_offset 16
407 ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
408 ; X32-NEXT: calll _func_float16_mask
409 ; X32-NEXT: addl $12, %esp
412 ; WIN32-LABEL: testf16_inp_mask:
414 ; WIN32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
415 ; WIN32-NEXT: calll _func_float16_mask
418 ; WIN64-KNL-LABEL: testf16_inp_mask:
419 ; WIN64-KNL: # %bb.0:
420 ; WIN64-KNL-NEXT: subq $40, %rsp
421 ; WIN64-KNL-NEXT: .seh_stackalloc 40
422 ; WIN64-KNL-NEXT: .seh_endprologue
423 ; WIN64-KNL-NEXT: # kill: def $dx killed $dx def $edx
424 ; WIN64-KNL-NEXT: vmovaps (%rcx), %zmm0
425 ; WIN64-KNL-NEXT: kmovw %edx, %k1
426 ; WIN64-KNL-NEXT: callq func_float16_mask
427 ; WIN64-KNL-NEXT: nop
428 ; WIN64-KNL-NEXT: addq $40, %rsp
429 ; WIN64-KNL-NEXT: retq
430 ; WIN64-KNL-NEXT: .seh_handlerdata
431 ; WIN64-KNL-NEXT: .text
432 ; WIN64-KNL-NEXT: .seh_endproc
434 ; WIN64-SKX-LABEL: testf16_inp_mask:
435 ; WIN64-SKX: # %bb.0:
436 ; WIN64-SKX-NEXT: subq $40, %rsp
437 ; WIN64-SKX-NEXT: .seh_stackalloc 40
438 ; WIN64-SKX-NEXT: .seh_endprologue
439 ; WIN64-SKX-NEXT: # kill: def $dx killed $dx def $edx
440 ; WIN64-SKX-NEXT: vmovaps (%rcx), %zmm0
441 ; WIN64-SKX-NEXT: kmovd %edx, %k1
442 ; WIN64-SKX-NEXT: callq func_float16_mask
443 ; WIN64-SKX-NEXT: nop
444 ; WIN64-SKX-NEXT: addq $40, %rsp
445 ; WIN64-SKX-NEXT: retq
446 ; WIN64-SKX-NEXT: .seh_handlerdata
447 ; WIN64-SKX-NEXT: .text
448 ; WIN64-SKX-NEXT: .seh_endproc
450 ; X64-KNL-LABEL: testf16_inp_mask:
452 ; X64-KNL-NEXT: pushq %rbp
453 ; X64-KNL-NEXT: .cfi_def_cfa_offset 16
454 ; X64-KNL-NEXT: pushq %r13
455 ; X64-KNL-NEXT: .cfi_def_cfa_offset 24
456 ; X64-KNL-NEXT: pushq %r12
457 ; X64-KNL-NEXT: .cfi_def_cfa_offset 32
458 ; X64-KNL-NEXT: .cfi_offset %r12, -32
459 ; X64-KNL-NEXT: .cfi_offset %r13, -24
460 ; X64-KNL-NEXT: .cfi_offset %rbp, -16
461 ; X64-KNL-NEXT: kmovw %edi, %k1
462 ; X64-KNL-NEXT: callq _func_float16_mask
463 ; X64-KNL-NEXT: popq %r12
464 ; X64-KNL-NEXT: popq %r13
465 ; X64-KNL-NEXT: popq %rbp
468 ; X64-SKX-LABEL: testf16_inp_mask:
470 ; X64-SKX-NEXT: pushq %rbp
471 ; X64-SKX-NEXT: .cfi_def_cfa_offset 16
472 ; X64-SKX-NEXT: pushq %r13
473 ; X64-SKX-NEXT: .cfi_def_cfa_offset 24
474 ; X64-SKX-NEXT: pushq %r12
475 ; X64-SKX-NEXT: .cfi_def_cfa_offset 32
476 ; X64-SKX-NEXT: .cfi_offset %r12, -32
477 ; X64-SKX-NEXT: .cfi_offset %r13, -24
478 ; X64-SKX-NEXT: .cfi_offset %rbp, -16
479 ; X64-SKX-NEXT: kmovd %edi, %k1
480 ; X64-SKX-NEXT: callq _func_float16_mask
481 ; X64-SKX-NEXT: popq %r12
482 ; X64-SKX-NEXT: popq %r13
483 ; X64-SKX-NEXT: popq %rbp
485 %imask = bitcast i16 %mask to <16 x i1>
486 %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
490 define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
491 ; X32-LABEL: test_prolog_epilog_with_mask:
493 ; X32-NEXT: subl $12, %esp
494 ; X32-NEXT: vpcmpeqd %zmm2, %zmm1, %k0
495 ; X32-NEXT: kxorw %k1, %k0, %k1
496 ; X32-NEXT: calll _func_float16_mask
497 ; X32-NEXT: addl $12, %esp
500 ; WIN32-LABEL: test_prolog_epilog_with_mask:
502 ; WIN32-NEXT: vpcmpeqd %zmm2, %zmm1, %k0
503 ; WIN32-NEXT: kxorw %k1, %k0, %k1
504 ; WIN32-NEXT: calll _func_float16_mask
507 ; WIN64-LABEL: test_prolog_epilog_with_mask:
509 ; WIN64-NEXT: subq $40, %rsp
510 ; WIN64-NEXT: vpcmpeqd %zmm2, %zmm1, %k0
511 ; WIN64-NEXT: kxorw %k1, %k0, %k1
512 ; WIN64-NEXT: callq func_float16_mask
513 ; WIN64-NEXT: addq $40, %rsp
516 ; X64-LABEL: test_prolog_epilog_with_mask:
518 ; X64-NEXT: pushq %rax
519 ; X64-NEXT: vpcmpeqd %zmm2, %zmm1, %k0
520 ; X64-NEXT: kxorw %k1, %k0, %k1
521 ; X64-NEXT: callq _func_float16_mask
522 ; X64-NEXT: popq %rax
524 %cmp_res = icmp eq <16 x i32>%x1, %x2
525 %mask1 = xor <16 x i1> %cmp_res, %mask
526 %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)