1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
4 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck -check-prefix=O0 %s
6 @buf = dso_local global [3072 x i8] zeroinitializer, align 64
8 define internal void @foo() {
10 ; CHECK: # %bb.0: # %entry
14 ; IPRA: # %bb.0: # %entry
18 ; O0: # %bb.0: # %entry
24 define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
25 ; CHECK-LABEL: test_api:
27 ; CHECK-NEXT: pushq %rbp
28 ; CHECK-NEXT: pushq %r15
29 ; CHECK-NEXT: pushq %r14
30 ; CHECK-NEXT: pushq %rbx
31 ; CHECK-NEXT: subq $2120, %rsp # imm = 0x848
32 ; CHECK-NEXT: movl %esi, %ebx
33 ; CHECK-NEXT: movl %edi, %ebp
34 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
35 ; CHECK-NEXT: vmovups %zmm0, (%rsp)
36 ; CHECK-NEXT: movb $1, (%rsp)
37 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
38 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
39 ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
40 ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
41 ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
42 ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
43 ; CHECK-NEXT: ldtilecfg (%rsp)
44 ; CHECK-NEXT: movl $buf, %eax
45 ; CHECK-NEXT: movl $32, %r14d
46 ; CHECK-NEXT: movw $8, %r15w
47 ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1
48 ; CHECK-NEXT: movabsq $64, %rax
49 ; CHECK-NEXT: tilestored %tmm1, 1088(%rsp,%rax) # 1024-byte Folded Spill
50 ; CHECK-NEXT: movl $buf+1024, %eax
51 ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2
52 ; CHECK-NEXT: movabsq $64, %rax
53 ; CHECK-NEXT: tilestored %tmm2, 64(%rsp,%rax) # 1024-byte Folded Spill
54 ; CHECK-NEXT: vzeroupper
55 ; CHECK-NEXT: callq foo
56 ; CHECK-NEXT: ldtilecfg (%rsp)
57 ; CHECK-NEXT: movl $buf+2048, %eax
58 ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0
59 ; CHECK-NEXT: movabsq $64, %rcx
60 ; CHECK-NEXT: tileloadd 1088(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
61 ; CHECK-NEXT: tileloadd 64(%rsp,%rcx), %tmm2 # 1024-byte Folded Reload
62 ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
63 ; CHECK-NEXT: tilestored %tmm0, (%rax,%r14)
64 ; CHECK-NEXT: addq $2120, %rsp # imm = 0x848
65 ; CHECK-NEXT: popq %rbx
66 ; CHECK-NEXT: popq %r14
67 ; CHECK-NEXT: popq %r15
68 ; CHECK-NEXT: popq %rbp
69 ; CHECK-NEXT: tilerelease
72 ; IPRA-LABEL: test_api:
74 ; IPRA-NEXT: subq $72, %rsp
75 ; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0
76 ; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
77 ; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
78 ; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
79 ; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
80 ; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
81 ; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
82 ; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp)
83 ; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp)
84 ; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
85 ; IPRA-NEXT: movl $buf, %eax
86 ; IPRA-NEXT: movl $32, %ecx
87 ; IPRA-NEXT: movw $8, %dx
88 ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
89 ; IPRA-NEXT: movl $buf+1024, %eax
90 ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1
91 ; IPRA-NEXT: callq foo
92 ; IPRA-NEXT: movl $buf+2048, %eax
93 ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2
94 ; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
95 ; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx)
96 ; IPRA-NEXT: addq $72, %rsp
97 ; IPRA-NEXT: tilerelease
98 ; IPRA-NEXT: vzeroupper
101 ; O0-LABEL: test_api:
103 ; O0-NEXT: pushq %rbp
104 ; O0-NEXT: movq %rsp, %rbp
105 ; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
106 ; O0-NEXT: subq $8192, %rsp # imm = 0x2000
107 ; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
108 ; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
109 ; O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
110 ; O0-NEXT: movw %si, %cx
111 ; O0-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
112 ; O0-NEXT: movw %di, %ax
113 ; O0-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
114 ; O0-NEXT: movl $buf, %esi
115 ; O0-NEXT: movl $32, %edi
116 ; O0-NEXT: movw $8, %dx
117 ; O0-NEXT: # implicit-def: $al
118 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
119 ; O0-NEXT: movw %dx, {{[0-9]+}}(%rsp)
120 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
121 ; O0-NEXT: tileloadd (%rsi,%rdi), %tmm0
122 ; O0-NEXT: movl $64, %edi
123 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
124 ; O0-NEXT: movw $8, %dx
125 ; O0-NEXT: tilestored %tmm0, (%rsi,%rdi)
126 ; O0-NEXT: movl $32, %esi
127 ; O0-NEXT: movl $buf+1024, %edx
128 ; O0-NEXT: movw $8, %ax
129 ; O0-NEXT: # implicit-def: $al
130 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
131 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
132 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
133 ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
134 ; O0-NEXT: movl $64, %esi
135 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
136 ; O0-NEXT: movw $8, %ax
137 ; O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
138 ; O0-NEXT: vzeroupper
140 ; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
141 ; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
142 ; O0-NEXT: # implicit-def: $al
143 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
144 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
145 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
146 ; O0-NEXT: movl $32, %esi
147 ; O0-NEXT: movl $buf+2048, %edx
148 ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
149 ; O0-NEXT: movl $64, %esi
150 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
151 ; O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
152 ; O0-NEXT: movl $64, %edi
153 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
154 ; O0-NEXT: movw $8, %si
155 ; O0-NEXT: # implicit-def: $al
156 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
157 ; O0-NEXT: movw %si, {{[0-9]+}}(%rsp)
158 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
159 ; O0-NEXT: tileloadd (%rdx,%rdi), %tmm0
160 ; O0-NEXT: movabsq $64, %rdx
161 ; O0-NEXT: tilestored %tmm0, 1024(%rsp,%rdx) # 1024-byte Folded Spill
162 ; O0-NEXT: movl $64, %r8d
163 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
164 ; O0-NEXT: movw $8, %dx
165 ; O0-NEXT: # implicit-def: $al
166 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
167 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
168 ; O0-NEXT: # implicit-def: $al
169 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
170 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
171 ; O0-NEXT: # implicit-def: $al
172 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
173 ; O0-NEXT: movw %si, {{[0-9]+}}(%rsp)
174 ; O0-NEXT: # implicit-def: $al
175 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
176 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
177 ; O0-NEXT: # implicit-def: $dl
178 ; O0-NEXT: movb %dl, {{[0-9]+}}(%rsp)
179 ; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
180 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
181 ; O0-NEXT: tileloadd (%rdi,%r8), %tmm2
182 ; O0-NEXT: movl $64, %edi
183 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
184 ; O0-NEXT: tileloadd (%rdx,%rdi), %tmm0
185 ; O0-NEXT: movw $8, %dx
186 ; O0-NEXT: movabsq $64, %rdi
187 ; O0-NEXT: tileloadd 1024(%rsp,%rdi), %tmm1 # 1024-byte Folded Reload
188 ; O0-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
189 ; O0-NEXT: movl $64, %esi
190 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
191 ; O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
192 ; O0-NEXT: movl $64, %esi
193 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
194 ; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
195 ; O0-NEXT: movl $32, %esi
196 ; O0-NEXT: movl $buf+2048, %edx
197 ; O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
198 ; O0-NEXT: movq %rbp, %rsp
200 ; O0-NEXT: tilerelease
202 %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32)
203 %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
205 %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
206 %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
207 tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6)
211 define dso_local i32 @test_loop(i32 %0) nounwind {
212 ; CHECK-LABEL: test_loop:
214 ; CHECK-NEXT: pushq %rbp
215 ; CHECK-NEXT: pushq %r15
216 ; CHECK-NEXT: pushq %r14
217 ; CHECK-NEXT: pushq %r13
218 ; CHECK-NEXT: pushq %r12
219 ; CHECK-NEXT: pushq %rbx
220 ; CHECK-NEXT: subq $1096, %rsp # imm = 0x448
221 ; CHECK-NEXT: movl %edi, %ebx
222 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
223 ; CHECK-NEXT: vmovups %zmm0, (%rsp)
224 ; CHECK-NEXT: movb $1, (%rsp)
225 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
226 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
227 ; CHECK-NEXT: vzeroupper
228 ; CHECK-NEXT: callq foo
229 ; CHECK-NEXT: ldtilecfg (%rsp)
230 ; CHECK-NEXT: testl %ebx, %ebx
231 ; CHECK-NEXT: jg .LBB2_4
232 ; CHECK-NEXT: # %bb.1: # %.preheader
233 ; CHECK-NEXT: movl $7, %ebp
234 ; CHECK-NEXT: movl $buf, %r14d
235 ; CHECK-NEXT: movl $32, %r15d
236 ; CHECK-NEXT: movw $8, %r12w
237 ; CHECK-NEXT: movl $buf+2048, %r13d
238 ; CHECK-NEXT: .p2align 4, 0x90
239 ; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
240 ; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
241 ; CHECK-NEXT: movabsq $64, %rax
242 ; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill
243 ; CHECK-NEXT: callq foo
244 ; CHECK-NEXT: ldtilecfg (%rsp)
245 ; CHECK-NEXT: movabsq $64, %rax
246 ; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
247 ; CHECK-NEXT: tilestored %tmm0, (%r13,%r15)
248 ; CHECK-NEXT: callq foo
249 ; CHECK-NEXT: ldtilecfg (%rsp)
250 ; CHECK-NEXT: decl %ebp
251 ; CHECK-NEXT: cmpl $7, %ebp
252 ; CHECK-NEXT: jne .LBB2_2
253 ; CHECK-NEXT: # %bb.3:
254 ; CHECK-NEXT: cmpl $3, %ebx
255 ; CHECK-NEXT: jne .LBB2_4
256 ; CHECK-NEXT: # %bb.6:
257 ; CHECK-NEXT: testl %ebp, %ebp
258 ; CHECK-NEXT: jne .LBB2_5
259 ; CHECK-NEXT: # %bb.7:
260 ; CHECK-NEXT: incl %ebx
261 ; CHECK-NEXT: jmp .LBB2_8
262 ; CHECK-NEXT: .LBB2_4:
263 ; CHECK-NEXT: callq foo
264 ; CHECK-NEXT: ldtilecfg (%rsp)
265 ; CHECK-NEXT: movl $32, %eax
266 ; CHECK-NEXT: movl $buf+1024, %ecx
267 ; CHECK-NEXT: movw $8, %dx
268 ; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
269 ; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax)
270 ; CHECK-NEXT: .LBB2_5:
271 ; CHECK-NEXT: decl %ebx
272 ; CHECK-NEXT: .LBB2_8:
273 ; CHECK-NEXT: movl %ebx, %eax
274 ; CHECK-NEXT: addq $1096, %rsp # imm = 0x448
275 ; CHECK-NEXT: popq %rbx
276 ; CHECK-NEXT: popq %r12
277 ; CHECK-NEXT: popq %r13
278 ; CHECK-NEXT: popq %r14
279 ; CHECK-NEXT: popq %r15
280 ; CHECK-NEXT: popq %rbp
281 ; CHECK-NEXT: tilerelease
284 ; IPRA-LABEL: test_loop:
286 ; IPRA-NEXT: subq $72, %rsp
287 ; IPRA-NEXT: movl %edi, %eax
288 ; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0
289 ; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
290 ; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
291 ; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
292 ; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
293 ; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
294 ; IPRA-NEXT: callq foo
295 ; IPRA-NEXT: testl %edi, %edi
296 ; IPRA-NEXT: jg .LBB2_4
297 ; IPRA-NEXT: # %bb.1: # %.preheader
298 ; IPRA-NEXT: movl $7, %ecx
299 ; IPRA-NEXT: movl $buf, %edx
300 ; IPRA-NEXT: movl $32, %esi
301 ; IPRA-NEXT: movw $8, %di
302 ; IPRA-NEXT: movl $buf+2048, %r8d
303 ; IPRA-NEXT: .p2align 4, 0x90
304 ; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1
305 ; IPRA-NEXT: tileloadd (%rdx,%rsi), %tmm0
306 ; IPRA-NEXT: callq foo
307 ; IPRA-NEXT: tilestored %tmm0, (%r8,%rsi)
308 ; IPRA-NEXT: callq foo
309 ; IPRA-NEXT: decl %ecx
310 ; IPRA-NEXT: cmpl $7, %ecx
311 ; IPRA-NEXT: jne .LBB2_2
312 ; IPRA-NEXT: # %bb.3:
313 ; IPRA-NEXT: cmpl $3, %eax
314 ; IPRA-NEXT: jne .LBB2_4
315 ; IPRA-NEXT: # %bb.6:
316 ; IPRA-NEXT: testl %ecx, %ecx
317 ; IPRA-NEXT: jne .LBB2_5
318 ; IPRA-NEXT: # %bb.7:
319 ; IPRA-NEXT: incl %eax
320 ; IPRA-NEXT: jmp .LBB2_8
321 ; IPRA-NEXT: .LBB2_4:
322 ; IPRA-NEXT: callq foo
323 ; IPRA-NEXT: movl $32, %ecx
324 ; IPRA-NEXT: movl $buf+1024, %edx
325 ; IPRA-NEXT: movw $8, %si
326 ; IPRA-NEXT: tileloadd (%rdx,%rcx), %tmm0
327 ; IPRA-NEXT: tilestored %tmm0, (%rdx,%rcx)
328 ; IPRA-NEXT: .LBB2_5:
329 ; IPRA-NEXT: decl %eax
330 ; IPRA-NEXT: .LBB2_8:
331 ; IPRA-NEXT: addq $72, %rsp
332 ; IPRA-NEXT: tilerelease
333 ; IPRA-NEXT: vzeroupper
336 ; O0-LABEL: test_loop:
338 ; O0-NEXT: pushq %rbp
339 ; O0-NEXT: movq %rsp, %rbp
340 ; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
341 ; O0-NEXT: subq $4096, %rsp # imm = 0x1000
342 ; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
343 ; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
344 ; O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
345 ; O0-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
346 ; O0-NEXT: vzeroupper
349 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
350 ; O0-NEXT: xorl %eax, %eax
351 ; O0-NEXT: cmpl $0, %ecx
352 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
353 ; O0-NEXT: jg .LBB2_4
354 ; O0-NEXT: jmp .LBB2_3
356 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
357 ; O0-NEXT: cmpl $3, %eax
358 ; O0-NEXT: je .LBB2_5
359 ; O0-NEXT: jmp .LBB2_4
360 ; O0-NEXT: .LBB2_3: # =>This Inner Loop Header: Depth=1
361 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
362 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
363 ; O0-NEXT: movl $buf, %ecx
364 ; O0-NEXT: movl $32, %edx
365 ; O0-NEXT: movw $8, %ax
366 ; O0-NEXT: # implicit-def: $al
367 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
368 ; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
369 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
370 ; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0
371 ; O0-NEXT: movl $64, %edx
372 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
373 ; O0-NEXT: movw $8, %ax
374 ; O0-NEXT: tilestored %tmm0, (%rcx,%rdx)
376 ; O0-NEXT: movl $64, %edx
377 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
378 ; O0-NEXT: movw $8, %ax
379 ; O0-NEXT: # implicit-def: $al
380 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
381 ; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
382 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
383 ; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0
384 ; O0-NEXT: movl $32, %edx
385 ; O0-NEXT: movl $buf+2048, %ecx
386 ; O0-NEXT: movw $8, %ax
387 ; O0-NEXT: tilestored %tmm0, (%rcx,%rdx)
389 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
390 ; O0-NEXT: addl $1, %eax
391 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
392 ; O0-NEXT: cmpl $0, %eax
393 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
394 ; O0-NEXT: je .LBB2_2
395 ; O0-NEXT: jmp .LBB2_3
398 ; O0-NEXT: movl $32, %edx
399 ; O0-NEXT: movl $buf+1024, %ecx
400 ; O0-NEXT: movw $8, %ax
401 ; O0-NEXT: # implicit-def: $al
402 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
403 ; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
404 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
405 ; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0
406 ; O0-NEXT: movl $64, %edx
407 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
408 ; O0-NEXT: movw $8, %ax
409 ; O0-NEXT: tilestored %tmm0, (%rcx,%rdx)
410 ; O0-NEXT: movl $64, %edx
411 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
412 ; O0-NEXT: movw $8, %ax
413 ; O0-NEXT: # implicit-def: $al
414 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
415 ; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
416 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
417 ; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0
418 ; O0-NEXT: movl $32, %edx
419 ; O0-NEXT: movl $buf+1024, %ecx
420 ; O0-NEXT: movw $8, %ax
421 ; O0-NEXT: tilestored %tmm0, (%rcx,%rdx)
422 ; O0-NEXT: jmp .LBB2_7
424 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
425 ; O0-NEXT: cmpl $7, %eax
426 ; O0-NEXT: jne .LBB2_7
428 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
429 ; O0-NEXT: addl $1, %eax
430 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
431 ; O0-NEXT: jmp .LBB2_8
433 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
434 ; O0-NEXT: subl $1, %eax
435 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
437 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
438 ; O0-NEXT: movq %rbp, %rsp
440 ; O0-NEXT: tilerelease
445 %3 = icmp sgt i32 %0, 0
446 br i1 %3, label %11, label %6
448 %5 = icmp eq i32 %0, 3
449 br i1 %5, label %13, label %11
451 %7 = phi i32 [ %9, %6 ], [ 0, %2 ]
452 %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32)
454 tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %8)
457 %10 = icmp eq i32 %9, 0
458 br i1 %10, label %4, label %6
461 %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32)
462 tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32, x86_amx %12)
465 %14 = icmp eq i32 %9, 7
466 br i1 %14, label %15, label %17
474 %20 = phi i32 [ %16, %15 ], [ %18, %17 ]
478 define dso_local void @test_loop2(i32 %0) nounwind {
479 ; CHECK-LABEL: test_loop2:
481 ; CHECK-NEXT: pushq %rbp
482 ; CHECK-NEXT: pushq %r15
483 ; CHECK-NEXT: pushq %r14
484 ; CHECK-NEXT: pushq %r12
485 ; CHECK-NEXT: pushq %rbx
486 ; CHECK-NEXT: subq $1088, %rsp # imm = 0x440
487 ; CHECK-NEXT: movl %edi, %ebx
488 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
489 ; CHECK-NEXT: vmovups %zmm0, (%rsp)
490 ; CHECK-NEXT: movb $1, (%rsp)
491 ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
492 ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
493 ; CHECK-NEXT: movl $buf, %r14d
494 ; CHECK-NEXT: movl $32, %r15d
495 ; CHECK-NEXT: movw $8, %bp
496 ; CHECK-NEXT: movl $buf+2048, %r12d
497 ; CHECK-NEXT: .p2align 4, 0x90
498 ; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
499 ; CHECK-NEXT: vzeroupper
500 ; CHECK-NEXT: callq foo
501 ; CHECK-NEXT: ldtilecfg (%rsp)
502 ; CHECK-NEXT: testl %ebx, %ebx
503 ; CHECK-NEXT: jle .LBB3_3
504 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
505 ; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
506 ; CHECK-NEXT: movabsq $64, %rax
507 ; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill
508 ; CHECK-NEXT: callq foo
509 ; CHECK-NEXT: ldtilecfg (%rsp)
510 ; CHECK-NEXT: movabsq $64, %rax
511 ; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
512 ; CHECK-NEXT: tilestored %tmm0, (%r12,%r15)
513 ; CHECK-NEXT: callq foo
514 ; CHECK-NEXT: jmp .LBB3_1
515 ; CHECK-NEXT: .LBB3_3:
516 ; CHECK-NEXT: addq $1088, %rsp # imm = 0x440
517 ; CHECK-NEXT: popq %rbx
518 ; CHECK-NEXT: popq %r12
519 ; CHECK-NEXT: popq %r14
520 ; CHECK-NEXT: popq %r15
521 ; CHECK-NEXT: popq %rbp
522 ; CHECK-NEXT: tilerelease
525 ; IPRA-LABEL: test_loop2:
527 ; IPRA-NEXT: subq $72, %rsp
528 ; IPRA-NEXT: vxorps %xmm0, %xmm0, %xmm0
529 ; IPRA-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
530 ; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp)
531 ; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp)
532 ; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp)
533 ; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
534 ; IPRA-NEXT: movl $buf, %eax
535 ; IPRA-NEXT: movl $32, %ecx
536 ; IPRA-NEXT: movw $8, %dx
537 ; IPRA-NEXT: movl $buf+2048, %esi
538 ; IPRA-NEXT: .p2align 4, 0x90
539 ; IPRA-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
540 ; IPRA-NEXT: callq foo
541 ; IPRA-NEXT: testl %edi, %edi
542 ; IPRA-NEXT: jle .LBB3_3
543 ; IPRA-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
544 ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0
545 ; IPRA-NEXT: callq foo
546 ; IPRA-NEXT: tilestored %tmm0, (%rsi,%rcx)
547 ; IPRA-NEXT: callq foo
548 ; IPRA-NEXT: jmp .LBB3_1
549 ; IPRA-NEXT: .LBB3_3:
550 ; IPRA-NEXT: addq $72, %rsp
551 ; IPRA-NEXT: tilerelease
552 ; IPRA-NEXT: vzeroupper
555 ; O0-LABEL: test_loop2:
557 ; O0-NEXT: pushq %rbp
558 ; O0-NEXT: movq %rsp, %rbp
559 ; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
560 ; O0-NEXT: subq $3072, %rsp # imm = 0xC00
561 ; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
562 ; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
563 ; O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
564 ; O0-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
565 ; O0-NEXT: xorl %eax, %eax
566 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
567 ; O0-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
568 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
569 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
570 ; O0-NEXT: vzeroupper
572 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
573 ; O0-NEXT: cmpl $0, %eax
574 ; O0-NEXT: jle .LBB3_3
575 ; O0-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
576 ; O0-NEXT: movl $buf, %ecx
577 ; O0-NEXT: movl $32, %edx
578 ; O0-NEXT: movw $8, %ax
579 ; O0-NEXT: # implicit-def: $al
580 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
581 ; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
582 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
583 ; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0
584 ; O0-NEXT: movl $64, %edx
585 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
586 ; O0-NEXT: movw $8, %ax
587 ; O0-NEXT: tilestored %tmm0, (%rcx,%rdx)
589 ; O0-NEXT: movl $64, %edx
590 ; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
591 ; O0-NEXT: movw $8, %ax
592 ; O0-NEXT: # implicit-def: $al
593 ; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
594 ; O0-NEXT: movw %ax, {{[0-9]+}}(%rsp)
595 ; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
596 ; O0-NEXT: tileloadd (%rcx,%rdx), %tmm0
597 ; O0-NEXT: movl $32, %edx
598 ; O0-NEXT: movl $buf+2048, %ecx
599 ; O0-NEXT: movw $8, %ax
600 ; O0-NEXT: tilestored %tmm0, (%rcx,%rdx)
602 ; O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
603 ; O0-NEXT: addl $1, %eax
604 ; O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
605 ; O0-NEXT: jmp .LBB3_1
607 ; O0-NEXT: movq %rbp, %rsp
609 ; O0-NEXT: tilerelease
613 %3 = phi i32 [ 0, %1 ], [ %7, %5 ]
615 %4 = icmp sgt i32 %0, 0
616 br i1 %4, label %5, label %8
618 %6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32)
620 tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6)
628 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
629 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
630 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)