1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
4 %struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
5 @buf = dso_local global [1024 x i8] zeroinitializer, align 16
6 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
8 define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 {
9 ; AVX512-LABEL: test_api:
10 ; AVX512: # %bb.0: # %entry
11 ; AVX512-NEXT: pushq %rbp
12 ; AVX512-NEXT: movq %rsp, %rbp
13 ; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
14 ; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400
15 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
16 ; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
17 ; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
18 ; AVX512-NEXT: movw %dx, %ax
19 ; AVX512-NEXT: movw %si, %cx
20 ; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp)
21 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
22 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
23 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
24 ; AVX512-NEXT: xorl %esi, %esi
25 ; AVX512-NEXT: movl $1088, %edx # imm = 0x440
26 ; AVX512-NEXT: vzeroupper
27 ; AVX512-NEXT: callq memset@PLT
28 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
29 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
30 ; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
31 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
32 ; AVX512-NEXT: xorl %esi, %esi
33 ; AVX512-NEXT: movl $1088, %edx # imm = 0x440
34 ; AVX512-NEXT: callq memset@PLT
35 ; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
36 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
37 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
38 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
39 ; AVX512-NEXT: xorl %esi, %esi
40 ; AVX512-NEXT: movl $1088, %edx # imm = 0x440
41 ; AVX512-NEXT: callq memset@PLT
42 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
43 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
44 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
45 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
46 ; AVX512-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
47 ; AVX512-NEXT: je .LBB0_2
48 ; AVX512-NEXT: # %bb.1: # %if.then
49 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
50 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
51 ; AVX512-NEXT: movabsq $buf, %rax
52 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
53 ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
54 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
55 ; AVX512-NEXT: movw (%rax), %si
56 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
57 ; AVX512-NEXT: movw 2(%rax), %dx
58 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
59 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
60 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
61 ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
62 ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
63 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
64 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
65 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
66 ; AVX512-NEXT: # implicit-def: $al
67 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
68 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
69 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
70 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
71 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
72 ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
73 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
74 ; AVX512-NEXT: addq $64, %rdx
75 ; AVX512-NEXT: movl $64, %esi
76 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
77 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
78 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
79 ; AVX512-NEXT: movabsq $buf, %rax
80 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
81 ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
82 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
83 ; AVX512-NEXT: movw (%rax), %si
84 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
85 ; AVX512-NEXT: movw 2(%rax), %dx
86 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
87 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
88 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
89 ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
90 ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
91 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
92 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
93 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
94 ; AVX512-NEXT: # implicit-def: $al
95 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
96 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
97 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
98 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
99 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
100 ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
101 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
102 ; AVX512-NEXT: addq $64, %rdx
103 ; AVX512-NEXT: movl $64, %esi
104 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
105 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
106 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
107 ; AVX512-NEXT: movabsq $buf, %rax
108 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
109 ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
110 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
111 ; AVX512-NEXT: movw (%rax), %si
112 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
113 ; AVX512-NEXT: movw 2(%rax), %dx
114 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
115 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
116 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
117 ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
118 ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
119 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
120 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
121 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
122 ; AVX512-NEXT: # implicit-def: $al
123 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
124 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
125 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
126 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
127 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
128 ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
129 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
130 ; AVX512-NEXT: addq $64, %rdx
131 ; AVX512-NEXT: movl $64, %esi
132 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
133 ; AVX512-NEXT: jmp .LBB0_3
134 ; AVX512-NEXT: .LBB0_2: # %if.else
135 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
136 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
137 ; AVX512-NEXT: movabsq $buf2, %rax
138 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
139 ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
140 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
141 ; AVX512-NEXT: movw (%rax), %si
142 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
143 ; AVX512-NEXT: movw 2(%rax), %dx
144 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
145 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
146 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
147 ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
148 ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
149 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
150 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
151 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
152 ; AVX512-NEXT: # implicit-def: $al
153 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
154 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
155 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
156 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
157 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
158 ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
159 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
160 ; AVX512-NEXT: addq $64, %rdx
161 ; AVX512-NEXT: movl $64, %esi
162 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
163 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
164 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
165 ; AVX512-NEXT: movabsq $buf2, %rax
166 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
167 ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
168 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
169 ; AVX512-NEXT: movw (%rax), %si
170 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
171 ; AVX512-NEXT: movw 2(%rax), %dx
172 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
173 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
174 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
175 ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
176 ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
177 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
178 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
179 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
180 ; AVX512-NEXT: # implicit-def: $al
181 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
182 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
183 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
184 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
185 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
186 ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
187 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
188 ; AVX512-NEXT: addq $64, %rdx
189 ; AVX512-NEXT: movl $64, %esi
190 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
191 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
192 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
193 ; AVX512-NEXT: movabsq $buf2, %rax
194 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
195 ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
196 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
197 ; AVX512-NEXT: movw (%rax), %si
198 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
199 ; AVX512-NEXT: movw 2(%rax), %dx
200 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
201 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
202 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
203 ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
204 ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
205 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
206 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
207 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
208 ; AVX512-NEXT: # implicit-def: $al
209 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
210 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
211 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
212 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
213 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
214 ; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
215 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
216 ; AVX512-NEXT: addq $64, %rdx
217 ; AVX512-NEXT: movl $64, %esi
218 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
219 ; AVX512-NEXT: .LBB0_3: # %if.end
220 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
221 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
222 ; AVX512-NEXT: movl $1088, %edx # imm = 0x440
223 ; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
224 ; AVX512-NEXT: callq memcpy@PLT
225 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
226 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
227 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
228 ; AVX512-NEXT: callq memcpy@PLT
229 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
230 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
231 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
232 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
233 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
234 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
235 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
236 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
237 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
238 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
239 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0
240 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
241 ; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0
242 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
243 ; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0
244 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
245 ; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0
246 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
247 ; AVX512-NEXT: vmovdqa64 320(%rax), %zmm0
248 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
249 ; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0
250 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
251 ; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0
252 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
253 ; AVX512-NEXT: vmovdqa64 512(%rax), %zmm0
254 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
255 ; AVX512-NEXT: vmovdqa64 576(%rax), %zmm0
256 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
257 ; AVX512-NEXT: vmovdqa64 640(%rax), %zmm0
258 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
259 ; AVX512-NEXT: vmovdqa64 704(%rax), %zmm0
260 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
261 ; AVX512-NEXT: vmovdqa64 768(%rax), %zmm0
262 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
263 ; AVX512-NEXT: vmovdqa64 832(%rax), %zmm0
264 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
265 ; AVX512-NEXT: vmovdqa64 896(%rax), %zmm0
266 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
267 ; AVX512-NEXT: vmovdqa64 960(%rax), %zmm0
268 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
269 ; AVX512-NEXT: vmovdqa64 1024(%rax), %zmm0
270 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
271 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
272 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
273 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
274 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
275 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
276 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
277 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
278 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
279 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
280 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
281 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
282 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
283 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
284 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
285 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
286 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
287 ; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
288 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
289 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
290 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
291 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
292 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
293 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
294 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
295 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
296 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
297 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
298 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
299 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
300 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
301 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
302 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
303 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
304 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
305 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
306 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
307 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
308 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
309 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
310 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
311 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
312 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
313 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
314 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
315 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
316 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
317 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
318 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
319 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
320 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
321 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
322 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
323 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
324 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
325 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
326 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
327 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
328 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
329 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
330 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
331 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
332 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
333 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
334 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
335 ; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
336 ; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
337 ; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
338 ; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
339 ; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
340 ; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
341 ; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
342 ; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
343 ; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
344 ; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
345 ; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
346 ; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
347 ; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
348 ; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
349 ; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
350 ; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
351 ; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
352 ; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
353 ; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
354 ; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
355 ; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
356 ; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
357 ; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
358 ; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
359 ; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
360 ; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
361 ; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
362 ; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
363 ; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
364 ; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
365 ; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
366 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
367 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
368 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
369 ; AVX512-NEXT: movl $1024, %edx # imm = 0x400
370 ; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
371 ; AVX512-NEXT: vzeroupper
372 ; AVX512-NEXT: callq memcpy@PLT
373 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
374 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
375 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
376 ; AVX512-NEXT: callq memcpy@PLT
377 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
378 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
379 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
380 ; AVX512-NEXT: callq memcpy@PLT
381 ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
382 ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
383 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
384 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
385 ; AVX512-NEXT: # kill: def $r8 killed $rax
386 ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
387 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
388 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
389 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
390 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
391 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
392 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
393 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
394 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
395 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
396 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
397 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
398 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
399 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
400 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
401 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
402 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
403 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
404 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
405 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
406 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
407 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
408 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
409 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
410 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
411 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
412 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
413 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
414 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
415 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
416 ; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
417 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
418 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
419 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
420 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
421 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
422 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
423 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
424 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
425 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
426 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
427 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
428 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
429 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
430 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
431 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
432 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
433 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
434 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
435 ; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
436 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
437 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
438 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
439 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
440 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
441 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
442 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
443 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
444 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
445 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
446 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
447 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
448 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
449 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
450 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
451 ; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
452 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
453 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
454 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
455 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
456 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
457 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
458 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
459 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
460 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
461 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
462 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
463 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
464 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
465 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
466 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
467 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
468 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
469 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
470 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
471 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
472 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
473 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
474 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
475 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
476 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
477 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
478 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
479 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
480 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
481 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
482 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
483 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
484 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
485 ; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
486 ; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
487 ; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
488 ; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
489 ; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
490 ; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
491 ; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
492 ; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
493 ; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
494 ; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
495 ; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
496 ; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
497 ; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
498 ; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
499 ; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
500 ; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
501 ; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
502 ; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
503 ; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
504 ; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
505 ; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
506 ; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
507 ; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
508 ; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
509 ; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
510 ; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
511 ; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
512 ; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
513 ; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
514 ; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
515 ; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
516 ; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
517 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
518 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
519 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
520 ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r8d
521 ; AVX512-NEXT: movw %r8w, %di
522 ; AVX512-NEXT: shrl $2, %r8d
523 ; AVX512-NEXT: movw %r8w, %r9w
524 ; AVX512-NEXT: # implicit-def: $al
525 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
526 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
527 ; AVX512-NEXT: # implicit-def: $r9b
528 ; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
529 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
530 ; AVX512-NEXT: # implicit-def: $al
531 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
532 ; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
533 ; AVX512-NEXT: # implicit-def: $al
534 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
535 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
536 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
537 ; AVX512-NEXT: movl $64, %r8d
538 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
539 ; AVX512-NEXT: tileloadd (%r10,%r8), %tmm0
540 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
541 ; AVX512-NEXT: tileloadd (%r10,%r8), %tmm1
542 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
543 ; AVX512-NEXT: tileloadd (%r10,%r8), %tmm2
544 ; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
545 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
546 ; AVX512-NEXT: addq $64, %rdi
547 ; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
548 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
549 ; AVX512-NEXT: vzeroupper
550 ; AVX512-NEXT: callq memcpy@PLT
551 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
552 ; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
553 ; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
554 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
555 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
556 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
557 ; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
558 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
559 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
560 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
561 ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
562 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
563 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
564 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
565 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
566 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
567 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
568 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
569 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
570 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
571 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
572 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
573 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
574 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
575 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
576 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
577 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
578 ; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
579 ; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
580 ; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
581 ; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
582 ; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
583 ; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
584 ; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
585 ; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
586 ; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
587 ; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
588 ; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
589 ; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
590 ; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
591 ; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
592 ; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
593 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
594 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
595 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
596 ; AVX512-NEXT: vzeroupper
597 ; AVX512-NEXT: callq memcpy@PLT
598 ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
599 ; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
600 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
601 ; AVX512-NEXT: # kill: def $rdi killed $rax
602 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
603 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
604 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
605 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
606 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
607 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
608 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
609 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
610 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
611 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
612 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
613 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
614 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
615 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
616 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
617 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
618 ; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
619 ; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
620 ; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
621 ; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
622 ; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
623 ; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
624 ; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
625 ; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
626 ; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
627 ; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
628 ; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
629 ; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
630 ; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
631 ; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
632 ; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
633 ; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
634 ; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
635 ; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
636 ; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
637 ; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
638 ; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
639 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
640 ; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
641 ; AVX512-NEXT: # implicit-def: $al
642 ; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
643 ; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
644 ; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
645 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
646 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
647 ; AVX512-NEXT: movl $64, %r8d
648 ; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
649 ; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0
650 ; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
651 ; AVX512-NEXT: movq %rbp, %rsp
652 ; AVX512-NEXT: popq %rbp
653 ; AVX512-NEXT: tilerelease
654 ; AVX512-NEXT: vzeroupper
657 %m.addr.i85 = alloca i16, align 2
658 %n.addr.i86 = alloca i16, align 2
659 %base.addr.i87 = alloca ptr, align 8
660 %stride.addr.i88 = alloca i64, align 8
661 %tile.addr.i = alloca <256 x i32>, align 64
662 %indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024
663 %m.addr.i81 = alloca i16, align 2
664 %n.addr.i82 = alloca i16, align 2
665 %k.addr.i = alloca i16, align 2
666 %dst.addr.i83 = alloca <256 x i32>, align 64
667 %src1.addr.i = alloca <256 x i32>, align 64
668 %src2.addr.i = alloca <256 x i32>, align 64
669 %indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024
670 %indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024
671 %indirect-arg-temp.i78 = alloca <256 x i32>, align 1024
672 %m.addr.i74 = alloca i16, align 2
673 %n.addr.i75 = alloca i16, align 2
674 %base.addr.i76 = alloca ptr, align 8
675 %stride.addr.i77 = alloca i64, align 8
676 %m.addr.i70 = alloca i16, align 2
677 %n.addr.i71 = alloca i16, align 2
678 %base.addr.i72 = alloca ptr, align 8
679 %stride.addr.i73 = alloca i64, align 8
680 %m.addr.i66 = alloca i16, align 2
681 %n.addr.i67 = alloca i16, align 2
682 %base.addr.i68 = alloca ptr, align 8
683 %stride.addr.i69 = alloca i64, align 8
684 %m.addr.i62 = alloca i16, align 2
685 %n.addr.i63 = alloca i16, align 2
686 %base.addr.i64 = alloca ptr, align 8
687 %stride.addr.i65 = alloca i64, align 8
688 %m.addr.i58 = alloca i16, align 2
689 %n.addr.i59 = alloca i16, align 2
690 %base.addr.i60 = alloca ptr, align 8
691 %stride.addr.i61 = alloca i64, align 8
692 %m.addr.i = alloca i16, align 2
693 %n.addr.i = alloca i16, align 2
694 %base.addr.i56 = alloca ptr, align 8
695 %stride.addr.i57 = alloca i64, align 8
696 %base.addr.i50 = alloca ptr, align 8
697 %stride.addr.i51 = alloca i64, align 8
698 %indirect-arg-temp.i52 = alloca <256 x i32>, align 1024
699 %c49 = alloca %struct.__tile1024i_str, align 64
700 %dst.addr.i44 = alloca ptr, align 8
701 %indirect-arg-temp.i = alloca <256 x i32>, align 1024
702 %indirect-arg-temp4.i = alloca <256 x i32>, align 1024
703 %indirect-arg-temp5.i = alloca <256 x i32>, align 1024
704 %b43 = alloca %struct.__tile1024i_str, align 64
705 %a42 = alloca %struct.__tile1024i_str, align 64
706 %dst.addr.i35 = alloca ptr, align 8
707 %base.addr.i36 = alloca ptr, align 8
708 %stride.addr.i37 = alloca i64, align 8
709 %dst.addr.i28 = alloca ptr, align 8
710 %base.addr.i29 = alloca ptr, align 8
711 %stride.addr.i30 = alloca i64, align 8
712 %dst.addr.i21 = alloca ptr, align 8
713 %base.addr.i22 = alloca ptr, align 8
714 %stride.addr.i23 = alloca i64, align 8
715 %dst.addr.i14 = alloca ptr, align 8
716 %base.addr.i15 = alloca ptr, align 8
717 %stride.addr.i16 = alloca i64, align 8
718 %dst.addr.i7 = alloca ptr, align 8
719 %base.addr.i8 = alloca ptr, align 8
720 %stride.addr.i9 = alloca i64, align 8
721 %dst.addr.i = alloca ptr, align 8
722 %base.addr.i = alloca ptr, align 8
723 %stride.addr.i = alloca i64, align 8
724 %cond.addr = alloca i32, align 4
725 %row.addr = alloca i16, align 2
726 %col.addr = alloca i16, align 2
727 %a = alloca %struct.__tile1024i_str, align 64
728 %b = alloca %struct.__tile1024i_str, align 64
729 %c = alloca %struct.__tile1024i_str, align 64
730 store i32 %cond, ptr %cond.addr, align 4
731 store i16 %row, ptr %row.addr, align 2
732 store i16 %col, ptr %col.addr, align 2
733 call void @llvm.memset.p0.i64(ptr align 64 %a, i8 0, i64 1088, i1 false)
734 %0 = load i16, ptr %row.addr, align 2
735 store i16 %0, ptr %a, align 64
736 %col2 = getelementptr inbounds %struct.__tile1024i_str, ptr %a, i32 0, i32 1
737 store i16 8, ptr %col2, align 2
738 call void @llvm.memset.p0.i64(ptr align 64 %b, i8 0, i64 1088, i1 false)
739 store i16 8, ptr %b, align 64
740 %col4 = getelementptr inbounds %struct.__tile1024i_str, ptr %b, i32 0, i32 1
741 %1 = load i16, ptr %col.addr, align 2
742 store i16 %1, ptr %col4, align 2
743 call void @llvm.memset.p0.i64(ptr align 64 %c, i8 0, i64 1088, i1 false)
744 %2 = load i16, ptr %row.addr, align 2
745 store i16 %2, ptr %c, align 64
746 %col6 = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i32 0, i32 1
747 %3 = load i16, ptr %col.addr, align 2
748 store i16 %3, ptr %col6, align 2
749 %4 = load i32, ptr %cond.addr, align 4
750 %tobool = icmp ne i32 %4, 0
751 br i1 %tobool, label %if.then, label %if.else
753 if.then: ; preds = %entry
754 store ptr %a, ptr %dst.addr.i35, align 8
755 store ptr @buf, ptr %base.addr.i36, align 8
756 store i64 32, ptr %stride.addr.i37, align 8
757 %5 = load ptr, ptr %dst.addr.i35, align 8
758 %6 = load i16, ptr %5, align 64
759 %7 = load ptr, ptr %dst.addr.i35, align 8
760 %col.i39 = getelementptr inbounds %struct.__tile1024i_str, ptr %7, i32 0, i32 1
761 %8 = load i16, ptr %col.i39, align 2
762 %9 = load ptr, ptr %base.addr.i36, align 8
763 %10 = load i64, ptr %stride.addr.i37, align 8
764 store i16 %6, ptr %m.addr.i, align 2
765 store i16 %8, ptr %n.addr.i, align 2
766 store ptr %9, ptr %base.addr.i56, align 8
767 store i64 %10, ptr %stride.addr.i57, align 8
768 %11 = load i16, ptr %m.addr.i, align 2
769 %12 = load i16, ptr %n.addr.i, align 2
770 %13 = load ptr, ptr %base.addr.i56, align 8
771 %14 = load i64, ptr %stride.addr.i57, align 8
772 %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %11, i16 %12, ptr %13, i64 %14) #2
773 %16 = bitcast x86_amx %15 to <256 x i32>
774 %17 = load ptr, ptr %dst.addr.i35, align 8
775 %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, ptr %17, i32 0, i32 3
776 store <256 x i32> %16, ptr %tile.i41, align 64
777 store ptr %b, ptr %dst.addr.i28, align 8
778 store ptr @buf, ptr %base.addr.i29, align 8
779 store i64 32, ptr %stride.addr.i30, align 8
780 %18 = load ptr, ptr %dst.addr.i28, align 8
781 %19 = load i16, ptr %18, align 64
782 %20 = load ptr, ptr %dst.addr.i28, align 8
783 %col.i32 = getelementptr inbounds %struct.__tile1024i_str, ptr %20, i32 0, i32 1
784 %21 = load i16, ptr %col.i32, align 2
785 %22 = load ptr, ptr %base.addr.i29, align 8
786 %23 = load i64, ptr %stride.addr.i30, align 8
787 store i16 %19, ptr %m.addr.i58, align 2
788 store i16 %21, ptr %n.addr.i59, align 2
789 store ptr %22, ptr %base.addr.i60, align 8
790 store i64 %23, ptr %stride.addr.i61, align 8
791 %24 = load i16, ptr %m.addr.i58, align 2
792 %25 = load i16, ptr %n.addr.i59, align 2
793 %26 = load ptr, ptr %base.addr.i60, align 8
794 %27 = load i64, ptr %stride.addr.i61, align 8
795 %28 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %24, i16 %25, ptr %26, i64 %27) #2
796 %29 = bitcast x86_amx %28 to <256 x i32>
797 %30 = load ptr, ptr %dst.addr.i28, align 8
798 %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, ptr %30, i32 0, i32 3
799 store <256 x i32> %29, ptr %tile.i34, align 64
800 store ptr %c, ptr %dst.addr.i21, align 8
801 store ptr @buf, ptr %base.addr.i22, align 8
802 store i64 32, ptr %stride.addr.i23, align 8
803 %31 = load ptr, ptr %dst.addr.i21, align 8
804 %32 = load i16, ptr %31, align 64
805 %33 = load ptr, ptr %dst.addr.i21, align 8
806 %col.i25 = getelementptr inbounds %struct.__tile1024i_str, ptr %33, i32 0, i32 1
807 %34 = load i16, ptr %col.i25, align 2
808 %35 = load ptr, ptr %base.addr.i22, align 8
809 %36 = load i64, ptr %stride.addr.i23, align 8
810 store i16 %32, ptr %m.addr.i62, align 2
811 store i16 %34, ptr %n.addr.i63, align 2
812 store ptr %35, ptr %base.addr.i64, align 8
813 store i64 %36, ptr %stride.addr.i65, align 8
814 %37 = load i16, ptr %m.addr.i62, align 2
815 %38 = load i16, ptr %n.addr.i63, align 2
816 %39 = load ptr, ptr %base.addr.i64, align 8
817 %40 = load i64, ptr %stride.addr.i65, align 8
818 %41 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %37, i16 %38, ptr %39, i64 %40) #2
819 %42 = bitcast x86_amx %41 to <256 x i32>
820 %43 = load ptr, ptr %dst.addr.i21, align 8
821 %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, ptr %43, i32 0, i32 3
822 store <256 x i32> %42, ptr %tile.i27, align 64
825 if.else: ; preds = %entry
826 store ptr %a, ptr %dst.addr.i14, align 8
827 store ptr @buf2, ptr %base.addr.i15, align 8
828 store i64 32, ptr %stride.addr.i16, align 8
829 %44 = load ptr, ptr %dst.addr.i14, align 8
830 %45 = load i16, ptr %44, align 64
831 %46 = load ptr, ptr %dst.addr.i14, align 8
832 %col.i18 = getelementptr inbounds %struct.__tile1024i_str, ptr %46, i32 0, i32 1
833 %47 = load i16, ptr %col.i18, align 2
834 %48 = load ptr, ptr %base.addr.i15, align 8
835 %49 = load i64, ptr %stride.addr.i16, align 8
836 store i16 %45, ptr %m.addr.i66, align 2
837 store i16 %47, ptr %n.addr.i67, align 2
838 store ptr %48, ptr %base.addr.i68, align 8
839 store i64 %49, ptr %stride.addr.i69, align 8
840 %50 = load i16, ptr %m.addr.i66, align 2
841 %51 = load i16, ptr %n.addr.i67, align 2
842 %52 = load ptr, ptr %base.addr.i68, align 8
843 %53 = load i64, ptr %stride.addr.i69, align 8
844 %54 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %50, i16 %51, ptr %52, i64 %53) #2
845 %55 = bitcast x86_amx %54 to <256 x i32>
846 %56 = load ptr, ptr %dst.addr.i14, align 8
847 %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, ptr %56, i32 0, i32 3
848 store <256 x i32> %55, ptr %tile.i20, align 64
849 store ptr %b, ptr %dst.addr.i7, align 8
850 store ptr @buf2, ptr %base.addr.i8, align 8
851 store i64 32, ptr %stride.addr.i9, align 8
852 %57 = load ptr, ptr %dst.addr.i7, align 8
853 %58 = load i16, ptr %57, align 64
854 %59 = load ptr, ptr %dst.addr.i7, align 8
855 %col.i11 = getelementptr inbounds %struct.__tile1024i_str, ptr %59, i32 0, i32 1
856 %60 = load i16, ptr %col.i11, align 2
857 %61 = load ptr, ptr %base.addr.i8, align 8
858 %62 = load i64, ptr %stride.addr.i9, align 8
859 store i16 %58, ptr %m.addr.i70, align 2
860 store i16 %60, ptr %n.addr.i71, align 2
861 store ptr %61, ptr %base.addr.i72, align 8
862 store i64 %62, ptr %stride.addr.i73, align 8
863 %63 = load i16, ptr %m.addr.i70, align 2
864 %64 = load i16, ptr %n.addr.i71, align 2
865 %65 = load ptr, ptr %base.addr.i72, align 8
866 %66 = load i64, ptr %stride.addr.i73, align 8
867 %67 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %63, i16 %64, ptr %65, i64 %66) #2
868 %68 = bitcast x86_amx %67 to <256 x i32>
869 %69 = load ptr, ptr %dst.addr.i7, align 8
870 %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, ptr %69, i32 0, i32 3
871 store <256 x i32> %68, ptr %tile.i13, align 64
872 store ptr %c, ptr %dst.addr.i, align 8
873 store ptr @buf2, ptr %base.addr.i, align 8
874 store i64 32, ptr %stride.addr.i, align 8
875 %70 = load ptr, ptr %dst.addr.i, align 8
876 %71 = load i16, ptr %70, align 64
877 %72 = load ptr, ptr %dst.addr.i, align 8
878 %col.i = getelementptr inbounds %struct.__tile1024i_str, ptr %72, i32 0, i32 1
879 %73 = load i16, ptr %col.i, align 2
880 %74 = load ptr, ptr %base.addr.i, align 8
881 %75 = load i64, ptr %stride.addr.i, align 8
882 store i16 %71, ptr %m.addr.i74, align 2
883 store i16 %73, ptr %n.addr.i75, align 2
884 store ptr %74, ptr %base.addr.i76, align 8
885 store i64 %75, ptr %stride.addr.i77, align 8
886 %76 = load i16, ptr %m.addr.i74, align 2
887 %77 = load i16, ptr %n.addr.i75, align 2
888 %78 = load ptr, ptr %base.addr.i76, align 8
889 %79 = load i64, ptr %stride.addr.i77, align 8
890 %80 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %76, i16 %77, ptr %78, i64 %79) #2
891 %81 = bitcast x86_amx %80 to <256 x i32>
892 %82 = load ptr, ptr %dst.addr.i, align 8
893 %tile.i = getelementptr inbounds %struct.__tile1024i_str, ptr %82, i32 0, i32 3
894 store <256 x i32> %81, ptr %tile.i, align 64
897 if.end: ; preds = %if.else, %if.then
898 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %b43, ptr align 1 %b, i64 1088, i1 false) #2
899 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %a42, ptr align 1 %a, i64 1088, i1 false) #2
900 store ptr %c, ptr %dst.addr.i44, align 8
901 %83 = load i16, ptr %a42, align 64
902 %col.i46 = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 1
903 %84 = load i16, ptr %col.i46, align 2
904 %col1.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 1
905 %85 = load i16, ptr %col1.i, align 2
906 %86 = load ptr, ptr %dst.addr.i44, align 8
907 %tile.i47 = getelementptr inbounds %struct.__tile1024i_str, ptr %86, i32 0, i32 3
908 %87 = load <256 x i32>, ptr %tile.i47, align 64
909 %tile2.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 3
910 %88 = load <256 x i32>, ptr %tile2.i, align 64
911 %tile3.i = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 3
912 %89 = load <256 x i32>, ptr %tile3.i, align 64
913 store <256 x i32> %87, ptr %indirect-arg-temp.i, align 1024
914 store <256 x i32> %88, ptr %indirect-arg-temp4.i, align 1024
915 store <256 x i32> %89, ptr %indirect-arg-temp5.i, align 1024
916 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp5.i80, ptr align 1 %indirect-arg-temp5.i, i64 1024, i1 false) #2
917 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp4.i79, ptr align 1 %indirect-arg-temp4.i, i64 1024, i1 false) #2
918 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i78, ptr align 1 %indirect-arg-temp.i, i64 1024, i1 false) #2
919 %dst.i = load <256 x i32>, ptr %indirect-arg-temp.i78, align 1024
920 %src1.i = load <256 x i32>, ptr %indirect-arg-temp4.i79, align 1024
921 %src2.i = load <256 x i32>, ptr %indirect-arg-temp5.i80, align 1024
922 store i16 %83, ptr %m.addr.i81, align 2
923 store i16 %84, ptr %n.addr.i82, align 2
924 store i16 %85, ptr %k.addr.i, align 2
925 store <256 x i32> %dst.i, ptr %dst.addr.i83, align 64
926 store <256 x i32> %src1.i, ptr %src1.addr.i, align 64
927 store <256 x i32> %src2.i, ptr %src2.addr.i, align 64
928 %90 = load i16, ptr %m.addr.i81, align 2
929 %91 = load i16, ptr %n.addr.i82, align 2
930 %92 = load i16, ptr %k.addr.i, align 2
931 %93 = load <256 x i32>, ptr %dst.addr.i83, align 64
932 %94 = bitcast <256 x i32> %93 to x86_amx
933 %95 = load <256 x i32>, ptr %src1.addr.i, align 64
934 %96 = bitcast <256 x i32> %95 to x86_amx
935 %97 = load <256 x i32>, ptr %src2.addr.i, align 64
936 %98 = bitcast <256 x i32> %97 to x86_amx
937 %99 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %90, i16 %91, i16 %92, x86_amx %94, x86_amx %96, x86_amx %98) #2
938 %100 = bitcast x86_amx %99 to <256 x i32>
939 %101 = load ptr, ptr %dst.addr.i44, align 8
940 %tile6.i = getelementptr inbounds %struct.__tile1024i_str, ptr %101, i32 0, i32 3
941 store <256 x i32> %100, ptr %tile6.i, align 64
942 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %c49, ptr align 1 %c, i64 1088, i1 false) #2
943 store ptr @buf, ptr %base.addr.i50, align 8
944 store i64 32, ptr %stride.addr.i51, align 8
945 %102 = load i16, ptr %c49, align 64
946 %col.i54 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 1
947 %103 = load i16, ptr %col.i54, align 2
948 %104 = load ptr, ptr %base.addr.i50, align 8
949 %105 = load i64, ptr %stride.addr.i51, align 8
950 %tile.i55 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 3
951 %106 = load <256 x i32>, ptr %tile.i55, align 64
952 store <256 x i32> %106, ptr %indirect-arg-temp.i52, align 1024
953 call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i5284, ptr align 1 %indirect-arg-temp.i52, i64 1024, i1 false) #2
954 %tile.i89 = load <256 x i32>, ptr %indirect-arg-temp.i5284, align 1024
955 store i16 %102, ptr %m.addr.i85, align 2
956 store i16 %103, ptr %n.addr.i86, align 2
957 store ptr %104, ptr %base.addr.i87, align 8
958 store i64 %105, ptr %stride.addr.i88, align 8
959 store <256 x i32> %tile.i89, ptr %tile.addr.i, align 64
960 %107 = load i16, ptr %m.addr.i85, align 2
961 %108 = load i16, ptr %n.addr.i86, align 2
962 %109 = load ptr, ptr %base.addr.i87, align 8
963 %110 = load i64, ptr %stride.addr.i88, align 8
964 %111 = load <256 x i32>, ptr %tile.addr.i, align 64
965 %112 = bitcast <256 x i32> %111 to x86_amx
966 call void @llvm.x86.tilestored64.internal(i16 %107, i16 %108, ptr %109, i64 %110, x86_amx %112) #2
970 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
971 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #2
972 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2
973 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #2
974 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3
976 attributes #0 = { noinline nounwind optnone }
977 attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly }
978 attributes #2 = { nounwind }
979 attributes #3 = { argmemonly nofree nosync nounwind willreturn }