[Arm] Fix generating code with UB in NeonEmitter (#121802)
[llvm-project.git] / polly / docs / experiments / matmul / matmul.polly.interchanged+tiled.s
blobbf25833eec167303d5976f2674bc822612962048
1 .text
2 .file "matmul.c"
3 .section .rodata.cst8,"aM",@progbits,8
4 .p2align 3 # -- Begin function init_array
5 .LCPI0_0:
6 .quad 4602678819172646912 # double 0.5
7 .text
8 .globl init_array
9 .p2align 4, 0x90
10 .type init_array,@function
11 init_array: # @init_array
12 .cfi_startproc
13 # %bb.0: # %entry
14 pushq %rbp
15 .cfi_def_cfa_offset 16
16 .cfi_offset %rbp, -16
17 movq %rsp, %rbp
18 .cfi_def_cfa_register %rbp
19 leaq B(%rip), %rax
20 leaq A(%rip), %rcx
21 xorl %r8d, %r8d
22 movsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero
23 xorl %r9d, %r9d
24 .p2align 4, 0x90
25 .LBB0_1: # %polly.loop_header
26 # =>This Loop Header: Depth=1
27 # Child Loop BB0_2 Depth 2
28 movl $1, %edi
29 xorl %edx, %edx
30 .p2align 4, 0x90
31 .LBB0_2: # %polly.loop_header1
32 # Parent Loop BB0_1 Depth=1
33 # => This Inner Loop Header: Depth=2
34 movl %edx, %esi
35 andl $1022, %esi # imm = 0x3FE
36 orl $1, %esi
37 xorps %xmm1, %xmm1
38 cvtsi2sdl %esi, %xmm1
39 mulsd %xmm0, %xmm1
40 cvtsd2ss %xmm1, %xmm1
41 movss %xmm1, -4(%rcx,%rdi,4)
42 movss %xmm1, -4(%rax,%rdi,4)
43 leal (%r9,%rdx), %esi
44 andl $1023, %esi # imm = 0x3FF
45 addl $1, %esi
46 xorps %xmm1, %xmm1
47 cvtsi2sdl %esi, %xmm1
48 mulsd %xmm0, %xmm1
49 cvtsd2ss %xmm1, %xmm1
50 movss %xmm1, (%rcx,%rdi,4)
51 movss %xmm1, (%rax,%rdi,4)
52 addq $2, %rdi
53 addl %r8d, %edx
54 cmpq $1537, %rdi # imm = 0x601
55 jne .LBB0_2
56 # %bb.3: # %polly.loop_exit3
57 # in Loop: Header=BB0_1 Depth=1
58 addq $1, %r9
59 addq $6144, %rax # imm = 0x1800
60 addq $6144, %rcx # imm = 0x1800
61 addl $2, %r8d
62 cmpq $1536, %r9 # imm = 0x600
63 jne .LBB0_1
64 # %bb.4: # %polly.exiting
65 popq %rbp
66 .cfi_def_cfa %rsp, 8
67 retq
68 .Lfunc_end0:
69 .size init_array, .Lfunc_end0-init_array
70 .cfi_endproc
71 # -- End function
72 .globl print_array # -- Begin function print_array
73 .p2align 4, 0x90
74 .type print_array,@function
75 print_array: # @print_array
76 .cfi_startproc
77 # %bb.0: # %entry
78 pushq %rbp
79 .cfi_def_cfa_offset 16
80 .cfi_offset %rbp, -16
81 movq %rsp, %rbp
82 .cfi_def_cfa_register %rbp
83 pushq %r15
84 pushq %r14
85 pushq %r13
86 pushq %r12
87 pushq %rbx
88 pushq %rax
89 .cfi_offset %rbx, -56
90 .cfi_offset %r12, -48
91 .cfi_offset %r13, -40
92 .cfi_offset %r14, -32
93 .cfi_offset %r15, -24
94 leaq C(%rip), %r13
95 xorl %eax, %eax
96 movl $3435973837, %r12d # imm = 0xCCCCCCCD
97 leaq .L.str(%rip), %r14
98 .p2align 4, 0x90
99 .LBB1_1: # %for.cond1.preheader
100 # =>This Loop Header: Depth=1
101 # Child Loop BB1_2 Depth 2
102 movq %rax, -48(%rbp) # 8-byte Spill
103 movq stdout(%rip), %rsi
104 xorl %ebx, %ebx
105 .p2align 4, 0x90
106 .LBB1_2: # %for.body3
107 # Parent Loop BB1_1 Depth=1
108 # => This Inner Loop Header: Depth=2
109 movl %ebx, %eax
110 imulq %r12, %rax
111 shrq $38, %rax
112 leal (%rax,%rax,4), %r15d
113 shll $4, %r15d
114 addl $79, %r15d
115 movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
116 cvtss2sd %xmm0, %xmm0
117 movb $1, %al
118 movq %rsi, %rdi
119 movq %r14, %rsi
120 callq fprintf
121 cmpl %ebx, %r15d
122 jne .LBB1_4
123 # %bb.3: # %if.then
124 # in Loop: Header=BB1_2 Depth=2
125 movq stdout(%rip), %rsi
126 movl $10, %edi
127 callq fputc@PLT
128 .LBB1_4: # %for.inc
129 # in Loop: Header=BB1_2 Depth=2
130 addq $1, %rbx
131 movq stdout(%rip), %rsi
132 cmpq $1536, %rbx # imm = 0x600
133 jne .LBB1_2
134 # %bb.5: # %for.end
135 # in Loop: Header=BB1_1 Depth=1
136 movl $10, %edi
137 callq fputc@PLT
138 movq -48(%rbp), %rax # 8-byte Reload
139 addq $1, %rax
140 addq $6144, %r13 # imm = 0x1800
141 cmpq $1536, %rax # imm = 0x600
142 jne .LBB1_1
143 # %bb.6: # %for.end12
144 addq $8, %rsp
145 popq %rbx
146 popq %r12
147 popq %r13
148 popq %r14
149 popq %r15
150 popq %rbp
151 .cfi_def_cfa %rsp, 8
152 retq
153 .Lfunc_end1:
154 .size print_array, .Lfunc_end1-print_array
155 .cfi_endproc
156 # -- End function
157 .globl main # -- Begin function main
158 .p2align 4, 0x90
159 .type main,@function
160 main: # @main
161 .cfi_startproc
162 # %bb.0: # %entry
163 pushq %rbp
164 .cfi_def_cfa_offset 16
165 .cfi_offset %rbp, -16
166 movq %rsp, %rbp
167 .cfi_def_cfa_register %rbp
168 pushq %r15
169 pushq %r14
170 pushq %r13
171 pushq %r12
172 pushq %rbx
173 subq $344, %rsp # imm = 0x158
174 .cfi_offset %rbx, -56
175 .cfi_offset %r12, -48
176 .cfi_offset %r13, -40
177 .cfi_offset %r14, -32
178 .cfi_offset %r15, -24
179 callq init_array
180 leaq C(%rip), %rdi
181 xorl %eax, %eax
182 movq %rax, -48(%rbp) # 8-byte Spill
183 xorl %esi, %esi
184 movl $9437184, %edx # imm = 0x900000
185 callq memset@PLT
186 movl $64, %eax
187 movq %rax, -64(%rbp) # 8-byte Spill
188 leaq A(%rip), %rax
189 movq %rax, -56(%rbp) # 8-byte Spill
190 .p2align 4, 0x90
191 .LBB2_1: # %polly.loop_header8
192 # =>This Loop Header: Depth=1
193 # Child Loop BB2_2 Depth 2
194 # Child Loop BB2_3 Depth 3
195 # Child Loop BB2_4 Depth 4
196 # Child Loop BB2_5 Depth 5
197 leaq B+240(%rip), %rax
198 xorl %edi, %edi
199 .p2align 4, 0x90
200 .LBB2_2: # %polly.loop_header14
201 # Parent Loop BB2_1 Depth=1
202 # => This Loop Header: Depth=2
203 # Child Loop BB2_3 Depth 3
204 # Child Loop BB2_4 Depth 4
205 # Child Loop BB2_5 Depth 5
206 movq %rdi, %rcx
207 orq $4, %rcx
208 movq %rcx, -80(%rbp) # 8-byte Spill
209 movq %rdi, %rcx
210 orq $8, %rcx
211 movq %rcx, -264(%rbp) # 8-byte Spill
212 movq %rdi, %rcx
213 orq $12, %rcx
214 movq %rcx, -256(%rbp) # 8-byte Spill
215 movq %rdi, %rcx
216 orq $16, %rcx
217 movq %rcx, -248(%rbp) # 8-byte Spill
218 movq %rdi, %rcx
219 orq $20, %rcx
220 movq %rcx, -240(%rbp) # 8-byte Spill
221 movq %rdi, %rcx
222 orq $24, %rcx
223 movq %rcx, -232(%rbp) # 8-byte Spill
224 movq %rdi, %rcx
225 orq $28, %rcx
226 movq %rcx, -224(%rbp) # 8-byte Spill
227 movq %rdi, %rcx
228 orq $32, %rcx
229 movq %rcx, -216(%rbp) # 8-byte Spill
230 movq %rdi, %rcx
231 orq $36, %rcx
232 movq %rcx, -208(%rbp) # 8-byte Spill
233 movq %rdi, %rcx
234 orq $40, %rcx
235 movq %rcx, -200(%rbp) # 8-byte Spill
236 movq %rdi, %rcx
237 orq $44, %rcx
238 movq %rcx, -192(%rbp) # 8-byte Spill
239 movq %rdi, %rcx
240 orq $48, %rcx
241 movq %rcx, -184(%rbp) # 8-byte Spill
242 movq %rdi, %rcx
243 orq $52, %rcx
244 movq %rcx, -176(%rbp) # 8-byte Spill
245 movq %rdi, %rcx
246 orq $56, %rcx
247 movq %rcx, -168(%rbp) # 8-byte Spill
248 movq %rdi, %rcx
249 orq $60, %rcx
250 movq %rcx, -160(%rbp) # 8-byte Spill
251 movq -56(%rbp), %rdx # 8-byte Reload
252 movq %rax, -136(%rbp) # 8-byte Spill
253 movq %rax, -72(%rbp) # 8-byte Spill
254 xorl %eax, %eax
255 movq %rdi, -272(%rbp) # 8-byte Spill
256 .p2align 4, 0x90
257 .LBB2_3: # %polly.loop_header20
258 # Parent Loop BB2_1 Depth=1
259 # Parent Loop BB2_2 Depth=2
260 # => This Loop Header: Depth=3
261 # Child Loop BB2_4 Depth 4
262 # Child Loop BB2_5 Depth 5
263 movq %rax, -144(%rbp) # 8-byte Spill
264 movq %rdx, -152(%rbp) # 8-byte Spill
265 movq -48(%rbp), %rax # 8-byte Reload
266 .p2align 4, 0x90
267 .LBB2_4: # %polly.loop_header26
268 # Parent Loop BB2_1 Depth=1
269 # Parent Loop BB2_2 Depth=2
270 # Parent Loop BB2_3 Depth=3
271 # => This Loop Header: Depth=4
272 # Child Loop BB2_5 Depth 5
273 movq %rax, -376(%rbp) # 8-byte Spill
274 leaq (%rax,%rax,2), %rax
275 shlq $11, %rax
276 leaq C(%rip), %rsi
277 addq %rsi, %rax
278 leaq (%rax,%rdi,4), %rcx
279 movq %rcx, -368(%rbp) # 8-byte Spill
280 movq -80(%rbp), %rcx # 8-byte Reload
281 leaq (%rax,%rcx,4), %rcx
282 movq %rcx, -360(%rbp) # 8-byte Spill
283 movq -264(%rbp), %rbx # 8-byte Reload
284 leaq (%rax,%rbx,4), %rcx
285 movq %rcx, -352(%rbp) # 8-byte Spill
286 movq -256(%rbp), %r8 # 8-byte Reload
287 movq %rdi, %rsi
288 leaq (%rax,%r8,4), %rdi
289 movq %rdi, -344(%rbp) # 8-byte Spill
290 movq -248(%rbp), %rdi # 8-byte Reload
291 leaq (%rax,%rdi,4), %rcx
292 movq %rcx, -336(%rbp) # 8-byte Spill
293 movq -240(%rbp), %r9 # 8-byte Reload
294 leaq (%rax,%r9,4), %rcx
295 movq %rcx, -328(%rbp) # 8-byte Spill
296 movq -232(%rbp), %r10 # 8-byte Reload
297 leaq (%rax,%r10,4), %rcx
298 movq %rcx, -320(%rbp) # 8-byte Spill
299 movq -224(%rbp), %r14 # 8-byte Reload
300 leaq (%rax,%r14,4), %rcx
301 movq %rcx, -312(%rbp) # 8-byte Spill
302 movq -216(%rbp), %r15 # 8-byte Reload
303 leaq (%rax,%r15,4), %rcx
304 movq %rcx, -304(%rbp) # 8-byte Spill
305 movq -208(%rbp), %r12 # 8-byte Reload
306 leaq (%rax,%r12,4), %rcx
307 movq %rcx, -296(%rbp) # 8-byte Spill
308 movq -200(%rbp), %r13 # 8-byte Reload
309 leaq (%rax,%r13,4), %rcx
310 movq %rcx, -288(%rbp) # 8-byte Spill
311 movq -192(%rbp), %r11 # 8-byte Reload
312 leaq (%rax,%r11,4), %rcx
313 movq %rcx, -280(%rbp) # 8-byte Spill
314 movaps (%rax,%rsi,4), %xmm15
315 movq -80(%rbp), %rcx # 8-byte Reload
316 movaps (%rax,%rcx,4), %xmm14
317 movaps (%rax,%rbx,4), %xmm13
318 movaps (%rax,%r8,4), %xmm12
319 movaps (%rax,%rdi,4), %xmm11
320 movaps (%rax,%r9,4), %xmm10
321 movaps (%rax,%r10,4), %xmm9
322 movaps (%rax,%r14,4), %xmm8
323 movaps (%rax,%r15,4), %xmm7
324 movaps (%rax,%r12,4), %xmm6
325 movaps (%rax,%r13,4), %xmm5
326 movaps (%rax,%r11,4), %xmm4
327 movq -184(%rbp), %rcx # 8-byte Reload
328 movaps (%rax,%rcx,4), %xmm3
329 movq -176(%rbp), %rsi # 8-byte Reload
330 movaps (%rax,%rsi,4), %xmm0
331 movaps %xmm0, -96(%rbp) # 16-byte Spill
332 movq -168(%rbp), %rbx # 8-byte Reload
333 movaps (%rax,%rbx,4), %xmm0
334 movaps %xmm0, -112(%rbp) # 16-byte Spill
335 movq -160(%rbp), %rdi # 8-byte Reload
336 movaps (%rax,%rdi,4), %xmm0
337 movaps %xmm0, -128(%rbp) # 16-byte Spill
338 leaq (%rax,%rcx,4), %r8
339 leaq (%rax,%rsi,4), %rcx
340 leaq (%rax,%rbx,4), %rsi
341 leaq (%rax,%rdi,4), %rax
342 movq -72(%rbp), %r9 # 8-byte Reload
343 movl $0, %r10d
344 .p2align 4, 0x90
345 .LBB2_5: # %vector.ph
346 # Parent Loop BB2_1 Depth=1
347 # Parent Loop BB2_2 Depth=2
348 # Parent Loop BB2_3 Depth=3
349 # Parent Loop BB2_4 Depth=4
350 # => This Inner Loop Header: Depth=5
351 movss (%rdx,%r10,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
352 shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
353 movaps -240(%r9), %xmm1
354 mulps %xmm0, %xmm1
355 addps %xmm1, %xmm15
356 movaps -224(%r9), %xmm1
357 mulps %xmm0, %xmm1
358 addps %xmm1, %xmm14
359 movaps -208(%r9), %xmm1
360 mulps %xmm0, %xmm1
361 addps %xmm1, %xmm13
362 movaps -192(%r9), %xmm1
363 mulps %xmm0, %xmm1
364 addps %xmm1, %xmm12
365 movaps -176(%r9), %xmm1
366 mulps %xmm0, %xmm1
367 addps %xmm1, %xmm11
368 movaps -160(%r9), %xmm1
369 mulps %xmm0, %xmm1
370 addps %xmm1, %xmm10
371 movaps -144(%r9), %xmm1
372 mulps %xmm0, %xmm1
373 addps %xmm1, %xmm9
374 movaps -128(%r9), %xmm1
375 mulps %xmm0, %xmm1
376 addps %xmm1, %xmm8
377 movaps -112(%r9), %xmm1
378 mulps %xmm0, %xmm1
379 addps %xmm1, %xmm7
380 movaps -96(%r9), %xmm1
381 mulps %xmm0, %xmm1
382 addps %xmm1, %xmm6
383 movaps -80(%r9), %xmm1
384 mulps %xmm0, %xmm1
385 addps %xmm1, %xmm5
386 movaps -64(%r9), %xmm1
387 mulps %xmm0, %xmm1
388 addps %xmm1, %xmm4
389 movaps -48(%r9), %xmm1
390 mulps %xmm0, %xmm1
391 addps %xmm1, %xmm3
392 movaps -32(%r9), %xmm1
393 mulps %xmm0, %xmm1
394 movaps -96(%rbp), %xmm2 # 16-byte Reload
395 addps %xmm1, %xmm2
396 movaps %xmm2, -96(%rbp) # 16-byte Spill
397 movaps -16(%r9), %xmm1
398 mulps %xmm0, %xmm1
399 movaps -112(%rbp), %xmm2 # 16-byte Reload
400 addps %xmm1, %xmm2
401 movaps %xmm2, -112(%rbp) # 16-byte Spill
402 mulps (%r9), %xmm0
403 movaps -128(%rbp), %xmm1 # 16-byte Reload
404 addps %xmm0, %xmm1
405 movaps %xmm1, -128(%rbp) # 16-byte Spill
406 addq $1, %r10
407 addq $6144, %r9 # imm = 0x1800
408 cmpq $64, %r10
409 jne .LBB2_5
410 # %bb.6: # %polly.loop_exit34
411 # in Loop: Header=BB2_4 Depth=4
412 movq -368(%rbp), %rdi # 8-byte Reload
413 movaps %xmm15, (%rdi)
414 movq -360(%rbp), %rdi # 8-byte Reload
415 movaps %xmm14, (%rdi)
416 movq -352(%rbp), %rdi # 8-byte Reload
417 movaps %xmm13, (%rdi)
418 movq -344(%rbp), %rdi # 8-byte Reload
419 movaps %xmm12, (%rdi)
420 movq -336(%rbp), %rdi # 8-byte Reload
421 movaps %xmm11, (%rdi)
422 movq -328(%rbp), %rdi # 8-byte Reload
423 movaps %xmm10, (%rdi)
424 movq -320(%rbp), %rdi # 8-byte Reload
425 movaps %xmm9, (%rdi)
426 movq -312(%rbp), %rdi # 8-byte Reload
427 movaps %xmm8, (%rdi)
428 movq -304(%rbp), %rdi # 8-byte Reload
429 movaps %xmm7, (%rdi)
430 movq -296(%rbp), %rdi # 8-byte Reload
431 movaps %xmm6, (%rdi)
432 movq -288(%rbp), %rdi # 8-byte Reload
433 movaps %xmm5, (%rdi)
434 movq -280(%rbp), %rdi # 8-byte Reload
435 movaps %xmm4, (%rdi)
436 movaps %xmm3, (%r8)
437 movaps -96(%rbp), %xmm0 # 16-byte Reload
438 movaps %xmm0, (%rcx)
439 movaps -112(%rbp), %xmm0 # 16-byte Reload
440 movaps %xmm0, (%rsi)
441 movaps -128(%rbp), %xmm0 # 16-byte Reload
442 movaps %xmm0, (%rax)
443 movq -376(%rbp), %rax # 8-byte Reload
444 addq $1, %rax
445 addq $6144, %rdx # imm = 0x1800
446 cmpq -64(%rbp), %rax # 8-byte Folded Reload
447 movq -272(%rbp), %rdi # 8-byte Reload
448 jne .LBB2_4
449 # %bb.7: # %polly.loop_exit28
450 # in Loop: Header=BB2_3 Depth=3
451 movq -144(%rbp), %rax # 8-byte Reload
452 addq $64, %rax
453 addq $393216, -72(%rbp) # 8-byte Folded Spill
454 # imm = 0x60000
455 movq -152(%rbp), %rdx # 8-byte Reload
456 addq $256, %rdx # imm = 0x100
457 cmpq $1536, %rax # imm = 0x600
458 jb .LBB2_3
459 # %bb.8: # %polly.loop_exit22
460 # in Loop: Header=BB2_2 Depth=2
461 addq $64, %rdi
462 movq -136(%rbp), %rax # 8-byte Reload
463 addq $256, %rax # imm = 0x100
464 cmpq $1536, %rdi # imm = 0x600
465 jb .LBB2_2
466 # %bb.9: # %polly.loop_exit16
467 # in Loop: Header=BB2_1 Depth=1
468 movq -48(%rbp), %rax # 8-byte Reload
469 movq %rax, %rcx
470 addq $64, %rcx
471 addq $64, -64(%rbp) # 8-byte Folded Spill
472 addq $393216, -56(%rbp) # 8-byte Folded Spill
473 # imm = 0x60000
474 movq %rcx, %rax
475 movq %rcx, -48(%rbp) # 8-byte Spill
476 cmpq $1536, %rcx # imm = 0x600
477 jb .LBB2_1
478 # %bb.10: # %polly.exiting
479 xorl %eax, %eax
480 addq $344, %rsp # imm = 0x158
481 popq %rbx
482 popq %r12
483 popq %r13
484 popq %r14
485 popq %r15
486 popq %rbp
487 .cfi_def_cfa %rsp, 8
488 retq
489 .Lfunc_end2:
490 .size main, .Lfunc_end2-main
491 .cfi_endproc
492 # -- End function
493 .type A,@object # @A
494 .comm A,9437184,16
495 .type B,@object # @B
496 .comm B,9437184,16
497 .type .L.str,@object # @.str
498 .section .rodata.str1.1,"aMS",@progbits,1
499 .L.str:
500 .asciz "%lf "
501 .size .L.str, 5
503 .type C,@object # @C
504 .comm C,9437184,16
506 .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"
507 .section ".note.GNU-stack","",@progbits