add amd memcpy
[rofl0r-memcpy-test.git] / amd.s
blob6234752d8ee209c5e4bb2682fb4545b2c5642b07
1 .file "amd.c"
2 .text
3 .globl mymemcpy
4 .type mymemcpy, @function
5 mymemcpy:
6 .LFB0:
7 .cfi_startproc
8 pushq %rbp
9 .cfi_def_cfa_offset 16
10 .cfi_offset 6, -16
11 movq %rsp, %rbp
12 .cfi_def_cfa_register 6
13 pushq %r14
14 pushq %r13
15 pushq %r12
16 pushq %rbx
17 .cfi_offset 14, -24
18 .cfi_offset 13, -32
19 .cfi_offset 12, -40
20 .cfi_offset 3, -48
21 movq %rdi, -40(%rbp)
22 movq %rsi, -48(%rbp)
23 movq %rdx, -56(%rbp)
24 movq -48(%rbp), %rbx
25 movq -40(%rbp), %r13
26 movq -56(%rbp), %r14
27 #APP
28 # 27 "amd.c" 1
29 movq %rbx, %rsi
30 movq %r13, %rdi
31 movq %r14, %rdx
32 movq %rdi, %rax
33 cmp $32, %rdx
34 jb less_vec
35 cmp $(32 * 2), %rdx
36 ja more_2x_vec
37 vmovdqu (%rsi), %ymm0
38 vmovdqu -32(%rsi,%rdx), %ymm1
39 vmovdqu %ymm0, (%rdi)
40 vmovdqu %ymm1, -32(%rdi,%rdx)
41 vzeroupper
42 jmp .L2
43 less_vec:
44 cmpb $32, %dl
45 jae between_32_63
46 cmpb $16, %dl
47 jae between_16_31
48 cmpb $8, %dl
49 jae between_8_15
50 cmpb $4, %dl
51 jae between_4_7
52 cmpb $1, %dl
53 ja between_2_3
54 jb 1f
55 movzbl (%rsi), %ecx
56 movb %cl, (%rdi)
58 jmp .L2
59 between_32_63:
60 vmovdqu (%rsi), %ymm0
61 vmovdqu -32(%rsi,%rdx), %ymm1
62 vmovdqu %ymm0, (%rdi)
63 vmovdqu %ymm1, -32(%rdi,%rdx)
64 vzeroupper
65 jmp .L2
66 between_16_31:
67 vmovdqu (%rsi), %xmm0
68 vmovdqu -16(%rsi,%rdx), %xmm1
69 vmovdqu %xmm0, (%rdi)
70 vmovdqu %xmm1, -16(%rdi,%rdx)
71 jmp .L2
72 between_8_15:
73 movq -8(%rsi,%rdx), %rcx
74 movq (%rsi), %rsi
75 movq %rcx, -8(%rdi,%rdx)
76 movq %rsi, (%rdi)
77 jmp .L2
78 between_4_7:
79 movl -4(%rsi,%rdx), %ecx
80 movl (%rsi), %esi
81 movl %ecx, -4(%rdi,%rdx)
82 movl %esi, (%rdi)
83 jmp .L2
84 between_2_3:
85 movzwl -2(%rsi,%rdx), %ecx
86 movzwl (%rsi), %esi
87 movw %cx, -2(%rdi,%rdx)
88 movw %si, (%rdi)
89 jmp .L2
90 more_2x_vec:
91 cmpq $(32 * 8), %rdx
92 ja more_8x_vec
93 cmpq $(32 * 4), %rdx
94 jb last_4x_vec
95 vmovdqu (%rsi), %ymm0
96 vmovdqu 32(%rsi), %ymm1
97 vmovdqu (32 * 2)(%rsi), %ymm2
98 vmovdqu (32 * 3)(%rsi), %ymm3
99 vmovdqu -32(%rsi,%rdx), %ymm4
100 vmovdqu -(32 * 2)(%rsi,%rdx), %ymm5
101 vmovdqu -(32 * 3)(%rsi,%rdx), %ymm6
102 vmovdqu -(32 * 4)(%rsi,%rdx), %ymm7
103 vmovdqu %ymm0, (%rdi)
104 vmovdqu %ymm1, 32(%rdi)
105 vmovdqu %ymm2, (32 * 2)(%rdi)
106 vmovdqu %ymm3, (32 * 3)(%rdi)
107 vmovdqu %ymm4, -32(%rdi,%rdx)
108 vmovdqu %ymm5, -(32 * 2)(%rdi,%rdx)
109 vmovdqu %ymm6, -(32 * 3)(%rdi,%rdx)
110 vmovdqu %ymm7, -(32 * 4)(%rdi,%rdx)
111 vzeroupper
112 jmp .L2
113 last_4x_vec:
114 vmovdqu (%rsi), %ymm0
115 vmovdqu 32(%rsi), %ymm1
116 vmovdqu -32(%rsi,%rdx), %ymm2
117 vmovdqu -(32 * 2)(%rsi,%rdx), %ymm3
118 vmovdqu %ymm0, (%rdi)
119 vmovdqu %ymm1, 32(%rdi)
120 vmovdqu %ymm2, -32(%rdi,%rdx)
121 vmovdqu %ymm3, -(32 * 2)(%rdi,%rdx)
122 vzeroupper
123 nop:
124 jmp .L2
125 more_8x_vec:
126 cmpq %rsi, %rdi
127 ja more_8x_vec_backward
128 je nop
129 vmovdqu (%rsi), %ymm4
130 vmovdqu -32(%rsi, %rdx), %ymm5
131 vmovdqu -(32 * 2)(%rsi, %rdx), %ymm6
132 vmovdqu -(32 * 3)(%rsi, %rdx), %ymm7
133 vmovdqu -(32 * 4)(%rsi, %rdx), %ymm8
134 movq %rdi, %r11
135 leaq -32(%rdi, %rdx), %rcx
136 movq %rdi, %r8
137 andq $(32 - 1), %r8
138 subq $32, %r8
139 subq %r8, %rsi
140 subq %r8, %rdi
141 addq %r8, %rdx
142 cmpq $(1024*1024), %rdx
143 ja large_forward
144 loop_4x_vec_forward:
145 vmovdqu (%rsi), %ymm0
146 vmovdqu 32(%rsi), %ymm1
147 vmovdqu (32 * 2)(%rsi), %ymm2
148 vmovdqu (32 * 3)(%rsi), %ymm3
149 addq $(32 * 4), %rsi
150 subq $(32 * 4), %rdx
151 vmovdqa %ymm0, (%rdi)
152 vmovdqa %ymm1, 32(%rdi)
153 vmovdqa %ymm2, (32 * 2)(%rdi)
154 vmovdqa %ymm3, (32 * 3)(%rdi)
155 addq $(32 * 4), %rdi
156 cmpq $(32 * 4), %rdx
157 ja loop_4x_vec_forward
158 vmovdqu %ymm5, (%rcx)
159 vmovdqu %ymm6, -32(%rcx)
160 vmovdqu %ymm7, -(32 * 2)(%rcx)
161 vmovdqu %ymm8, -(32 * 3)(%rcx)
162 vmovdqu %ymm4, (%r11)
163 vzeroupper
164 jmp .L2
165 more_8x_vec_backward:
166 vmovdqu (%rsi), %ymm4
167 vmovdqu 32(%rsi), %ymm5
168 vmovdqu (32 * 2)(%rsi), %ymm6
169 vmovdqu (32 * 3)(%rsi), %ymm7
170 vmovdqu -32(%rsi,%rdx), %ymm8
171 leaq -32(%rdi, %rdx), %r11
172 leaq -32(%rsi, %rdx), %rcx
173 movq %r11, %r9
174 movq %r11, %r8
175 andq $(32 - 1), %r8
176 subq %r8, %rcx
177 subq %r8, %r9
178 subq %r8, %rdx
179 cmpq $(1024*1024), %rdx
180 ja large_backward
181 loop_4x_vec_backward:
182 vmovdqu (%rcx), %ymm0
183 vmovdqu -32(%rcx), %ymm1
184 vmovdqu -(32 * 2)(%rcx), %ymm2
185 vmovdqu -(32 * 3)(%rcx), %ymm3
186 subq $(32 * 4), %rcx
187 subq $(32 * 4), %rdx
188 vmovdqa %ymm0, (%r9)
189 vmovdqa %ymm1, -32(%r9)
190 vmovdqa %ymm2, -(32 * 2)(%r9)
191 vmovdqa %ymm3, -(32 * 3)(%r9)
192 subq $(32 * 4), %r9
193 cmpq $(32 * 4), %rdx
194 ja loop_4x_vec_backward
195 vmovdqu %ymm4, (%rdi)
196 vmovdqu %ymm5, 32(%rdi)
197 vmovdqu %ymm6, (32 * 2)(%rdi)
198 vmovdqu %ymm7, (32 * 3)(%rdi)
199 vmovdqu %ymm8, (%r11)
200 vzeroupper
201 jmp .L2
202 large_forward:
203 leaq (%rdi, %rdx), %r10
204 cmpq %r10, %rsi
205 jb loop_4x_vec_forward
206 loop_large_forward:
207 prefetcht0 (32*4*2)(%rsi)
208 prefetcht0 (32*4*2 + 64)(%rsi)
209 prefetcht0 (32*4*3)(%rsi)
210 prefetcht0 (32*4*3 + 64)(%rsi)
211 vmovdqu (%rsi), %ymm0
212 vmovdqu 32(%rsi), %ymm1
213 vmovdqu (32 * 2)(%rsi), %ymm2
214 vmovdqu (32 * 3)(%rsi), %ymm3
215 addq $(32*4), %rsi
216 subq $(32*4), %rdx
217 vmovntdq %ymm0, (%rdi)
218 vmovntdq %ymm1, 32(%rdi)
219 vmovntdq %ymm2, (32 * 2)(%rdi)
220 vmovntdq %ymm3, (32 * 3)(%rdi)
221 addq $(32*4), %rdi
222 cmpq $(32*4), %rdx
223 ja loop_large_forward
224 sfence
225 vmovdqu %ymm5, (%rcx)
226 vmovdqu %ymm6, -32(%rcx)
227 vmovdqu %ymm7, -(32 * 2)(%rcx)
228 vmovdqu %ymm8, -(32 * 3)(%rcx)
229 vmovdqu %ymm4, (%r11)
230 vzeroupper
231 jmp .L2
232 large_backward:
233 leaq (%rcx, %rdx), %r10
234 cmpq %r10, %r9
235 jb loop_4x_vec_backward
236 loop_large_backward:
237 prefetcht0 (-32 * 4 * 2)(%rcx)
238 prefetcht0 (-32 * 4 * 2 - 64)(%rcx)
239 prefetcht0 (-32 * 4 * 3)(%rcx)
240 prefetcht0 (-32 * 4 * 3 - 64)(%rcx)
241 vmovdqu (%rcx), %ymm0
242 vmovdqu -32(%rcx), %ymm1
243 vmovdqu -(32 * 2)(%rcx), %ymm2
244 vmovdqu -(32 * 3)(%rcx), %ymm3
245 subq $(32*4), %rcx
246 subq $(32*4), %rdx
247 vmovntdq %ymm0, (%r9)
248 vmovntdq %ymm1, -32(%r9)
249 vmovntdq %ymm2, -(32 * 2)(%r9)
250 vmovntdq %ymm3, -(32 * 3)(%r9)
251 subq $(32 * 4), %r9
252 cmpq $(32 * 4), %rdx
253 ja loop_large_backward
254 sfence
255 vmovdqu %ymm4, (%rdi)
256 vmovdqu %ymm5, 32(%rdi)
257 vmovdqu %ymm6, (32 * 2)(%rdi)
258 vmovdqu %ymm7, (32 * 3)(%rdi)
259 vmovdqu %ymm8, (%r11)
260 vzeroupper
261 jmp .L2
262 #NO_APP
263 .L2:
264 .L3:
265 movq -40(%rbp), %rax
266 popq %rbx
267 popq %r12
268 popq %r13
269 popq %r14
270 popq %rbp
271 .cfi_def_cfa 7, 8
273 .cfi_endproc
274 .LFE0:
275 .size mymemcpy, .-mymemcpy
276 .ident "GCC: (GNU) 10.3.0"
277 .section .note.GNU-stack,"",@progbits