add amd memcpy
[rofl0r-memcpy-test.git] / cosmopolitan.s
blob7eae68f815800ccc41049c8a1af17ee0a813f240
1 # Copyright 2020 Justine Alexandra Roberts Tunney
3 # Permission to use, copy, modify, and/or distribute this software for
4 # any purpose with or without fee is hereby granted, provided that the
5 # above copyright notice and this permission notice appear in all copies.
7 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
8 # WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
9 # WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
10 # AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
11 # DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
12 # PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
13 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14 # PERFORMANCE OF THIS SOFTWARE.
16 # Copies memory.
18 # DEST and SRC must not overlap, unless DEST<=SRC.
20 # @param rdi is dest
21 # @param rsi is src
22 # @param rdx is number of bytes
23 # @return original rdi copied to rax
24 # @mode long
25 # @asyncsignalsafe
26 mymemcpy:
27 mov %rdi,%rax
28 .align 16
29 .size mymemcpy,.-mymemcpy
30 .type mymemcpy,@function
31 .globl mymemcpy
33 # Copies memory w/ minimal impact ABI.
35 # @param rdi is dest
36 # @param rsi is src
37 # @param rdx is number of bytes
38 # @clob flags,rcx,xmm3,xmm4
39 # @mode long
40 __memcpy:
41 push %rbp
42 mov %rsp,%rbp
43 mov $.L__memcpytab.ro.size,%ecx
44 cmp %rcx,%rdx
45 cmovb %rdx,%rcx
46 ## ifndef __PIC__
47 jmp *__memcpytab(,%rcx,8)
48 ## else
49 # shl $3, %rcx # mul by 8
50 # add __memcpytab@GOTPCREL(%rip), %rcx
51 # jmp *%rcx
52 ## jmp *__memcpytab@GOTPCREL(%rip)(,%rcx,8)
53 ## endif
54 .Lanchorpoint:
55 .L32r: cmp $1024,%rdx
56 jae .Lerms
57 .L32: vmovdqu -32(%rsi,%rdx),%ymm4
58 mov $32,%rcx
59 0: add $32,%rcx
60 vmovdqu -64(%rsi,%rcx),%ymm3
61 vmovdqu %ymm3,-64(%rdi,%rcx)
62 cmp %rcx,%rdx
63 ja 0b
64 vmovdqu %ymm4,-32(%rdi,%rdx)
65 vxorps %ymm4,%ymm4,%ymm4
66 vxorps %ymm3,%ymm3,%ymm3
67 jmp .L0
68 .L16r: cmp $1024,%rdx
69 jae .Lerms
70 .L16: movdqu -16(%rsi,%rdx),%xmm4
71 mov $16,%rcx
72 0: add $16,%rcx
73 movdqu -32(%rsi,%rcx),%xmm3
74 movdqu %xmm3,-32(%rdi,%rcx)
75 cmp %rcx,%rdx
76 ja 0b
77 movdqu %xmm4,-16(%rdi,%rdx)
78 pxor %xmm4,%xmm4
79 pxor %xmm3,%xmm3
80 jmp .L0
81 .L8: push %rbx
82 mov (%rsi),%rcx
83 mov -8(%rsi,%rdx),%rbx
84 mov %rcx,(%rdi)
85 mov %rbx,-8(%rdi,%rdx)
86 1: pop %rbx
87 .L0: pop %rbp
88 ret
89 .L4: push %rbx
90 mov (%rsi),%ecx
91 mov -4(%rsi,%rdx),%ebx
92 mov %ecx,(%rdi)
93 mov %ebx,-4(%rdi,%rdx)
94 jmp 1b
95 .L3: push %rbx
96 mov (%rsi),%cx
97 mov -2(%rsi,%rdx),%bx
98 mov %cx,(%rdi)
99 mov %bx,-2(%rdi,%rdx)
100 jmp 1b
101 .L2: mov (%rsi),%cx
102 mov %cx,(%rdi)
103 jmp .L0
104 .L1: mov (%rsi),%cl
105 mov %cl,(%rdi)
106 jmp .L0
107 .Lerms:
108 #### if 1 || defined(TINY)
109 cmp $1024*1024,%rdx
110 #### else
111 # cmp kHalfCache3(%rip),%rdx
112 #### endif
113 ja .Lnts
114 push %rdi
115 push %rsi
116 mov %rdx,%rcx
117 rep movsb
118 pop %rsi
119 pop %rdi
120 jmp .L0
121 .Lnts: movdqu (%rsi),%xmm3
122 movdqu %xmm3,(%rdi)
123 lea 16(%rdi),%rcx
124 and $-16,%rcx
125 sub %rdi,%rcx
126 add %rcx,%rdi
127 add %rcx,%rsi
128 sub %rcx,%rdx
129 mov $16,%rcx
130 0: add $16,%rcx
131 movdqu -32(%rsi,%rcx),%xmm3
132 movntdq %xmm3,-32(%rdi,%rcx)
133 cmp %rcx,%rdx
134 ja 0b
135 sfence
136 movdqu -16(%rsi,%rdx),%xmm3
137 movdqu %xmm3,-16(%rdi,%rdx)
138 pxor %xmm3,%xmm3
139 jmp .L0
140 .size __memcpy,.-__memcpy
141 .type __memcpy,@function
142 .globl __memcpy
143 .hidden __memcpy
145 .section .initro.300._init_memcpy,"a",@progbits
146 __memcpytab.ro:
147 .byte .L0-.Lanchorpoint
148 .byte .L1-.Lanchorpoint
149 .byte .L2-.Lanchorpoint
150 .byte .L3-.Lanchorpoint
151 .rept 4
152 .byte .L4-.Lanchorpoint
153 .endr
154 .rept 8
155 .byte .L8-.Lanchorpoint
156 .endr
157 .rept 16
158 .byte .L16-.Lanchorpoint
159 .endr
160 .equ .L__memcpytab.ro.size,.-__memcpytab.ro
161 .size __memcpytab.ro,.-__memcpytab.ro
162 .type __memcpytab.ro,@object
163 .if .L__memcpytab.ro.size % 8
164 .error "memcpytab alignment wrong"
165 .endif
166 .byte .L16-.Lanchorpoint # SSE2
167 .byte .L16r-.Lanchorpoint # SSE2 + ERMS
168 .byte .L32-.Lanchorpoint # AVX
169 .byte .L32r-.Lanchorpoint # AVX + ERMS
170 .byte 0,0,0,0
171 .previous
173 .section .piro.bss.init.2.300._init_memcpy,"aw",@nobits
174 .align 8
175 __memcpytab:
176 .rept .L__memcpytab.ro.size
177 .quad 0
178 .endr
179 .quad 0
180 .size __memcpytab,.-__memcpytab
181 .type __memcpytab,@object
182 .previous
185 .section .init.300._init_memcpy,"ax",@progbits
186 .align 8
187 _init_memcpy:
188 # i had to add these 2 lines to match ABI of __memjmpinit
189 # apparently cosmopolitan does some other trickery to pass these
190 # these 2 work for static link
191 ## ifndef __PIC__
192 mov $__memcpytab, %rdi
193 mov $__memcpytab.ro, %rsi
194 ## else
195 # mov __memcpytab@GOTPCREL(%rip), %rdi
196 # mov __memcpytab.ro@GOTPCREL(%rip), %rsi
197 ## endif
198 # -----------------------
199 # pushpop .Lmemcpytab.ro.size,%rcx
200 push $.L__memcpytab.ro.size
201 pop %rcx
202 ## ifndef __PIC
203 # mov .Lanchorpoint@GOTPCREL(%rip),%edx
204 ## else
205 mov $.Lanchorpoint,%edx
206 ## endif
207 testb $1 << (28 % 8), 28 / 8 + 1 * 16 + 2 * 4 +__kcpuids(%rip)
208 #testb X86_HAVE(AVX)+__kcpuids(%rip)
209 ## ifndef __PIC__
210 call __memjmpinit
211 ## else
212 # call *__memjmpinit@GOTPCREL(%rip)
213 ## endif
214 #.init.end 300,_init_memcpy
215 .size _init_memcpy,.-_init_memcpy
216 .type _init_memcpy,@function
217 .globl _init_memcpy
218 .previous
220 # cosmopolitan libc/nexgen32e/memjmpinit.S
222 .section .text.startup,"ax",@progbits
224 # Initializes jump table for memset() and memcpy().
226 # @param !ZF if required cpu vector extensions are available
227 # @param rdi is address of 64-bit jump table
228 # @param rsi is address of 8-bit jump initializers
229 # @param rdx is address of indirect branch
230 # @param ecx is size of jump table
231 __memjmpinit:
232 push %rbp
233 mov %rsp,%rbp
234 setnz %r8b
235 shl %r8b
236 0: xor %eax,%eax
237 lodsb
238 add %rdx,%rax
239 stosq
240 # .loop 0b
241 .byte 0x83,0xe9,0x01 # sub $1,%ecx
242 jnz 0b
243 xor %eax,%eax
244 testb $1 << (9 % 8), 9 / 8 + 3 * 16 + 1 * 4 +__kcpuids(%rip)
245 # testb X86_HAVE(ERMS)+__kcpuids(%rip)
246 setnz %al
247 or %r8b,%al
248 mov (%rsi,%rax),%al
249 add %rdx,%rax
250 stosq
251 lodsq
252 pop %rbp
254 .size __memjmpinit,.-__memjmpinit
255 .type __memjmpinit,@function
256 .global __memjmpinit
257 .hidden __memjmpinit
258 # .endfn __memjmpinit,globl,hidden
259 # .source __FILE__
262 # cosmopolitan: libc/nexgen32e/kcpuids.S
264 .section .piro.bss.init.2.201._init_kcpuids,"aw",@nobits
265 .align 8
266 __kcpuids:
267 .long 0,0,0,0 # EAX=0 (Basic Processor Info)
268 .long 0,0,0,0 # EAX=1 (Processor Info)
269 .long 0,0,0,0 # EAX=2
270 .long 0,0,0,0 # EAX=7 (Extended Features)
271 .long 0,0,0,0 # EAX=0x80000001 (NexGen32e)
272 .long 0,0,0,0 # EAX=0x80000007 (APM)
273 .long 0,0,0,0 # EAX=16h (CPU Frequency)
274 .size __kcpuids,.-__kcpuids
275 .type __kcpuids,@object
276 .global __kcpuids
277 .previous
279 .section .init.201._init_kcpuids,"ax",@progbits
280 _init_kcpuids:
281 push %rbx
282 push $0
283 push $0x16
284 push $0xffffffff80000007
285 push $0xffffffff80000001
286 push $7
287 push $2
288 push $1
289 mov %rdi,%r8
290 # rofl0r: added this line
291 ## ifndef __PIC__
292 mov $__kcpuids, %rdi
293 ## else
294 # mov __kcpuids@GOTPCREL(%rip), %rdi
295 ## endif
297 # -----------------------
298 xor %eax,%eax
299 1: xor %ecx,%ecx
300 cpuid
301 stos %eax,(%rdi)
302 # stosl
303 xchg %eax,%ebx
304 stosl
305 xchg %eax,%ecx
306 stosl
307 xchg %eax,%edx
308 stosl
309 2: pop %rax
310 test %eax,%eax # EAX = stacklist->pop()
311 jz 3f # EAX != 0 (EOL sentinel)
312 cmp 0 * 16 + 0 * 4(%r8),%al # EAX <= CPUID.0 max leaf
313 jbe 1b # CPUID too new to probe
314 add $4*4,%rdi
315 jmp 2b
316 3: nop
317 testb $1 << (28 % 8), 28 / 8 + 1 * 16 + 2 * 4(%r8)
318 jz 5f
319 testb $1 << (27 % 8), 27 / 8 + 1 * 16 + 2 * 4(%r8)
320 jz 4f
321 xor %ecx,%ecx
322 xgetbv
323 and $0x02|0x04,%eax
324 cmp $0x02|0x04,%eax
325 je 5f
326 4: btr $28,1 * 16 + 2 * 4(%r8)
327 btr $5,3 * 16 + 1 * 4(%r8)
328 5: pop %rbx
329 .size _init_kcpuids,.-_init_kcpuids
330 .type _init_kcpuids,@function
331 .globl _init_kcpuids
333 .section .init_array,"aw"
334 .align 8
335 #.quad _init_kcpuids
336 .quad _init_memcpy