1 # Copyright 2020 Justine Alexandra Roberts Tunney
3 # Permission to use, copy, modify, and/or distribute this software for
4 # any purpose with or without fee is hereby granted, provided that the
5 # above copyright notice and this permission notice appear in all copies.
7 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
8 # WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
9 # WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
10 # AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
11 # DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
12 # PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
13 # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14 # PERFORMANCE OF THIS SOFTWARE.
18 # DEST and SRC must not overlap, unless DEST<=SRC.
22 # @param rdx is number of bytes
23 # @return original rdi copied to rax
29 .size mymemcpy,.-mymemcpy
30 .type mymemcpy,@function
33 # Copies memory w/ minimal impact ABI.
37 # @param rdx is number of bytes
38 # @clob flags,rcx,xmm3,xmm4
43 mov $
.L__memcpytab.ro.size,%ecx
47 jmp
*__memcpytab
(,%rcx
,8)
49 # shl $3, %rcx # mul by 8
50 # add __memcpytab@GOTPCREL(%rip), %rcx
52 ## jmp *__memcpytab@GOTPCREL(%rip)(,%rcx,8)
57 .L32: vmovdqu -32(%rsi,%rdx),%ymm4
60 vmovdqu
-64(%rsi
,%rcx
),%ymm3
61 vmovdqu
%ymm3
,-64(%rdi
,%rcx
)
64 vmovdqu
%ymm4
,-32(%rdi
,%rdx
)
65 vxorps
%ymm4
,%ymm4
,%ymm4
66 vxorps
%ymm3
,%ymm3
,%ymm3
70 .L16: movdqu -16(%rsi,%rdx),%xmm4
73 movdqu
-32(%rsi
,%rcx
),%xmm3
74 movdqu
%xmm3
,-32(%rdi
,%rcx
)
77 movdqu
%xmm4
,-16(%rdi
,%rdx
)
83 mov
-8(%rsi
,%rdx
),%rbx
85 mov
%rbx
,-8(%rdi
,%rdx
)
91 mov
-4(%rsi
,%rdx
),%ebx
93 mov
%ebx
,-4(%rdi
,%rdx
)
108 #### if 1 || defined(TINY)
111 # cmp kHalfCache3(%rip),%rdx
121 .Lnts: movdqu (%rsi),%xmm3
131 movdqu
-32(%rsi
,%rcx
),%xmm3
132 movntdq
%xmm3
,-32(%rdi
,%rcx
)
136 movdqu
-16(%rsi
,%rdx
),%xmm3
137 movdqu
%xmm3
,-16(%rdi
,%rdx
)
140 .size __memcpy,.-__memcpy
141 .type __memcpy,@function
145 .section .initro.300._init_memcpy,"a",@progbits
147 .byte .L0-.Lanchorpoint
148 .byte .L1-.Lanchorpoint
149 .byte .L2-.Lanchorpoint
150 .byte .L3-.Lanchorpoint
152 .byte .L4-.Lanchorpoint
155 .byte .L8-.Lanchorpoint
158 .byte .L16-.Lanchorpoint
160 .equ .L__memcpytab.ro.size,.-__memcpytab.ro
161 .size __memcpytab.ro,.-__memcpytab.ro
162 .type __memcpytab.ro,@object
163 .if .L__memcpytab.ro.size % 8
164 .error "memcpytab alignment wrong"
166 .byte .L16-.Lanchorpoint # SSE2
167 .byte .L16r-.Lanchorpoint # SSE2 + ERMS
168 .byte .L32-.Lanchorpoint # AVX
169 .byte .L32r-.Lanchorpoint # AVX + ERMS
173 .section .piro.bss.init.2.300._init_memcpy,"aw",@nobits
176 .rept .L__memcpytab.ro.size
180 .size __memcpytab,.-__memcpytab
181 .type __memcpytab,@object
185 .section .init.300._init_memcpy,"ax",@progbits
188 # i had to add these 2 lines to match ABI of __memjmpinit
189 # apparently cosmopolitan does some other trickery to pass these
190 # these 2 work for static link
192 mov $__memcpytab
, %rdi
193 mov $__memcpytab.ro
, %rsi
195 # mov __memcpytab@GOTPCREL(%rip), %rdi
196 # mov __memcpytab.ro@GOTPCREL(%rip), %rsi
198 # -----------------------
199 # pushpop .Lmemcpytab.ro.size,%rcx
200 push $
.L__memcpytab.ro.size
203 # mov .Lanchorpoint@GOTPCREL(%rip),%edx
205 mov $
.Lanchorpoint,%edx
207 testb $
1 << (28 % 8), 28 / 8 + 1 * 16 + 2 * 4 +__kcpuids
(%rip
)
208 #testb X86_HAVE(AVX)+__kcpuids(%rip)
212 # call *__memjmpinit@GOTPCREL(%rip)
214 #.init.end 300,_init_memcpy
215 .size _init_memcpy,.-_init_memcpy
216 .type _init_memcpy,@function
220 # cosmopolitan libc/nexgen32e/memjmpinit.S
222 .section .text.startup,"ax",@progbits
224 # Initializes jump table for memset() and memcpy().
226 # @param !ZF if required cpu vector extensions are available
227 # @param rdi is address of 64-bit jump table
228 # @param rsi is address of 8-bit jump initializers
229 # @param rdx is address of indirect branch
230 # @param ecx is size of jump table
241 .byte 0x83,0xe9,0x01 # sub $1,%ecx
244 testb $
1 << (9 % 8), 9 / 8 + 3 * 16 + 1 * 4 +__kcpuids
(%rip
)
245 # testb X86_HAVE(ERMS)+__kcpuids(%rip)
254 .size __memjmpinit,.-__memjmpinit
255 .type __memjmpinit,@function
258 # .endfn __memjmpinit,globl,hidden
262 # cosmopolitan: libc/nexgen32e/kcpuids.S
264 .section .piro.bss.init.2.201._init_kcpuids,"aw",@nobits
267 .long 0,0,0,0 # EAX=0 (Basic Processor Info)
268 .long 0,0,0,0 # EAX=1 (Processor Info)
269 .long 0,0,0,0 # EAX=2
270 .long 0,0,0,0 # EAX=7 (Extended Features)
271 .long 0,0,0,0 # EAX=0x80000001 (NexGen32e)
272 .long 0,0,0,0 # EAX=0x80000007 (APM)
273 .long 0,0,0,0 # EAX=16h (CPU Frequency)
274 .size __kcpuids,.-__kcpuids
275 .type __kcpuids,@object
279 .section .init.201._init_kcpuids,"ax",@progbits
284 push $
0xffffffff80000007
285 push $
0xffffffff80000001
290 # rofl0r: added this line
294 # mov __kcpuids@GOTPCREL(%rip), %rdi
297 # -----------------------
310 test
%eax
,%eax
# EAX = stacklist->pop()
311 jz
3f
# EAX != 0 (EOL sentinel)
312 cmp 0 * 16 + 0 * 4(%r8),%al
# EAX <= CPUID.0 max leaf
313 jbe
1b # CPUID too new to probe
317 testb $
1 << (28 % 8), 28 / 8 + 1 * 16 + 2 * 4(%r8)
319 testb $
1 << (27 % 8), 27 / 8 + 1 * 16 + 2 * 4(%r8)
326 4: btr $
28,1 * 16 + 2 * 4(%r8)
327 btr $
5,3 * 16 + 1 * 4(%r8)
329 .size _init_kcpuids,.-_init_kcpuids
330 .type _init_kcpuids,@function
333 .section .init_array,"aw"