2 Copyright © 1995-2012, The AROS Development Team. All rights reserved.
6 /*****************************************************************************
10 AROS_LH3(void, CopyMem_SSE,
13 AROS_LHA(CONST_APTR, source, A0),
14 AROS_LHA(APTR, destination, A1),
15 AROS_LHA(IPTR, len, D0),
18 struct ExecBase *, SysBase, 104, Exec) (if we're lucky)
21 Copy some data from one location to another in memory using
22 SSE optimised copying method if enough data.
25 source - Pointer to source area
26 dest - Pointer to destination
27 size - number of bytes to copy
40 The source and destination area *ARE* allowed to overlap.
42 ******************************************************************************/
43 #include "aros/i386/asm.h"
44 #include <aros/config.h>
47 .globl AROS_SLIB_ENTRY(CopyMem_SSE,Exec,104)
48 _FUNCTION(AROS_SLIB_ENTRY(CopyMem_SSE,Exec,104))
49 AROS_SLIB_ENTRY(CopyMem_SSE,Exec,104):
53 jmp AROS_SLIB_ENTRY(CopyMem,Exec,104) /* non-SSE version is faster for small copies */
55 .globl AROS_SLIB_ENTRY(CopyMemQuick_SSE,Exec,105)
56 _FUNCTION(AROS_SLIB_ENTRY(CopyMemQuick_SSE,Exec,105))
57 AROS_SLIB_ENTRY(CopyMemQuick_SSE,Exec,105):
61 jmp AROS_SLIB_ENTRY(CopyMemQuick,Exec,105) /* non-SSE version is faster for small copies */
73 ** okay, so the user wants to copy at least 4096 bytes.
81 ** align memory to save xmm regs
86 movntps %xmm1,16(%esp)
87 movntps %xmm2,32(%esp)
88 movntps %xmm3,48(%esp)
105 prefetchnta 128(%esi)
106 prefetchnta 160(%esi)
109 ** check memory alignment
117 movl %eax,%ecx /* set count register */
118 subl %eax,%ebx /* update "to-do" length */
123 movl %esi,%eax /* start checking alignment here */
124 subl $64,%ebx /* another trick here - don't use cmp, but neg flag instead */
130 prefetchnta 160(%esi) /* it's true we prefetched "something". but we may have moved during alignment */
131 movups (%esi),%xmm0 /* transfer block 16 bytes by 16 bytes */
132 movups 16(%esi),%xmm1
133 movups 32(%esi),%xmm2
134 movups 48(%esi),%xmm3
136 movntps %xmm1,16(%edi)
137 movntps %xmm2,32(%edi)
138 movntps %xmm3,48(%edi)
141 subl $64,%ebx /* update count */
142 jge .unaligned_copy /* continue */
146 prefetchnta 160(%esi) /* it's true we prefetched "something". but we may have moved during alignment */
147 movaps (%esi),%xmm0 /* transfer block 16 bytes by 16 bytes */
148 movaps 16(%esi),%xmm1
149 movaps 32(%esi),%xmm2
150 movaps 48(%esi),%xmm3
152 movntps %xmm1,16(%edi)
153 movntps %xmm2,32(%edi)
154 movntps %xmm3,48(%edi)
157 subl $64,%ebx /* update count */
158 jge .aligned_copy /* continue */
163 ** adjust pointers first
173 prefetchnta -191(%esi)
174 prefetchnta -159(%esi)
175 prefetchnta -127(%esi)
176 prefetchnta -95(%esi)
177 prefetchnta -63(%esi)
178 prefetchnta -31(%esi)
181 ** check memory alignment
189 movl %eax,%ecx /* set count register */
190 subl %eax,%ebx /* update "to-do" length */
191 std /* indicate we want to copy backwards */
195 movl %esi,%eax /* start checking alignment here */
196 subl $64,%ebx /* another trick here - don't use cmp, but neg flag instead */
200 je .aligned_reverse_copy
202 .unaligned_reverse_copy:
203 prefetchnta -191(%esi) /* it's true we prefetched "something". but we may have moved during alignment */
204 movups -63(%esi),%xmm0 /* transfer block 16 bytes by 16 bytes */
205 movups -47(%esi),%xmm1
206 movups -31(%esi),%xmm2
207 movups -15(%esi),%xmm3
208 movntps %xmm0,-63(%edi)
209 movntps %xmm1,-47(%edi)
210 movntps %xmm2,-31(%edi)
211 movntps %xmm3,-15(%edi)
214 subl $64,%ebx /* update count */
215 jge .unaligned_reverse_copy /* continue */
218 .aligned_reverse_copy:
219 prefetchnta -192(%esi) /* it's true we prefetched "something". but we may have moved during alignment */
220 movaps -63(%esi),%xmm0 /* transfer block 16 bytes by 16 bytes */
221 movaps -47(%esi),%xmm1
222 movaps -31(%esi),%xmm2
223 movaps -15(%esi),%xmm3
224 movntps %xmm0,-63(%edi)
225 movntps %xmm1,-47(%edi)
226 movntps %xmm2,-31(%edi)
227 movntps %xmm3,-15(%edi)
230 subl $64,%ebx /* update count */
231 jge .aligned_reverse_copy /* continue */
238 ** prefetch stack. we'll be leaving soon
245 ** copy remaining bytes
246 ** note: std not needed for reverse resume
253 ** restore everything
256 movaps 16(%esp),%xmm1
257 movaps 32(%esp),%xmm2
258 movaps 48(%esp),%xmm3