revert between 56095 -> 55830 in arch
[AROS.git] / arch / x86_64-all / exec / copymem.c
blob06a01c3b04b1b26b19b2dd260e3f4caddb4c8ce7
1 /*
2 Copyright © 2009-2012, The AROS Development Team. All rights reserved.
3 $Id$
5 Desc: Copy memory.
6 Lang: english
7 */
9 #define DEBUG 0
10 #include <aros/debug.h>
12 #include <aros/libcall.h>
13 #include <proto/exec.h>
15 /* See rom/exec/copymem.c for documentation */
17 #define SSE_REG_SIZE 16
18 #define SSE_REG_MASK 0xF
20 #define MEMFENCE __asm__ __volatile__ ("sfence":::"memory")
21 #define MMENABLE __asm__ __volatile__ ("emms":::"memory")
23 #define __byte_memcpy(src,dst,size) \
24 { \
25 __asm__ __volatile__( \
26 " rep; movsb" \
27 : "=&D" (dst), "=&S" (src), "=&c" (dummy) \
28 : "0" (dst), "1" (src), "2" (size) \
29 : "memory"); \
32 #define __long_memcpy(src,dst,size) \
33 { \
34 __asm__ __volatile__( \
35 " rep; movsl" "\n\t" \
36 " testb $2,%b6" "\n\t" \
37 " je 1f" "\n\t" \
38 " movsw" "\n" \
39 "1: testb $1,%b6" "\n\t" \
40 " je 2f" "\n\t" \
41 " movsb" "\n" \
42 "2:" \
43 : "=&D" (dst), "=&S" (src), "=&c" (dummy) \
44 : "0" (dst), "1" (src), "2" (size >> 2), "q" (size) \
45 : "memory"); \
48 static __inline__ void __small_memcpy(const void * src, void * dst, ULONG size)
50 register unsigned long int dummy;
51 if( size < 4 ) {
52 D(bug("[Exec] __byte_memcpy(%p, %p, %ld)\n", src, dst, size));
54 __byte_memcpy(src, dst, size);
56 else
58 D(bug("[Exec] __long_memcpy(%p, %p, %ld)\n", src, dst, size));
60 __long_memcpy(src, dst, size);
65 AROS_LH3I(void, CopyMem,
66 AROS_LHA(CONST_APTR, source, A0),
67 AROS_LHA(APTR, dest, A1),
68 AROS_LHA(IPTR, size, D0),
69 struct ExecBase *, SysBase, 104, Exec)
71 AROS_LIBFUNC_INIT
73 if (!size) return;
75 ULONG lcnt = (size >> 6); /* size / 64 */
77 const void *src = source;
78 void *dst = dest;
80 D(bug("[Exec] CopyMem(%p, %p, %ld)\n", src, dst, size));
82 __asm__ __volatile__ (
83 " prefetchnta (%0)\n"
84 " prefetchnta 32(%0)\n"
85 " prefetchnta 64(%0)\n"
86 " prefetchnta 96(%0)\n"
87 " prefetchnta 128(%0)\n"
88 " prefetchnta 160(%0)\n"
89 " prefetchnta 192(%0)\n"
90 " prefetchnta 256(%0)\n"
91 " prefetchnta 288(%0)\n"
93 : "r" (src) );
95 if ((lcnt > 0) && (size >= (SSE_REG_SIZE * 4)))
97 D(bug("[Exec] CopyMem: Using SSE Copy.\n"));
98 ULONG alignsize = ((SSE_REG_SIZE - ((IPTR)src & SSE_REG_MASK)));
100 if ((((IPTR)src & SSE_REG_MASK) != 0) && (((IPTR)(dst + alignsize) & SSE_REG_MASK) == 0))
102 D(bug("[Exec] CopyMem: Aligning src to %d byte boundary (%d bytes) .. \n", SSE_REG_SIZE, alignsize));
104 __small_memcpy(src, dst, alignsize);
106 size -= alignsize;
107 lcnt = (size >> 6); /* size / 64 */
108 src += alignsize;
109 dst += alignsize;
111 if (lcnt > 0) {
112 if ((((IPTR)src & SSE_REG_MASK) == 0) && (((IPTR)dst & SSE_REG_MASK) == 0))
115 # SRC and DST aligned on 16-byte boundary.
116 We can use movaps instead of movups since we meet
117 the alignment constraints (a general-protection fault
118 would be triggered otherwise)
120 size -= (lcnt << 6);
121 for( ; lcnt > 0; lcnt--)
123 D(bug("[Exec] CopyMem: SSE Aligned-Copy %p to %p.\n", src, dst));
125 __asm__ __volatile__ (
126 " prefetchnta 320(%0)\n"
127 " prefetchnta 352(%0)\n"
128 " movaps (%0), %%xmm0\n"
129 " movaps 16(%0), %%xmm1\n"
130 " movaps 32(%0), %%xmm2\n"
131 " movaps 48(%0), %%xmm3\n"
132 " movntps %%xmm0, (%1)\n"
133 " movntps %%xmm1, 16(%1)\n"
134 " movntps %%xmm2, 32(%1)\n"
135 " movntps %%xmm3, 48(%1)\n"
137 : "r" (src), "r" (dst)
138 : "memory");
140 src += (SSE_REG_SIZE * 4);
141 dst += (SSE_REG_SIZE * 4);
144 else if (((IPTR)dst & SSE_REG_MASK) == 0)
147 # SRC is unaligned and DST aligned on 16-byte boundary.
149 size -= (lcnt << 6);
150 for( ; lcnt > 0; lcnt--)
152 D(bug("[Exec] CopyMem: SSE Unaligned-Copy %p to %p.\n", src, dst));
154 __asm__ __volatile__ (
155 " prefetchnta 320(%0)\n"
156 " prefetchnta 352(%0)\n"
157 " movups (%0), %%xmm0\n"
158 " movups 16(%0), %%xmm1\n"
159 " movups 32(%0), %%xmm2\n"
160 " movups 48(%0), %%xmm3\n"
161 " movntps %%xmm0, (%1)\n"
162 " movntps %%xmm1, 16(%1)\n"
163 " movntps %%xmm2, 32(%1)\n"
164 " movntps %%xmm3, 48(%1)\n"
166 : "r" (src), "r" (dst)
167 : "memory");
169 src += (SSE_REG_SIZE * 4);
170 dst += (SSE_REG_SIZE * 4);
176 if (size > 0)
178 D(bug("[Exec] CopyMem: Copy remaining %ld bytes.\n", size));
179 __small_memcpy(src, dst, size);
183 FENCE Memory to re-order again since movntq is weakly-ordered ?
185 MEMFENCE;
187 enable FPU use ?
189 MMENABLE;
191 D(bug("[Exec] CopyMem: Finished.\n"));
193 AROS_LIBFUNC_EXIT
194 } /* CopyMem */