4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
29 #include <sys/asm_linkage.h>
31 ANSI_PRAGMA_WEAK
(memmove
,function
)
32 ANSI_PRAGMA_WEAK
(memcpy
,function
)
35 movl
0+12(%esp
),%ecx
/ get number of bytes to move
36 pushl
%esi
/ save off
%edi
, %esi
and move destination
38 movl
8+ 4(%esp
),%edi
/ destination buffer address
39 movl
8+ 8(%esp
),%esi
/ source buffer address
44 cmpl %esi
,%edi
/ if
(source addr
> dest addr
)
45 leal
-1(%esi
,%ecx
),%edx
/ %edx
= src
+ size
- 1
46 jbe
.memcpy_post / jump if dst <= src
48 jbe
.CopyLeft / jump if dst <= src + size - 1
55 movl
8+4(%esp
),%edi
/ %edi
= dest address
56 movl
%edi
, %eax
/ save this
57 movl
8+8(%esp
),%esi
/ %esi
= source address
58 movl
8+12(%esp
),%ecx
/ %ecx
= length of string
59 / %edx scratch register
60 / %eax scratch register
62 nop / this really helps
, don
't know why
63 / note: cld is perf death on P4
65 ja .move_sse / not worth doing sse for less
68 movl %ecx,%edx / save byte cnt
69 shrl $2,%ecx / %ecx = number of words to move
70 rep ; smovl / move the words
73 andl $0x3,%edx / %edx = number of bytes left to move
74 jz .Return / %edx <= 3, so just unroll the loop
88 popl %edi / restore register variables
94 / time to 16 byte align destination
97 jnz .sse_unaligned / jmp if dest is unaligned
98 .sse: / dest is aligned, check source
99 movl %ecx, %edx / get byte count
100 shrl $6, %edx / number of 64 byte blocks to move
102 jnz .sse_da / go to slow loop if source is unaligned
107 / use aligned load since we're lucky
110 prefetcht0
568(%esi
) / prefetch source
& copy
64 byte at
a time
111 prefetcht0
568(%edi
) / prefetch source
& copy
64 byte at
a time
112 movaps
0(%esi
), %xmm0
113 movaps
%xmm0
, 0(%edi
)
114 movaps
16(%esi
), %xmm1
115 movaps
%xmm1
, 16(%edi
)
116 movaps
32(%esi
), %xmm2
117 movaps
%xmm2
, 32(%edi
)
118 movaps
48(%esi
), %xmm3
119 movaps
%xmm3
, 48(%edi
)
126 andl $
63, %ecx
/ compute remaining bytes
127 movl
8+4(%esp
), %eax
/ setup return value
132 / use aligned load since we
're lucky
136 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
138 movntps %xmm0, 0(%edi)
139 movaps 16(%esi), %xmm1
140 movntps %xmm1, 16(%edi)
141 movaps 32(%esi), %xmm2
142 movntps %xmm2, 32(%edi)
143 movaps 48(%esi), %xmm3
144 movntps %xmm3, 48(%edi)
149 #if defined(_SSE2_INSN)
151 #elif defined(_SSE_INSN)
154 #error "Must have either SSE or SSE2"
159 / Make certain that destination buffer becomes aligned
162 neg %eax / subtract from 16 and get destination
163 andl $15, %eax / aligned on a 16 byte boundary
164 movl %ecx, %edx / saved count
165 subl %eax, %ecx / subtract from byte count
166 cmpl $64, %ecx / after aligning, will we still have 64 bytes?
167 cmovb %edx, %ecx / if not, restore original byte count,
168 cmovb 8+4(%esp), %eax / and restore return value,
169 jb .movew / and do a non-SSE move.
170 xchg %ecx, %eax / flip for copy
171 rep ; smovb / move the bytes
172 xchg %ecx, %eax / flip back
181 / use unaligned load since source doesn't line up
184 prefetchnta
16384(%esi
) / prefetch source
& copy
64 byte at
a time
185 movups
0(%esi
), %xmm0
186 movntps
%xmm0
, 0(%edi
)
187 movups
16(%esi
), %xmm1
188 movntps
%xmm1
, 16(%edi
)
189 movups
32(%esi
), %xmm2
190 movntps
%xmm2
, 32(%edi
)
191 movups
48(%esi
), %xmm3
192 movntps
%xmm3
, 48(%edi
)
197 #if defined(_SSE2_INSN)
199 #elif defined(_SSE_INSN)
202 #error "Must have either SSE or SSE2"
206 / use unaligned load since source doesn
't line up
210 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
212 movups 0(%esi), %xmm0
213 movaps %xmm0, 0(%edi)
214 movups 16(%esi), %xmm1
215 movaps %xmm1, 16(%edi)
216 movups 32(%esi), %xmm2
217 movaps %xmm2, 32(%edi)
218 movups 48(%esi), %xmm3
219 movaps %xmm3, 48(%edi)
229 / .CopyLeft handles the memmove case where we must perform the copy backwards,
230 / because of overlap between src and dst. This is not particularly optimized.
233 movl $3,%eax / heavily used constant
234 std / reverse direction bit (RtoL)
235 cmpl $12,%ecx / if (size < 12)
237 movl %edx,%esi / src = src + size - 1
238 leal -1(%ecx,%edi),%edi / dst = dst + size - 1
239 rep; smovb / do the byte copy
240 cld / reset direction flag to LtoR
242 popl %esi / restore registers
243 movl 4(%esp),%eax / set up return value
245 .BigCopyLeft: / } else {
247 movl %ecx,%esi / align source w/byte copy
248 leal -1(%edx,%edi),%edi
251 addl $1, %ecx / we need to insure that future
252 subl %ecx,%edx / copy is done on aligned boundary
257 shrl $2,%ecx / do 4 byte copy RtoL
260 andl %eax,%edx / do 1 byte copy whats left
261 jz .CleanupReturnLeft
263 addl %eax,%esi / rep; smovl instruction will decrement
264 addl %eax,%edi / %edi, %esi by four after each copy
265 / adding 3 will restore pointers to byte
266 / before last double word copied
267 / which is where they are expected to
268 / be for the single byte copy code
271 cld / reset direction flag to LtoR
273 popl %esi / restore registers
274 movl 4(%esp),%eax / set up return value