import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / capabilities / i386 / common / memcpy.s
blob5f3eef411faadbfcb53721e4b43cdbb9b1a08fcb
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 .file "memcpy.s"
29 #include <sys/asm_linkage.h>
31 ANSI_PRAGMA_WEAK(memmove,function)
32 ANSI_PRAGMA_WEAK(memcpy,function)
34 ENTRY(memmove)
35 movl 0+12(%esp),%ecx / get number of bytes to move
36 pushl %esi / save off %edi, %esi and move destination
37 pushl %edi
38 movl 8+ 4(%esp),%edi / destination buffer address
39 movl 8+ 8(%esp),%esi / source buffer address
40 movl %edi, %eax
41 testl %ecx,%ecx
42 jz .Return
44 cmpl %esi,%edi / if (source addr > dest addr)
45 leal -1(%esi,%ecx),%edx / %edx = src + size - 1
46 jbe .memcpy_post / jump if dst <= src
47 cmpl %edx,%edi
48 jbe .CopyLeft / jump if dst <= src + size - 1
49 jmp .memcpy_post
51 ENTRY(memcpy)
52 pushl %esi
53 pushl %edi
55 movl 8+4(%esp),%edi / %edi = dest address
56 movl %edi, %eax / save this
57 movl 8+8(%esp),%esi / %esi = source address
58 movl 8+12(%esp),%ecx/ %ecx = length of string
59 / %edx scratch register
60 / %eax scratch register
61 .memcpy_post:
62 nop / this really helps, don't know why
63 / note: cld is perf death on P4
64 cmpl $63,%ecx
65 ja .move_sse / not worth doing sse for less
67 .movew:
68 movl %ecx,%edx / save byte cnt
69 shrl $2,%ecx / %ecx = number of words to move
70 rep ; smovl / move the words
73 andl $0x3,%edx / %edx = number of bytes left to move
74 jz .Return / %edx <= 3, so just unroll the loop
76 movb (%esi), %cl
77 movb %cl, (%edi)
78 decl %edx
79 jz .Return
80 movb 1(%esi), %cl
81 movb %cl, 1(%edi)
82 decl %edx
83 jz .Return
84 movb 2(%esi), %cl
85 movb %cl, 2(%edi)
87 .Return:
88 popl %edi / restore register variables
89 popl %esi
90 ret
92 .move_sse:
94 / time to 16 byte align destination
96 andl $15, %eax
97 jnz .sse_unaligned / jmp if dest is unaligned
98 .sse: / dest is aligned, check source
99 movl %ecx, %edx / get byte count
100 shrl $6, %edx / number of 64 byte blocks to move
101 testl $15, %esi
102 jnz .sse_da / go to slow loop if source is unaligned
103 cmpl $65535, %ecx
104 ja .sse_sa_nt_loop
107 / use aligned load since we're lucky
109 .sse_sa_loop:
110 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
111 prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time
112 movaps 0(%esi), %xmm0
113 movaps %xmm0, 0(%edi)
114 movaps 16(%esi), %xmm1
115 movaps %xmm1, 16(%edi)
116 movaps 32(%esi), %xmm2
117 movaps %xmm2, 32(%edi)
118 movaps 48(%esi), %xmm3
119 movaps %xmm3, 48(%edi)
120 addl $64, %esi
121 addl $64, %edi
122 decl %edx
123 jnz .sse_sa_loop
125 .sse_cleanup:
126 andl $63, %ecx / compute remaining bytes
127 movl 8+4(%esp), %eax / setup return value
128 jz .Return
129 jmp .movew
132 / use aligned load since we're lucky
134 .align 16
135 .sse_sa_nt_loop:
136 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
137 movaps (%esi), %xmm0
138 movntps %xmm0, 0(%edi)
139 movaps 16(%esi), %xmm1
140 movntps %xmm1, 16(%edi)
141 movaps 32(%esi), %xmm2
142 movntps %xmm2, 32(%edi)
143 movaps 48(%esi), %xmm3
144 movntps %xmm3, 48(%edi)
145 addl $64, %esi
146 addl $64, %edi
147 decl %edx
148 jnz .sse_sa_nt_loop
149 #if defined(_SSE2_INSN)
150 mfence
151 #elif defined(_SSE_INSN)
152 sfence
153 #else
154 #error "Must have either SSE or SSE2"
155 #endif
156 jmp .sse_cleanup
159 / Make certain that destination buffer becomes aligned
161 .sse_unaligned:
162 neg %eax / subtract from 16 and get destination
163 andl $15, %eax / aligned on a 16 byte boundary
164 movl %ecx, %edx / saved count
165 subl %eax, %ecx / subtract from byte count
166 cmpl $64, %ecx / after aligning, will we still have 64 bytes?
167 cmovb %edx, %ecx / if not, restore original byte count,
168 cmovb 8+4(%esp), %eax / and restore return value,
169 jb .movew / and do a non-SSE move.
170 xchg %ecx, %eax / flip for copy
171 rep ; smovb / move the bytes
172 xchg %ecx, %eax / flip back
173 jmp .sse
175 .align 16
176 .sse_da:
177 cmpl $65535, %ecx
178 jbe .sse_da_loop
181 / use unaligned load since source doesn't line up
183 .sse_da_nt_loop:
184 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
185 movups 0(%esi), %xmm0
186 movntps %xmm0, 0(%edi)
187 movups 16(%esi), %xmm1
188 movntps %xmm1, 16(%edi)
189 movups 32(%esi), %xmm2
190 movntps %xmm2, 32(%edi)
191 movups 48(%esi), %xmm3
192 movntps %xmm3, 48(%edi)
193 addl $64, %esi
194 addl $64, %edi
195 decl %edx
196 jnz .sse_da_nt_loop
197 #if defined(_SSE2_INSN)
198 mfence
199 #elif defined(_SSE_INSN)
200 sfence
201 #else
202 #error "Must have either SSE or SSE2"
203 #endif
204 jmp .sse_cleanup
206 / use unaligned load since source doesn't line up
208 .align 16
209 .sse_da_loop:
210 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
211 prefetcht0 568(%edi)
212 movups 0(%esi), %xmm0
213 movaps %xmm0, 0(%edi)
214 movups 16(%esi), %xmm1
215 movaps %xmm1, 16(%edi)
216 movups 32(%esi), %xmm2
217 movaps %xmm2, 32(%edi)
218 movups 48(%esi), %xmm3
219 movaps %xmm3, 48(%edi)
220 addl $64, %esi
221 addl $64, %edi
222 decl %edx
223 jnz .sse_da_loop
224 jmp .sse_cleanup
226 SET_SIZE(memcpy)
229 / .CopyLeft handles the memmove case where we must perform the copy backwards,
230 / because of overlap between src and dst. This is not particularly optimized.
232 .CopyLeft:
233 movl $3,%eax / heavily used constant
234 std / reverse direction bit (RtoL)
235 cmpl $12,%ecx / if (size < 12)
236 ja .BigCopyLeft / {
237 movl %edx,%esi / src = src + size - 1
238 leal -1(%ecx,%edi),%edi / dst = dst + size - 1
239 rep; smovb / do the byte copy
240 cld / reset direction flag to LtoR
241 popl %edi / }
242 popl %esi / restore registers
243 movl 4(%esp),%eax / set up return value
244 ret / return(dba);
245 .BigCopyLeft: / } else {
246 xchgl %edx,%ecx
247 movl %ecx,%esi / align source w/byte copy
248 leal -1(%edx,%edi),%edi
249 andl %eax,%ecx
250 jz .SkipAlignLeft
251 addl $1, %ecx / we need to insure that future
252 subl %ecx,%edx / copy is done on aligned boundary
253 rep; smovb
254 .SkipAlignLeft:
255 movl %edx,%ecx
256 subl %eax,%esi
257 shrl $2,%ecx / do 4 byte copy RtoL
258 subl %eax,%edi
259 rep; smovl
260 andl %eax,%edx / do 1 byte copy whats left
261 jz .CleanupReturnLeft
262 movl %edx,%ecx
263 addl %eax,%esi / rep; smovl instruction will decrement
264 addl %eax,%edi / %edi, %esi by four after each copy
265 / adding 3 will restore pointers to byte
266 / before last double word copied
267 / which is where they are expected to
268 / be for the single byte copy code
269 rep; smovb
270 .CleanupReturnLeft:
271 cld / reset direction flag to LtoR
272 popl %edi
273 popl %esi / restore registers
274 movl 4(%esp),%eax / set up return value
275 ret / return(dba);
276 SET_SIZE(memmove)