import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / capabilities / i386 / common / memset.s
blobceaf437c086693d332f573636551cae3161593fc
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 .file "memset.s"
29 #include <sys/asm_linkage.h>
31 ANSI_PRAGMA_WEAK(memset,function)
33 ENTRY(memset)
34 pushl %edi / save register variable
35 movl 8(%esp),%edi / %edi = string address
36 movl 12(%esp),%eax / %al = byte to duplicate
37 movl 16(%esp),%ecx / %ecx = number of copies
39 / For all basic blocks in this routine, maintain the following
40 / entry conditions: %eax each byte is set to desired byte.
41 / NOTE: .byteset doesn't require this
42 / %ecx contains # bytes to set
43 / %edi contain address to set
45 cld / make sure we go the right way...
46 cmpl $20,%ecx / strings with fewer than 20 chars should be byte set
47 jbe .byteset
49 andl $0xff, %eax / trim anything above low byte
50 imul $0x01010101, %eax / extend low byte to each byte
52 cmpl $256, %ecx / smaller areas don't benefit from alignment
53 jbe .wordset
55 cmpl $511, %ecx / areas smaller than this should be wordset
56 jbe .check_wordset
59 / prep work for sse temporal and non-temporal
62 pushl %ebx / more registers are needed
63 pushl %esi / for alignment work
66 / align address to 64 byte boundaries.
69 movl %ecx, %ebx / save byte count
70 movl %edi, %esi / esi is scratch register
71 andl $63, %esi / bytes to align to 64 byte align addr
72 neg %esi / compute count of bytes
73 addl $64, %esi / needed to align
74 andl $63, %esi / to 64 byte align addr
75 jz .sse_aligned / skip alignment if not needed
76 subl %esi, %ebx / ebx contains remainder of bytes to set
77 movl %esi, %ecx / alignment bytes
78 shrl $2,%ecx / %ecx = number of words to set
79 rep; sstol
80 movl %esi,%ecx
81 andl $3,%ecx / %ecx = number of bytes left
82 rep; sstob
83 movl %ebx, %ecx / remainder to be set
85 .sse_aligned:
87 shr $6, %ecx / number of 64 byte blocks to set
90 / load xmm0 with bytes to be set
92 subl $16,%esp / give ourselves some working room on the stack
93 movl %eax,(%esp) / copy eax into each of 4 bytes
94 movl %eax,4(%esp) / avoid pushl since it causes more interlocking
95 movl %eax,8(%esp) /
96 movl %eax,12(%esp) /
97 movups (%esp), %xmm0 / unaligned load from stack into xmm0
98 addl $16,%esp / restore stack position
100 cmpl $262143, %ebx / blocks smaller than this allocate in the cache
101 jbe .sse_loop
102 jmp .sse_nt_loop / branch across alignment nops
104 .align 16
106 .sse_nt_loop:
107 movntps %xmm0, (%edi) / block non-temporal store
108 movntps %xmm0, 16(%edi) / use sse rather than sse2
109 movntps %xmm0, 32(%edi) / so we work more places
110 movntps %xmm0, 48(%edi) /
112 addl $64, %edi / increment dest address
113 dec %ecx / dec count of blocks
114 jnz .sse_nt_loop / jump if not done
116 andl $63, %ebx / remainder of bytes to copy
117 movl %ebx, %ecx / ecx contains remainer of bytes to set
118 popl %esi / restore stack config
119 popl %ebx /
120 #if defined(_SSE2_INSN)
121 mfence
122 #elif defined(_SSE_INSN)
123 sfence
124 #else
125 #error "Must have either SSE or SSE2"
126 #endif
127 cmpl $20, %ecx / compare and jump accordingly
128 jbe .byteset
129 jmp .wordset
131 .align 16
132 .sse_loop:
133 movaps %xmm0, (%edi) / block copy w/ SSE
134 movaps %xmm0, 16(%edi)
135 movaps %xmm0, 32(%edi)
136 movaps %xmm0, 48(%edi)
138 addl $64, %edi / increment addr
139 dec %ecx / dec count of blocks
140 jnz .sse_loop / jump if not done
142 andl $63, %ebx / remainder of bytes to copy
143 movl %ebx, %ecx / in %ecx as normal
144 popl %esi / restore stack config
145 popl %ebx /
146 cmpl $20, %ecx
147 jbe .byteset
148 jmp .wordset
150 .check_wordset:
151 movl %edi, %edx / save current store ptr
152 andl $7, %edi / check alignment
153 movl %edx,%edi / %edi = string address
154 jz .wordset / all ok
157 .align_wordset:
158 pushl %ebx / more registers are needed
159 pushl %esi
161 movl %ecx, %ebx
162 movl %edi, %esi
163 andl $7, %esi
164 neg %esi
165 addl $8, %esi
166 andl $7, %esi
167 subl %esi, %ebx / ebx contains remainder of bytes to copy
168 movl %esi, %ecx
169 rep; sstob
170 movl %ebx, %ecx
171 popl %esi / restore stack config
172 popl %ebx /
174 .wordset:
175 movl %ecx, %edx / save cont
176 shrl $2,%ecx / %ecx = number of words to set
177 rep; sstol
178 movl %edx,%ecx
179 andl $3,%ecx / %ecx = number of bytes left
181 .byteset:
182 rep; sstob
183 movl 8(%esp),%eax / return string address
184 popl %edi / restore register variable
186 SET_SIZE(memset)