import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / amd64 / gen / memcmp.s
blob5330e202d4fb5290bc6ca126d8815ef7a06e8516
1 /*
2 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
6 /*
7 * Copyright (c) 2002 Advanced Micro Devices, Inc.
8 *
9 * All rights reserved.
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the
13 * following conditions are met:
15 * + Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the
17 * following disclaimer.
19 * + Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the
21 * following disclaimer in the documentation and/or other
22 * materials provided with the distribution.
24 * + Neither the name of Advanced Micro Devices, Inc. nor the
25 * names of its contributors may be used to endorse or
26 * promote products derived from this software without
27 * specific prior written permission.
29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES,
31 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
32 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
33 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES,
34 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
36 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
37 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
38 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
39 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
41 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
42 * POSSIBILITY OF SUCH DAMAGE.
44 * It is licensee's responsibility to comply with any export
45 * regulations applicable in licensee's jurisdiction.
48 .file "memcmp.s"
50 #include <sys/asm_linkage.h>
52 ANSI_PRAGMA_WEAK(memcmp,function)
54 #include "SYS.h"
55 #include "cache.h"
57 #define LABEL(s) .memcmp##s
59 ENTRY(memcmp) /* (const void *, const void*, size_t) */
61 LABEL(try1):
62 cmp $8, %rdx
63 jae LABEL(1after)
65 LABEL(1): /* 1-byte */
66 test %rdx, %rdx
67 mov $0, %eax
68 jz LABEL(exit)
70 LABEL(1loop):
71 movzbl (%rdi), %eax
72 movzbl (%rsi), %ecx
73 sub %ecx, %eax
74 jnz LABEL(exit)
76 dec %rdx
78 lea 1 (%rdi), %rdi
79 lea 1 (%rsi), %rsi
81 jnz LABEL(1loop)
83 LABEL(exit):
84 rep
85 ret
87 .p2align 4
89 LABEL(1after):
91 LABEL(8try):
92 cmp $32, %rdx
93 jae LABEL(8after)
95 LABEL(8): /* 8-byte */
96 mov %edx, %ecx
97 shr $3, %ecx
98 jz LABEL(1)
100 .p2align 4
102 LABEL(8loop):
103 mov (%rsi), %rax
104 cmp (%rdi), %rax
105 jne LABEL(1)
107 sub $8, %rdx
108 dec %ecx
110 lea 8 (%rsi), %rsi
111 lea 8 (%rdi), %rdi
113 jnz LABEL(8loop)
115 LABEL(8skip):
116 and $7, %edx
117 jnz LABEL(1)
119 xor %eax, %eax
122 .p2align 4
124 LABEL(8after):
126 LABEL(32try):
127 cmp $2048, %rdx
128 ja LABEL(32after)
130 LABEL(32): /* 32-byte */
131 mov %edx, %ecx
132 shr $5, %ecx
133 jz LABEL(8)
135 .p2align 4
137 LABEL(32loop):
138 mov (%rsi), %rax
139 mov 8 (%rsi), %r8
140 mov 16 (%rsi), %r9
141 mov 24 (%rsi), %r10
142 sub (%rdi), %rax
143 sub 8 (%rdi), %r8
144 sub 16 (%rdi), %r9
145 sub 24 (%rdi), %r10
147 or %rax, %r8
148 or %r9, %r10
149 or %r8, %r10
150 jnz LABEL(8)
152 sub $32, %rdx
153 dec %ecx
155 lea 32 (%rsi), %rsi
156 lea 32 (%rdi), %rdi
158 jnz LABEL(32loop)
160 LABEL(32skip):
161 and $31, %edx
162 jnz LABEL(8)
164 xor %eax, %eax
167 .p2align 4
169 LABEL(32after):
171 prefetchnta _sref_(.amd64cache1half) /* 3DNow: use prefetch */
173 LABEL(srctry):
174 mov %esi, %r8d /* align by source */
176 and $7, %r8d
177 jz LABEL(srcafter) /* not unaligned */
179 LABEL(src): /* align */
180 lea -8 (%r8, %rdx), %rdx
181 sub $8, %r8d
184 LABEL(srcloop):
185 movzbl (%rdi), %eax
186 movzbl (%rsi), %ecx
187 sub %ecx, %eax
188 jnz LABEL(exit)
190 inc %r8d
192 lea 1 (%rdi), %rdi
193 lea 1 (%rsi), %rsi
195 jnz LABEL(srcloop)
197 .p2align 4
199 LABEL(srcafter):
201 LABEL(64try):
202 mov _sref_(.amd64cache1half), %rcx
203 cmp %rdx, %rcx
204 cmova %rdx, %rcx
206 LABEL(64): /* 64-byte */
207 shr $6, %rcx
208 jz LABEL(32)
210 .p2align 4
212 LABEL(64loop):
213 mov (%rsi), %rax
214 mov 8 (%rsi), %r8
215 sub (%rdi), %rax
216 sub 8 (%rdi), %r8
217 or %r8, %rax
219 mov 16 (%rsi), %r9
220 mov 24 (%rsi), %r10
221 sub 16 (%rdi), %r9
222 sub 24 (%rdi), %r10
223 or %r10, %r9
225 or %r9, %rax
226 jnz LABEL(32)
228 mov 32 (%rsi), %rax
229 mov 40 (%rsi), %r8
230 sub 32 (%rdi), %rax
231 sub 40 (%rdi), %r8
232 or %r8, %rax
234 mov 48 (%rsi), %r9
235 mov 56 (%rsi), %r10
236 sub 48 (%rdi), %r9
237 sub 56 (%rdi), %r10
238 or %r10, %r9
240 or %r9, %rax
241 jnz LABEL(32)
243 lea 64 (%rsi), %rsi
244 lea 64 (%rdi), %rdi
246 sub $64, %rdx
247 dec %rcx
248 jnz LABEL(64loop)
250 LABEL(64skip):
251 cmp $2048, %rdx
252 ja LABEL(64after)
254 test %edx, %edx
255 jnz LABEL(32)
257 xor %eax, %eax
260 .p2align 4
262 LABEL(64after):
264 LABEL(pretry):
266 LABEL(pre): /* 64-byte prefetching */
267 mov _sref_(.amd64cache2half), %rcx
268 cmp %rdx, %rcx
269 cmova %rdx, %rcx
271 shr $6, %rcx
272 jz LABEL(preskip)
274 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */
275 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */
277 mov (%rsi), %rax
278 mov 8 (%rsi), %r9
279 mov 16 (%rsi), %r10
280 mov 24 (%rsi), %r11
281 sub (%rdi), %rax
282 sub 8 (%rdi), %r9
283 sub 16 (%rdi), %r10
284 sub 24 (%rdi), %r11
286 or %r9, %rax
287 or %r11, %r10
288 or %r10, %rax
289 jnz LABEL(32)
291 mov 32 (%rsi), %rax
292 mov 40 (%rsi), %r9
293 mov 48 (%rsi), %r10
294 mov 56 (%rsi), %r11
295 sub 32 (%rdi), %rax
296 sub 40 (%rdi), %r9
297 sub 48 (%rdi), %r10
298 sub 56 (%rdi), %r11
300 or %r9, %rax
301 or %r11, %r10
302 or %r10, %rax
303 jnz LABEL(32)
305 lea 64 (%rsi), %rsi
306 lea 64 (%rdi), %rdi
308 sub $64, %rdx
309 dec %rcx
311 .p2align 4
313 LABEL(preloop):
314 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */
315 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */
317 mov (%rsi), %rax
318 mov 8 (%rsi), %r9
319 mov 16 (%rsi), %r10
320 mov 24 (%rsi), %r11
321 sub (%rdi), %rax
322 sub 8 (%rdi), %r9
323 sub 16 (%rdi), %r10
324 sub 24 (%rdi), %r11
326 or %r9, %rax
327 or %r11, %r10
328 or %r10, %rax
329 jnz LABEL(32)
331 mov 32 (%rsi), %rax
332 mov 40 (%rsi), %r9
333 mov 48 (%rsi), %r10
334 mov 56 (%rsi), %r11
335 sub 32 (%rdi), %rax
336 sub 40 (%rdi), %r9
337 sub 48 (%rdi), %r10
338 sub 56 (%rdi), %r11
340 or %r9, %rax
341 or %r11, %r10
342 or %r10, %rax
343 jnz LABEL(32)
345 lea 64 (%rsi), %rsi
346 lea 64 (%rdi), %rdi
348 sub $64, %rdx
349 dec %rcx
350 jnz LABEL(preloop)
353 LABEL(preskip):
354 cmp $2048, %rdx
355 ja LABEL(preafter)
357 test %edx, %edx
358 jnz LABEL(32)
360 xor %eax, %eax
363 .p2align 4
365 LABEL(preafter):
367 LABEL(128try):
369 LABEL(128): /* 128-byte */
370 mov %rdx, %rcx
371 shr $7, %rcx
372 jz LABEL(128skip)
374 .p2align 4
376 LABEL(128loop):
377 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */
378 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */
380 mov (%rsi), %rax
381 mov 8 (%rsi), %r8
382 sub (%rdi), %rax
383 sub 8 (%rdi), %r8
384 mov 16 (%rsi), %r9
385 mov 24 (%rsi), %r10
386 sub 16 (%rdi), %r9
387 sub 24 (%rdi), %r10
389 or %r8, %rax
390 or %r9, %r10
391 or %r10, %rax
393 mov 32 (%rsi), %r8
394 mov 40 (%rsi), %r9
395 sub 32 (%rdi), %r8
396 sub 40 (%rdi), %r9
397 mov 48 (%rsi), %r10
398 mov 56 (%rsi), %r11
399 sub 48 (%rdi), %r10
400 sub 56 (%rdi), %r11
402 or %r9, %r8
403 or %r11, %r10
404 or %r10, %r8
406 or %r8, %rax
407 jnz LABEL(32)
409 prefetchnta 576 (%rsi) /* 3DNow: use prefetch */
410 prefetchnta 576 (%rdi) /* 3DNow: use prefetch */
412 mov 64 (%rsi), %rax
413 mov 72 (%rsi), %r8
414 sub 64 (%rdi), %rax
415 sub 72 (%rdi), %r8
416 mov 80 (%rsi), %r9
417 mov 88 (%rsi), %r10
418 sub 80 (%rdi), %r9
419 sub 88 (%rdi), %r10
421 or %r8, %rax
422 or %r9, %r10
423 or %r10, %rax
425 mov 96 (%rsi), %r8
426 mov 104 (%rsi), %r9
427 sub 96 (%rdi), %r8
428 sub 104 (%rdi), %r9
429 mov 112 (%rsi), %r10
430 mov 120 (%rsi), %r11
431 sub 112 (%rdi), %r10
432 sub 120 (%rdi), %r11
434 or %r9, %r8
435 or %r11, %r10
436 or %r10, %r8
438 or %r8, %rax
439 jnz LABEL(32)
441 sub $128, %rdx
442 dec %rcx
444 lea 128 (%rsi), %rsi
445 lea 128 (%rdi), %rdi
447 jnz LABEL(128loop)
449 LABEL(128skip):
450 and $127, %edx
451 jnz LABEL(32)
453 xor %eax, %eax
456 SET_SIZE(memcmp)