import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / amd64 / gen / strcmp.s
blob5c0cfbdedfeccfc0d4fb31305a1b4c19dd8b3ec5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2009, Intel Corporation
24 * All rights reserved.
28 * str[n]cmp - compare chars between two string
31 #include "SYS.h"
32 #include "proc64_id.h"
34 #define LABEL(s) .strcmp##s
36 #ifdef USE_AS_STRNCMP
38 * Since the counter, %r11, is unsigned, we branch to strcmp_exitz
39 * if the new counter > the old one or is 0.
41 #define UPDATE_STRNCMP_COUNTER \
42 /* calculate left number to compare */ \
43 lea -16(%rcx, %r11), %r9; \
44 cmp %r9, %r11; \
45 jb LABEL(strcmp_exitz); \
46 test %r9, %r9; \
47 je LABEL(strcmp_exitz); \
48 mov %r9, %r11
49 #else
50 #define UPDATE_STRNCMP_COUNTER
51 #endif
54 * This implementation uses SSE to compare up to 16 bytes at a time.
56 #ifdef USE_AS_STRNCMP
57 ENTRY(strncmp)
58 test %rdx, %rdx
59 je LABEL(strcmp_exitz)
60 mov %rdx, %r11
61 #else
62 ENTRY(strcmp) /* (const char *, const char *) */
63 #endif
64 mov %esi, %ecx
65 mov %edi, %eax
66 and $0x3f, %rcx /* rsi alignment in cache line */
67 and $0x3f, %rax /* rdi alignment in cache line */
68 cmp $0x30, %ecx
69 ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
70 cmp $0x30, %eax
71 ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
72 movlpd (%rdi), %xmm1
73 movlpd (%rsi), %xmm2
74 movhpd 8(%rdi), %xmm1
75 movhpd 8(%rsi), %xmm2
76 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
77 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
78 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
79 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
80 pmovmskb %xmm1, %edx
81 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
82 jnz LABEL(less16bytes) /* If not, found mismatch or null char */
83 #ifdef USE_AS_STRNCMP
84 sub $16, %r11
85 jbe LABEL(strcmp_exitz) /* finish comparision */
86 #endif
87 add $16, %rsi /* prepare to search next 16 bytes */
88 add $16, %rdi /* prepare to search next 16 bytes */
91 * Determine rdi and rsi string offsets from 16-byte alignment.
92 * Use relative offset difference between the two to determine which case
93 * below to use.
95 .p2align 4
96 LABEL(crosscache):
97 and $0xfffffffffffffff0, %rsi /* force %rsi to be 16 byte aligned */
98 and $0xfffffffffffffff0, %rdi /* force %rdi to be 16 byte aligned */
99 mov $0xffff, %edx /* for equivalent offset */
100 xor %r8d, %r8d
101 and $0xf, %ecx /* offset of rsi */
102 and $0xf, %eax /* offset of rdi */
103 cmp %eax, %ecx
104 je LABEL(ashr_0) /* both strings have the same alignment */
105 ja LABEL(bigger)
106 mov %edx, %r8d /* r8d is offset flag for exit tail */
107 xchg %ecx, %eax
108 xchg %rsi, %rdi
109 LABEL(bigger):
110 mov %rcx, %r9
111 sub %rax, %r9
112 lea LABEL(unaligned_table)(%rip), %r10
113 movslq (%r10, %r9, 4), %r9
114 lea (%r10, %r9), %r10
115 jmp *%r10 /* jump to corresponding case */
118 * ashr_0 handles the following cases:
119 * str1 offset = str2 offset
121 .p2align 4
122 LABEL(ashr_0):
123 movdqa (%rsi), %xmm1
124 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
125 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
126 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
127 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
128 pmovmskb %xmm1, %r9d
129 shr %cl, %edx /* adjust 0xffff for offset */
130 shr %cl, %r9d /* adjust for 16-byte offset */
131 sub %r9d, %edx
133 * edx must be the same with r9d if in left byte (16-rcx) is equal to
134 * the start from (16-rax) and no null char was seen.
136 jne LABEL(less32bytes) /* mismatch or null char */
137 UPDATE_STRNCMP_COUNTER
138 mov $16, %rcx
139 mov $16, %r9
140 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
143 * Now both strings are aligned at 16-byte boundary. Loop over strings
144 * checking 32-bytes per iteration.
146 .p2align 4
147 LABEL(loop_ashr_0):
148 movdqa (%rsi, %rcx), %xmm1
149 movdqa (%rdi, %rcx), %xmm2
151 pcmpeqb %xmm1, %xmm0
152 pcmpeqb %xmm2, %xmm1
153 psubb %xmm0, %xmm1
154 pmovmskb %xmm1, %edx
155 sub $0xffff, %edx
156 jnz LABEL(exit) /* mismatch or null char seen */
158 #ifdef USE_AS_STRNCMP
159 sub $16, %r11
160 jbe LABEL(strcmp_exitz)
161 #endif
162 add $16, %rcx
163 movdqa (%rsi, %rcx), %xmm1
164 movdqa (%rdi, %rcx), %xmm2
166 pcmpeqb %xmm1, %xmm0
167 pcmpeqb %xmm2, %xmm1
168 psubb %xmm0, %xmm1
169 pmovmskb %xmm1, %edx
170 sub $0xffff, %edx
171 jnz LABEL(exit)
172 #ifdef USE_AS_STRNCMP
173 sub $16, %r11
174 jbe LABEL(strcmp_exitz)
175 #endif
176 add $16, %rcx
177 jmp LABEL(loop_ashr_0)
180 * ashr_1 handles the following cases:
181 * abs(str1 offset - str2 offset) = 15
183 .p2align 4
184 LABEL(ashr_1):
185 pxor %xmm0, %xmm0
186 movdqa (%rdi), %xmm2
187 movdqa (%rsi), %xmm1
188 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
189 pslldq $15, %xmm2 /* shift first string to align with second */
190 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
191 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
192 pmovmskb %xmm2, %r9d
193 shr %cl, %edx /* adjust 0xffff for offset */
194 shr %cl, %r9d /* adjust for 16-byte offset */
195 sub %r9d, %edx
196 jnz LABEL(less32bytes) /* mismatch or null char seen */
197 movdqa (%rdi), %xmm3
198 UPDATE_STRNCMP_COUNTER
200 pxor %xmm0, %xmm0
201 mov $16, %rcx /* index for loads */
202 mov $1, %r9d /* rdi bytes already examined. Used in exit code */
204 * Setup %r10 value allows us to detect crossing a page boundary.
205 * When %r10 goes positive we are crossing a page boundary and
206 * need to do a nibble.
208 lea 1(%rdi), %r10
209 and $0xfff, %r10 /* offset into 4K page */
210 sub $0x1000, %r10 /* subtract 4K pagesize */
211 movdqa %xmm3, %xmm4
213 .p2align 4
214 LABEL(loop_ashr_1):
215 add $16, %r10
216 jg LABEL(nibble_ashr_1) /* cross page boundary */
218 LABEL(gobble_ashr_1):
219 movdqa (%rsi, %rcx), %xmm1
220 movdqa (%rdi, %rcx), %xmm2
221 movdqa %xmm2, %xmm4 /* store for next cycle */
223 psrldq $1, %xmm3
224 pslldq $15, %xmm2
225 por %xmm3, %xmm2 /* merge into one 16byte value */
227 pcmpeqb %xmm1, %xmm0
228 pcmpeqb %xmm2, %xmm1
229 psubb %xmm0, %xmm1
230 pmovmskb %xmm1, %edx
231 sub $0xffff, %edx
232 jnz LABEL(exit)
234 #ifdef USE_AS_STRNCMP
235 sub $16, %r11
236 jbe LABEL(strcmp_exitz)
237 #endif
238 add $16, %rcx
239 movdqa %xmm4, %xmm3
241 add $16, %r10
242 jg LABEL(nibble_ashr_1) /* cross page boundary */
244 movdqa (%rsi, %rcx), %xmm1
245 movdqa (%rdi, %rcx), %xmm2
246 movdqa %xmm2, %xmm4 /* store for next cycle */
248 psrldq $1, %xmm3
249 pslldq $15, %xmm2
250 por %xmm3, %xmm2 /* merge into one 16byte value */
252 pcmpeqb %xmm1, %xmm0
253 pcmpeqb %xmm2, %xmm1
254 psubb %xmm0, %xmm1
255 pmovmskb %xmm1, %edx
256 sub $0xffff, %edx
257 jnz LABEL(exit)
259 #ifdef USE_AS_STRNCMP
260 sub $16, %r11
261 jbe LABEL(strcmp_exitz)
262 #endif
263 add $16, %rcx
264 movdqa %xmm4, %xmm3
265 jmp LABEL(loop_ashr_1)
268 * Nibble avoids loads across page boundary. This is to avoid a potential
269 * access into unmapped memory.
271 .p2align 4
272 LABEL(nibble_ashr_1):
273 psrldq $1, %xmm4
274 movdqa (%rsi, %rcx), %xmm1
275 pcmpeqb %xmm1, %xmm0
276 pcmpeqb %xmm4, %xmm1
277 psubb %xmm0, %xmm1
278 pmovmskb %xmm1, %edx
279 sub $0x7fff, %edx
280 jnz LABEL(exit)
281 #ifdef USE_AS_STRNCMP
282 cmp $15, %r11
283 jbe LABEL(strcmp_exitz)
284 #endif
285 pxor %xmm0, %xmm0
286 sub $0x1000, %r10 /* subtract 4K from %r10 */
287 jmp LABEL(gobble_ashr_1)
290 * ashr_2 handles the following cases:
291 * abs(str1 offset - str2 offset) = 14
293 .p2align 4
294 LABEL(ashr_2):
295 pxor %xmm0, %xmm0
296 movdqa (%rdi), %xmm2
297 movdqa (%rsi), %xmm1
298 pcmpeqb %xmm1, %xmm0
299 pslldq $14, %xmm2
300 pcmpeqb %xmm1, %xmm2
301 psubb %xmm0, %xmm2
302 pmovmskb %xmm2, %r9d
303 shr %cl, %edx
304 shr %cl, %r9d
305 sub %r9d, %edx
306 jnz LABEL(less32bytes)
307 movdqa (%rdi), %xmm3
308 UPDATE_STRNCMP_COUNTER
310 pxor %xmm0, %xmm0
311 mov $16, %rcx /* index for loads */
312 mov $2, %r9d /* rdi bytes already examined. Used in exit code */
314 * Setup %r10 value allows us to detect crossing a page boundary.
315 * When %r10 goes positive we are crossing a page boundary and
316 * need to do a nibble.
318 lea 2(%rdi), %r10
319 and $0xfff, %r10 /* offset into 4K page */
320 sub $0x1000, %r10 /* subtract 4K pagesize */
321 movdqa %xmm3, %xmm4
323 .p2align 4
324 LABEL(loop_ashr_2):
325 add $16, %r10
326 jg LABEL(nibble_ashr_2)
328 LABEL(gobble_ashr_2):
329 movdqa (%rsi, %rcx), %xmm1
330 movdqa (%rdi, %rcx), %xmm2
331 movdqa %xmm2, %xmm4
333 psrldq $2, %xmm3
334 pslldq $14, %xmm2
335 por %xmm3, %xmm2
337 pcmpeqb %xmm1, %xmm0
338 pcmpeqb %xmm2, %xmm1
339 psubb %xmm0, %xmm1
340 pmovmskb %xmm1, %edx
341 sub $0xffff, %edx
342 jnz LABEL(exit)
344 #ifdef USE_AS_STRNCMP
345 sub $16, %r11
346 jbe LABEL(strcmp_exitz)
347 #endif
349 add $16, %rcx
350 movdqa %xmm4, %xmm3
352 add $16, %r10
353 jg LABEL(nibble_ashr_2) /* cross page boundary */
355 movdqa (%rsi, %rcx), %xmm1
356 movdqa (%rdi, %rcx), %xmm2
357 movdqa %xmm2, %xmm4
359 psrldq $2, %xmm3
360 pslldq $14, %xmm2
361 por %xmm3, %xmm2
363 pcmpeqb %xmm1, %xmm0
364 pcmpeqb %xmm2, %xmm1
365 psubb %xmm0, %xmm1
366 pmovmskb %xmm1, %edx
367 sub $0xffff, %edx
368 jnz LABEL(exit)
370 #ifdef USE_AS_STRNCMP
371 sub $16, %r11
372 jbe LABEL(strcmp_exitz)
373 #endif
375 add $16, %rcx
376 movdqa %xmm4, %xmm3
377 jmp LABEL(loop_ashr_2)
379 .p2align 4
380 LABEL(nibble_ashr_2):
381 psrldq $2, %xmm4
382 movdqa (%rsi, %rcx), %xmm1
383 pcmpeqb %xmm1, %xmm0
384 pcmpeqb %xmm4, %xmm1
385 psubb %xmm0, %xmm1
386 pmovmskb %xmm1, %edx
387 sub $0x3fff, %edx
388 jnz LABEL(exit)
389 #ifdef USE_AS_STRNCMP
390 cmp $14, %r11
391 jbe LABEL(strcmp_exitz)
392 #endif
393 pxor %xmm0, %xmm0
394 sub $0x1000, %r10 /* subtract 4K from %r10 */
395 jmp LABEL(gobble_ashr_2)
398 * ashr_3 handles the following cases:
399 * abs(str1 offset - str2 offset) = 13
401 .p2align 4
402 LABEL(ashr_3):
403 pxor %xmm0, %xmm0
404 movdqa (%rdi), %xmm2
405 movdqa (%rsi), %xmm1
406 pcmpeqb %xmm1, %xmm0
407 pslldq $13, %xmm2
408 pcmpeqb %xmm1, %xmm2
409 psubb %xmm0, %xmm2
410 pmovmskb %xmm2, %r9d
411 shr %cl, %edx
412 shr %cl, %r9d
413 sub %r9d, %edx
414 jnz LABEL(less32bytes)
415 movdqa (%rdi), %xmm3
417 UPDATE_STRNCMP_COUNTER
419 pxor %xmm0, %xmm0
420 mov $16, %rcx /* index for loads */
421 mov $3, %r9d /* rdi bytes already examined. Used in exit code */
423 * Setup %r10 value allows us to detect crossing a page boundary.
424 * When %r10 goes positive we are crossing a page boundary and
425 * need to do a nibble.
427 lea 3(%rdi), %r10
428 and $0xfff, %r10 /* offset into 4K page */
429 sub $0x1000, %r10 /* subtract 4K pagesize */
430 movdqa %xmm3, %xmm4
432 .p2align 4
433 LABEL(loop_ashr_3):
434 add $16, %r10
435 jg LABEL(nibble_ashr_3)
437 LABEL(gobble_ashr_3):
438 movdqa (%rsi, %rcx), %xmm1
439 movdqa (%rdi, %rcx), %xmm2
440 movdqa %xmm2, %xmm4
442 psrldq $3, %xmm3
443 pslldq $13, %xmm2
444 por %xmm3, %xmm2
446 pcmpeqb %xmm1, %xmm0
447 pcmpeqb %xmm2, %xmm1
448 psubb %xmm0, %xmm1
449 pmovmskb %xmm1, %edx
450 sub $0xffff, %edx
451 jnz LABEL(exit)
453 #ifdef USE_AS_STRNCMP
454 sub $16, %r11
455 jbe LABEL(strcmp_exitz)
456 #endif
458 add $16, %rcx
459 movdqa %xmm4, %xmm3
461 add $16, %r10
462 jg LABEL(nibble_ashr_3) /* cross page boundary */
464 movdqa (%rsi, %rcx), %xmm1
465 movdqa (%rdi, %rcx), %xmm2
466 movdqa %xmm2, %xmm4
468 psrldq $3, %xmm3
469 pslldq $13, %xmm2
470 por %xmm3, %xmm2
472 pcmpeqb %xmm1, %xmm0
473 pcmpeqb %xmm2, %xmm1
474 psubb %xmm0, %xmm1
475 pmovmskb %xmm1, %edx
476 sub $0xffff, %edx
477 jnz LABEL(exit)
479 #ifdef USE_AS_STRNCMP
480 sub $16, %r11
481 jbe LABEL(strcmp_exitz)
482 #endif
484 add $16, %rcx
485 movdqa %xmm4, %xmm3
486 jmp LABEL(loop_ashr_3)
488 .p2align 4
489 LABEL(nibble_ashr_3):
490 psrldq $3, %xmm4
491 movdqa (%rsi, %rcx), %xmm1
492 pcmpeqb %xmm1, %xmm0
493 pcmpeqb %xmm4, %xmm1
494 psubb %xmm0, %xmm1
495 pmovmskb %xmm1, %edx
496 sub $0x1fff, %edx
497 jnz LABEL(exit)
498 #ifdef USE_AS_STRNCMP
499 cmp $13, %r11
500 jbe LABEL(strcmp_exitz)
501 #endif
502 pxor %xmm0, %xmm0
503 sub $0x1000, %r10 /* subtract 4K from %r10 */
504 jmp LABEL(gobble_ashr_3)
507 * ashr_4 handles the following cases:
508 * abs(str1 offset - str2 offset) = 12
510 .p2align 4
511 LABEL(ashr_4):
512 pxor %xmm0, %xmm0
513 movdqa (%rdi), %xmm2
514 movdqa (%rsi), %xmm1
515 pcmpeqb %xmm1, %xmm0
516 pslldq $12, %xmm2
517 pcmpeqb %xmm1, %xmm2
518 psubb %xmm0, %xmm2
519 pmovmskb %xmm2, %r9d
520 shr %cl, %edx
521 shr %cl, %r9d
522 sub %r9d, %edx
523 jnz LABEL(less32bytes)
524 movdqa (%rdi), %xmm3
526 UPDATE_STRNCMP_COUNTER
528 pxor %xmm0, %xmm0
529 mov $16, %rcx /* index for loads */
530 mov $4, %r9d /* rdi bytes already examined. Used in exit code */
532 * Setup %r10 value allows us to detect crossing a page boundary.
533 * When %r10 goes positive we are crossing a page boundary and
534 * need to do a nibble.
536 lea 4(%rdi), %r10
537 and $0xfff, %r10 /* offset into 4K page */
538 sub $0x1000, %r10 /* subtract 4K pagesize */
539 movdqa %xmm3, %xmm4
541 .p2align 4
542 LABEL(loop_ashr_4):
543 add $16, %r10
544 jg LABEL(nibble_ashr_4)
546 LABEL(gobble_ashr_4):
547 movdqa (%rsi, %rcx), %xmm1
548 movdqa (%rdi, %rcx), %xmm2
549 movdqa %xmm2, %xmm4
551 psrldq $4, %xmm3
552 pslldq $12, %xmm2
553 por %xmm3, %xmm2
555 pcmpeqb %xmm1, %xmm0
556 pcmpeqb %xmm2, %xmm1
557 psubb %xmm0, %xmm1
558 pmovmskb %xmm1, %edx
559 sub $0xffff, %edx
560 jnz LABEL(exit)
562 #ifdef USE_AS_STRNCMP
563 sub $16, %r11
564 jbe LABEL(strcmp_exitz)
565 #endif
567 add $16, %rcx
568 movdqa %xmm4, %xmm3
570 add $16, %r10
571 jg LABEL(nibble_ashr_4) /* cross page boundary */
573 movdqa (%rsi, %rcx), %xmm1
574 movdqa (%rdi, %rcx), %xmm2
575 movdqa %xmm2, %xmm4
577 psrldq $4, %xmm3
578 pslldq $12, %xmm2
579 por %xmm3, %xmm2
581 pcmpeqb %xmm1, %xmm0
582 pcmpeqb %xmm2, %xmm1
583 psubb %xmm0, %xmm1
584 pmovmskb %xmm1, %edx
585 sub $0xffff, %edx
586 jnz LABEL(exit)
588 #ifdef USE_AS_STRNCMP
589 sub $16, %r11
590 jbe LABEL(strcmp_exitz)
591 #endif
593 add $16, %rcx
594 movdqa %xmm4, %xmm3
595 jmp LABEL(loop_ashr_4)
597 .p2align 4
598 LABEL(nibble_ashr_4):
599 psrldq $4, %xmm4
600 movdqa (%rsi, %rcx), %xmm1
601 pcmpeqb %xmm1, %xmm0
602 pcmpeqb %xmm4, %xmm1
603 psubb %xmm0, %xmm1
604 pmovmskb %xmm1, %edx
605 sub $0x0fff, %edx
606 jnz LABEL(exit)
607 #ifdef USE_AS_STRNCMP
608 cmp $12, %r11
609 jbe LABEL(strcmp_exitz)
610 #endif
611 pxor %xmm0, %xmm0
612 sub $0x1000, %r10 /* subtract 4K from %r10 */
613 jmp LABEL(gobble_ashr_4)
616 * ashr_5 handles the following cases:
617 * abs(str1 offset - str2 offset) = 11
619 .p2align 4
620 LABEL(ashr_5):
621 pxor %xmm0, %xmm0
622 movdqa (%rdi), %xmm2
623 movdqa (%rsi), %xmm1
624 pcmpeqb %xmm1, %xmm0
625 pslldq $11, %xmm2
626 pcmpeqb %xmm1, %xmm2
627 psubb %xmm0, %xmm2
628 pmovmskb %xmm2, %r9d
629 shr %cl, %edx
630 shr %cl, %r9d
631 sub %r9d, %edx
632 jnz LABEL(less32bytes)
633 movdqa (%rdi), %xmm3
635 UPDATE_STRNCMP_COUNTER
637 pxor %xmm0, %xmm0
638 mov $16, %rcx /* index for loads */
639 mov $5, %r9d /* rdi bytes already examined. Used in exit code */
641 * Setup %r10 value allows us to detect crossing a page boundary.
642 * When %r10 goes positive we are crossing a page boundary and
643 * need to do a nibble.
645 lea 5(%rdi), %r10
646 and $0xfff, %r10 /* offset into 4K page */
647 sub $0x1000, %r10 /* subtract 4K pagesize */
648 movdqa %xmm3, %xmm4
650 .p2align 4
651 LABEL(loop_ashr_5):
652 add $16, %r10
653 jg LABEL(nibble_ashr_5)
655 LABEL(gobble_ashr_5):
656 movdqa (%rsi, %rcx), %xmm1
657 movdqa (%rdi, %rcx), %xmm2
658 movdqa %xmm2, %xmm4
660 psrldq $5, %xmm3
661 pslldq $11, %xmm2
662 por %xmm3, %xmm2
664 pcmpeqb %xmm1, %xmm0
665 pcmpeqb %xmm2, %xmm1
666 psubb %xmm0, %xmm1
667 pmovmskb %xmm1, %edx
668 sub $0xffff, %edx
669 jnz LABEL(exit)
671 #ifdef USE_AS_STRNCMP
672 sub $16, %r11
673 jbe LABEL(strcmp_exitz)
674 #endif
676 add $16, %rcx
677 movdqa %xmm4, %xmm3
679 add $16, %r10
680 jg LABEL(nibble_ashr_5) /* cross page boundary */
682 movdqa (%rsi, %rcx), %xmm1
683 movdqa (%rdi, %rcx), %xmm2
684 movdqa %xmm2, %xmm4
686 psrldq $5, %xmm3
687 pslldq $11, %xmm2
688 por %xmm3, %xmm2
690 pcmpeqb %xmm1, %xmm0
691 pcmpeqb %xmm2, %xmm1
692 psubb %xmm0, %xmm1
693 pmovmskb %xmm1, %edx
694 sub $0xffff, %edx
695 jnz LABEL(exit)
697 #ifdef USE_AS_STRNCMP
698 sub $16, %r11
699 jbe LABEL(strcmp_exitz)
700 #endif
702 add $16, %rcx
703 movdqa %xmm4, %xmm3
704 jmp LABEL(loop_ashr_5)
706 .p2align 4
707 LABEL(nibble_ashr_5):
708 psrldq $5, %xmm4
709 movdqa (%rsi, %rcx), %xmm1
710 pcmpeqb %xmm1, %xmm0
711 pcmpeqb %xmm4, %xmm1
712 psubb %xmm0, %xmm1
713 pmovmskb %xmm1, %edx
714 sub $0x07ff, %edx
715 jnz LABEL(exit)
716 #ifdef USE_AS_STRNCMP
717 cmp $11, %r11
718 jbe LABEL(strcmp_exitz)
719 #endif
720 pxor %xmm0, %xmm0
721 sub $0x1000, %r10 /* subtract 4K from %r10 */
722 jmp LABEL(gobble_ashr_5)
725 * ashr_6 handles the following cases:
726 * abs(str1 offset - str2 offset) = 10
728 .p2align 4
729 LABEL(ashr_6):
730 pxor %xmm0, %xmm0
731 movdqa (%rdi), %xmm2
732 movdqa (%rsi), %xmm1
733 pcmpeqb %xmm1, %xmm0
734 pslldq $10, %xmm2
735 pcmpeqb %xmm1, %xmm2
736 psubb %xmm0, %xmm2
737 pmovmskb %xmm2, %r9d
738 shr %cl, %edx
739 shr %cl, %r9d
740 sub %r9d, %edx
741 jnz LABEL(less32bytes)
742 movdqa (%rdi), %xmm3
744 UPDATE_STRNCMP_COUNTER
746 pxor %xmm0, %xmm0
747 mov $16, %rcx /* index for loads */
748 mov $6, %r9d /* rdi bytes already examined. Used in exit code */
750 * Setup %r10 value allows us to detect crossing a page boundary.
751 * When %r10 goes positive we are crossing a page boundary and
752 * need to do a nibble.
754 lea 6(%rdi), %r10
755 and $0xfff, %r10 /* offset into 4K page */
756 sub $0x1000, %r10 /* subtract 4K pagesize */
757 movdqa %xmm3, %xmm4
759 .p2align 4
760 LABEL(loop_ashr_6):
761 add $16, %r10
762 jg LABEL(nibble_ashr_6)
764 LABEL(gobble_ashr_6):
765 movdqa (%rsi, %rcx), %xmm1
766 movdqa (%rdi, %rcx), %xmm2
767 movdqa %xmm2, %xmm4
769 psrldq $6, %xmm3
770 pslldq $10, %xmm2
771 por %xmm3, %xmm2
773 pcmpeqb %xmm1, %xmm0
774 pcmpeqb %xmm2, %xmm1
775 psubb %xmm0, %xmm1
776 pmovmskb %xmm1, %edx
777 sub $0xffff, %edx
778 jnz LABEL(exit)
780 #ifdef USE_AS_STRNCMP
781 sub $16, %r11
782 jbe LABEL(strcmp_exitz)
783 #endif
785 add $16, %rcx
786 movdqa %xmm4, %xmm3
788 add $16, %r10
789 jg LABEL(nibble_ashr_6) /* cross page boundary */
791 movdqa (%rsi, %rcx), %xmm1
792 movdqa (%rdi, %rcx), %xmm2
793 movdqa %xmm2, %xmm4
795 psrldq $6, %xmm3
796 pslldq $10, %xmm2
797 por %xmm3, %xmm2
799 pcmpeqb %xmm1, %xmm0
800 pcmpeqb %xmm2, %xmm1
801 psubb %xmm0, %xmm1
802 pmovmskb %xmm1, %edx
803 sub $0xffff, %edx
804 jnz LABEL(exit)
806 #ifdef USE_AS_STRNCMP
807 sub $16, %r11
808 jbe LABEL(strcmp_exitz)
809 #endif
811 add $16, %rcx
812 movdqa %xmm4, %xmm3
813 jmp LABEL(loop_ashr_6)
815 .p2align 4
816 LABEL(nibble_ashr_6):
817 psrldq $6, %xmm4
818 movdqa (%rsi, %rcx), %xmm1
819 pcmpeqb %xmm1, %xmm0
820 pcmpeqb %xmm4, %xmm1
821 psubb %xmm0, %xmm1
822 pmovmskb %xmm1, %edx
823 sub $0x03ff, %edx
824 jnz LABEL(exit)
825 #ifdef USE_AS_STRNCMP
826 cmp $10, %r11
827 jbe LABEL(strcmp_exitz)
828 #endif
829 pxor %xmm0, %xmm0
830 sub $0x1000, %r10 /* subtract 4K from %r10 */
831 jmp LABEL(gobble_ashr_6)
834 * ashr_7 handles the following cases:
835 * abs(str1 offset - str2 offset) = 9
837 .p2align 4
838 LABEL(ashr_7):
839 pxor %xmm0, %xmm0
840 movdqa (%rdi), %xmm2
841 movdqa (%rsi), %xmm1
842 pcmpeqb %xmm1, %xmm0
843 pslldq $9, %xmm2
844 pcmpeqb %xmm1, %xmm2
845 psubb %xmm0, %xmm2
846 pmovmskb %xmm2, %r9d
847 shr %cl, %edx
848 shr %cl, %r9d
849 sub %r9d, %edx
850 jnz LABEL(less32bytes)
851 movdqa (%rdi), %xmm3
853 UPDATE_STRNCMP_COUNTER
855 pxor %xmm0, %xmm0
856 mov $16, %rcx /* index for loads */
857 mov $7, %r9d /* rdi bytes already examined. Used in exit code */
859 * Setup %r10 value allows us to detect crossing a page boundary.
860 * When %r10 goes positive we are crossing a page boundary and
861 * need to do a nibble.
863 lea 7(%rdi), %r10
864 and $0xfff, %r10 /* offset into 4K page */
865 sub $0x1000, %r10 /* subtract 4K pagesize */
866 movdqa %xmm3, %xmm4
868 .p2align 4
869 LABEL(loop_ashr_7):
870 add $16, %r10
871 jg LABEL(nibble_ashr_7)
873 LABEL(gobble_ashr_7):
874 movdqa (%rsi, %rcx), %xmm1
875 movdqa (%rdi, %rcx), %xmm2
876 movdqa %xmm2, %xmm4
878 psrldq $7, %xmm3
879 pslldq $9, %xmm2
880 por %xmm3, %xmm2
882 pcmpeqb %xmm1, %xmm0
883 pcmpeqb %xmm2, %xmm1
884 psubb %xmm0, %xmm1
885 pmovmskb %xmm1, %edx
886 sub $0xffff, %edx
887 jnz LABEL(exit)
889 #ifdef USE_AS_STRNCMP
890 sub $16, %r11
891 jbe LABEL(strcmp_exitz)
892 #endif
894 add $16, %rcx
895 movdqa %xmm4, %xmm3
897 add $16, %r10
898 jg LABEL(nibble_ashr_7) /* cross page boundary */
900 movdqa (%rsi, %rcx), %xmm1
901 movdqa (%rdi, %rcx), %xmm2
902 movdqa %xmm2, %xmm4
904 psrldq $7, %xmm3
905 pslldq $9, %xmm2
906 por %xmm3, %xmm2
908 pcmpeqb %xmm1, %xmm0
909 pcmpeqb %xmm2, %xmm1
910 psubb %xmm0, %xmm1
911 pmovmskb %xmm1, %edx
912 sub $0xffff, %edx
913 jnz LABEL(exit)
915 #ifdef USE_AS_STRNCMP
916 sub $16, %r11
917 jbe LABEL(strcmp_exitz)
918 #endif
920 add $16, %rcx
921 movdqa %xmm4, %xmm3
922 jmp LABEL(loop_ashr_7)
924 .p2align 4
925 LABEL(nibble_ashr_7):
926 psrldq $7, %xmm4
927 movdqa (%rsi, %rcx), %xmm1
928 pcmpeqb %xmm1, %xmm0
929 pcmpeqb %xmm4, %xmm1
930 psubb %xmm0, %xmm1
931 pmovmskb %xmm1, %edx
932 sub $0x01ff, %edx
933 jnz LABEL(exit)
934 #ifdef USE_AS_STRNCMP
935 cmp $9, %r11
936 jbe LABEL(strcmp_exitz)
937 #endif
938 pxor %xmm0, %xmm0
939 sub $0x1000, %r10 /* subtract 4K from %r10 */
940 jmp LABEL(gobble_ashr_7)
943 * ashr_8 handles the following cases:
944 * abs(str1 offset - str2 offset) = 8
946 .p2align 4
947 LABEL(ashr_8):
948 pxor %xmm0, %xmm0
949 movdqa (%rdi), %xmm2
950 movdqa (%rsi), %xmm1
951 pcmpeqb %xmm1, %xmm0
952 pslldq $8, %xmm2
953 pcmpeqb %xmm1, %xmm2
954 psubb %xmm0, %xmm2
955 pmovmskb %xmm2, %r9d
956 shr %cl, %edx
957 shr %cl, %r9d
958 sub %r9d, %edx
959 jnz LABEL(less32bytes)
960 movdqa (%rdi), %xmm3
962 UPDATE_STRNCMP_COUNTER
964 pxor %xmm0, %xmm0
965 mov $16, %rcx /* index for loads */
966 mov $8, %r9d /* rdi bytes already examined. Used in exit code */
968 * Setup %r10 value allows us to detect crossing a page boundary.
969 * When %r10 goes positive we are crossing a page boundary and
970 * need to do a nibble.
972 lea 8(%rdi), %r10
973 and $0xfff, %r10 /* offset into 4K page */
974 sub $0x1000, %r10 /* subtract 4K pagesize */
975 movdqa %xmm3, %xmm4
977 .p2align 4
978 LABEL(loop_ashr_8):
979 add $16, %r10
980 jg LABEL(nibble_ashr_8)
982 LABEL(gobble_ashr_8):
983 movdqa (%rsi, %rcx), %xmm1
984 movdqa (%rdi, %rcx), %xmm2
985 movdqa %xmm2, %xmm4
987 psrldq $8, %xmm3
988 pslldq $8, %xmm2
989 por %xmm3, %xmm2
991 pcmpeqb %xmm1, %xmm0
992 pcmpeqb %xmm2, %xmm1
993 psubb %xmm0, %xmm1
994 pmovmskb %xmm1, %edx
995 sub $0xffff, %edx
996 jnz LABEL(exit)
998 #ifdef USE_AS_STRNCMP
999 sub $16, %r11
1000 jbe LABEL(strcmp_exitz)
1001 #endif
1003 add $16, %rcx
1004 movdqa %xmm4, %xmm3
1006 add $16, %r10
1007 jg LABEL(nibble_ashr_8) /* cross page boundary */
1009 movdqa (%rsi, %rcx), %xmm1
1010 movdqa (%rdi, %rcx), %xmm2
1011 movdqa %xmm2, %xmm4
1013 psrldq $8, %xmm3
1014 pslldq $8, %xmm2
1015 por %xmm3, %xmm2
1017 pcmpeqb %xmm1, %xmm0
1018 pcmpeqb %xmm2, %xmm1
1019 psubb %xmm0, %xmm1
1020 pmovmskb %xmm1, %edx
1021 sub $0xffff, %edx
1022 jnz LABEL(exit)
1024 #ifdef USE_AS_STRNCMP
1025 sub $16, %r11
1026 jbe LABEL(strcmp_exitz)
1027 #endif
1029 add $16, %rcx
1030 movdqa %xmm4, %xmm3
1031 jmp LABEL(loop_ashr_8)
1033 .p2align 4
1034 LABEL(nibble_ashr_8):
1035 psrldq $8, %xmm4
1036 movdqa (%rsi, %rcx), %xmm1
1037 pcmpeqb %xmm1, %xmm0
1038 pcmpeqb %xmm4, %xmm1
1039 psubb %xmm0, %xmm1
1040 pmovmskb %xmm1, %edx
1041 sub $0x00ff, %edx
1042 jnz LABEL(exit)
1043 #ifdef USE_AS_STRNCMP
1044 cmp $8, %r11
1045 jbe LABEL(strcmp_exitz)
1046 #endif
1047 pxor %xmm0, %xmm0
1048 sub $0x1000, %r10 /* subtract 4K from %r10 */
1049 jmp LABEL(gobble_ashr_8)
1052 * ashr_9 handles the following cases:
1053 * abs(str1 offset - str2 offset) = 7
1055 .p2align 4
1056 LABEL(ashr_9):
1057 pxor %xmm0, %xmm0
1058 movdqa (%rdi), %xmm2
1059 movdqa (%rsi), %xmm1
1060 pcmpeqb %xmm1, %xmm0
1061 pslldq $7, %xmm2
1062 pcmpeqb %xmm1, %xmm2
1063 psubb %xmm0, %xmm2
1064 pmovmskb %xmm2, %r9d
1065 shr %cl, %edx
1066 shr %cl, %r9d
1067 sub %r9d, %edx
1068 jnz LABEL(less32bytes)
1069 movdqa (%rdi), %xmm3
1071 UPDATE_STRNCMP_COUNTER
1073 pxor %xmm0, %xmm0
1074 mov $16, %rcx /* index for loads */
1075 mov $9, %r9d /* rdi bytes already examined. Used in exit code */
1077 * Setup %r10 value allows us to detect crossing a page boundary.
1078 * When %r10 goes positive we are crossing a page boundary and
1079 * need to do a nibble.
1081 lea 9(%rdi), %r10
1082 and $0xfff, %r10 /* offset into 4K page */
1083 sub $0x1000, %r10 /* subtract 4K pagesize */
1084 movdqa %xmm3, %xmm4
1086 .p2align 4
1087 LABEL(loop_ashr_9):
1088 add $16, %r10
1089 jg LABEL(nibble_ashr_9)
1091 LABEL(gobble_ashr_9):
1092 movdqa (%rsi, %rcx), %xmm1
1093 movdqa (%rdi, %rcx), %xmm2
1094 movdqa %xmm2, %xmm4
1096 psrldq $9, %xmm3
1097 pslldq $7, %xmm2
1098 por %xmm3, %xmm2
1100 pcmpeqb %xmm1, %xmm0
1101 pcmpeqb %xmm2, %xmm1
1102 psubb %xmm0, %xmm1
1103 pmovmskb %xmm1, %edx
1104 sub $0xffff, %edx
1105 jnz LABEL(exit)
1107 #ifdef USE_AS_STRNCMP
1108 sub $16, %r11
1109 jbe LABEL(strcmp_exitz)
1110 #endif
1112 add $16, %rcx
1113 movdqa %xmm4, %xmm3
1115 add $16, %r10
1116 jg LABEL(nibble_ashr_9) /* cross page boundary */
1118 movdqa (%rsi, %rcx), %xmm1
1119 movdqa (%rdi, %rcx), %xmm2
1120 movdqa %xmm2, %xmm4
1122 psrldq $9, %xmm3
1123 pslldq $7, %xmm2
1124 por %xmm3, %xmm2
1126 pcmpeqb %xmm1, %xmm0
1127 pcmpeqb %xmm2, %xmm1
1128 psubb %xmm0, %xmm1
1129 pmovmskb %xmm1, %edx
1130 sub $0xffff, %edx
1131 jnz LABEL(exit)
1133 #ifdef USE_AS_STRNCMP
1134 sub $16, %r11
1135 jbe LABEL(strcmp_exitz)
1136 #endif
1138 add $16, %rcx
1139 movdqa %xmm4, %xmm3 /* store for next cycle */
1140 jmp LABEL(loop_ashr_9)
1142 .p2align 4
1143 LABEL(nibble_ashr_9):
1144 psrldq $9, %xmm4
1145 movdqa (%rsi, %rcx), %xmm1
1146 pcmpeqb %xmm1, %xmm0
1147 pcmpeqb %xmm4, %xmm1
1148 psubb %xmm0, %xmm1
1149 pmovmskb %xmm1, %edx
1150 sub $0x007f, %edx
1151 jnz LABEL(exit)
1152 #ifdef USE_AS_STRNCMP
1153 cmp $7, %r11
1154 jbe LABEL(strcmp_exitz)
1155 #endif
1156 pxor %xmm0, %xmm0
1157 sub $0x1000, %r10 /* subtract 4K from %r10 */
1158 jmp LABEL(gobble_ashr_9)
1161 * ashr_10 handles the following cases:
1162 * abs(str1 offset - str2 offset) = 6
1164 .p2align 4
1165 LABEL(ashr_10):
1166 pxor %xmm0, %xmm0
1167 movdqa (%rdi), %xmm2
1168 movdqa (%rsi), %xmm1
1169 pcmpeqb %xmm1, %xmm0
1170 pslldq $6, %xmm2
1171 pcmpeqb %xmm1, %xmm2
1172 psubb %xmm0, %xmm2
1173 pmovmskb %xmm2, %r9d
1174 shr %cl, %edx
1175 shr %cl, %r9d
1176 sub %r9d, %edx
1177 jnz LABEL(less32bytes)
1178 movdqa (%rdi), %xmm3
1180 UPDATE_STRNCMP_COUNTER
1182 pxor %xmm0, %xmm0
1183 mov $16, %rcx /* index for loads */
1184 mov $10, %r9d /* rdi bytes already examined. Used in exit code */
1186 * Setup %r10 value allows us to detect crossing a page boundary.
1187 * When %r10 goes positive we are crossing a page boundary and
1188 * need to do a nibble.
1190 lea 10(%rdi), %r10
1191 and $0xfff, %r10 /* offset into 4K page */
1192 sub $0x1000, %r10 /* subtract 4K pagesize */
1193 movdqa %xmm3, %xmm4
1195 .p2align 4
1196 LABEL(loop_ashr_10):
1197 add $16, %r10
1198 jg LABEL(nibble_ashr_10)
1200 LABEL(gobble_ashr_10):
1201 movdqa (%rsi, %rcx), %xmm1
1202 movdqa (%rdi, %rcx), %xmm2
1203 movdqa %xmm2, %xmm4
1205 psrldq $10, %xmm3
1206 pslldq $6, %xmm2
1207 por %xmm3, %xmm2
1209 pcmpeqb %xmm1, %xmm0
1210 pcmpeqb %xmm2, %xmm1
1211 psubb %xmm0, %xmm1
1212 pmovmskb %xmm1, %edx
1213 sub $0xffff, %edx
1214 jnz LABEL(exit)
1216 #ifdef USE_AS_STRNCMP
1217 sub $16, %r11
1218 jbe LABEL(strcmp_exitz)
1219 #endif
1221 add $16, %rcx
1222 movdqa %xmm4, %xmm3
1224 add $16, %r10
1225 jg LABEL(nibble_ashr_10) /* cross page boundary */
1227 movdqa (%rsi, %rcx), %xmm1
1228 movdqa (%rdi, %rcx), %xmm2
1229 movdqa %xmm2, %xmm4
1231 psrldq $10, %xmm3
1232 pslldq $6, %xmm2
1233 por %xmm3, %xmm2
1235 pcmpeqb %xmm1, %xmm0
1236 pcmpeqb %xmm2, %xmm1
1237 psubb %xmm0, %xmm1
1238 pmovmskb %xmm1, %edx
1239 sub $0xffff, %edx
1240 jnz LABEL(exit)
1242 #ifdef USE_AS_STRNCMP
1243 sub $16, %r11
1244 jbe LABEL(strcmp_exitz)
1245 #endif
1247 add $16, %rcx
1248 movdqa %xmm4, %xmm3
1249 jmp LABEL(loop_ashr_10)
1251 .p2align 4
1252 LABEL(nibble_ashr_10):
1253 psrldq $10, %xmm4
1254 movdqa (%rsi, %rcx), %xmm1
1255 pcmpeqb %xmm1, %xmm0
1256 pcmpeqb %xmm4, %xmm1
1257 psubb %xmm0, %xmm1
1258 pmovmskb %xmm1, %edx
1259 sub $0x003f, %edx
1260 jnz LABEL(exit)
1261 #ifdef USE_AS_STRNCMP
1262 cmp $6, %r11
1263 jbe LABEL(strcmp_exitz)
1264 #endif
1265 pxor %xmm0, %xmm0
1266 sub $0x1000, %r10 /* subtract 4K from %r10 */
1267 jmp LABEL(gobble_ashr_10)
1270 * ashr_11 handles the following cases:
1271 * abs(str1 offset - str2 offset) = 5
1273 .p2align 4
1274 LABEL(ashr_11):
1275 pxor %xmm0, %xmm0
1276 movdqa (%rdi), %xmm2
1277 movdqa (%rsi), %xmm1
1278 pcmpeqb %xmm1, %xmm0
1279 pslldq $5, %xmm2
1280 pcmpeqb %xmm1, %xmm2
1281 psubb %xmm0, %xmm2
1282 pmovmskb %xmm2, %r9d
1283 shr %cl, %edx
1284 shr %cl, %r9d
1285 sub %r9d, %edx
1286 jnz LABEL(less32bytes)
1287 movdqa (%rdi), %xmm3
1289 UPDATE_STRNCMP_COUNTER
1291 pxor %xmm0, %xmm0
1292 mov $16, %rcx /* index for loads */
1293 mov $11, %r9d /* rdi bytes already examined. Used in exit code */
1295 * Setup %r10 value allows us to detect crossing a page boundary.
1296 * When %r10 goes positive we are crossing a page boundary and
1297 * need to do a nibble.
1299 lea 11(%rdi), %r10
1300 and $0xfff, %r10 /* offset into 4K page */
1301 sub $0x1000, %r10 /* subtract 4K pagesize */
1302 movdqa %xmm3, %xmm4
1304 .p2align 4
1305 LABEL(loop_ashr_11):
1306 add $16, %r10
1307 jg LABEL(nibble_ashr_11)
1309 LABEL(gobble_ashr_11):
1310 movdqa (%rsi, %rcx), %xmm1
1311 movdqa (%rdi, %rcx), %xmm2
1312 movdqa %xmm2, %xmm4
1314 psrldq $11, %xmm3
1315 pslldq $5, %xmm2
1316 por %xmm3, %xmm2
1318 pcmpeqb %xmm1, %xmm0
1319 pcmpeqb %xmm2, %xmm1
1320 psubb %xmm0, %xmm1
1321 pmovmskb %xmm1, %edx
1322 sub $0xffff, %edx
1323 jnz LABEL(exit)
1325 #ifdef USE_AS_STRNCMP
1326 sub $16, %r11
1327 jbe LABEL(strcmp_exitz)
1328 #endif
1330 add $16, %rcx
1331 movdqa %xmm4, %xmm3
1333 add $16, %r10
1334 jg LABEL(nibble_ashr_11) /* cross page boundary */
1336 movdqa (%rsi, %rcx), %xmm1
1337 movdqa (%rdi, %rcx), %xmm2
1338 movdqa %xmm2, %xmm4
1340 psrldq $11, %xmm3
1341 pslldq $5, %xmm2
1342 por %xmm3, %xmm2
1344 pcmpeqb %xmm1, %xmm0
1345 pcmpeqb %xmm2, %xmm1
1346 psubb %xmm0, %xmm1
1347 pmovmskb %xmm1, %edx
1348 sub $0xffff, %edx
1349 jnz LABEL(exit)
1351 #ifdef USE_AS_STRNCMP
1352 sub $16, %r11
1353 jbe LABEL(strcmp_exitz)
1354 #endif
1356 add $16, %rcx
1357 movdqa %xmm4, %xmm3
1358 jmp LABEL(loop_ashr_11)
1360 .p2align 4
1361 LABEL(nibble_ashr_11):
1362 psrldq $11, %xmm4
1363 movdqa (%rsi, %rcx), %xmm1
1364 pcmpeqb %xmm1, %xmm0
1365 pcmpeqb %xmm4, %xmm1
1366 psubb %xmm0, %xmm1
1367 pmovmskb %xmm1, %edx
1368 sub $0x001f, %edx
1369 jnz LABEL(exit)
1370 #ifdef USE_AS_STRNCMP
1371 cmp $5, %r11
1372 jbe LABEL(strcmp_exitz)
1373 #endif
1374 pxor %xmm0, %xmm0
1375 sub $0x1000, %r10 /* subtract 4K from %r10 */
1376 jmp LABEL(gobble_ashr_11)
1379 * ashr_12 handles the following cases:
1380 * abs(str1 offset - str2 offset) = 4
1382 .p2align 4
1383 LABEL(ashr_12):
1384 pxor %xmm0, %xmm0
1385 movdqa (%rdi), %xmm2
1386 movdqa (%rsi), %xmm1
1387 pcmpeqb %xmm1, %xmm0
1388 pslldq $4, %xmm2
1389 pcmpeqb %xmm1, %xmm2
1390 psubb %xmm0, %xmm2
1391 pmovmskb %xmm2, %r9d
1392 shr %cl, %edx
1393 shr %cl, %r9d
1394 sub %r9d, %edx
1395 jnz LABEL(less32bytes)
1396 movdqa (%rdi), %xmm3
1398 UPDATE_STRNCMP_COUNTER
1400 pxor %xmm0, %xmm0
1401 mov $16, %rcx /* index for loads */
1402 mov $12, %r9d /* rdi bytes already examined. Used in exit code */
1404 * Setup %r10 value allows us to detect crossing a page boundary.
1405 * When %r10 goes positive we are crossing a page boundary and
1406 * need to do a nibble.
1408 lea 12(%rdi), %r10
1409 and $0xfff, %r10 /* offset into 4K page */
1410 sub $0x1000, %r10 /* subtract 4K pagesize */
1411 movdqa %xmm3, %xmm4
1413 .p2align 4
1414 LABEL(loop_ashr_12):
1415 add $16, %r10
1416 jg LABEL(nibble_ashr_12)
1418 LABEL(gobble_ashr_12):
1419 movdqa (%rsi, %rcx), %xmm1
1420 movdqa (%rdi, %rcx), %xmm2
1421 movdqa %xmm2, %xmm4
1423 psrldq $12, %xmm3
1424 pslldq $4, %xmm2
1425 por %xmm3, %xmm2
1427 pcmpeqb %xmm1, %xmm0
1428 pcmpeqb %xmm2, %xmm1
1429 psubb %xmm0, %xmm1
1430 pmovmskb %xmm1, %edx
1431 sub $0xffff, %edx
1432 jnz LABEL(exit)
1434 #ifdef USE_AS_STRNCMP
1435 sub $16, %r11
1436 jbe LABEL(strcmp_exitz)
1437 #endif
1439 add $16, %rcx
1440 movdqa %xmm4, %xmm3
1442 add $16, %r10
1443 jg LABEL(nibble_ashr_12) /* cross page boundary */
1445 movdqa (%rsi, %rcx), %xmm1
1446 movdqa (%rdi, %rcx), %xmm2
1447 movdqa %xmm2, %xmm4
1449 psrldq $12, %xmm3
1450 pslldq $4, %xmm2
1451 por %xmm3, %xmm2
1453 pcmpeqb %xmm1, %xmm0
1454 pcmpeqb %xmm2, %xmm1
1455 psubb %xmm0, %xmm1
1456 pmovmskb %xmm1, %edx
1457 sub $0xffff, %edx
1458 jnz LABEL(exit)
1460 #ifdef USE_AS_STRNCMP
1461 sub $16, %r11
1462 jbe LABEL(strcmp_exitz)
1463 #endif
1465 add $16, %rcx
1466 movdqa %xmm4, %xmm3
1467 jmp LABEL(loop_ashr_12)
1469 .p2align 4
1470 LABEL(nibble_ashr_12):
1471 psrldq $12, %xmm4
1472 movdqa (%rsi, %rcx), %xmm1
1473 pcmpeqb %xmm1, %xmm0
1474 pcmpeqb %xmm4, %xmm1
1475 psubb %xmm0, %xmm1
1476 pmovmskb %xmm1, %edx
1477 sub $0x000f, %edx
1478 jnz LABEL(exit)
1479 #ifdef USE_AS_STRNCMP
1480 cmp $4, %r11
1481 jbe LABEL(strcmp_exitz)
1482 #endif
1483 pxor %xmm0, %xmm0
1484 sub $0x1000, %r10 /* subtract 4K from %r10 */
1485 jmp LABEL(gobble_ashr_12)
1488 * ashr_13 handles the following cases:
1489 * abs(str1 offset - str2 offset) = 3
1491 .p2align 4
1492 LABEL(ashr_13):
1493 pxor %xmm0, %xmm0
1494 movdqa (%rdi), %xmm2
1495 movdqa (%rsi), %xmm1
1496 pcmpeqb %xmm1, %xmm0
1497 pslldq $3, %xmm2
1498 pcmpeqb %xmm1, %xmm2
1499 psubb %xmm0, %xmm2
1500 pmovmskb %xmm2, %r9d
1501 shr %cl, %edx
1502 shr %cl, %r9d
1503 sub %r9d, %edx
1504 jnz LABEL(less32bytes)
1505 movdqa (%rdi), %xmm3
1507 UPDATE_STRNCMP_COUNTER
1509 pxor %xmm0, %xmm0
1510 mov $16, %rcx /* index for loads */
1511 mov $13, %r9d /* rdi bytes already examined. Used in exit code */
1513 * Setup %r10 value allows us to detect crossing a page boundary.
1514 * When %r10 goes positive we are crossing a page boundary and
1515 * need to do a nibble.
1517 lea 13(%rdi), %r10
1518 and $0xfff, %r10 /* offset into 4K page */
1519 sub $0x1000, %r10 /* subtract 4K pagesize */
1520 movdqa %xmm3, %xmm4
1522 .p2align 4
1523 LABEL(loop_ashr_13):
1524 add $16, %r10
1525 jg LABEL(nibble_ashr_13)
1527 LABEL(gobble_ashr_13):
1528 movdqa (%rsi, %rcx), %xmm1
1529 movdqa (%rdi, %rcx), %xmm2
1530 movdqa %xmm2, %xmm4
1532 psrldq $13, %xmm3
1533 pslldq $3, %xmm2
1534 por %xmm3, %xmm2
1536 pcmpeqb %xmm1, %xmm0
1537 pcmpeqb %xmm2, %xmm1
1538 psubb %xmm0, %xmm1
1539 pmovmskb %xmm1, %edx
1540 sub $0xffff, %edx
1541 jnz LABEL(exit)
1543 #ifdef USE_AS_STRNCMP
1544 sub $16, %r11
1545 jbe LABEL(strcmp_exitz)
1546 #endif
1548 add $16, %rcx
1549 movdqa %xmm4, %xmm3
1551 add $16, %r10
1552 jg LABEL(nibble_ashr_13) /* cross page boundary */
1554 movdqa (%rsi, %rcx), %xmm1
1555 movdqa (%rdi, %rcx), %xmm2
1556 movdqa %xmm2, %xmm4
1558 psrldq $13, %xmm3
1559 pslldq $3, %xmm2
1560 por %xmm3, %xmm2
1562 pcmpeqb %xmm1, %xmm0
1563 pcmpeqb %xmm2, %xmm1
1564 psubb %xmm0, %xmm1
1565 pmovmskb %xmm1, %edx
1566 sub $0xffff, %edx
1567 jnz LABEL(exit)
1569 #ifdef USE_AS_STRNCMP
1570 sub $16, %r11
1571 jbe LABEL(strcmp_exitz)
1572 #endif
1574 add $16, %rcx
1575 movdqa %xmm4, %xmm3
1576 jmp LABEL(loop_ashr_13)
1578 .p2align 4
1579 LABEL(nibble_ashr_13):
1580 psrldq $13, %xmm4
1581 movdqa (%rsi, %rcx), %xmm1
1582 pcmpeqb %xmm1, %xmm0
1583 pcmpeqb %xmm4, %xmm1
1584 psubb %xmm0, %xmm1
1585 pmovmskb %xmm1, %edx
1586 sub $0x0007, %edx
1587 jnz LABEL(exit)
1588 #ifdef USE_AS_STRNCMP
1589 cmp $3, %r11
1590 jbe LABEL(strcmp_exitz)
1591 #endif
1592 pxor %xmm0, %xmm0
1593 sub $0x1000, %r10 /* subtract 4K from %r10 */
1594 jmp LABEL(gobble_ashr_13)
1597 * ashr_14 handles the following cases:
1598 * abs(str1 offset - str2 offset) = 2
1600 .p2align 4
1601 LABEL(ashr_14):
1602 pxor %xmm0, %xmm0
1603 movdqa (%rdi), %xmm2
1604 movdqa (%rsi), %xmm1
1605 pcmpeqb %xmm1, %xmm0
1606 pslldq $2, %xmm2
1607 pcmpeqb %xmm1, %xmm2
1608 psubb %xmm0, %xmm2
1609 pmovmskb %xmm2, %r9d
1610 shr %cl, %edx
1611 shr %cl, %r9d
1612 sub %r9d, %edx
1613 jnz LABEL(less32bytes)
1614 movdqa (%rdi), %xmm3
1616 UPDATE_STRNCMP_COUNTER
1618 pxor %xmm0, %xmm0
1619 mov $16, %rcx /* index for loads */
1620 mov $14, %r9d /* rdi bytes already examined. Used in exit code */
1622 * Setup %r10 value allows us to detect crossing a page boundary.
1623 * When %r10 goes positive we are crossing a page boundary and
1624 * need to do a nibble.
1626 lea 14(%rdi), %r10
1627 and $0xfff, %r10 /* offset into 4K page */
1628 sub $0x1000, %r10 /* subtract 4K pagesize */
1629 movdqa %xmm3, %xmm4
1631 .p2align 4
1632 LABEL(loop_ashr_14):
1633 add $16, %r10
1634 jg LABEL(nibble_ashr_14)
1636 LABEL(gobble_ashr_14):
1637 movdqa (%rsi, %rcx), %xmm1
1638 movdqa (%rdi, %rcx), %xmm2
1639 movdqa %xmm2, %xmm4
1641 psrldq $14, %xmm3
1642 pslldq $2, %xmm2
1643 por %xmm3, %xmm2
1645 pcmpeqb %xmm1, %xmm0
1646 pcmpeqb %xmm2, %xmm1
1647 psubb %xmm0, %xmm1
1648 pmovmskb %xmm1, %edx
1649 sub $0xffff, %edx
1650 jnz LABEL(exit)
1652 #ifdef USE_AS_STRNCMP
1653 sub $16, %r11
1654 jbe LABEL(strcmp_exitz)
1655 #endif
1657 add $16, %rcx
1658 movdqa %xmm4, %xmm3
1660 add $16, %r10
1661 jg LABEL(nibble_ashr_14) /* cross page boundary */
1663 movdqa (%rsi, %rcx), %xmm1
1664 movdqa (%rdi, %rcx), %xmm2
1665 movdqa %xmm2, %xmm4
1667 psrldq $14, %xmm3
1668 pslldq $2, %xmm2
1669 por %xmm3, %xmm2
1671 pcmpeqb %xmm1, %xmm0
1672 pcmpeqb %xmm2, %xmm1
1673 psubb %xmm0, %xmm1
1674 pmovmskb %xmm1, %edx
1675 sub $0xffff, %edx
1676 jnz LABEL(exit)
1678 #ifdef USE_AS_STRNCMP
1679 sub $16, %r11
1680 jbe LABEL(strcmp_exitz)
1681 #endif
1683 add $16, %rcx
1684 movdqa %xmm4, %xmm3
1685 jmp LABEL(loop_ashr_14)
1687 .p2align 4
1688 LABEL(nibble_ashr_14):
1689 psrldq $14, %xmm4
1690 movdqa (%rsi, %rcx), %xmm1
1691 pcmpeqb %xmm1, %xmm0
1692 pcmpeqb %xmm4, %xmm1
1693 psubb %xmm0, %xmm1
1694 pmovmskb %xmm1, %edx
1695 sub $0x0003, %edx
1696 jnz LABEL(exit)
1697 #ifdef USE_AS_STRNCMP
1698 cmp $2, %r11
1699 jbe LABEL(strcmp_exitz)
1700 #endif
1701 pxor %xmm0, %xmm0
1702 sub $0x1000, %r10 /* subtract 4K from %r10 */
1703 jmp LABEL(gobble_ashr_14)
1706 * ashr_15 handles the following cases:
1707 * abs(str1 offset - str2 offset) = 1
1709 .p2align 4
1710 LABEL(ashr_15):
1711 pxor %xmm0, %xmm0
1712 movdqa (%rdi), %xmm2
1713 movdqa (%rsi), %xmm1
1714 pcmpeqb %xmm1, %xmm0
1715 pslldq $1, %xmm2
1716 pcmpeqb %xmm1, %xmm2
1717 psubb %xmm0, %xmm2
1718 pmovmskb %xmm2, %r9d
1719 shr %cl, %edx
1720 shr %cl, %r9d
1721 sub %r9d, %edx
1722 jnz LABEL(less32bytes)
1724 movdqa (%rdi), %xmm3
1726 UPDATE_STRNCMP_COUNTER
1728 pxor %xmm0, %xmm0
1729 mov $16, %rcx /* index for loads */
1730 mov $15, %r9d /* rdi bytes already examined. Used in exit code */
1732 * Setup %r10 value allows us to detect crossing a page boundary.
1733 * When %r10 goes positive we are crossing a page boundary and
1734 * need to do a nibble.
1736 lea 15(%rdi), %r10
1737 and $0xfff, %r10 /* offset into 4K page */
1738 sub $0x1000, %r10 /* subtract 4K pagesize */
1739 movdqa %xmm3, %xmm4
1741 .p2align 4
1742 LABEL(loop_ashr_15):
1743 add $16, %r10
1744 jg LABEL(nibble_ashr_15)
1746 LABEL(gobble_ashr_15):
1747 movdqa (%rsi, %rcx), %xmm1
1748 movdqa (%rdi, %rcx), %xmm2
1749 movdqa %xmm2, %xmm4
1751 psrldq $15, %xmm3
1752 pslldq $1, %xmm2
1753 por %xmm3, %xmm2
1755 pcmpeqb %xmm1, %xmm0
1756 pcmpeqb %xmm2, %xmm1
1757 psubb %xmm0, %xmm1
1758 pmovmskb %xmm1, %edx
1759 sub $0xffff, %edx
1760 jnz LABEL(exit)
1762 #ifdef USE_AS_STRNCMP
1763 sub $16, %r11
1764 jbe LABEL(strcmp_exitz)
1765 #endif
1767 add $16, %rcx
1768 movdqa %xmm4, %xmm3
1770 add $16, %r10
1771 jg LABEL(nibble_ashr_15) /* cross page boundary */
1773 movdqa (%rsi, %rcx), %xmm1
1774 movdqa (%rdi, %rcx), %xmm2
1775 movdqa %xmm2, %xmm4
1777 psrldq $15, %xmm3
1778 pslldq $1, %xmm2
1779 por %xmm3, %xmm2
1781 pcmpeqb %xmm1, %xmm0
1782 pcmpeqb %xmm2, %xmm1
1783 psubb %xmm0, %xmm1
1784 pmovmskb %xmm1, %edx
1785 sub $0xffff, %edx
1786 jnz LABEL(exit)
1788 #ifdef USE_AS_STRNCMP
1789 sub $16, %r11
1790 jbe LABEL(strcmp_exitz)
1791 #endif
1793 add $16, %rcx
1794 movdqa %xmm4, %xmm3
1795 jmp LABEL(loop_ashr_15)
1797 .p2align 4
1798 LABEL(nibble_ashr_15):
1799 psrldq $15, %xmm4
1800 movdqa (%rsi, %rcx), %xmm1
1801 pcmpeqb %xmm1, %xmm0
1802 pcmpeqb %xmm4, %xmm1
1803 psubb %xmm0, %xmm1
1804 pmovmskb %xmm1, %edx
1805 sub $0x0001, %edx
1806 jnz LABEL(exit)
1807 #ifdef USE_AS_STRNCMP
1808 cmp $1, %r11
1809 jbe LABEL(strcmp_exitz)
1810 #endif
1811 pxor %xmm0, %xmm0
1812 sub $0x1000, %r10 /* subtract 4K from %r10 */
1813 jmp LABEL(gobble_ashr_15)
1815 .p2align 4
1816 LABEL(exit):
1817 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
1818 LABEL(less32bytes):
1819 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1820 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1821 test %r8d, %r8d
1822 jz LABEL(ret)
1823 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1825 .p2align 4
1826 LABEL(ret):
1827 LABEL(less16bytes):
1829 * Check to see if BSF is fast on this processor. If not, use a different
1830 * exit tail.
1832 testl $USE_BSF,.memops_method(%rip)
1833 jz LABEL(AMD_exit)
1834 bsf %rdx, %rdx /* find and store bit index in %rdx */
1836 #ifdef USE_AS_STRNCMP
1837 sub %rdx, %r11
1838 jbe LABEL(strcmp_exitz)
1839 #endif
1840 xor %ecx, %ecx /* clear %ecx */
1841 xor %eax, %eax /* clear %eax */
1843 movb (%rsi, %rdx), %cl
1844 movb (%rdi, %rdx), %al
1846 sub %ecx, %eax
1849 #ifdef USE_AS_STRNCMP
1850 LABEL(strcmp_exitz):
1851 xor %eax, %eax
1853 #endif
1856 * This exit tail does not use the bsf instruction.
1858 .p2align 4
1859 LABEL(AMD_exit):
1860 test %dl, %dl
1861 jz LABEL(next_8_bytes)
1863 test $0x01, %dl
1864 jnz LABEL(Byte0)
1866 test $0x02, %dl
1867 jnz LABEL(Byte1)
1869 test $0x04, %dl
1870 jnz LABEL(Byte2)
1872 test $0x08, %dl
1873 jnz LABEL(Byte3)
1875 test $0x10, %dl
1876 jnz LABEL(Byte4)
1878 test $0x20, %dl
1879 jnz LABEL(Byte5)
1881 test $0x40, %dl
1882 jnz LABEL(Byte6)
1884 #ifdef USE_AS_STRNCMP
1885 sub $7, %r11
1886 jbe LABEL(strcmp_exitz)
1887 #endif
1888 movzx 7(%rsi), %ecx
1889 movzx 7(%rdi), %eax
1891 sub %ecx, %eax
1894 .p2align 4
1895 LABEL(Byte0):
1897 * never need to handle byte 0 for strncmpy
1898 #ifdef USE_AS_STRNCMP
1899 sub $0, %r11
1900 jbe LABEL(strcmp_exitz)
1901 #endif
1903 movzx (%rsi), %ecx
1904 movzx (%rdi), %eax
1906 sub %ecx, %eax
1909 .p2align 4
1910 LABEL(Byte1):
1912 #ifdef USE_AS_STRNCMP
1913 sub $1, %r11
1914 jbe LABEL(strcmp_exitz)
1915 #endif
1916 movzx 1(%rsi), %ecx
1917 movzx 1(%rdi), %eax
1919 sub %ecx, %eax
1922 .p2align 4
1923 LABEL(Byte2):
1925 #ifdef USE_AS_STRNCMP
1926 sub $2, %r11
1927 jbe LABEL(strcmp_exitz)
1928 #endif
1929 movzx 2(%rsi), %ecx
1930 movzx 2(%rdi), %eax
1932 sub %ecx, %eax
1935 .p2align 4
1936 LABEL(Byte3):
1938 #ifdef USE_AS_STRNCMP
1939 sub $3, %r11
1940 jbe LABEL(strcmp_exitz)
1941 #endif
1942 movzx 3(%rsi), %ecx
1943 movzx 3(%rdi), %eax
1945 sub %ecx, %eax
1948 .p2align 4
1949 LABEL(Byte4):
1951 #ifdef USE_AS_STRNCMP
1952 sub $4, %r11
1953 jbe LABEL(strcmp_exitz)
1954 #endif
1955 movzx 4(%rsi), %ecx
1956 movzx 4(%rdi), %eax
1958 sub %ecx, %eax
1961 .p2align 4
1962 LABEL(Byte5):
1964 #ifdef USE_AS_STRNCMP
1965 sub $5, %r11
1966 jbe LABEL(strcmp_exitz)
1967 #endif
1968 movzx 5(%rsi), %ecx
1969 movzx 5(%rdi), %eax
1971 sub %ecx, %eax
1974 .p2align 4
1975 LABEL(Byte6):
1977 #ifdef USE_AS_STRNCMP
1978 sub $6, %r11
1979 jbe LABEL(strcmp_exitz)
1980 #endif
1981 movzx 6(%rsi), %ecx
1982 movzx 6(%rdi), %eax
1984 sub %ecx, %eax
1987 .p2align 4
1988 LABEL(next_8_bytes):
1989 add $8, %rdi
1990 add $8, %rsi
1991 #ifdef USE_AS_STRNCMP
1992 sub $8, %r11
1993 jbe LABEL(strcmp_exitz)
1994 #endif
1995 test $0x01, %dh
1996 jnz LABEL(Byte0)
1998 test $0x02, %dh
1999 jnz LABEL(Byte1)
2001 test $0x04, %dh
2002 jnz LABEL(Byte2)
2004 test $0x08, %dh
2005 jnz LABEL(Byte3)
2007 test $0x10, %dh
2008 jnz LABEL(Byte4)
2010 test $0x20, %dh
2011 jnz LABEL(Byte5)
2013 test $0x40, %dh
2014 jnz LABEL(Byte6)
2016 #ifdef USE_AS_STRNCMP
2017 sub $7, %r11
2018 jbe LABEL(strcmp_exitz)
2019 #endif
2020 movzx 7(%rsi), %ecx
2021 movzx 7(%rdi), %eax
2023 sub %ecx, %eax
2026 .pushsection .rodata
2027 .p2align 4
2028 LABEL(unaligned_table):
2029 .int LABEL(ashr_0) - LABEL(unaligned_table)
2030 .int LABEL(ashr_15) - LABEL(unaligned_table)
2031 .int LABEL(ashr_14) - LABEL(unaligned_table)
2032 .int LABEL(ashr_13) - LABEL(unaligned_table)
2033 .int LABEL(ashr_12) - LABEL(unaligned_table)
2034 .int LABEL(ashr_11) - LABEL(unaligned_table)
2035 .int LABEL(ashr_10) - LABEL(unaligned_table)
2036 .int LABEL(ashr_9) - LABEL(unaligned_table)
2037 .int LABEL(ashr_8) - LABEL(unaligned_table)
2038 .int LABEL(ashr_7) - LABEL(unaligned_table)
2039 .int LABEL(ashr_6) - LABEL(unaligned_table)
2040 .int LABEL(ashr_5) - LABEL(unaligned_table)
2041 .int LABEL(ashr_4) - LABEL(unaligned_table)
2042 .int LABEL(ashr_3) - LABEL(unaligned_table)
2043 .int LABEL(ashr_2) - LABEL(unaligned_table)
2044 .int LABEL(ashr_1) - LABEL(unaligned_table)
2045 .popsection
2046 #ifdef USE_AS_STRNCMP
2047 SET_SIZE(strncmp)
2048 #else
2049 SET_SIZE(strcmp) /* (const char *, const char *) */
2050 #endif