4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2009, Intel Corporation
24 * All rights reserved.
28 * str[n]cpy - copy [n] chars from second operand into first operand
31 #include "proc64_id.h"
33 #define LABEL(s) .strcpy##s
38 jz LABEL
(strncpy_exitz
)
41 ENTRY
(strcpy
) /* (char *, const char *) */
45 and $
0xfffffffffffffff0, %rsi
/* force rsi 16 byte align */
47 mov
%rdi
, %rax
/* save destination address for return value */
50 pxor
%xmm0
, %xmm0
/* clear %xmm0 for null char checks */
51 pcmpeqb
(%rsi
), %xmm0
/* check 16 bytes in src for null */
53 shr
%cl
, %edx
/* adjust for offset from 16byte boundary */
54 test
%edx
, %edx
/* edx will be 0 if chars are non-null */
55 jnz LABEL
(less16bytes
) /* null char found in first 16 bytes examined */
58 * Check if the count is satisfied in first 16 bytes examined.
60 lea
-16(%r8, %rcx
), %r11
62 jle LABEL
(less16bytes
)
64 mov
%rcx
, %r9 /* rsi alignment offset */
68 jz LABEL
(ashr_0
) /* src and dest are both 16 byte aligned */
70 neg %r10 /* max src bytes remaining in current dqword */
72 pxor
%xmm0
, %xmm0
/* clear %xmm0, may be polluted by unaligned operation */
73 pcmpeqb
16(%rsi
), %xmm0
/* check next 16 bytes in src for a null */
76 jnz LABEL
(less32bytes
) /* null char found in first 32 bytes examined */
80 * If strncpy count <= 16 go to exit case
83 jbe LABEL
(less32bytes_strncpy_truncation
)
86 * At least 16 bytes to copy to destination string. Move them now.
87 * Don't worry about alignment.
91 mov
8(%rsi
, %r9), %rdx
95 * so far destination rdi may be aligned by 16, re-calculate rsi and
96 * jump to corresponding src/dest relative offset case.
97 * rcx is offset of rsi
98 * rdx is offset of rdi
100 and $
0xfffffffffffffff0, %rdi
/* force rdi 16 byte align */
101 mov
%rax
, %rdx
/* rax contains orignal rdi */
102 xor %rdi
, %rdx
/* same effect as "and $0xf, %rdx" */
103 #ifdef USE_AS_STRNCPY
105 * Will now do 16 byte aligned stores. Stores may overlap some bytes
106 * (ie store twice) if destination was unaligned. Compensate here.
108 add %rdx
, %r8 /* compensate for overlap */
111 add $
16, %rdi
/* next 16 bytes for dest */
114 * align src to 16-byte boundary. Could be up or down depending on
115 * whether src offset - dest offset > 0 (up) or
116 * src offset - dest offset < 0 (down).
118 sub %rdx
, %r9 /* src offset - dest offset */
120 lea
16(%r9, %rsi
), %rsi
121 mov
%esi
, %ecx
/* for new src offset */
122 and $
0xfffffffffffffff0, %rsi
/* force rsi 16 byte align */
124 and $
0xf, %ecx
/* new src offset is 0 if rsi/rdi have same alignment */
127 #ifdef USE_AS_STRNCPY
128 xor %edx
, %edx
/* In case unaligned_exit is taken */
131 * Jump to case corresponding to source/dest string relative offsets
132 * Index = (16 + (src offset - dest offset)) % 16
136 neg %r10 /* max src bytes remaining in current dqword */
137 lea LABEL
(unaligned_table
)(%rip
), %r11
138 movslq
(%r11, %rcx
, 4), %rcx
139 lea
(%r11, %rcx
), %rcx
143 * ashr_0 handles the following cases:
144 * src alignment offset = dest alignment offset
148 #ifdef USE_AS_STRNCPY
150 jbe LABEL
(strncpy_truncation_aligned
)
152 movdqa
(%rsi
), %xmm1
/* fetch 16 bytes from src string */
153 movdqa
%xmm1
, (%rdi
) /* store 16 bytes into dest string */
156 pcmpeqb
(%rsi
), %xmm0
/* check 16 bytes in src for a null */
159 test
%edx
, %edx
/* edx will be 0 if chars are non-null */
160 jnz LABEL
(aligned_16bytes
) /* exit tail */
163 #ifdef USE_AS_STRNCPY
165 jbe LABEL
(strncpy_truncation_aligned
)
167 movdqa
(%rsi
, %rcx
), %xmm1
168 movdqa
%xmm1
, (%rdi
, %rcx
)
170 pcmpeqb
(%rsi
, %rcx
), %xmm0
173 jnz LABEL
(aligned_exit
)
175 #ifdef USE_AS_STRNCPY
177 jbe LABEL
(strncpy_truncation_aligned
)
179 movdqa
(%rsi
, %rcx
), %xmm1
180 movdqa
%xmm1
, (%rdi
, %rcx
)
182 pcmpeqb
(%rsi
, %rcx
), %xmm0
185 jnz LABEL
(aligned_exit
)
187 #ifdef USE_AS_STRNCPY
189 jbe LABEL
(strncpy_truncation_aligned
)
191 movdqa
(%rsi
, %rcx
), %xmm1
192 movdqa
%xmm1
, (%rdi
, %rcx
)
195 pcmpeqb
(%rsi
, %rcx
), %xmm0
198 jnz LABEL
(aligned_exit
)
200 #ifdef USE_AS_STRNCPY
202 jbe LABEL
(strncpy_truncation_aligned
)
204 movdqa
(%rsi
, %rcx
), %xmm1
205 movdqa
%xmm1
, (%rdi
, %rcx
)
207 pcmpeqb
(%rsi
, %rcx
), %xmm0
210 jz LABEL
(ashr_0_loop
)
211 jmp LABEL
(aligned_exit
)
215 * ashr_15 handles the following cases:
216 * (16 + (src offset - dest offset)) % 16 = 15
218 * Based on above operation, start from (%r9 + rsi) to the left of this cache
219 * bank, there is no null byte.
223 xor %ecx
, %ecx
/* clear index */
224 #ifdef USE_AS_STRNCPY
226 jbe LABEL
(unaligned_exit
)
228 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
229 jz LABEL
(ashr_15_use_sse2
)
232 LABEL
(ashr_15_use_ssse3
):
233 movdqa
16(%rsi
, %rcx
), %xmm3
237 jnz LABEL
(unaligned_exit
)
238 #ifdef USE_AS_STRNCPY
240 jbe LABEL
(strncpy_truncation_unaligned
)
243 #palignr $15, (%rsi, %rcx), %xmm3
244 .byte 0x66, 0x0F, 0x3A ,0x0F
245 .byte 0x1c, 0x0e, 0x0f
247 movdqa
%xmm3
, (%rdi
, %rcx
)
250 #ifdef USE_AS_STRNCPY
252 jbe LABEL
(unaligned_exit
)
254 movdqa
16(%rsi
, %rcx
), %xmm3
258 jnz LABEL
(unaligned_exit
)
259 #ifdef USE_AS_STRNCPY
261 jbe LABEL
(strncpy_truncation_unaligned
)
264 #palignr $15, (%rsi, %rcx), %xmm3
265 .byte 0x66, 0x0F, 0x3A ,0x0F
266 .byte 0x1c, 0x0e, 0x0f
268 movdqa
%xmm3
, (%rdi
, %rcx
)
271 #ifdef USE_AS_STRNCPY
273 jbe LABEL
(unaligned_exit
)
275 jmp LABEL
(ashr_15_use_ssse3
)
278 LABEL
(ashr_15_use_sse2
):
279 pcmpeqb
16(%rsi
, %rcx
), %xmm0
282 jnz LABEL
(unaligned_exit
)
283 #ifdef USE_AS_STRNCPY
285 jbe LABEL
(strncpy_truncation_unaligned
)
288 movdqa
16(%rsi
, %rcx
), %xmm3
289 movdqa
(%rsi
, %rcx
), %xmm2
295 movdqa
%xmm3
, (%rdi
, %rcx
)
297 #ifdef USE_AS_STRNCPY
299 jbe LABEL
(unaligned_exit
)
301 pcmpeqb
16(%rsi
, %rcx
), %xmm0
304 jnz LABEL
(unaligned_exit
)
305 #ifdef USE_AS_STRNCPY
307 jbe LABEL
(strncpy_truncation_unaligned
)
310 movdqa
16(%rsi
, %rcx
), %xmm3
311 movdqa
(%rsi
, %rcx
), %xmm2
317 movdqa
%xmm3
, (%rdi
, %rcx
)
319 #ifdef USE_AS_STRNCPY
321 jbe LABEL
(unaligned_exit
)
323 jmp LABEL
(ashr_15_use_sse2
)
327 * ashr_14 handles the following cases:
328 * (16 + (src offset - dest offset)) % 16 = 14
330 * Based on above operation, start from (%r9 + rsi) to the left of this cache
331 * bank, there is no null byte.
335 xor %ecx
, %ecx
/* clear index */
336 #ifdef USE_AS_STRNCPY
338 jbe LABEL
(unaligned_exit
)
340 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
341 jz LABEL
(ashr_14_use_sse2
)
344 LABEL
(ashr_14_use_ssse3
):
345 movdqa
16(%rsi
, %rcx
), %xmm3
349 jnz LABEL
(unaligned_exit
)
350 #ifdef USE_AS_STRNCPY
352 jbe LABEL
(strncpy_truncation_unaligned
)
355 #palignr $14, (%rsi, %rcx), %xmm3
356 .byte 0x66, 0x0F, 0x3A ,0x0F
357 .byte 0x1c, 0x0e, 0x0e
359 movdqa
%xmm3
, (%rdi
, %rcx
)
362 #ifdef USE_AS_STRNCPY
364 jbe LABEL
(unaligned_exit
)
366 movdqa
16(%rsi
, %rcx
), %xmm3
370 jnz LABEL
(unaligned_exit
)
371 #ifdef USE_AS_STRNCPY
373 jbe LABEL
(strncpy_truncation_unaligned
)
376 #palignr $14, (%rsi, %rcx), %xmm3
377 .byte 0x66, 0x0F, 0x3A ,0x0F
378 .byte 0x1c, 0x0e, 0x0e
380 movdqa
%xmm3
, (%rdi
, %rcx
)
382 #ifdef USE_AS_STRNCPY
384 jbe LABEL
(unaligned_exit
)
386 jmp LABEL
(ashr_14_use_ssse3
)
389 LABEL
(ashr_14_use_sse2
):
390 pcmpeqb
16(%rsi
, %rcx
), %xmm0
393 jnz LABEL
(unaligned_exit
)
394 #ifdef USE_AS_STRNCPY
396 jbe LABEL
(strncpy_truncation_unaligned
)
399 movdqa
16(%rsi
, %rcx
), %xmm3
400 movdqa
(%rsi
, %rcx
), %xmm2
406 movdqa
%xmm3
, (%rdi
, %rcx
)
409 #ifdef USE_AS_STRNCPY
411 jbe LABEL
(unaligned_exit
)
413 pcmpeqb
16(%rsi
, %rcx
), %xmm0
416 jnz LABEL
(unaligned_exit
)
417 #ifdef USE_AS_STRNCPY
419 jbe LABEL
(strncpy_truncation_unaligned
)
422 movdqa
16(%rsi
, %rcx
), %xmm3
423 movdqa
(%rsi
, %rcx
), %xmm2
429 movdqa
%xmm3
, (%rdi
, %rcx
)
431 #ifdef USE_AS_STRNCPY
433 jbe LABEL
(unaligned_exit
)
435 jmp LABEL
(ashr_14_use_sse2
)
439 * ashr_13 handles the following cases:
440 * (16 + (src offset - dest offset)) % 16 = 13
442 * Based on above operation, start from (%r9 + rsi) to the left of this cache
443 * bank, there is no null byte.
447 xor %ecx
, %ecx
/* clear index */
448 #ifdef USE_AS_STRNCPY
450 jbe LABEL
(unaligned_exit
)
452 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
453 jz LABEL
(ashr_13_use_sse2
)
456 LABEL
(ashr_13_use_ssse3
):
457 movdqa
16(%rsi
, %rcx
), %xmm3
461 jnz LABEL
(unaligned_exit
)
462 #ifdef USE_AS_STRNCPY
464 jbe LABEL
(strncpy_truncation_unaligned
)
467 #palignr $13, (%rsi, %rcx), %xmm3
468 .byte 0x66, 0x0F, 0x3A ,0x0F
469 .byte 0x1c, 0x0e, 0x0d
471 movdqa
%xmm3
, (%rdi
, %rcx
)
474 #ifdef USE_AS_STRNCPY
476 jbe LABEL
(unaligned_exit
)
478 movdqa
16(%rsi
, %rcx
), %xmm3
482 jnz LABEL
(unaligned_exit
)
483 #ifdef USE_AS_STRNCPY
485 jbe LABEL
(strncpy_truncation_unaligned
)
488 #palignr $13, (%rsi, %rcx), %xmm3
489 .byte 0x66, 0x0F, 0x3A ,0x0F
490 .byte 0x1c, 0x0e, 0x0d
492 movdqa
%xmm3
, (%rdi
, %rcx
)
494 #ifdef USE_AS_STRNCPY
496 jbe LABEL
(unaligned_exit
)
498 jmp LABEL
(ashr_13_use_ssse3
)
501 LABEL
(ashr_13_use_sse2
):
502 pcmpeqb
16(%rsi
, %rcx
), %xmm0
505 jnz LABEL
(unaligned_exit
)
506 #ifdef USE_AS_STRNCPY
508 jbe LABEL
(strncpy_truncation_unaligned
)
511 movdqa
16(%rsi
, %rcx
), %xmm3
512 movdqa
(%rsi
, %rcx
), %xmm2
518 movdqa
%xmm3
, (%rdi
, %rcx
)
521 #ifdef USE_AS_STRNCPY
523 jbe LABEL
(unaligned_exit
)
525 pcmpeqb
16(%rsi
, %rcx
), %xmm0
528 jnz LABEL
(unaligned_exit
)
529 #ifdef USE_AS_STRNCPY
531 jbe LABEL
(strncpy_truncation_unaligned
)
534 movdqa
16(%rsi
, %rcx
), %xmm3
535 movdqa
(%rsi
, %rcx
), %xmm2
541 movdqa
%xmm3
, (%rdi
, %rcx
)
543 #ifdef USE_AS_STRNCPY
545 jbe LABEL
(unaligned_exit
)
547 jmp LABEL
(ashr_13_use_sse2
)
551 * ashr_12 handles the following cases:
552 * (16 + (src offset - dest offset)) % 16 = 12
554 * Based on above operation, start from (%r9 + rsi) to the left of this cache
555 * bank, there is no null byte.
559 xor %ecx
, %ecx
/* clear index */
560 #ifdef USE_AS_STRNCPY
562 jbe LABEL
(unaligned_exit
)
564 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
565 jz LABEL
(ashr_12_use_sse2
)
568 LABEL
(ashr_12_use_ssse3
):
569 movdqa
16(%rsi
, %rcx
), %xmm3
573 jnz LABEL
(unaligned_exit
)
574 #ifdef USE_AS_STRNCPY
576 jbe LABEL
(strncpy_truncation_unaligned
)
579 #palignr $12, (%rsi, %rcx), %xmm3
580 .byte 0x66, 0x0F, 0x3A ,0x0F
581 .byte 0x1c, 0x0e, 0x0c
583 movdqa
%xmm3
, (%rdi
, %rcx
)
586 #ifdef USE_AS_STRNCPY
588 jbe LABEL
(unaligned_exit
)
590 movdqa
16(%rsi
, %rcx
), %xmm3
594 jnz LABEL
(unaligned_exit
)
595 #ifdef USE_AS_STRNCPY
597 jbe LABEL
(strncpy_truncation_unaligned
)
600 #palignr $12, (%rsi, %rcx), %xmm3
601 .byte 0x66, 0x0F, 0x3A ,0x0F
602 .byte 0x1c, 0x0e, 0x0c
604 movdqa
%xmm3
, (%rdi
, %rcx
)
606 #ifdef USE_AS_STRNCPY
608 jbe LABEL
(unaligned_exit
)
610 jmp LABEL
(ashr_12_use_ssse3
)
613 LABEL
(ashr_12_use_sse2
):
614 pcmpeqb
16(%rsi
, %rcx
), %xmm0
617 jnz LABEL
(unaligned_exit
)
618 #ifdef USE_AS_STRNCPY
620 jbe LABEL
(strncpy_truncation_unaligned
)
623 movdqa
16(%rsi
, %rcx
), %xmm3
624 movdqa
(%rsi
, %rcx
), %xmm2
630 movdqa
%xmm3
, (%rdi
, %rcx
)
633 #ifdef USE_AS_STRNCPY
635 jbe LABEL
(unaligned_exit
)
637 pcmpeqb
16(%rsi
, %rcx
), %xmm0
640 jnz LABEL
(unaligned_exit
)
641 #ifdef USE_AS_STRNCPY
643 jbe LABEL
(strncpy_truncation_unaligned
)
646 movdqa
16(%rsi
, %rcx
), %xmm3
647 movdqa
(%rsi
, %rcx
), %xmm2
653 movdqa
%xmm3
, (%rdi
, %rcx
)
655 #ifdef USE_AS_STRNCPY
657 jbe LABEL
(unaligned_exit
)
659 jmp LABEL
(ashr_12_use_sse2
)
663 * ashr_11 handles the following cases:
664 * (16 + (src offset - dest offset)) % 16 = 11
666 * Based on above operation, start from (%r9 + rsi) to the left of this cache
667 * bank, there is no null byte.
671 xor %ecx
, %ecx
/* clear index */
672 #ifdef USE_AS_STRNCPY
674 jbe LABEL
(unaligned_exit
)
676 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
677 jz LABEL
(ashr_11_use_sse2
)
680 LABEL
(ashr_11_use_ssse3
):
681 movdqa
16(%rsi
, %rcx
), %xmm3
685 jnz LABEL
(unaligned_exit
)
686 #ifdef USE_AS_STRNCPY
688 jbe LABEL
(strncpy_truncation_unaligned
)
691 #palignr $11, (%rsi, %rcx), %xmm3
692 .byte 0x66, 0x0F, 0x3A ,0x0F
693 .byte 0x1c, 0x0e, 0x0b
695 movdqa
%xmm3
, (%rdi
, %rcx
)
698 #ifdef USE_AS_STRNCPY
700 jbe LABEL
(unaligned_exit
)
702 movdqa
16(%rsi
, %rcx
), %xmm3
706 jnz LABEL
(unaligned_exit
)
707 #ifdef USE_AS_STRNCPY
709 jbe LABEL
(strncpy_truncation_unaligned
)
712 #palignr $11, (%rsi, %rcx), %xmm3
713 .byte 0x66, 0x0F, 0x3A ,0x0F
714 .byte 0x1c, 0x0e, 0x0b
716 movdqa
%xmm3
, (%rdi
, %rcx
)
718 #ifdef USE_AS_STRNCPY
720 jbe LABEL
(unaligned_exit
)
722 jmp LABEL
(ashr_11_use_ssse3
)
725 LABEL
(ashr_11_use_sse2
):
726 pcmpeqb
16(%rsi
, %rcx
), %xmm0
729 jnz LABEL
(unaligned_exit
)
730 #ifdef USE_AS_STRNCPY
732 jbe LABEL
(strncpy_truncation_unaligned
)
735 movdqa
16(%rsi
, %rcx
), %xmm3
736 movdqa
(%rsi
, %rcx
), %xmm2
742 movdqa
%xmm3
, (%rdi
, %rcx
)
745 #ifdef USE_AS_STRNCPY
747 jbe LABEL
(unaligned_exit
)
749 pcmpeqb
16(%rsi
, %rcx
), %xmm0
752 jnz LABEL
(unaligned_exit
)
753 #ifdef USE_AS_STRNCPY
755 jbe LABEL
(strncpy_truncation_unaligned
)
758 movdqa
16(%rsi
, %rcx
), %xmm3
759 movdqa
(%rsi
, %rcx
), %xmm2
765 movdqa
%xmm3
, (%rdi
, %rcx
)
767 #ifdef USE_AS_STRNCPY
769 jbe LABEL
(unaligned_exit
)
771 jmp LABEL
(ashr_11_use_sse2
)
775 * ashr_10 handles the following cases:
776 * (16 + (src offset - dest offset)) % 16 = 10
778 * Based on above operation, start from (%r9 + rsi) to the left of this cache
779 * bank, there is no null byte.
783 xor %ecx
, %ecx
/* clear index */
784 #ifdef USE_AS_STRNCPY
786 jbe LABEL
(unaligned_exit
)
788 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
789 jz LABEL
(ashr_10_use_sse2
)
792 LABEL
(ashr_10_use_ssse3
):
793 movdqa
16(%rsi
, %rcx
), %xmm3
797 jnz LABEL
(unaligned_exit
)
798 #ifdef USE_AS_STRNCPY
800 jbe LABEL
(strncpy_truncation_unaligned
)
803 #palignr $10, (%rsi, %rcx), %xmm3
804 .byte 0x66, 0x0F, 0x3A ,0x0F
805 .byte 0x1c, 0x0e, 0x0a
807 movdqa
%xmm3
, (%rdi
, %rcx
)
810 #ifdef USE_AS_STRNCPY
812 jbe LABEL
(unaligned_exit
)
814 movdqa
16(%rsi
, %rcx
), %xmm3
818 jnz LABEL
(unaligned_exit
)
819 #ifdef USE_AS_STRNCPY
821 jbe LABEL
(strncpy_truncation_unaligned
)
824 #palignr $10, (%rsi, %rcx), %xmm3
825 .byte 0x66, 0x0F, 0x3A ,0x0F
826 .byte 0x1c, 0x0e, 0x0a
828 movdqa
%xmm3
, (%rdi
, %rcx
)
830 #ifdef USE_AS_STRNCPY
832 jbe LABEL
(unaligned_exit
)
834 jmp LABEL
(ashr_10_use_ssse3
)
837 LABEL
(ashr_10_use_sse2
):
838 pcmpeqb
16(%rsi
, %rcx
), %xmm0
841 jnz LABEL
(unaligned_exit
)
842 #ifdef USE_AS_STRNCPY
844 jbe LABEL
(strncpy_truncation_unaligned
)
847 movdqa
16(%rsi
, %rcx
), %xmm3
848 movdqa
(%rsi
, %rcx
), %xmm2
854 movdqa
%xmm3
, (%rdi
, %rcx
)
857 #ifdef USE_AS_STRNCPY
859 jbe LABEL
(unaligned_exit
)
861 pcmpeqb
16(%rsi
, %rcx
), %xmm0
864 jnz LABEL
(unaligned_exit
)
865 #ifdef USE_AS_STRNCPY
867 jbe LABEL
(strncpy_truncation_unaligned
)
870 movdqa
16(%rsi
, %rcx
), %xmm3
871 movdqa
(%rsi
, %rcx
), %xmm2
877 movdqa
%xmm3
, (%rdi
, %rcx
)
879 #ifdef USE_AS_STRNCPY
881 jbe LABEL
(unaligned_exit
)
883 jmp LABEL
(ashr_10_use_sse2
)
887 * ashr_9 handles the following cases:
888 * (16 + (src offset - dest offset)) % 16 = 9
890 * Based on above operation, start from (%r9 + rsi) to the left of this cache
891 * bank, there is no null byte.
895 xor %ecx
, %ecx
/* clear index */
896 #ifdef USE_AS_STRNCPY
898 jbe LABEL
(unaligned_exit
)
900 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
901 jz LABEL
(ashr_9_use_sse2
)
904 LABEL
(ashr_9_use_ssse3
):
905 movdqa
16(%rsi
, %rcx
), %xmm3
909 jnz LABEL
(unaligned_exit
)
910 #ifdef USE_AS_STRNCPY
912 jbe LABEL
(strncpy_truncation_unaligned
)
915 #palignr $9, (%rsi, %rcx), %xmm3
916 .byte 0x66, 0x0F, 0x3A ,0x0F
917 .byte 0x1c, 0x0e, 0x09
919 movdqa
%xmm3
, (%rdi
, %rcx
)
922 #ifdef USE_AS_STRNCPY
924 jbe LABEL
(unaligned_exit
)
926 movdqa
16(%rsi
, %rcx
), %xmm3
930 jnz LABEL
(unaligned_exit
)
931 #ifdef USE_AS_STRNCPY
933 jbe LABEL
(strncpy_truncation_unaligned
)
936 #palignr $9, (%rsi, %rcx), %xmm3
937 .byte 0x66, 0x0F, 0x3A ,0x0F
938 .byte 0x1c, 0x0e, 0x09
940 movdqa
%xmm3
, (%rdi
, %rcx
)
942 #ifdef USE_AS_STRNCPY
944 jbe LABEL
(unaligned_exit
)
946 jmp LABEL
(ashr_9_use_ssse3
)
949 LABEL
(ashr_9_use_sse2
):
950 pcmpeqb
16(%rsi
, %rcx
), %xmm0
953 jnz LABEL
(unaligned_exit
)
954 #ifdef USE_AS_STRNCPY
956 jbe LABEL
(strncpy_truncation_unaligned
)
959 movdqa
16(%rsi
, %rcx
), %xmm3
960 movdqa
(%rsi
, %rcx
), %xmm2
966 movdqa
%xmm3
, (%rdi
, %rcx
)
969 #ifdef USE_AS_STRNCPY
971 jbe LABEL
(unaligned_exit
)
973 pcmpeqb
16(%rsi
, %rcx
), %xmm0
976 jnz LABEL
(unaligned_exit
)
977 #ifdef USE_AS_STRNCPY
979 jbe LABEL
(strncpy_truncation_unaligned
)
982 movdqa
16(%rsi
, %rcx
), %xmm3
983 movdqa
(%rsi
, %rcx
), %xmm2
989 movdqa
%xmm3
, (%rdi
, %rcx
)
991 #ifdef USE_AS_STRNCPY
993 jbe LABEL
(unaligned_exit
)
995 jmp LABEL
(ashr_9_use_sse2
)
999 * ashr_8 handles the following cases:
1000 * (16 + (src offset - dest offset)) % 16 = 8
1002 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1003 * bank, there is no null byte.
1007 xor %ecx
, %ecx
/* clear index */
1008 #ifdef USE_AS_STRNCPY
1010 jbe LABEL
(unaligned_exit
)
1012 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1013 jz LABEL
(ashr_8_use_sse2
)
1016 LABEL
(ashr_8_use_ssse3
):
1017 movdqa
16(%rsi
, %rcx
), %xmm3
1018 pcmpeqb
%xmm3
, %xmm0
1019 pmovmskb
%xmm0
, %edx
1021 jnz LABEL
(unaligned_exit
)
1022 #ifdef USE_AS_STRNCPY
1024 jbe LABEL
(strncpy_truncation_unaligned
)
1027 #palignr $8, (%rsi, %rcx), %xmm3
1028 .byte 0x66, 0x0F, 0x3A ,0x0F
1029 .byte 0x1c, 0x0e, 0x08
1031 movdqa
%xmm3
, (%rdi
, %rcx
)
1034 #ifdef USE_AS_STRNCPY
1036 jbe LABEL
(unaligned_exit
)
1038 movdqa
16(%rsi
, %rcx
), %xmm3
1039 pcmpeqb
%xmm3
, %xmm0
1040 pmovmskb
%xmm0
, %edx
1042 jnz LABEL
(unaligned_exit
)
1043 #ifdef USE_AS_STRNCPY
1045 jbe LABEL
(strncpy_truncation_unaligned
)
1048 #palignr $8, (%rsi, %rcx), %xmm3
1049 .byte 0x66, 0x0F, 0x3A ,0x0F
1050 .byte 0x1c, 0x0e, 0x08
1052 movdqa
%xmm3
, (%rdi
, %rcx
)
1054 #ifdef USE_AS_STRNCPY
1056 jbe LABEL
(unaligned_exit
)
1058 jmp LABEL
(ashr_8_use_ssse3
)
1061 LABEL
(ashr_8_use_sse2
):
1062 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1063 pmovmskb
%xmm0
, %edx
1065 jnz LABEL
(unaligned_exit
)
1066 #ifdef USE_AS_STRNCPY
1068 jbe LABEL
(strncpy_truncation_unaligned
)
1071 movdqa
16(%rsi
, %rcx
), %xmm3
1072 movdqa
(%rsi
, %rcx
), %xmm2
1078 movdqa
%xmm3
, (%rdi
, %rcx
)
1081 #ifdef USE_AS_STRNCPY
1083 jbe LABEL
(unaligned_exit
)
1085 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1086 pmovmskb
%xmm0
, %edx
1088 jnz LABEL
(unaligned_exit
)
1089 #ifdef USE_AS_STRNCPY
1091 jbe LABEL
(strncpy_truncation_unaligned
)
1094 movdqa
16(%rsi
, %rcx
), %xmm3
1095 movdqa
(%rsi
, %rcx
), %xmm2
1101 movdqa
%xmm3
, (%rdi
, %rcx
)
1103 #ifdef USE_AS_STRNCPY
1105 jbe LABEL
(unaligned_exit
)
1107 jmp LABEL
(ashr_8_use_sse2
)
1111 * ashr_7 handles the following cases:
1112 * (16 + (src offset - dest offset)) % 16 = 7
1114 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1115 * bank, there is no null byte.
1119 xor %ecx
, %ecx
/* clear index */
1120 #ifdef USE_AS_STRNCPY
1122 jbe LABEL
(unaligned_exit
)
1124 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1125 jz LABEL
(ashr_7_use_sse2
)
1128 LABEL
(ashr_7_use_ssse3
):
1129 movdqa
16(%rsi
, %rcx
), %xmm3
1130 pcmpeqb
%xmm3
, %xmm0
1131 pmovmskb
%xmm0
, %edx
1133 jnz LABEL
(unaligned_exit
)
1134 #ifdef USE_AS_STRNCPY
1136 jbe LABEL
(strncpy_truncation_unaligned
)
1139 #palignr $7, (%rsi, %rcx), %xmm3
1140 .byte 0x66, 0x0F, 0x3A ,0x0F
1141 .byte 0x1c, 0x0e, 0x07
1143 movdqa
%xmm3
, (%rdi
, %rcx
)
1146 #ifdef USE_AS_STRNCPY
1148 jbe LABEL
(unaligned_exit
)
1150 movdqa
16(%rsi
, %rcx
), %xmm3
1151 pcmpeqb
%xmm3
, %xmm0
1152 pmovmskb
%xmm0
, %edx
1154 jnz LABEL
(unaligned_exit
)
1155 #ifdef USE_AS_STRNCPY
1157 jbe LABEL
(strncpy_truncation_unaligned
)
1160 #palignr $7, (%rsi, %rcx), %xmm3
1161 .byte 0x66, 0x0F, 0x3A ,0x0F
1162 .byte 0x1c, 0x0e, 0x07
1164 movdqa
%xmm3
, (%rdi
, %rcx
)
1166 #ifdef USE_AS_STRNCPY
1168 jbe LABEL
(unaligned_exit
)
1170 jmp LABEL
(ashr_7_use_ssse3
)
1173 LABEL
(ashr_7_use_sse2
):
1174 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1175 pmovmskb
%xmm0
, %edx
1177 jnz LABEL
(unaligned_exit
)
1178 #ifdef USE_AS_STRNCPY
1180 jbe LABEL
(strncpy_truncation_unaligned
)
1183 movdqa
16(%rsi
, %rcx
), %xmm3
1184 movdqa
(%rsi
, %rcx
), %xmm2
1190 movdqa
%xmm3
, (%rdi
, %rcx
)
1193 #ifdef USE_AS_STRNCPY
1195 jbe LABEL
(unaligned_exit
)
1197 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1198 pmovmskb
%xmm0
, %edx
1200 jnz LABEL
(unaligned_exit
)
1201 #ifdef USE_AS_STRNCPY
1203 jbe LABEL
(strncpy_truncation_unaligned
)
1206 movdqa
16(%rsi
, %rcx
), %xmm3
1207 movdqa
(%rsi
, %rcx
), %xmm2
1213 movdqa
%xmm3
, (%rdi
, %rcx
)
1215 #ifdef USE_AS_STRNCPY
1217 jbe LABEL
(unaligned_exit
)
1219 jmp LABEL
(ashr_7_use_sse2
)
1223 * ashr_6 handles the following cases:
1224 * (16 + (src offset - dest offset)) % 16 = 6
1226 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1227 * bank, there is no null byte.
1231 xor %ecx
, %ecx
/* clear index */
1232 #ifdef USE_AS_STRNCPY
1234 jbe LABEL
(unaligned_exit
)
1236 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1237 jz LABEL
(ashr_6_use_sse2
)
1240 LABEL
(ashr_6_use_ssse3
):
1241 movdqa
16(%rsi
, %rcx
), %xmm3
1242 pcmpeqb
%xmm3
, %xmm0
1243 pmovmskb
%xmm0
, %edx
1245 jnz LABEL
(unaligned_exit
)
1246 #ifdef USE_AS_STRNCPY
1248 jbe LABEL
(strncpy_truncation_unaligned
)
1251 #palignr $6, (%rsi, %rcx), %xmm3
1252 .byte 0x66, 0x0F, 0x3A ,0x0F
1253 .byte 0x1c, 0x0e, 0x06
1255 movdqa
%xmm3
, (%rdi
, %rcx
)
1258 #ifdef USE_AS_STRNCPY
1260 jbe LABEL
(unaligned_exit
)
1262 movdqa
16(%rsi
, %rcx
), %xmm3
1263 pcmpeqb
%xmm3
, %xmm0
1264 pmovmskb
%xmm0
, %edx
1266 jnz LABEL
(unaligned_exit
)
1267 #ifdef USE_AS_STRNCPY
1269 jbe LABEL
(strncpy_truncation_unaligned
)
1272 #palignr $6, (%rsi, %rcx), %xmm3
1273 .byte 0x66, 0x0F, 0x3A ,0x0F
1274 .byte 0x1c, 0x0e, 0x06
1276 movdqa
%xmm3
, (%rdi
, %rcx
)
1278 #ifdef USE_AS_STRNCPY
1280 jbe LABEL
(unaligned_exit
)
1282 jmp LABEL
(ashr_6_use_ssse3
)
1285 LABEL
(ashr_6_use_sse2
):
1286 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1287 pmovmskb
%xmm0
, %edx
1289 jnz LABEL
(unaligned_exit
)
1290 #ifdef USE_AS_STRNCPY
1292 jbe LABEL
(strncpy_truncation_unaligned
)
1295 movdqa
16(%rsi
, %rcx
), %xmm3
1296 movdqa
(%rsi
, %rcx
), %xmm2
1302 movdqa
%xmm3
, (%rdi
, %rcx
)
1305 #ifdef USE_AS_STRNCPY
1307 jbe LABEL
(unaligned_exit
)
1309 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1310 pmovmskb
%xmm0
, %edx
1312 jnz LABEL
(unaligned_exit
)
1313 #ifdef USE_AS_STRNCPY
1315 jbe LABEL
(strncpy_truncation_unaligned
)
1318 movdqa
16(%rsi
, %rcx
), %xmm3
1319 movdqa
(%rsi
, %rcx
), %xmm2
1325 movdqa
%xmm3
, (%rdi
, %rcx
)
1327 #ifdef USE_AS_STRNCPY
1329 jbe LABEL
(unaligned_exit
)
1331 jmp LABEL
(ashr_6_use_sse2
)
1335 * ashr_5 handles the following cases:
1336 * (16 + (src offset - dest offset)) % 16 = 5
1338 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1339 * bank, there is no null byte.
1343 xor %ecx
, %ecx
/* clear index */
1344 #ifdef USE_AS_STRNCPY
1346 jbe LABEL
(unaligned_exit
)
1348 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1349 jz LABEL
(ashr_5_use_sse2
)
1352 LABEL
(ashr_5_use_ssse3
):
1353 movdqa
16(%rsi
, %rcx
), %xmm3
1354 pcmpeqb
%xmm3
, %xmm0
1355 pmovmskb
%xmm0
, %edx
1357 jnz LABEL
(unaligned_exit
)
1358 #ifdef USE_AS_STRNCPY
1360 jbe LABEL
(strncpy_truncation_unaligned
)
1363 #palignr $5, (%rsi, %rcx), %xmm3
1364 .byte 0x66, 0x0F, 0x3A ,0x0F
1365 .byte 0x1c, 0x0e, 0x05
1367 movdqa
%xmm3
, (%rdi
, %rcx
)
1370 #ifdef USE_AS_STRNCPY
1372 jbe LABEL
(unaligned_exit
)
1374 movdqa
16(%rsi
, %rcx
), %xmm3
1375 pcmpeqb
%xmm3
, %xmm0
1376 pmovmskb
%xmm0
, %edx
1378 jnz LABEL
(unaligned_exit
)
1379 #ifdef USE_AS_STRNCPY
1381 jbe LABEL
(strncpy_truncation_unaligned
)
1384 #palignr $5, (%rsi, %rcx), %xmm3
1385 .byte 0x66, 0x0F, 0x3A ,0x0F
1386 .byte 0x1c, 0x0e, 0x05
1388 movdqa
%xmm3
, (%rdi
, %rcx
)
1390 #ifdef USE_AS_STRNCPY
1392 jbe LABEL
(unaligned_exit
)
1394 jmp LABEL
(ashr_5_use_ssse3
)
1397 LABEL
(ashr_5_use_sse2
):
1398 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1399 pmovmskb
%xmm0
, %edx
1401 jnz LABEL
(unaligned_exit
)
1402 #ifdef USE_AS_STRNCPY
1404 jbe LABEL
(strncpy_truncation_unaligned
)
1407 movdqa
16(%rsi
, %rcx
), %xmm3
1408 movdqa
(%rsi
, %rcx
), %xmm2
1414 movdqa
%xmm3
, (%rdi
, %rcx
)
1417 #ifdef USE_AS_STRNCPY
1419 jbe LABEL
(unaligned_exit
)
1421 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1422 pmovmskb
%xmm0
, %edx
1424 jnz LABEL
(unaligned_exit
)
1425 #ifdef USE_AS_STRNCPY
1427 jbe LABEL
(strncpy_truncation_unaligned
)
1430 movdqa
16(%rsi
, %rcx
), %xmm3
1431 movdqa
(%rsi
, %rcx
), %xmm2
1437 movdqa
%xmm3
, (%rdi
, %rcx
)
1439 #ifdef USE_AS_STRNCPY
1441 jbe LABEL
(unaligned_exit
)
1443 jmp LABEL
(ashr_5_use_sse2
)
1447 * ashr_4 handles the following cases:
1448 * (16 + (src offset - dest offset)) % 16 = 4
1450 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1451 * bank, there is no null byte.
1455 xor %ecx
, %ecx
/* clear index */
1456 #ifdef USE_AS_STRNCPY
1458 jbe LABEL
(unaligned_exit
)
1460 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1461 jz LABEL
(ashr_4_use_sse2
)
1464 LABEL
(ashr_4_use_ssse3
):
1465 movdqa
16(%rsi
, %rcx
), %xmm3
1466 pcmpeqb
%xmm3
, %xmm0
1467 pmovmskb
%xmm0
, %edx
1469 jnz LABEL
(unaligned_exit
)
1470 #ifdef USE_AS_STRNCPY
1472 jbe LABEL
(strncpy_truncation_unaligned
)
1475 #palignr $4, (%rsi, %rcx), %xmm3
1476 .byte 0x66, 0x0F, 0x3A ,0x0F
1477 .byte 0x1c, 0x0e, 0x04
1479 movdqa
%xmm3
, (%rdi
, %rcx
)
1482 #ifdef USE_AS_STRNCPY
1484 jbe LABEL
(unaligned_exit
)
1486 movdqa
16(%rsi
, %rcx
), %xmm3
1487 pcmpeqb
%xmm3
, %xmm0
1488 pmovmskb
%xmm0
, %edx
1490 jnz LABEL
(unaligned_exit
)
1491 #ifdef USE_AS_STRNCPY
1493 jbe LABEL
(strncpy_truncation_unaligned
)
1496 #palignr $4, (%rsi, %rcx), %xmm3
1497 .byte 0x66, 0x0F, 0x3A ,0x0F
1498 .byte 0x1c, 0x0e, 0x04
1500 movdqa
%xmm3
, (%rdi
, %rcx
)
1502 #ifdef USE_AS_STRNCPY
1504 jbe LABEL
(unaligned_exit
)
1506 jmp LABEL
(ashr_4_use_ssse3
)
1509 LABEL
(ashr_4_use_sse2
):
1510 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1511 pmovmskb
%xmm0
, %edx
1513 jnz LABEL
(unaligned_exit
)
1514 #ifdef USE_AS_STRNCPY
1516 jbe LABEL
(strncpy_truncation_unaligned
)
1519 movdqa
16(%rsi
, %rcx
), %xmm3
1520 movdqa
(%rsi
, %rcx
), %xmm2
1526 movdqa
%xmm3
, (%rdi
, %rcx
)
1529 #ifdef USE_AS_STRNCPY
1531 jbe LABEL
(unaligned_exit
)
1533 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1534 pmovmskb
%xmm0
, %edx
1536 jnz LABEL
(unaligned_exit
)
1537 #ifdef USE_AS_STRNCPY
1539 jbe LABEL
(strncpy_truncation_unaligned
)
1542 movdqa
16(%rsi
, %rcx
), %xmm3
1543 movdqa
(%rsi
, %rcx
), %xmm2
1549 movdqa
%xmm3
, (%rdi
, %rcx
)
1551 #ifdef USE_AS_STRNCPY
1553 jbe LABEL
(unaligned_exit
)
1555 jmp LABEL
(ashr_4_use_sse2
)
1559 * ashr_3 handles the following cases:
1560 * (16 + (src offset - dest offset)) % 16 = 3
1562 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1563 * bank, there is no null byte.
1567 xor %ecx
, %ecx
/* clear index */
1568 #ifdef USE_AS_STRNCPY
1570 jbe LABEL
(unaligned_exit
)
1572 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1573 jz LABEL
(ashr_3_use_sse2
)
1576 LABEL
(ashr_3_use_ssse3
):
1577 movdqa
16(%rsi
, %rcx
), %xmm3
1578 pcmpeqb
%xmm3
, %xmm0
1579 pmovmskb
%xmm0
, %edx
1581 jnz LABEL
(unaligned_exit
)
1582 #ifdef USE_AS_STRNCPY
1584 jbe LABEL
(strncpy_truncation_unaligned
)
1587 #palignr $3, (%rsi, %rcx), %xmm3
1588 .byte 0x66, 0x0F, 0x3A ,0x0F
1589 .byte 0x1c, 0x0e, 0x03
1591 movdqa
%xmm3
, (%rdi
, %rcx
)
1594 #ifdef USE_AS_STRNCPY
1596 jbe LABEL
(unaligned_exit
)
1598 movdqa
16(%rsi
, %rcx
), %xmm3
1599 pcmpeqb
%xmm3
, %xmm0
1600 pmovmskb
%xmm0
, %edx
1602 jnz LABEL
(unaligned_exit
)
1603 #ifdef USE_AS_STRNCPY
1605 jbe LABEL
(strncpy_truncation_unaligned
)
1608 #palignr $3, (%rsi, %rcx), %xmm3
1609 .byte 0x66, 0x0F, 0x3A ,0x0F
1610 .byte 0x1c, 0x0e, 0x03
1612 movdqa
%xmm3
, (%rdi
, %rcx
)
1614 #ifdef USE_AS_STRNCPY
1616 jbe LABEL
(unaligned_exit
)
1618 jmp LABEL
(ashr_3_use_ssse3
)
1621 LABEL
(ashr_3_use_sse2
):
1622 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1623 pmovmskb
%xmm0
, %edx
1625 jnz LABEL
(unaligned_exit
)
1626 #ifdef USE_AS_STRNCPY
1628 jbe LABEL
(strncpy_truncation_unaligned
)
1631 movdqa
16(%rsi
, %rcx
), %xmm3
1632 movdqa
(%rsi
, %rcx
), %xmm2
1638 movdqa
%xmm3
, (%rdi
, %rcx
)
1641 #ifdef USE_AS_STRNCPY
1643 jbe LABEL
(unaligned_exit
)
1645 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1646 pmovmskb
%xmm0
, %edx
1648 jnz LABEL
(unaligned_exit
)
1649 #ifdef USE_AS_STRNCPY
1651 jbe LABEL
(strncpy_truncation_unaligned
)
1654 movdqa
16(%rsi
, %rcx
), %xmm3
1655 movdqa
(%rsi
, %rcx
), %xmm2
1661 movdqa
%xmm3
, (%rdi
, %rcx
)
1663 #ifdef USE_AS_STRNCPY
1665 jbe LABEL
(unaligned_exit
)
1667 jmp LABEL
(ashr_3_use_sse2
)
1671 * ashr_2 handles the following cases:
1672 * (16 + (src offset - dest offset)) % 16 = 2
1674 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1675 * bank, there is no null byte.
1679 xor %ecx
, %ecx
/* clear index */
1680 #ifdef USE_AS_STRNCPY
1682 jbe LABEL
(unaligned_exit
)
1684 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1685 jz LABEL
(ashr_2_use_sse2
)
1688 LABEL
(ashr_2_use_ssse3
):
1689 movdqa
16(%rsi
, %rcx
), %xmm3
1690 pcmpeqb
%xmm3
, %xmm0
1691 pmovmskb
%xmm0
, %edx
1693 jnz LABEL
(unaligned_exit
)
1694 #ifdef USE_AS_STRNCPY
1696 jbe LABEL
(strncpy_truncation_unaligned
)
1699 #palignr $2, (%rsi, %rcx), %xmm3
1700 .byte 0x66, 0x0F, 0x3A ,0x0F
1701 .byte 0x1c, 0x0e, 0x02
1703 movdqa
%xmm3
, (%rdi
, %rcx
)
1706 #ifdef USE_AS_STRNCPY
1708 jbe LABEL
(unaligned_exit
)
1710 movdqa
16(%rsi
, %rcx
), %xmm3
1711 pcmpeqb
%xmm3
, %xmm0
1712 pmovmskb
%xmm0
, %edx
1714 jnz LABEL
(unaligned_exit
)
1715 #ifdef USE_AS_STRNCPY
1717 jbe LABEL
(strncpy_truncation_unaligned
)
1720 #palignr $2, (%rsi, %rcx), %xmm3
1721 .byte 0x66, 0x0F, 0x3A ,0x0F
1722 .byte 0x1c, 0x0e, 0x02
1724 movdqa
%xmm3
, (%rdi
, %rcx
)
1726 #ifdef USE_AS_STRNCPY
1728 jbe LABEL
(unaligned_exit
)
1730 jmp LABEL
(ashr_2_use_ssse3
)
1733 LABEL
(ashr_2_use_sse2
):
1734 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1735 pmovmskb
%xmm0
, %edx
1737 jnz LABEL
(unaligned_exit
)
1738 #ifdef USE_AS_STRNCPY
1740 jbe LABEL
(strncpy_truncation_unaligned
)
1743 movdqa
16(%rsi
, %rcx
), %xmm3
1744 movdqa
(%rsi
, %rcx
), %xmm2
1750 movdqa
%xmm3
, (%rdi
, %rcx
)
1753 #ifdef USE_AS_STRNCPY
1755 jbe LABEL
(unaligned_exit
)
1757 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1758 pmovmskb
%xmm0
, %edx
1760 jnz LABEL
(unaligned_exit
)
1761 #ifdef USE_AS_STRNCPY
1763 jbe LABEL
(strncpy_truncation_unaligned
)
1766 movdqa
16(%rsi
, %rcx
), %xmm3
1767 movdqa
(%rsi
, %rcx
), %xmm2
1773 movdqa
%xmm3
, (%rdi
, %rcx
)
1775 #ifdef USE_AS_STRNCPY
1777 jbe LABEL
(unaligned_exit
)
1779 jmp LABEL
(ashr_2_use_sse2
)
1783 * ashr_1 handles the following cases:
1784 * (16 + (src offset - dest offset)) % 16 = 1
1786 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1787 * bank, there is no null byte.
1791 xor %ecx
, %ecx
/* clear index */
1792 #ifdef USE_AS_STRNCPY
1794 jbe LABEL
(unaligned_exit
)
1796 testl $USE_SSSE3
, .memops_method(%rip) /* use sse2 or ssse3? */
1797 jz LABEL
(ashr_1_use_sse2
)
1800 LABEL
(ashr_1_use_ssse3
):
1801 movdqa
16(%rsi
, %rcx
), %xmm3
1802 pcmpeqb
%xmm3
, %xmm0
1803 pmovmskb
%xmm0
, %edx
1805 jnz LABEL
(unaligned_exit
)
1806 #ifdef USE_AS_STRNCPY
1808 jbe LABEL
(strncpy_truncation_unaligned
)
1811 #palignr $1, (%rsi, %rcx), %xmm3
1812 .byte 0x66, 0x0F, 0x3A ,0x0F
1813 .byte 0x1c, 0x0e, 0x01
1815 movdqa
%xmm3
, (%rdi
, %rcx
)
1818 #ifdef USE_AS_STRNCPY
1820 jbe LABEL
(unaligned_exit
)
1822 movdqa
16(%rsi
, %rcx
), %xmm3
1823 pcmpeqb
%xmm3
, %xmm0
1824 pmovmskb
%xmm0
, %edx
1826 jnz LABEL
(unaligned_exit
)
1827 #ifdef USE_AS_STRNCPY
1829 jbe LABEL
(strncpy_truncation_unaligned
)
1831 #palignr $1, (%rsi, %rcx), %xmm3
1832 .byte 0x66, 0x0F, 0x3A ,0x0F
1833 .byte 0x1c, 0x0e, 0x01
1835 movdqa
%xmm3
, (%rdi
, %rcx
)
1837 #ifdef USE_AS_STRNCPY
1839 jbe LABEL
(unaligned_exit
)
1841 jmp LABEL
(ashr_1_use_ssse3
)
1844 LABEL
(ashr_1_use_sse2
):
1845 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1846 pmovmskb
%xmm0
, %edx
1848 jnz LABEL
(unaligned_exit
)
1849 #ifdef USE_AS_STRNCPY
1851 jbe LABEL
(strncpy_truncation_unaligned
)
1853 movdqa
16(%rsi
, %rcx
), %xmm3
1854 movdqa
(%rsi
, %rcx
), %xmm2
1860 movdqa
%xmm3
, (%rdi
, %rcx
)
1863 #ifdef USE_AS_STRNCPY
1865 jbe LABEL
(unaligned_exit
)
1867 pcmpeqb
16(%rsi
, %rcx
), %xmm0
1868 pmovmskb
%xmm0
, %edx
1870 jnz LABEL
(unaligned_exit
)
1871 #ifdef USE_AS_STRNCPY
1873 jbe LABEL
(strncpy_truncation_unaligned
)
1876 movdqa
16(%rsi
, %rcx
), %xmm3
1877 movdqa
(%rsi
, %rcx
), %xmm2
1883 movdqa
%xmm3
, (%rdi
, %rcx
)
1885 #ifdef USE_AS_STRNCPY
1887 jbe LABEL
(unaligned_exit
)
1889 jmp LABEL
(ashr_1_use_sse2
)
1894 * Up to 32 bytes are copied in the case of strcpy.
1899 LABEL
(unaligned_exit
):
1900 add %r9, %rsi
/* r9 holds offset of rsi */
1903 shl
%cl
, %edx
/* after shl, calculate the exact number to be filled */
1906 LABEL
(aligned_exit
):
1907 add %rcx
, %rdi
/* locate exact address for rdi */
1909 add %rcx
, %rsi
/* locate exact address for rsi */
1910 LABEL
(aligned_16bytes
):
1911 #ifdef USE_AS_STRNCPY
1913 * Null found in 16bytes checked. Set bit in bitmask corresponding to
1914 * the strncpy count argument. We will copy to the null (inclusive)
1915 * or count whichever comes first.
1921 ja LABEL
(strncpy_tail
)
1923 LABEL
(strncpy_tail
):
1926 * Check to see if BSF is fast on this processor. If not, use a
1927 * different exit tail.
1929 testb $USE_BSF
, .memops_method(%rip)
1931 bsf
%rdx
, %rcx
/* Find byte with null char */
1932 lea LABEL
(tail_table
)(%rip
), %r11
1933 movslq
(%r11, %rcx
, 4), %rcx
1934 lea
(%r11, %rcx
), %rcx
1937 #ifdef USE_AS_STRNCPY
1939 * Count reached before null found.
1942 LABEL
(less32bytes_strncpy_truncation
):
1944 LABEL
(strncpy_truncation_unaligned
):
1945 add %r9, %rsi
/* next src char to copy */
1946 LABEL
(strncpy_truncation_aligned
):
1949 add $
16, %r8 /* compensation */
1951 lea LABEL
(tail_table
)(%rip
), %r11
1952 movslq
(%r11, %rcx
, 4), %rcx
1953 lea
(%r11, %rcx
), %rcx
1957 LABEL
(strncpy_exitz
):
1965 jz LABEL
(AMD_exit_more_8
)
1982 LABEL
(tail_7
): /* 8 bytes */
1985 #ifdef USE_AS_STRNCPY
1988 jnz LABEL
(strncpy_fill_tail
)
1992 #ifdef USE_AS_STRNCPY
1994 * Null terminated src string shorter than count. Fill the rest of the
1995 * destination with null chars.
1998 LABEL
(strncpy_fill_tail
):
2005 jz LABEL
(strncpy_fill_less_8
)
2008 LABEL
(strncpy_fill_less_8
):
2011 jz LABEL
(strncpy_fill_return
)
2012 LABEL
(strncpy_fill_less_7
):
2014 mov
%al
, (%rdi
, %rcx
)
2015 jnz LABEL
(strncpy_fill_less_7
)
2016 LABEL
(strncpy_fill_return
):
2022 LABEL
(tail_0
): /* 1 byte */
2025 #ifdef USE_AS_STRNCPY
2028 jnz LABEL
(strncpy_fill_tail
)
2033 LABEL
(tail_1
): /* 2 bytes */
2036 #ifdef USE_AS_STRNCPY
2039 jnz LABEL
(strncpy_fill_tail
)
2044 LABEL
(tail_2
): /* 3 bytes */
2049 #ifdef USE_AS_STRNCPY
2052 jnz LABEL
(strncpy_fill_tail
)
2057 LABEL
(tail_3
): /* 4 bytes */
2060 #ifdef USE_AS_STRNCPY
2063 jnz LABEL
(strncpy_fill_tail
)
2068 LABEL
(tail_4
): /* 5 bytes */
2073 #ifdef USE_AS_STRNCPY
2076 jnz LABEL
(strncpy_fill_tail
)
2081 LABEL
(tail_5
): /* 6 bytes */
2086 #ifdef USE_AS_STRNCPY
2089 jnz LABEL
(strncpy_fill_tail
)
2094 LABEL
(tail_6
): /* 7 bytes */
2099 #ifdef USE_AS_STRNCPY
2102 jnz LABEL
(strncpy_fill_tail
)
2107 LABEL
(tail_8
): /* 9 bytes */
2112 #ifdef USE_AS_STRNCPY
2115 jnz LABEL
(strncpy_fill_tail
)
2120 LABEL
(AMD_exit_more_8
):
2122 jz LABEL
(AMD_exit_more_16
)
2139 LABEL
(tail_15
): /* 16 bytes */
2144 #ifdef USE_AS_STRNCPY
2147 jnz LABEL
(strncpy_fill_tail
)
2152 LABEL
(tail_9
): /* 10 bytes */
2157 #ifdef USE_AS_STRNCPY
2160 jnz LABEL
(strncpy_fill_tail
)
2165 LABEL
(tail_10
): /* 11 bytes */
2170 #ifdef USE_AS_STRNCPY
2173 jnz LABEL
(strncpy_fill_tail
)
2178 LABEL
(tail_11
): /* 12 bytes */
2183 #ifdef USE_AS_STRNCPY
2186 jnz LABEL
(strncpy_fill_tail
)
2191 LABEL
(tail_12
): /* 13 bytes */
2196 #ifdef USE_AS_STRNCPY
2199 jnz LABEL
(strncpy_fill_tail
)
2204 LABEL
(tail_13
): /* 14 bytes */
2209 #ifdef USE_AS_STRNCPY
2212 jnz LABEL
(strncpy_fill_tail
)
2217 LABEL
(tail_14
): /* 15 bytes */
2222 #ifdef USE_AS_STRNCPY
2225 jnz LABEL
(strncpy_fill_tail
)
2230 LABEL
(AMD_exit_more_16
):
2233 jz LABEL
(AMD_exit_more_24
)
2250 LABEL
(tail_23
): /* 24 bytes */
2257 #ifdef USE_AS_STRNCPY
2260 jnz LABEL
(strncpy_fill_tail
)
2265 LABEL
(tail_16
): /* 17 bytes */
2272 #ifdef USE_AS_STRNCPY
2275 jnz LABEL
(strncpy_fill_tail
)
2280 LABEL
(tail_17
): /* 18 bytes */
2287 #ifdef USE_AS_STRNCPY
2290 jnz LABEL
(strncpy_fill_tail
)
2295 LABEL
(tail_18
): /* 19 bytes */
2302 #ifdef USE_AS_STRNCPY
2305 jnz LABEL
(strncpy_fill_tail
)
2310 LABEL
(tail_19
): /* 20 bytes */
2317 #ifdef USE_AS_STRNCPY
2320 jnz LABEL
(strncpy_fill_tail
)
2325 LABEL
(tail_20
): /* 21 bytes */
2332 #ifdef USE_AS_STRNCPY
2335 jnz LABEL
(strncpy_fill_tail
)
2340 LABEL
(tail_21
): /* 22 bytes */
2347 #ifdef USE_AS_STRNCPY
2350 jnz LABEL
(strncpy_fill_tail
)
2355 LABEL
(tail_22
): /* 23 bytes */
2362 #ifdef USE_AS_STRNCPY
2365 jnz LABEL
(strncpy_fill_tail
)
2370 LABEL
(AMD_exit_more_24
):
2387 LABEL
(tail_31
): /* 32 bytes */
2396 #ifdef USE_AS_STRNCPY
2399 jnz LABEL
(strncpy_fill_tail
)
2404 LABEL
(tail_24
): /* 25 bytes */
2413 #ifdef USE_AS_STRNCPY
2416 jnz LABEL
(strncpy_fill_tail
)
2421 LABEL
(tail_25
): /* 26 bytes */
2430 #ifdef USE_AS_STRNCPY
2433 jnz LABEL
(strncpy_fill_tail
)
2438 LABEL
(tail_26
): /* 27 bytes */
2447 #ifdef USE_AS_STRNCPY
2450 jnz LABEL
(strncpy_fill_tail
)
2455 LABEL
(tail_27
): /* 28 bytes */
2464 #ifdef USE_AS_STRNCPY
2467 jnz LABEL
(strncpy_fill_tail
)
2472 LABEL
(tail_28
): /* 29 bytes */
2481 #ifdef USE_AS_STRNCPY
2484 jnz LABEL
(strncpy_fill_tail
)
2489 LABEL
(tail_29
): /* 30 bytes */
2498 #ifdef USE_AS_STRNCPY
2501 jnz LABEL
(strncpy_fill_tail
)
2506 LABEL
(tail_30
): /* 31 bytes */
2515 #ifdef USE_AS_STRNCPY
2518 jnz LABEL
(strncpy_fill_tail
)
2522 .pushsection .rodata
2525 .int LABEL(tail_0) - LABEL(tail_table) /* 1 byte */
2526 .int LABEL(tail_1) - LABEL(tail_table)
2527 .int LABEL(tail_2) - LABEL(tail_table)
2528 .int LABEL(tail_3) - LABEL(tail_table)
2529 .int LABEL(tail_4) - LABEL(tail_table)
2530 .int LABEL(tail_5) - LABEL(tail_table)
2531 .int LABEL(tail_6) - LABEL(tail_table)
2532 .int LABEL(tail_7) - LABEL(tail_table)
2533 .int LABEL(tail_8) - LABEL(tail_table)
2534 .int LABEL(tail_9) - LABEL(tail_table)
2535 .int LABEL(tail_10) - LABEL(tail_table)
2536 .int LABEL(tail_11) - LABEL(tail_table)
2537 .int LABEL(tail_12) - LABEL(tail_table)
2538 .int LABEL(tail_13) - LABEL(tail_table)
2539 .int LABEL(tail_14) - LABEL(tail_table)
2540 .int LABEL(tail_15) - LABEL(tail_table)
2541 .int LABEL(tail_16) - LABEL(tail_table)
2542 .int LABEL(tail_17) - LABEL(tail_table)
2543 .int LABEL(tail_18) - LABEL(tail_table)
2544 .int LABEL(tail_19) - LABEL(tail_table)
2545 .int LABEL(tail_20) - LABEL(tail_table)
2546 .int LABEL(tail_21) - LABEL(tail_table)
2547 .int LABEL(tail_22) - LABEL(tail_table)
2548 .int LABEL(tail_23) - LABEL(tail_table)
2549 .int LABEL(tail_24) - LABEL(tail_table)
2550 .int LABEL(tail_25) - LABEL(tail_table)
2551 .int LABEL(tail_26) - LABEL(tail_table)
2552 .int LABEL(tail_27) - LABEL(tail_table)
2553 .int LABEL(tail_28) - LABEL(tail_table)
2554 .int LABEL(tail_29) - LABEL(tail_table)
2555 .int LABEL(tail_30) - LABEL(tail_table)
2556 .int LABEL(tail_31) - LABEL(tail_table) /* 32 bytes */
2559 LABEL
(unaligned_table
):
2560 .int LABEL(ashr_0) - LABEL(unaligned_table)
2561 .int LABEL(ashr_1) - LABEL(unaligned_table)
2562 .int LABEL(ashr_2) - LABEL(unaligned_table)
2563 .int LABEL(ashr_3) - LABEL(unaligned_table)
2564 .int LABEL(ashr_4) - LABEL(unaligned_table)
2565 .int LABEL(ashr_5) - LABEL(unaligned_table)
2566 .int LABEL(ashr_6) - LABEL(unaligned_table)
2567 .int LABEL(ashr_7) - LABEL(unaligned_table)
2568 .int LABEL(ashr_8) - LABEL(unaligned_table)
2569 .int LABEL(ashr_9) - LABEL(unaligned_table)
2570 .int LABEL(ashr_10) - LABEL(unaligned_table)
2571 .int LABEL(ashr_11) - LABEL(unaligned_table)
2572 .int LABEL(ashr_12) - LABEL(unaligned_table)
2573 .int LABEL(ashr_13) - LABEL(unaligned_table)
2574 .int LABEL(ashr_14) - LABEL(unaligned_table)
2575 .int LABEL(ashr_15) - LABEL(unaligned_table)
2578 #ifdef USE_AS_STRNCPY
2581 SET_SIZE
(strcpy
) /* (char *, const char *) */