import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / amd64 / gen / strcpy.s
blobf62d717e8453033160d11e5142288b7a00884fe5
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2009, Intel Corporation
24 * All rights reserved.
28 * str[n]cpy - copy [n] chars from second operand into first operand
30 #include "SYS.h"
31 #include "proc64_id.h"
33 #define LABEL(s) .strcpy##s
35 #ifdef USE_AS_STRNCPY
36 ENTRY(strncpy)
37 test %edx, %edx
38 jz LABEL(strncpy_exitz)
39 mov %rdx, %r8
40 #else
41 ENTRY(strcpy) /* (char *, const char *) */
42 xor %rdx, %rdx
43 #endif
44 mov %esi, %ecx
45 and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
46 and $0xf, %rcx
47 mov %rdi, %rax /* save destination address for return value */
50 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
51 pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for null */
52 pmovmskb %xmm0, %edx
53 shr %cl, %edx /* adjust for offset from 16byte boundary */
54 test %edx, %edx /* edx will be 0 if chars are non-null */
55 jnz LABEL(less16bytes) /* null char found in first 16 bytes examined */
56 #ifdef USE_AS_STRNCPY
58 * Check if the count is satisfied in first 16 bytes examined.
60 lea -16(%r8, %rcx), %r11
61 cmp $0, %r11
62 jle LABEL(less16bytes)
63 #endif
64 mov %rcx, %r9 /* rsi alignment offset */
65 or %edi, %ecx
66 and $0xf, %ecx
67 lea -16(%r9), %r10
68 jz LABEL(ashr_0) /* src and dest are both 16 byte aligned */
70 neg %r10 /* max src bytes remaining in current dqword */
72 pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation */
73 pcmpeqb 16(%rsi), %xmm0 /* check next 16 bytes in src for a null */
74 pmovmskb %xmm0, %edx
75 test %edx, %edx
76 jnz LABEL(less32bytes) /* null char found in first 32 bytes examined */
78 #ifdef USE_AS_STRNCPY
80 * If strncpy count <= 16 go to exit case
82 sub $16, %r8
83 jbe LABEL(less32bytes_strncpy_truncation)
84 #endif
86 * At least 16 bytes to copy to destination string. Move them now.
87 * Don't worry about alignment.
89 mov (%rsi, %r9), %rdx
90 mov %rdx, (%rdi)
91 mov 8(%rsi, %r9), %rdx
92 mov %rdx, 8(%rdi)
95 * so far destination rdi may be aligned by 16, re-calculate rsi and
96 * jump to corresponding src/dest relative offset case.
97 * rcx is offset of rsi
98 * rdx is offset of rdi
100 and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
101 mov %rax, %rdx /* rax contains orignal rdi */
102 xor %rdi, %rdx /* same effect as "and $0xf, %rdx" */
103 #ifdef USE_AS_STRNCPY
105 * Will now do 16 byte aligned stores. Stores may overlap some bytes
106 * (ie store twice) if destination was unaligned. Compensate here.
108 add %rdx, %r8 /* compensate for overlap */
109 #endif
111 add $16, %rdi /* next 16 bytes for dest */
114 * align src to 16-byte boundary. Could be up or down depending on
115 * whether src offset - dest offset > 0 (up) or
116 * src offset - dest offset < 0 (down).
118 sub %rdx, %r9 /* src offset - dest offset */
120 lea 16(%r9, %rsi), %rsi
121 mov %esi, %ecx /* for new src offset */
122 and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
124 and $0xf, %ecx /* new src offset is 0 if rsi/rdi have same alignment */
125 jz LABEL(ashr_0)
127 #ifdef USE_AS_STRNCPY
128 xor %edx, %edx /* In case unaligned_exit is taken */
129 #endif
131 * Jump to case corresponding to source/dest string relative offsets
132 * Index = (16 + (src offset - dest offset)) % 16
134 lea -16(%rcx), %r10
135 mov %rcx, %r9
136 neg %r10 /* max src bytes remaining in current dqword */
137 lea LABEL(unaligned_table)(%rip), %r11
138 movslq (%r11, %rcx, 4), %rcx
139 lea (%r11, %rcx), %rcx
140 jmp *%rcx
143 * ashr_0 handles the following cases:
144 * src alignment offset = dest alignment offset
146 .p2align 5
147 LABEL(ashr_0):
148 #ifdef USE_AS_STRNCPY
149 sub $16, %r8
150 jbe LABEL(strncpy_truncation_aligned)
151 #endif
152 movdqa (%rsi), %xmm1 /* fetch 16 bytes from src string */
153 movdqa %xmm1, (%rdi) /* store 16 bytes into dest string */
154 add $16, %rsi
155 add $16, %rdi
156 pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for a null */
157 pmovmskb %xmm0, %edx
159 test %edx, %edx /* edx will be 0 if chars are non-null */
160 jnz LABEL(aligned_16bytes) /* exit tail */
162 LABEL(ashr_0_loop):
163 #ifdef USE_AS_STRNCPY
164 sub $16, %r8
165 jbe LABEL(strncpy_truncation_aligned)
166 #endif
167 movdqa (%rsi, %rcx), %xmm1
168 movdqa %xmm1, (%rdi, %rcx)
169 add $16, %rcx
170 pcmpeqb (%rsi, %rcx), %xmm0
171 pmovmskb %xmm0, %edx
172 test %edx, %edx
173 jnz LABEL(aligned_exit)
175 #ifdef USE_AS_STRNCPY
176 sub $16, %r8
177 jbe LABEL(strncpy_truncation_aligned)
178 #endif
179 movdqa (%rsi, %rcx), %xmm1
180 movdqa %xmm1, (%rdi, %rcx)
181 add $16, %rcx
182 pcmpeqb (%rsi, %rcx), %xmm0
183 pmovmskb %xmm0, %edx
184 test %edx, %edx
185 jnz LABEL(aligned_exit)
187 #ifdef USE_AS_STRNCPY
188 sub $16, %r8
189 jbe LABEL(strncpy_truncation_aligned)
190 #endif
191 movdqa (%rsi, %rcx), %xmm1
192 movdqa %xmm1, (%rdi, %rcx)
194 add $16, %rcx
195 pcmpeqb (%rsi, %rcx), %xmm0
196 pmovmskb %xmm0, %edx
197 test %edx, %edx
198 jnz LABEL(aligned_exit)
200 #ifdef USE_AS_STRNCPY
201 sub $16, %r8
202 jbe LABEL(strncpy_truncation_aligned)
203 #endif
204 movdqa (%rsi, %rcx), %xmm1
205 movdqa %xmm1, (%rdi, %rcx)
206 add $16, %rcx
207 pcmpeqb (%rsi, %rcx), %xmm0
208 pmovmskb %xmm0, %edx
209 test %edx, %edx
210 jz LABEL(ashr_0_loop)
211 jmp LABEL(aligned_exit)
215 * ashr_15 handles the following cases:
216 * (16 + (src offset - dest offset)) % 16 = 15
218 * Based on above operation, start from (%r9 + rsi) to the left of this cache
219 * bank, there is no null byte.
221 .p2align 4
222 LABEL(ashr_15):
223 xor %ecx, %ecx /* clear index */
224 #ifdef USE_AS_STRNCPY
225 cmp %r10, %r8
226 jbe LABEL(unaligned_exit)
227 #endif
228 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
229 jz LABEL(ashr_15_use_sse2)
231 .p2align 4
232 LABEL(ashr_15_use_ssse3):
233 movdqa 16(%rsi, %rcx), %xmm3
234 pcmpeqb %xmm3, %xmm0
235 pmovmskb %xmm0, %edx
236 test %edx, %edx
237 jnz LABEL(unaligned_exit)
238 #ifdef USE_AS_STRNCPY
239 sub $16, %r8
240 jbe LABEL(strncpy_truncation_unaligned)
241 #endif
243 #palignr $15, (%rsi, %rcx), %xmm3
244 .byte 0x66, 0x0F, 0x3A ,0x0F
245 .byte 0x1c, 0x0e, 0x0f
247 movdqa %xmm3, (%rdi, %rcx)
248 add $16, %rcx
250 #ifdef USE_AS_STRNCPY
251 cmp %r10, %r8
252 jbe LABEL(unaligned_exit)
253 #endif
254 movdqa 16(%rsi, %rcx), %xmm3
255 pcmpeqb %xmm3, %xmm0
256 pmovmskb %xmm0, %edx
257 test %edx, %edx
258 jnz LABEL(unaligned_exit)
259 #ifdef USE_AS_STRNCPY
260 sub $16, %r8
261 jbe LABEL(strncpy_truncation_unaligned)
262 #endif
264 #palignr $15, (%rsi, %rcx), %xmm3
265 .byte 0x66, 0x0F, 0x3A ,0x0F
266 .byte 0x1c, 0x0e, 0x0f
268 movdqa %xmm3, (%rdi, %rcx)
269 add $16, %rcx
271 #ifdef USE_AS_STRNCPY
272 cmp %r10, %r8
273 jbe LABEL(unaligned_exit)
274 #endif
275 jmp LABEL(ashr_15_use_ssse3)
277 .p2align 4
278 LABEL(ashr_15_use_sse2):
279 pcmpeqb 16(%rsi, %rcx), %xmm0
280 pmovmskb %xmm0, %edx
281 test %edx, %edx
282 jnz LABEL(unaligned_exit)
283 #ifdef USE_AS_STRNCPY
284 sub $16, %r8
285 jbe LABEL(strncpy_truncation_unaligned)
286 #endif
288 movdqa 16(%rsi, %rcx), %xmm3
289 movdqa (%rsi, %rcx), %xmm2
291 psrldq $15, %xmm2
292 pslldq $1, %xmm3
293 por %xmm2, %xmm3
295 movdqa %xmm3, (%rdi, %rcx)
296 add $16, %rcx
297 #ifdef USE_AS_STRNCPY
298 cmp %r10, %r8
299 jbe LABEL(unaligned_exit)
300 #endif
301 pcmpeqb 16(%rsi, %rcx), %xmm0
302 pmovmskb %xmm0, %edx
303 test %edx, %edx
304 jnz LABEL(unaligned_exit)
305 #ifdef USE_AS_STRNCPY
306 sub $16, %r8
307 jbe LABEL(strncpy_truncation_unaligned)
308 #endif
310 movdqa 16(%rsi, %rcx), %xmm3
311 movdqa (%rsi, %rcx), %xmm2
313 psrldq $15, %xmm2
314 pslldq $1, %xmm3
315 por %xmm2, %xmm3
317 movdqa %xmm3, (%rdi, %rcx)
318 add $16, %rcx
319 #ifdef USE_AS_STRNCPY
320 cmp %r10, %r8
321 jbe LABEL(unaligned_exit)
322 #endif
323 jmp LABEL(ashr_15_use_sse2)
327 * ashr_14 handles the following cases:
328 * (16 + (src offset - dest offset)) % 16 = 14
330 * Based on above operation, start from (%r9 + rsi) to the left of this cache
331 * bank, there is no null byte.
333 .p2align 4
334 LABEL(ashr_14):
335 xor %ecx, %ecx /* clear index */
336 #ifdef USE_AS_STRNCPY
337 cmp %r10, %r8
338 jbe LABEL(unaligned_exit)
339 #endif
340 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
341 jz LABEL(ashr_14_use_sse2)
343 .p2align 4
344 LABEL(ashr_14_use_ssse3):
345 movdqa 16(%rsi, %rcx), %xmm3
346 pcmpeqb %xmm3, %xmm0
347 pmovmskb %xmm0, %edx
348 test %edx, %edx
349 jnz LABEL(unaligned_exit)
350 #ifdef USE_AS_STRNCPY
351 sub $16, %r8
352 jbe LABEL(strncpy_truncation_unaligned)
353 #endif
355 #palignr $14, (%rsi, %rcx), %xmm3
356 .byte 0x66, 0x0F, 0x3A ,0x0F
357 .byte 0x1c, 0x0e, 0x0e
359 movdqa %xmm3, (%rdi, %rcx)
360 add $16, %rcx
362 #ifdef USE_AS_STRNCPY
363 cmp %r10, %r8
364 jbe LABEL(unaligned_exit)
365 #endif
366 movdqa 16(%rsi, %rcx), %xmm3
367 pcmpeqb %xmm3, %xmm0
368 pmovmskb %xmm0, %edx
369 test %edx, %edx
370 jnz LABEL(unaligned_exit)
371 #ifdef USE_AS_STRNCPY
372 sub $16, %r8
373 jbe LABEL(strncpy_truncation_unaligned)
374 #endif
376 #palignr $14, (%rsi, %rcx), %xmm3
377 .byte 0x66, 0x0F, 0x3A ,0x0F
378 .byte 0x1c, 0x0e, 0x0e
380 movdqa %xmm3, (%rdi, %rcx)
381 add $16, %rcx
382 #ifdef USE_AS_STRNCPY
383 cmp %r10, %r8
384 jbe LABEL(unaligned_exit)
385 #endif
386 jmp LABEL(ashr_14_use_ssse3)
388 .p2align 4
389 LABEL(ashr_14_use_sse2):
390 pcmpeqb 16(%rsi, %rcx), %xmm0
391 pmovmskb %xmm0, %edx
392 test %edx, %edx
393 jnz LABEL(unaligned_exit)
394 #ifdef USE_AS_STRNCPY
395 sub $16, %r8
396 jbe LABEL(strncpy_truncation_unaligned)
397 #endif
399 movdqa 16(%rsi, %rcx), %xmm3
400 movdqa (%rsi, %rcx), %xmm2
402 psrldq $14, %xmm2
403 pslldq $2, %xmm3
404 por %xmm2, %xmm3
406 movdqa %xmm3, (%rdi, %rcx)
407 add $16, %rcx
409 #ifdef USE_AS_STRNCPY
410 cmp %r10, %r8
411 jbe LABEL(unaligned_exit)
412 #endif
413 pcmpeqb 16(%rsi, %rcx), %xmm0
414 pmovmskb %xmm0, %edx
415 test %edx, %edx
416 jnz LABEL(unaligned_exit)
417 #ifdef USE_AS_STRNCPY
418 sub $16, %r8
419 jbe LABEL(strncpy_truncation_unaligned)
420 #endif
422 movdqa 16(%rsi, %rcx), %xmm3
423 movdqa (%rsi, %rcx), %xmm2
425 psrldq $14, %xmm2
426 pslldq $2, %xmm3
427 por %xmm2, %xmm3
429 movdqa %xmm3, (%rdi, %rcx)
430 add $16, %rcx
431 #ifdef USE_AS_STRNCPY
432 cmp %r10, %r8
433 jbe LABEL(unaligned_exit)
434 #endif
435 jmp LABEL(ashr_14_use_sse2)
439 * ashr_13 handles the following cases:
440 * (16 + (src offset - dest offset)) % 16 = 13
442 * Based on above operation, start from (%r9 + rsi) to the left of this cache
443 * bank, there is no null byte.
445 .p2align 4
446 LABEL(ashr_13):
447 xor %ecx, %ecx /* clear index */
448 #ifdef USE_AS_STRNCPY
449 cmp %r10, %r8
450 jbe LABEL(unaligned_exit)
451 #endif
452 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
453 jz LABEL(ashr_13_use_sse2)
455 .p2align 4
456 LABEL(ashr_13_use_ssse3):
457 movdqa 16(%rsi, %rcx), %xmm3
458 pcmpeqb %xmm3, %xmm0
459 pmovmskb %xmm0, %edx
460 test %edx, %edx
461 jnz LABEL(unaligned_exit)
462 #ifdef USE_AS_STRNCPY
463 sub $16, %r8
464 jbe LABEL(strncpy_truncation_unaligned)
465 #endif
467 #palignr $13, (%rsi, %rcx), %xmm3
468 .byte 0x66, 0x0F, 0x3A ,0x0F
469 .byte 0x1c, 0x0e, 0x0d
471 movdqa %xmm3, (%rdi, %rcx)
472 add $16, %rcx
474 #ifdef USE_AS_STRNCPY
475 cmp %r10, %r8
476 jbe LABEL(unaligned_exit)
477 #endif
478 movdqa 16(%rsi, %rcx), %xmm3
479 pcmpeqb %xmm3, %xmm0
480 pmovmskb %xmm0, %edx
481 test %edx, %edx
482 jnz LABEL(unaligned_exit)
483 #ifdef USE_AS_STRNCPY
484 sub $16, %r8
485 jbe LABEL(strncpy_truncation_unaligned)
486 #endif
488 #palignr $13, (%rsi, %rcx), %xmm3
489 .byte 0x66, 0x0F, 0x3A ,0x0F
490 .byte 0x1c, 0x0e, 0x0d
492 movdqa %xmm3, (%rdi, %rcx)
493 add $16, %rcx
494 #ifdef USE_AS_STRNCPY
495 cmp %r10, %r8
496 jbe LABEL(unaligned_exit)
497 #endif
498 jmp LABEL(ashr_13_use_ssse3)
500 .p2align 4
501 LABEL(ashr_13_use_sse2):
502 pcmpeqb 16(%rsi, %rcx), %xmm0
503 pmovmskb %xmm0, %edx
504 test %edx, %edx
505 jnz LABEL(unaligned_exit)
506 #ifdef USE_AS_STRNCPY
507 sub $16, %r8
508 jbe LABEL(strncpy_truncation_unaligned)
509 #endif
511 movdqa 16(%rsi, %rcx), %xmm3
512 movdqa (%rsi, %rcx), %xmm2
514 psrldq $13, %xmm2
515 pslldq $3, %xmm3
516 por %xmm2, %xmm3
518 movdqa %xmm3, (%rdi, %rcx)
519 add $16, %rcx
521 #ifdef USE_AS_STRNCPY
522 cmp %r10, %r8
523 jbe LABEL(unaligned_exit)
524 #endif
525 pcmpeqb 16(%rsi, %rcx), %xmm0
526 pmovmskb %xmm0, %edx
527 test %edx, %edx
528 jnz LABEL(unaligned_exit)
529 #ifdef USE_AS_STRNCPY
530 sub $16, %r8
531 jbe LABEL(strncpy_truncation_unaligned)
532 #endif
534 movdqa 16(%rsi, %rcx), %xmm3
535 movdqa (%rsi, %rcx), %xmm2
537 psrldq $13, %xmm2
538 pslldq $3, %xmm3
539 por %xmm2, %xmm3
541 movdqa %xmm3, (%rdi, %rcx)
542 add $16, %rcx
543 #ifdef USE_AS_STRNCPY
544 cmp %r10, %r8
545 jbe LABEL(unaligned_exit)
546 #endif
547 jmp LABEL(ashr_13_use_sse2)
551 * ashr_12 handles the following cases:
552 * (16 + (src offset - dest offset)) % 16 = 12
554 * Based on above operation, start from (%r9 + rsi) to the left of this cache
555 * bank, there is no null byte.
557 .p2align 4
558 LABEL(ashr_12):
559 xor %ecx, %ecx /* clear index */
560 #ifdef USE_AS_STRNCPY
561 cmp %r10, %r8
562 jbe LABEL(unaligned_exit)
563 #endif
564 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
565 jz LABEL(ashr_12_use_sse2)
567 .p2align 4
568 LABEL(ashr_12_use_ssse3):
569 movdqa 16(%rsi, %rcx), %xmm3
570 pcmpeqb %xmm3, %xmm0
571 pmovmskb %xmm0, %edx
572 test %edx, %edx
573 jnz LABEL(unaligned_exit)
574 #ifdef USE_AS_STRNCPY
575 sub $16, %r8
576 jbe LABEL(strncpy_truncation_unaligned)
577 #endif
579 #palignr $12, (%rsi, %rcx), %xmm3
580 .byte 0x66, 0x0F, 0x3A ,0x0F
581 .byte 0x1c, 0x0e, 0x0c
583 movdqa %xmm3, (%rdi, %rcx)
584 add $16, %rcx
586 #ifdef USE_AS_STRNCPY
587 cmp %r10, %r8
588 jbe LABEL(unaligned_exit)
589 #endif
590 movdqa 16(%rsi, %rcx), %xmm3
591 pcmpeqb %xmm3, %xmm0
592 pmovmskb %xmm0, %edx
593 test %edx, %edx
594 jnz LABEL(unaligned_exit)
595 #ifdef USE_AS_STRNCPY
596 sub $16, %r8
597 jbe LABEL(strncpy_truncation_unaligned)
598 #endif
600 #palignr $12, (%rsi, %rcx), %xmm3
601 .byte 0x66, 0x0F, 0x3A ,0x0F
602 .byte 0x1c, 0x0e, 0x0c
604 movdqa %xmm3, (%rdi, %rcx)
605 add $16, %rcx
606 #ifdef USE_AS_STRNCPY
607 cmp %r10, %r8
608 jbe LABEL(unaligned_exit)
609 #endif
610 jmp LABEL(ashr_12_use_ssse3)
612 .p2align 4
613 LABEL(ashr_12_use_sse2):
614 pcmpeqb 16(%rsi, %rcx), %xmm0
615 pmovmskb %xmm0, %edx
616 test %edx, %edx
617 jnz LABEL(unaligned_exit)
618 #ifdef USE_AS_STRNCPY
619 sub $16, %r8
620 jbe LABEL(strncpy_truncation_unaligned)
621 #endif
623 movdqa 16(%rsi, %rcx), %xmm3
624 movdqa (%rsi, %rcx), %xmm2
626 psrldq $12, %xmm2
627 pslldq $4, %xmm3
628 por %xmm2, %xmm3
630 movdqa %xmm3, (%rdi, %rcx)
631 add $16, %rcx
633 #ifdef USE_AS_STRNCPY
634 cmp %r10, %r8
635 jbe LABEL(unaligned_exit)
636 #endif
637 pcmpeqb 16(%rsi, %rcx), %xmm0
638 pmovmskb %xmm0, %edx
639 test %edx, %edx
640 jnz LABEL(unaligned_exit)
641 #ifdef USE_AS_STRNCPY
642 sub $16, %r8
643 jbe LABEL(strncpy_truncation_unaligned)
644 #endif
646 movdqa 16(%rsi, %rcx), %xmm3
647 movdqa (%rsi, %rcx), %xmm2
649 psrldq $12, %xmm2
650 pslldq $4, %xmm3
651 por %xmm2, %xmm3
653 movdqa %xmm3, (%rdi, %rcx)
654 add $16, %rcx
655 #ifdef USE_AS_STRNCPY
656 cmp %r10, %r8
657 jbe LABEL(unaligned_exit)
658 #endif
659 jmp LABEL(ashr_12_use_sse2)
663 * ashr_11 handles the following cases:
664 * (16 + (src offset - dest offset)) % 16 = 11
666 * Based on above operation, start from (%r9 + rsi) to the left of this cache
667 * bank, there is no null byte.
669 .p2align 4
670 LABEL(ashr_11):
671 xor %ecx, %ecx /* clear index */
672 #ifdef USE_AS_STRNCPY
673 cmp %r10, %r8
674 jbe LABEL(unaligned_exit)
675 #endif
676 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
677 jz LABEL(ashr_11_use_sse2)
679 .p2align 4
680 LABEL(ashr_11_use_ssse3):
681 movdqa 16(%rsi, %rcx), %xmm3
682 pcmpeqb %xmm3, %xmm0
683 pmovmskb %xmm0, %edx
684 test %edx, %edx
685 jnz LABEL(unaligned_exit)
686 #ifdef USE_AS_STRNCPY
687 sub $16, %r8
688 jbe LABEL(strncpy_truncation_unaligned)
689 #endif
691 #palignr $11, (%rsi, %rcx), %xmm3
692 .byte 0x66, 0x0F, 0x3A ,0x0F
693 .byte 0x1c, 0x0e, 0x0b
695 movdqa %xmm3, (%rdi, %rcx)
696 add $16, %rcx
698 #ifdef USE_AS_STRNCPY
699 cmp %r10, %r8
700 jbe LABEL(unaligned_exit)
701 #endif
702 movdqa 16(%rsi, %rcx), %xmm3
703 pcmpeqb %xmm3, %xmm0
704 pmovmskb %xmm0, %edx
705 test %edx, %edx
706 jnz LABEL(unaligned_exit)
707 #ifdef USE_AS_STRNCPY
708 sub $16, %r8
709 jbe LABEL(strncpy_truncation_unaligned)
710 #endif
712 #palignr $11, (%rsi, %rcx), %xmm3
713 .byte 0x66, 0x0F, 0x3A ,0x0F
714 .byte 0x1c, 0x0e, 0x0b
716 movdqa %xmm3, (%rdi, %rcx)
717 add $16, %rcx
718 #ifdef USE_AS_STRNCPY
719 cmp %r10, %r8
720 jbe LABEL(unaligned_exit)
721 #endif
722 jmp LABEL(ashr_11_use_ssse3)
724 .p2align 4
725 LABEL(ashr_11_use_sse2):
726 pcmpeqb 16(%rsi, %rcx), %xmm0
727 pmovmskb %xmm0, %edx
728 test %edx, %edx
729 jnz LABEL(unaligned_exit)
730 #ifdef USE_AS_STRNCPY
731 sub $16, %r8
732 jbe LABEL(strncpy_truncation_unaligned)
733 #endif
735 movdqa 16(%rsi, %rcx), %xmm3
736 movdqa (%rsi, %rcx), %xmm2
738 psrldq $11, %xmm2
739 pslldq $5, %xmm3
740 por %xmm2, %xmm3
742 movdqa %xmm3, (%rdi, %rcx)
743 add $16, %rcx
745 #ifdef USE_AS_STRNCPY
746 cmp %r10, %r8
747 jbe LABEL(unaligned_exit)
748 #endif
749 pcmpeqb 16(%rsi, %rcx), %xmm0
750 pmovmskb %xmm0, %edx
751 test %edx, %edx
752 jnz LABEL(unaligned_exit)
753 #ifdef USE_AS_STRNCPY
754 sub $16, %r8
755 jbe LABEL(strncpy_truncation_unaligned)
756 #endif
758 movdqa 16(%rsi, %rcx), %xmm3
759 movdqa (%rsi, %rcx), %xmm2
761 psrldq $11, %xmm2
762 pslldq $5, %xmm3
763 por %xmm2, %xmm3
765 movdqa %xmm3, (%rdi, %rcx)
766 add $16, %rcx
767 #ifdef USE_AS_STRNCPY
768 cmp %r10, %r8
769 jbe LABEL(unaligned_exit)
770 #endif
771 jmp LABEL(ashr_11_use_sse2)
775 * ashr_10 handles the following cases:
776 * (16 + (src offset - dest offset)) % 16 = 10
778 * Based on above operation, start from (%r9 + rsi) to the left of this cache
779 * bank, there is no null byte.
781 .p2align 4
782 LABEL(ashr_10):
783 xor %ecx, %ecx /* clear index */
784 #ifdef USE_AS_STRNCPY
785 cmp %r10, %r8
786 jbe LABEL(unaligned_exit)
787 #endif
788 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
789 jz LABEL(ashr_10_use_sse2)
791 .p2align 4
792 LABEL(ashr_10_use_ssse3):
793 movdqa 16(%rsi, %rcx), %xmm3
794 pcmpeqb %xmm3, %xmm0
795 pmovmskb %xmm0, %edx
796 test %edx, %edx
797 jnz LABEL(unaligned_exit)
798 #ifdef USE_AS_STRNCPY
799 sub $16, %r8
800 jbe LABEL(strncpy_truncation_unaligned)
801 #endif
803 #palignr $10, (%rsi, %rcx), %xmm3
804 .byte 0x66, 0x0F, 0x3A ,0x0F
805 .byte 0x1c, 0x0e, 0x0a
807 movdqa %xmm3, (%rdi, %rcx)
808 add $16, %rcx
810 #ifdef USE_AS_STRNCPY
811 cmp %r10, %r8
812 jbe LABEL(unaligned_exit)
813 #endif
814 movdqa 16(%rsi, %rcx), %xmm3
815 pcmpeqb %xmm3, %xmm0
816 pmovmskb %xmm0, %edx
817 test %edx, %edx
818 jnz LABEL(unaligned_exit)
819 #ifdef USE_AS_STRNCPY
820 sub $16, %r8
821 jbe LABEL(strncpy_truncation_unaligned)
822 #endif
824 #palignr $10, (%rsi, %rcx), %xmm3
825 .byte 0x66, 0x0F, 0x3A ,0x0F
826 .byte 0x1c, 0x0e, 0x0a
828 movdqa %xmm3, (%rdi, %rcx)
829 add $16, %rcx
830 #ifdef USE_AS_STRNCPY
831 cmp %r10, %r8
832 jbe LABEL(unaligned_exit)
833 #endif
834 jmp LABEL(ashr_10_use_ssse3)
836 .p2align 4
837 LABEL(ashr_10_use_sse2):
838 pcmpeqb 16(%rsi, %rcx), %xmm0
839 pmovmskb %xmm0, %edx
840 test %edx, %edx
841 jnz LABEL(unaligned_exit)
842 #ifdef USE_AS_STRNCPY
843 sub $16, %r8
844 jbe LABEL(strncpy_truncation_unaligned)
845 #endif
847 movdqa 16(%rsi, %rcx), %xmm3
848 movdqa (%rsi, %rcx), %xmm2
850 psrldq $10, %xmm2
851 pslldq $6, %xmm3
852 por %xmm2, %xmm3
854 movdqa %xmm3, (%rdi, %rcx)
855 add $16, %rcx
857 #ifdef USE_AS_STRNCPY
858 cmp %r10, %r8
859 jbe LABEL(unaligned_exit)
860 #endif
861 pcmpeqb 16(%rsi, %rcx), %xmm0
862 pmovmskb %xmm0, %edx
863 test %edx, %edx
864 jnz LABEL(unaligned_exit)
865 #ifdef USE_AS_STRNCPY
866 sub $16, %r8
867 jbe LABEL(strncpy_truncation_unaligned)
868 #endif
870 movdqa 16(%rsi, %rcx), %xmm3
871 movdqa (%rsi, %rcx), %xmm2
873 psrldq $10, %xmm2
874 pslldq $6, %xmm3
875 por %xmm2, %xmm3
877 movdqa %xmm3, (%rdi, %rcx)
878 add $16, %rcx
879 #ifdef USE_AS_STRNCPY
880 cmp %r10, %r8
881 jbe LABEL(unaligned_exit)
882 #endif
883 jmp LABEL(ashr_10_use_sse2)
887 * ashr_9 handles the following cases:
888 * (16 + (src offset - dest offset)) % 16 = 9
890 * Based on above operation, start from (%r9 + rsi) to the left of this cache
891 * bank, there is no null byte.
893 .p2align 4
894 LABEL(ashr_9):
895 xor %ecx, %ecx /* clear index */
896 #ifdef USE_AS_STRNCPY
897 cmp %r10, %r8
898 jbe LABEL(unaligned_exit)
899 #endif
900 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
901 jz LABEL(ashr_9_use_sse2)
903 .p2align 4
904 LABEL(ashr_9_use_ssse3):
905 movdqa 16(%rsi, %rcx), %xmm3
906 pcmpeqb %xmm3, %xmm0
907 pmovmskb %xmm0, %edx
908 test %edx, %edx
909 jnz LABEL(unaligned_exit)
910 #ifdef USE_AS_STRNCPY
911 sub $16, %r8
912 jbe LABEL(strncpy_truncation_unaligned)
913 #endif
915 #palignr $9, (%rsi, %rcx), %xmm3
916 .byte 0x66, 0x0F, 0x3A ,0x0F
917 .byte 0x1c, 0x0e, 0x09
919 movdqa %xmm3, (%rdi, %rcx)
920 add $16, %rcx
922 #ifdef USE_AS_STRNCPY
923 cmp %r10, %r8
924 jbe LABEL(unaligned_exit)
925 #endif
926 movdqa 16(%rsi, %rcx), %xmm3
927 pcmpeqb %xmm3, %xmm0
928 pmovmskb %xmm0, %edx
929 test %edx, %edx
930 jnz LABEL(unaligned_exit)
931 #ifdef USE_AS_STRNCPY
932 sub $16, %r8
933 jbe LABEL(strncpy_truncation_unaligned)
934 #endif
936 #palignr $9, (%rsi, %rcx), %xmm3
937 .byte 0x66, 0x0F, 0x3A ,0x0F
938 .byte 0x1c, 0x0e, 0x09
940 movdqa %xmm3, (%rdi, %rcx)
941 add $16, %rcx
942 #ifdef USE_AS_STRNCPY
943 cmp %r10, %r8
944 jbe LABEL(unaligned_exit)
945 #endif
946 jmp LABEL(ashr_9_use_ssse3)
948 .p2align 4
949 LABEL(ashr_9_use_sse2):
950 pcmpeqb 16(%rsi, %rcx), %xmm0
951 pmovmskb %xmm0, %edx
952 test %edx, %edx
953 jnz LABEL(unaligned_exit)
954 #ifdef USE_AS_STRNCPY
955 sub $16, %r8
956 jbe LABEL(strncpy_truncation_unaligned)
957 #endif
959 movdqa 16(%rsi, %rcx), %xmm3
960 movdqa (%rsi, %rcx), %xmm2
962 psrldq $9, %xmm2
963 pslldq $7, %xmm3
964 por %xmm2, %xmm3
966 movdqa %xmm3, (%rdi, %rcx)
967 add $16, %rcx
969 #ifdef USE_AS_STRNCPY
970 cmp %r10, %r8
971 jbe LABEL(unaligned_exit)
972 #endif
973 pcmpeqb 16(%rsi, %rcx), %xmm0
974 pmovmskb %xmm0, %edx
975 test %edx, %edx
976 jnz LABEL(unaligned_exit)
977 #ifdef USE_AS_STRNCPY
978 sub $16, %r8
979 jbe LABEL(strncpy_truncation_unaligned)
980 #endif
982 movdqa 16(%rsi, %rcx), %xmm3
983 movdqa (%rsi, %rcx), %xmm2
985 psrldq $9, %xmm2
986 pslldq $7, %xmm3
987 por %xmm2, %xmm3
989 movdqa %xmm3, (%rdi, %rcx)
990 add $16, %rcx
991 #ifdef USE_AS_STRNCPY
992 cmp %r10, %r8
993 jbe LABEL(unaligned_exit)
994 #endif
995 jmp LABEL(ashr_9_use_sse2)
999 * ashr_8 handles the following cases:
1000 * (16 + (src offset - dest offset)) % 16 = 8
1002 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1003 * bank, there is no null byte.
1005 .p2align 4
1006 LABEL(ashr_8):
1007 xor %ecx, %ecx /* clear index */
1008 #ifdef USE_AS_STRNCPY
1009 cmp %r10, %r8
1010 jbe LABEL(unaligned_exit)
1011 #endif
1012 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1013 jz LABEL(ashr_8_use_sse2)
1015 .p2align 4
1016 LABEL(ashr_8_use_ssse3):
1017 movdqa 16(%rsi, %rcx), %xmm3
1018 pcmpeqb %xmm3, %xmm0
1019 pmovmskb %xmm0, %edx
1020 test %edx, %edx
1021 jnz LABEL(unaligned_exit)
1022 #ifdef USE_AS_STRNCPY
1023 sub $16, %r8
1024 jbe LABEL(strncpy_truncation_unaligned)
1025 #endif
1027 #palignr $8, (%rsi, %rcx), %xmm3
1028 .byte 0x66, 0x0F, 0x3A ,0x0F
1029 .byte 0x1c, 0x0e, 0x08
1031 movdqa %xmm3, (%rdi, %rcx)
1032 add $16, %rcx
1034 #ifdef USE_AS_STRNCPY
1035 cmp %r10, %r8
1036 jbe LABEL(unaligned_exit)
1037 #endif
1038 movdqa 16(%rsi, %rcx), %xmm3
1039 pcmpeqb %xmm3, %xmm0
1040 pmovmskb %xmm0, %edx
1041 test %edx, %edx
1042 jnz LABEL(unaligned_exit)
1043 #ifdef USE_AS_STRNCPY
1044 sub $16, %r8
1045 jbe LABEL(strncpy_truncation_unaligned)
1046 #endif
1048 #palignr $8, (%rsi, %rcx), %xmm3
1049 .byte 0x66, 0x0F, 0x3A ,0x0F
1050 .byte 0x1c, 0x0e, 0x08
1052 movdqa %xmm3, (%rdi, %rcx)
1053 add $16, %rcx
1054 #ifdef USE_AS_STRNCPY
1055 cmp %r10, %r8
1056 jbe LABEL(unaligned_exit)
1057 #endif
1058 jmp LABEL(ashr_8_use_ssse3)
1060 .p2align 4
1061 LABEL(ashr_8_use_sse2):
1062 pcmpeqb 16(%rsi, %rcx), %xmm0
1063 pmovmskb %xmm0, %edx
1064 test %edx, %edx
1065 jnz LABEL(unaligned_exit)
1066 #ifdef USE_AS_STRNCPY
1067 sub $16, %r8
1068 jbe LABEL(strncpy_truncation_unaligned)
1069 #endif
1071 movdqa 16(%rsi, %rcx), %xmm3
1072 movdqa (%rsi, %rcx), %xmm2
1074 psrldq $8, %xmm2
1075 pslldq $8, %xmm3
1076 por %xmm2, %xmm3
1078 movdqa %xmm3, (%rdi, %rcx)
1079 add $16, %rcx
1081 #ifdef USE_AS_STRNCPY
1082 cmp %r10, %r8
1083 jbe LABEL(unaligned_exit)
1084 #endif
1085 pcmpeqb 16(%rsi, %rcx), %xmm0
1086 pmovmskb %xmm0, %edx
1087 test %edx, %edx
1088 jnz LABEL(unaligned_exit)
1089 #ifdef USE_AS_STRNCPY
1090 sub $16, %r8
1091 jbe LABEL(strncpy_truncation_unaligned)
1092 #endif
1094 movdqa 16(%rsi, %rcx), %xmm3
1095 movdqa (%rsi, %rcx), %xmm2
1097 psrldq $8, %xmm2
1098 pslldq $8, %xmm3
1099 por %xmm2, %xmm3
1101 movdqa %xmm3, (%rdi, %rcx)
1102 add $16, %rcx
1103 #ifdef USE_AS_STRNCPY
1104 cmp %r10, %r8
1105 jbe LABEL(unaligned_exit)
1106 #endif
1107 jmp LABEL(ashr_8_use_sse2)
1111 * ashr_7 handles the following cases:
1112 * (16 + (src offset - dest offset)) % 16 = 7
1114 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1115 * bank, there is no null byte.
1117 .p2align 4
1118 LABEL(ashr_7):
1119 xor %ecx, %ecx /* clear index */
1120 #ifdef USE_AS_STRNCPY
1121 cmp %r10, %r8
1122 jbe LABEL(unaligned_exit)
1123 #endif
1124 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1125 jz LABEL(ashr_7_use_sse2)
1127 .p2align 4
1128 LABEL(ashr_7_use_ssse3):
1129 movdqa 16(%rsi, %rcx), %xmm3
1130 pcmpeqb %xmm3, %xmm0
1131 pmovmskb %xmm0, %edx
1132 test %edx, %edx
1133 jnz LABEL(unaligned_exit)
1134 #ifdef USE_AS_STRNCPY
1135 sub $16, %r8
1136 jbe LABEL(strncpy_truncation_unaligned)
1137 #endif
1139 #palignr $7, (%rsi, %rcx), %xmm3
1140 .byte 0x66, 0x0F, 0x3A ,0x0F
1141 .byte 0x1c, 0x0e, 0x07
1143 movdqa %xmm3, (%rdi, %rcx)
1144 add $16, %rcx
1146 #ifdef USE_AS_STRNCPY
1147 cmp %r10, %r8
1148 jbe LABEL(unaligned_exit)
1149 #endif
1150 movdqa 16(%rsi, %rcx), %xmm3
1151 pcmpeqb %xmm3, %xmm0
1152 pmovmskb %xmm0, %edx
1153 test %edx, %edx
1154 jnz LABEL(unaligned_exit)
1155 #ifdef USE_AS_STRNCPY
1156 sub $16, %r8
1157 jbe LABEL(strncpy_truncation_unaligned)
1158 #endif
1160 #palignr $7, (%rsi, %rcx), %xmm3
1161 .byte 0x66, 0x0F, 0x3A ,0x0F
1162 .byte 0x1c, 0x0e, 0x07
1164 movdqa %xmm3, (%rdi, %rcx)
1165 add $16, %rcx
1166 #ifdef USE_AS_STRNCPY
1167 cmp %r10, %r8
1168 jbe LABEL(unaligned_exit)
1169 #endif
1170 jmp LABEL(ashr_7_use_ssse3)
1172 .p2align 4
1173 LABEL(ashr_7_use_sse2):
1174 pcmpeqb 16(%rsi, %rcx), %xmm0
1175 pmovmskb %xmm0, %edx
1176 test %edx, %edx
1177 jnz LABEL(unaligned_exit)
1178 #ifdef USE_AS_STRNCPY
1179 sub $16, %r8
1180 jbe LABEL(strncpy_truncation_unaligned)
1181 #endif
1183 movdqa 16(%rsi, %rcx), %xmm3
1184 movdqa (%rsi, %rcx), %xmm2
1186 psrldq $7, %xmm2
1187 pslldq $9, %xmm3
1188 por %xmm2, %xmm3
1190 movdqa %xmm3, (%rdi, %rcx)
1191 add $16, %rcx
1193 #ifdef USE_AS_STRNCPY
1194 cmp %r10, %r8
1195 jbe LABEL(unaligned_exit)
1196 #endif
1197 pcmpeqb 16(%rsi, %rcx), %xmm0
1198 pmovmskb %xmm0, %edx
1199 test %edx, %edx
1200 jnz LABEL(unaligned_exit)
1201 #ifdef USE_AS_STRNCPY
1202 sub $16, %r8
1203 jbe LABEL(strncpy_truncation_unaligned)
1204 #endif
1206 movdqa 16(%rsi, %rcx), %xmm3
1207 movdqa (%rsi, %rcx), %xmm2
1209 psrldq $7, %xmm2
1210 pslldq $9, %xmm3
1211 por %xmm2, %xmm3
1213 movdqa %xmm3, (%rdi, %rcx)
1214 add $16, %rcx
1215 #ifdef USE_AS_STRNCPY
1216 cmp %r10, %r8
1217 jbe LABEL(unaligned_exit)
1218 #endif
1219 jmp LABEL(ashr_7_use_sse2)
1223 * ashr_6 handles the following cases:
1224 * (16 + (src offset - dest offset)) % 16 = 6
1226 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1227 * bank, there is no null byte.
1229 .p2align 4
1230 LABEL(ashr_6):
1231 xor %ecx, %ecx /* clear index */
1232 #ifdef USE_AS_STRNCPY
1233 cmp %r10, %r8
1234 jbe LABEL(unaligned_exit)
1235 #endif
1236 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1237 jz LABEL(ashr_6_use_sse2)
1239 .p2align 4
1240 LABEL(ashr_6_use_ssse3):
1241 movdqa 16(%rsi, %rcx), %xmm3
1242 pcmpeqb %xmm3, %xmm0
1243 pmovmskb %xmm0, %edx
1244 test %edx, %edx
1245 jnz LABEL(unaligned_exit)
1246 #ifdef USE_AS_STRNCPY
1247 sub $16, %r8
1248 jbe LABEL(strncpy_truncation_unaligned)
1249 #endif
1251 #palignr $6, (%rsi, %rcx), %xmm3
1252 .byte 0x66, 0x0F, 0x3A ,0x0F
1253 .byte 0x1c, 0x0e, 0x06
1255 movdqa %xmm3, (%rdi, %rcx)
1256 add $16, %rcx
1258 #ifdef USE_AS_STRNCPY
1259 cmp %r10, %r8
1260 jbe LABEL(unaligned_exit)
1261 #endif
1262 movdqa 16(%rsi, %rcx), %xmm3
1263 pcmpeqb %xmm3, %xmm0
1264 pmovmskb %xmm0, %edx
1265 test %edx, %edx
1266 jnz LABEL(unaligned_exit)
1267 #ifdef USE_AS_STRNCPY
1268 sub $16, %r8
1269 jbe LABEL(strncpy_truncation_unaligned)
1270 #endif
1272 #palignr $6, (%rsi, %rcx), %xmm3
1273 .byte 0x66, 0x0F, 0x3A ,0x0F
1274 .byte 0x1c, 0x0e, 0x06
1276 movdqa %xmm3, (%rdi, %rcx)
1277 add $16, %rcx
1278 #ifdef USE_AS_STRNCPY
1279 cmp %r10, %r8
1280 jbe LABEL(unaligned_exit)
1281 #endif
1282 jmp LABEL(ashr_6_use_ssse3)
1284 .p2align 4
1285 LABEL(ashr_6_use_sse2):
1286 pcmpeqb 16(%rsi, %rcx), %xmm0
1287 pmovmskb %xmm0, %edx
1288 test %edx, %edx
1289 jnz LABEL(unaligned_exit)
1290 #ifdef USE_AS_STRNCPY
1291 sub $16, %r8
1292 jbe LABEL(strncpy_truncation_unaligned)
1293 #endif
1295 movdqa 16(%rsi, %rcx), %xmm3
1296 movdqa (%rsi, %rcx), %xmm2
1298 psrldq $6, %xmm2
1299 pslldq $10, %xmm3
1300 por %xmm2, %xmm3
1302 movdqa %xmm3, (%rdi, %rcx)
1303 add $16, %rcx
1305 #ifdef USE_AS_STRNCPY
1306 cmp %r10, %r8
1307 jbe LABEL(unaligned_exit)
1308 #endif
1309 pcmpeqb 16(%rsi, %rcx), %xmm0
1310 pmovmskb %xmm0, %edx
1311 test %edx, %edx
1312 jnz LABEL(unaligned_exit)
1313 #ifdef USE_AS_STRNCPY
1314 sub $16, %r8
1315 jbe LABEL(strncpy_truncation_unaligned)
1316 #endif
1318 movdqa 16(%rsi, %rcx), %xmm3
1319 movdqa (%rsi, %rcx), %xmm2
1321 psrldq $6, %xmm2
1322 pslldq $10, %xmm3
1323 por %xmm2, %xmm3
1325 movdqa %xmm3, (%rdi, %rcx)
1326 add $16, %rcx
1327 #ifdef USE_AS_STRNCPY
1328 cmp %r10, %r8
1329 jbe LABEL(unaligned_exit)
1330 #endif
1331 jmp LABEL(ashr_6_use_sse2)
1335 * ashr_5 handles the following cases:
1336 * (16 + (src offset - dest offset)) % 16 = 5
1338 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1339 * bank, there is no null byte.
1341 .p2align 4
1342 LABEL(ashr_5):
1343 xor %ecx, %ecx /* clear index */
1344 #ifdef USE_AS_STRNCPY
1345 cmp %r10, %r8
1346 jbe LABEL(unaligned_exit)
1347 #endif
1348 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1349 jz LABEL(ashr_5_use_sse2)
1351 .p2align 4
1352 LABEL(ashr_5_use_ssse3):
1353 movdqa 16(%rsi, %rcx), %xmm3
1354 pcmpeqb %xmm3, %xmm0
1355 pmovmskb %xmm0, %edx
1356 test %edx, %edx
1357 jnz LABEL(unaligned_exit)
1358 #ifdef USE_AS_STRNCPY
1359 sub $16, %r8
1360 jbe LABEL(strncpy_truncation_unaligned)
1361 #endif
1363 #palignr $5, (%rsi, %rcx), %xmm3
1364 .byte 0x66, 0x0F, 0x3A ,0x0F
1365 .byte 0x1c, 0x0e, 0x05
1367 movdqa %xmm3, (%rdi, %rcx)
1368 add $16, %rcx
1370 #ifdef USE_AS_STRNCPY
1371 cmp %r10, %r8
1372 jbe LABEL(unaligned_exit)
1373 #endif
1374 movdqa 16(%rsi, %rcx), %xmm3
1375 pcmpeqb %xmm3, %xmm0
1376 pmovmskb %xmm0, %edx
1377 test %edx, %edx
1378 jnz LABEL(unaligned_exit)
1379 #ifdef USE_AS_STRNCPY
1380 sub $16, %r8
1381 jbe LABEL(strncpy_truncation_unaligned)
1382 #endif
1384 #palignr $5, (%rsi, %rcx), %xmm3
1385 .byte 0x66, 0x0F, 0x3A ,0x0F
1386 .byte 0x1c, 0x0e, 0x05
1388 movdqa %xmm3, (%rdi, %rcx)
1389 add $16, %rcx
1390 #ifdef USE_AS_STRNCPY
1391 cmp %r10, %r8
1392 jbe LABEL(unaligned_exit)
1393 #endif
1394 jmp LABEL(ashr_5_use_ssse3)
1396 .p2align 4
1397 LABEL(ashr_5_use_sse2):
1398 pcmpeqb 16(%rsi, %rcx), %xmm0
1399 pmovmskb %xmm0, %edx
1400 test %edx, %edx
1401 jnz LABEL(unaligned_exit)
1402 #ifdef USE_AS_STRNCPY
1403 sub $16, %r8
1404 jbe LABEL(strncpy_truncation_unaligned)
1405 #endif
1407 movdqa 16(%rsi, %rcx), %xmm3
1408 movdqa (%rsi, %rcx), %xmm2
1410 psrldq $5, %xmm2
1411 pslldq $11, %xmm3
1412 por %xmm2, %xmm3
1414 movdqa %xmm3, (%rdi, %rcx)
1415 add $16, %rcx
1417 #ifdef USE_AS_STRNCPY
1418 cmp %r10, %r8
1419 jbe LABEL(unaligned_exit)
1420 #endif
1421 pcmpeqb 16(%rsi, %rcx), %xmm0
1422 pmovmskb %xmm0, %edx
1423 test %edx, %edx
1424 jnz LABEL(unaligned_exit)
1425 #ifdef USE_AS_STRNCPY
1426 sub $16, %r8
1427 jbe LABEL(strncpy_truncation_unaligned)
1428 #endif
1430 movdqa 16(%rsi, %rcx), %xmm3
1431 movdqa (%rsi, %rcx), %xmm2
1433 psrldq $5, %xmm2
1434 pslldq $11, %xmm3
1435 por %xmm2, %xmm3
1437 movdqa %xmm3, (%rdi, %rcx)
1438 add $16, %rcx
1439 #ifdef USE_AS_STRNCPY
1440 cmp %r10, %r8
1441 jbe LABEL(unaligned_exit)
1442 #endif
1443 jmp LABEL(ashr_5_use_sse2)
1447 * ashr_4 handles the following cases:
1448 * (16 + (src offset - dest offset)) % 16 = 4
1450 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1451 * bank, there is no null byte.
1453 .p2align 4
1454 LABEL(ashr_4):
1455 xor %ecx, %ecx /* clear index */
1456 #ifdef USE_AS_STRNCPY
1457 cmp %r10, %r8
1458 jbe LABEL(unaligned_exit)
1459 #endif
1460 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1461 jz LABEL(ashr_4_use_sse2)
1463 .p2align 4
1464 LABEL(ashr_4_use_ssse3):
1465 movdqa 16(%rsi, %rcx), %xmm3
1466 pcmpeqb %xmm3, %xmm0
1467 pmovmskb %xmm0, %edx
1468 test %edx, %edx
1469 jnz LABEL(unaligned_exit)
1470 #ifdef USE_AS_STRNCPY
1471 sub $16, %r8
1472 jbe LABEL(strncpy_truncation_unaligned)
1473 #endif
1475 #palignr $4, (%rsi, %rcx), %xmm3
1476 .byte 0x66, 0x0F, 0x3A ,0x0F
1477 .byte 0x1c, 0x0e, 0x04
1479 movdqa %xmm3, (%rdi, %rcx)
1480 add $16, %rcx
1482 #ifdef USE_AS_STRNCPY
1483 cmp %r10, %r8
1484 jbe LABEL(unaligned_exit)
1485 #endif
1486 movdqa 16(%rsi, %rcx), %xmm3
1487 pcmpeqb %xmm3, %xmm0
1488 pmovmskb %xmm0, %edx
1489 test %edx, %edx
1490 jnz LABEL(unaligned_exit)
1491 #ifdef USE_AS_STRNCPY
1492 sub $16, %r8
1493 jbe LABEL(strncpy_truncation_unaligned)
1494 #endif
1496 #palignr $4, (%rsi, %rcx), %xmm3
1497 .byte 0x66, 0x0F, 0x3A ,0x0F
1498 .byte 0x1c, 0x0e, 0x04
1500 movdqa %xmm3, (%rdi, %rcx)
1501 add $16, %rcx
1502 #ifdef USE_AS_STRNCPY
1503 cmp %r10, %r8
1504 jbe LABEL(unaligned_exit)
1505 #endif
1506 jmp LABEL(ashr_4_use_ssse3)
1508 .p2align 4
1509 LABEL(ashr_4_use_sse2):
1510 pcmpeqb 16(%rsi, %rcx), %xmm0
1511 pmovmskb %xmm0, %edx
1512 test %edx, %edx
1513 jnz LABEL(unaligned_exit)
1514 #ifdef USE_AS_STRNCPY
1515 sub $16, %r8
1516 jbe LABEL(strncpy_truncation_unaligned)
1517 #endif
1519 movdqa 16(%rsi, %rcx), %xmm3
1520 movdqa (%rsi, %rcx), %xmm2
1522 psrldq $4, %xmm2
1523 pslldq $12, %xmm3
1524 por %xmm2, %xmm3
1526 movdqa %xmm3, (%rdi, %rcx)
1527 add $16, %rcx
1529 #ifdef USE_AS_STRNCPY
1530 cmp %r10, %r8
1531 jbe LABEL(unaligned_exit)
1532 #endif
1533 pcmpeqb 16(%rsi, %rcx), %xmm0
1534 pmovmskb %xmm0, %edx
1535 test %edx, %edx
1536 jnz LABEL(unaligned_exit)
1537 #ifdef USE_AS_STRNCPY
1538 sub $16, %r8
1539 jbe LABEL(strncpy_truncation_unaligned)
1540 #endif
1542 movdqa 16(%rsi, %rcx), %xmm3
1543 movdqa (%rsi, %rcx), %xmm2
1545 psrldq $4, %xmm2
1546 pslldq $12, %xmm3
1547 por %xmm2, %xmm3
1549 movdqa %xmm3, (%rdi, %rcx)
1550 add $16, %rcx
1551 #ifdef USE_AS_STRNCPY
1552 cmp %r10, %r8
1553 jbe LABEL(unaligned_exit)
1554 #endif
1555 jmp LABEL(ashr_4_use_sse2)
1559 * ashr_3 handles the following cases:
1560 * (16 + (src offset - dest offset)) % 16 = 3
1562 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1563 * bank, there is no null byte.
1565 .p2align 4
1566 LABEL(ashr_3):
1567 xor %ecx, %ecx /* clear index */
1568 #ifdef USE_AS_STRNCPY
1569 cmp %r10, %r8
1570 jbe LABEL(unaligned_exit)
1571 #endif
1572 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1573 jz LABEL(ashr_3_use_sse2)
1575 .p2align 4
1576 LABEL(ashr_3_use_ssse3):
1577 movdqa 16(%rsi, %rcx), %xmm3
1578 pcmpeqb %xmm3, %xmm0
1579 pmovmskb %xmm0, %edx
1580 test %edx, %edx
1581 jnz LABEL(unaligned_exit)
1582 #ifdef USE_AS_STRNCPY
1583 sub $16, %r8
1584 jbe LABEL(strncpy_truncation_unaligned)
1585 #endif
1587 #palignr $3, (%rsi, %rcx), %xmm3
1588 .byte 0x66, 0x0F, 0x3A ,0x0F
1589 .byte 0x1c, 0x0e, 0x03
1591 movdqa %xmm3, (%rdi, %rcx)
1592 add $16, %rcx
1594 #ifdef USE_AS_STRNCPY
1595 cmp %r10, %r8
1596 jbe LABEL(unaligned_exit)
1597 #endif
1598 movdqa 16(%rsi, %rcx), %xmm3
1599 pcmpeqb %xmm3, %xmm0
1600 pmovmskb %xmm0, %edx
1601 test %edx, %edx
1602 jnz LABEL(unaligned_exit)
1603 #ifdef USE_AS_STRNCPY
1604 sub $16, %r8
1605 jbe LABEL(strncpy_truncation_unaligned)
1606 #endif
1608 #palignr $3, (%rsi, %rcx), %xmm3
1609 .byte 0x66, 0x0F, 0x3A ,0x0F
1610 .byte 0x1c, 0x0e, 0x03
1612 movdqa %xmm3, (%rdi, %rcx)
1613 add $16, %rcx
1614 #ifdef USE_AS_STRNCPY
1615 cmp %r10, %r8
1616 jbe LABEL(unaligned_exit)
1617 #endif
1618 jmp LABEL(ashr_3_use_ssse3)
1620 .p2align 4
1621 LABEL(ashr_3_use_sse2):
1622 pcmpeqb 16(%rsi, %rcx), %xmm0
1623 pmovmskb %xmm0, %edx
1624 test %edx, %edx
1625 jnz LABEL(unaligned_exit)
1626 #ifdef USE_AS_STRNCPY
1627 sub $16, %r8
1628 jbe LABEL(strncpy_truncation_unaligned)
1629 #endif
1631 movdqa 16(%rsi, %rcx), %xmm3
1632 movdqa (%rsi, %rcx), %xmm2
1634 psrldq $3, %xmm2
1635 pslldq $13, %xmm3
1636 por %xmm2, %xmm3
1638 movdqa %xmm3, (%rdi, %rcx)
1639 add $16, %rcx
1641 #ifdef USE_AS_STRNCPY
1642 cmp %r10, %r8
1643 jbe LABEL(unaligned_exit)
1644 #endif
1645 pcmpeqb 16(%rsi, %rcx), %xmm0
1646 pmovmskb %xmm0, %edx
1647 test %edx, %edx
1648 jnz LABEL(unaligned_exit)
1649 #ifdef USE_AS_STRNCPY
1650 sub $16, %r8
1651 jbe LABEL(strncpy_truncation_unaligned)
1652 #endif
1654 movdqa 16(%rsi, %rcx), %xmm3
1655 movdqa (%rsi, %rcx), %xmm2
1657 psrldq $3, %xmm2
1658 pslldq $13, %xmm3
1659 por %xmm2, %xmm3
1661 movdqa %xmm3, (%rdi, %rcx)
1662 add $16, %rcx
1663 #ifdef USE_AS_STRNCPY
1664 cmp %r10, %r8
1665 jbe LABEL(unaligned_exit)
1666 #endif
1667 jmp LABEL(ashr_3_use_sse2)
1671 * ashr_2 handles the following cases:
1672 * (16 + (src offset - dest offset)) % 16 = 2
1674 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1675 * bank, there is no null byte.
1677 .p2align 4
1678 LABEL(ashr_2):
1679 xor %ecx, %ecx /* clear index */
1680 #ifdef USE_AS_STRNCPY
1681 cmp %r10, %r8
1682 jbe LABEL(unaligned_exit)
1683 #endif
1684 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1685 jz LABEL(ashr_2_use_sse2)
1687 .p2align 4
1688 LABEL(ashr_2_use_ssse3):
1689 movdqa 16(%rsi, %rcx), %xmm3
1690 pcmpeqb %xmm3, %xmm0
1691 pmovmskb %xmm0, %edx
1692 test %edx, %edx
1693 jnz LABEL(unaligned_exit)
1694 #ifdef USE_AS_STRNCPY
1695 sub $16, %r8
1696 jbe LABEL(strncpy_truncation_unaligned)
1697 #endif
1699 #palignr $2, (%rsi, %rcx), %xmm3
1700 .byte 0x66, 0x0F, 0x3A ,0x0F
1701 .byte 0x1c, 0x0e, 0x02
1703 movdqa %xmm3, (%rdi, %rcx)
1704 add $16, %rcx
1706 #ifdef USE_AS_STRNCPY
1707 cmp %r10, %r8
1708 jbe LABEL(unaligned_exit)
1709 #endif
1710 movdqa 16(%rsi, %rcx), %xmm3
1711 pcmpeqb %xmm3, %xmm0
1712 pmovmskb %xmm0, %edx
1713 test %edx, %edx
1714 jnz LABEL(unaligned_exit)
1715 #ifdef USE_AS_STRNCPY
1716 sub $16, %r8
1717 jbe LABEL(strncpy_truncation_unaligned)
1718 #endif
1720 #palignr $2, (%rsi, %rcx), %xmm3
1721 .byte 0x66, 0x0F, 0x3A ,0x0F
1722 .byte 0x1c, 0x0e, 0x02
1724 movdqa %xmm3, (%rdi, %rcx)
1725 add $16, %rcx
1726 #ifdef USE_AS_STRNCPY
1727 cmp %r10, %r8
1728 jbe LABEL(unaligned_exit)
1729 #endif
1730 jmp LABEL(ashr_2_use_ssse3)
1732 .p2align 4
1733 LABEL(ashr_2_use_sse2):
1734 pcmpeqb 16(%rsi, %rcx), %xmm0
1735 pmovmskb %xmm0, %edx
1736 test %edx, %edx
1737 jnz LABEL(unaligned_exit)
1738 #ifdef USE_AS_STRNCPY
1739 sub $16, %r8
1740 jbe LABEL(strncpy_truncation_unaligned)
1741 #endif
1743 movdqa 16(%rsi, %rcx), %xmm3
1744 movdqa (%rsi, %rcx), %xmm2
1746 psrldq $2, %xmm2
1747 pslldq $14, %xmm3
1748 por %xmm2, %xmm3
1750 movdqa %xmm3, (%rdi, %rcx)
1751 add $16, %rcx
1753 #ifdef USE_AS_STRNCPY
1754 cmp %r10, %r8
1755 jbe LABEL(unaligned_exit)
1756 #endif
1757 pcmpeqb 16(%rsi, %rcx), %xmm0
1758 pmovmskb %xmm0, %edx
1759 test %edx, %edx
1760 jnz LABEL(unaligned_exit)
1761 #ifdef USE_AS_STRNCPY
1762 sub $16, %r8
1763 jbe LABEL(strncpy_truncation_unaligned)
1764 #endif
1766 movdqa 16(%rsi, %rcx), %xmm3
1767 movdqa (%rsi, %rcx), %xmm2
1769 psrldq $2, %xmm2
1770 pslldq $14, %xmm3
1771 por %xmm2, %xmm3
1773 movdqa %xmm3, (%rdi, %rcx)
1774 add $16, %rcx
1775 #ifdef USE_AS_STRNCPY
1776 cmp %r10, %r8
1777 jbe LABEL(unaligned_exit)
1778 #endif
1779 jmp LABEL(ashr_2_use_sse2)
1783 * ashr_1 handles the following cases:
1784 * (16 + (src offset - dest offset)) % 16 = 1
1786 * Based on above operation, start from (%r9 + rsi) to the left of this cache
1787 * bank, there is no null byte.
1789 .p2align 4
1790 LABEL(ashr_1):
1791 xor %ecx, %ecx /* clear index */
1792 #ifdef USE_AS_STRNCPY
1793 cmp %r10, %r8
1794 jbe LABEL(unaligned_exit)
1795 #endif
1796 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */
1797 jz LABEL(ashr_1_use_sse2)
1799 .p2align 4
1800 LABEL(ashr_1_use_ssse3):
1801 movdqa 16(%rsi, %rcx), %xmm3
1802 pcmpeqb %xmm3, %xmm0
1803 pmovmskb %xmm0, %edx
1804 test %edx, %edx
1805 jnz LABEL(unaligned_exit)
1806 #ifdef USE_AS_STRNCPY
1807 sub $16, %r8
1808 jbe LABEL(strncpy_truncation_unaligned)
1809 #endif
1811 #palignr $1, (%rsi, %rcx), %xmm3
1812 .byte 0x66, 0x0F, 0x3A ,0x0F
1813 .byte 0x1c, 0x0e, 0x01
1815 movdqa %xmm3, (%rdi, %rcx)
1816 add $16, %rcx
1818 #ifdef USE_AS_STRNCPY
1819 cmp %r10, %r8
1820 jbe LABEL(unaligned_exit)
1821 #endif
1822 movdqa 16(%rsi, %rcx), %xmm3
1823 pcmpeqb %xmm3, %xmm0
1824 pmovmskb %xmm0, %edx
1825 test %edx, %edx
1826 jnz LABEL(unaligned_exit)
1827 #ifdef USE_AS_STRNCPY
1828 sub $16, %r8
1829 jbe LABEL(strncpy_truncation_unaligned)
1830 #endif
1831 #palignr $1, (%rsi, %rcx), %xmm3
1832 .byte 0x66, 0x0F, 0x3A ,0x0F
1833 .byte 0x1c, 0x0e, 0x01
1835 movdqa %xmm3, (%rdi, %rcx)
1836 add $16, %rcx
1837 #ifdef USE_AS_STRNCPY
1838 cmp %r10, %r8
1839 jbe LABEL(unaligned_exit)
1840 #endif
1841 jmp LABEL(ashr_1_use_ssse3)
1843 .p2align 4
1844 LABEL(ashr_1_use_sse2):
1845 pcmpeqb 16(%rsi, %rcx), %xmm0
1846 pmovmskb %xmm0, %edx
1847 test %edx, %edx
1848 jnz LABEL(unaligned_exit)
1849 #ifdef USE_AS_STRNCPY
1850 sub $16, %r8
1851 jbe LABEL(strncpy_truncation_unaligned)
1852 #endif
1853 movdqa 16(%rsi, %rcx), %xmm3
1854 movdqa (%rsi, %rcx), %xmm2
1856 psrldq $1, %xmm2
1857 pslldq $15, %xmm3
1858 por %xmm2, %xmm3
1860 movdqa %xmm3, (%rdi, %rcx)
1861 add $16, %rcx
1863 #ifdef USE_AS_STRNCPY
1864 cmp %r10, %r8
1865 jbe LABEL(unaligned_exit)
1866 #endif
1867 pcmpeqb 16(%rsi, %rcx), %xmm0
1868 pmovmskb %xmm0, %edx
1869 test %edx, %edx
1870 jnz LABEL(unaligned_exit)
1871 #ifdef USE_AS_STRNCPY
1872 sub $16, %r8
1873 jbe LABEL(strncpy_truncation_unaligned)
1874 #endif
1876 movdqa 16(%rsi, %rcx), %xmm3
1877 movdqa (%rsi, %rcx), %xmm2
1879 psrldq $1, %xmm2
1880 pslldq $15, %xmm3
1881 por %xmm2, %xmm3
1883 movdqa %xmm3, (%rdi, %rcx)
1884 add $16, %rcx
1885 #ifdef USE_AS_STRNCPY
1886 cmp %r10, %r8
1887 jbe LABEL(unaligned_exit)
1888 #endif
1889 jmp LABEL(ashr_1_use_sse2)
1893 * Exit tail code:
1894 * Up to 32 bytes are copied in the case of strcpy.
1896 .p2align 4
1897 LABEL(less32bytes):
1898 xor %ecx, %ecx
1899 LABEL(unaligned_exit):
1900 add %r9, %rsi /* r9 holds offset of rsi */
1901 mov %rcx, %r9
1902 mov %r10, %rcx
1903 shl %cl, %edx /* after shl, calculate the exact number to be filled */
1904 mov %r9, %rcx
1905 .p2align 4
1906 LABEL(aligned_exit):
1907 add %rcx, %rdi /* locate exact address for rdi */
1908 LABEL(less16bytes):
1909 add %rcx, %rsi /* locate exact address for rsi */
1910 LABEL(aligned_16bytes):
1911 #ifdef USE_AS_STRNCPY
1913 * Null found in 16bytes checked. Set bit in bitmask corresponding to
1914 * the strncpy count argument. We will copy to the null (inclusive)
1915 * or count whichever comes first.
1917 mov $1, %r9d
1918 lea -1(%r8), %rcx
1919 shl %cl, %r9d
1920 cmp $32, %r8
1921 ja LABEL(strncpy_tail)
1922 or %r9d, %edx
1923 LABEL(strncpy_tail):
1924 #endif
1926 * Check to see if BSF is fast on this processor. If not, use a
1927 * different exit tail.
1929 testb $USE_BSF, .memops_method(%rip)
1930 jz LABEL(AMD_exit)
1931 bsf %rdx, %rcx /* Find byte with null char */
1932 lea LABEL(tail_table)(%rip), %r11
1933 movslq (%r11, %rcx, 4), %rcx
1934 lea (%r11, %rcx), %rcx
1935 jmp *%rcx
1937 #ifdef USE_AS_STRNCPY
1939 * Count reached before null found.
1941 .p2align 4
1942 LABEL(less32bytes_strncpy_truncation):
1943 xor %ecx, %ecx
1944 LABEL(strncpy_truncation_unaligned):
1945 add %r9, %rsi /* next src char to copy */
1946 LABEL(strncpy_truncation_aligned):
1947 add %rcx, %rdi
1948 add %rcx, %rsi
1949 add $16, %r8 /* compensation */
1950 lea -1(%r8), %rcx
1951 lea LABEL(tail_table)(%rip), %r11
1952 movslq (%r11, %rcx, 4), %rcx
1953 lea (%r11, %rcx), %rcx
1954 jmp *%rcx
1956 .p2align 4
1957 LABEL(strncpy_exitz):
1958 mov %rdi, %rax
1960 #endif
1962 .p2align 4
1963 LABEL(AMD_exit):
1964 test %dl, %dl
1965 jz LABEL(AMD_exit_more_8)
1966 test $0x01, %dl
1967 jnz LABEL(tail_0)
1968 test $0x02, %dl
1969 jnz LABEL(tail_1)
1970 test $0x04, %dl
1971 jnz LABEL(tail_2)
1972 test $0x08, %dl
1973 jnz LABEL(tail_3)
1974 test $0x10, %dl
1975 jnz LABEL(tail_4)
1976 test $0x20, %dl
1977 jnz LABEL(tail_5)
1978 test $0x40, %dl
1979 jnz LABEL(tail_6)
1981 .p2align 4
1982 LABEL(tail_7): /* 8 bytes */
1983 mov (%rsi), %rcx
1984 mov %rcx, (%rdi)
1985 #ifdef USE_AS_STRNCPY
1986 mov $8, %cl
1987 sub $8, %r8
1988 jnz LABEL(strncpy_fill_tail)
1989 #endif
1992 #ifdef USE_AS_STRNCPY
1994 * Null terminated src string shorter than count. Fill the rest of the
1995 * destination with null chars.
1997 .p2align 4
1998 LABEL(strncpy_fill_tail):
1999 mov %rax, %rdx
2000 movzx %cl, %rax
2001 mov %r8, %rcx
2002 add %rax, %rdi
2003 xor %eax, %eax
2004 shr $3, %ecx
2005 jz LABEL(strncpy_fill_less_8)
2007 rep stosq
2008 LABEL(strncpy_fill_less_8):
2009 mov %r8, %rcx
2010 and $7, %rcx
2011 jz LABEL(strncpy_fill_return)
2012 LABEL(strncpy_fill_less_7):
2013 sub $1, %ecx
2014 mov %al, (%rdi, %rcx)
2015 jnz LABEL(strncpy_fill_less_7)
2016 LABEL(strncpy_fill_return):
2017 mov %rdx, %rax
2019 #endif
2021 .p2align 4
2022 LABEL(tail_0): /* 1 byte */
2023 mov (%rsi), %cl
2024 mov %cl, (%rdi)
2025 #ifdef USE_AS_STRNCPY
2026 mov $1, %cl
2027 sub $1, %r8
2028 jnz LABEL(strncpy_fill_tail)
2029 #endif
2032 .p2align 4
2033 LABEL(tail_1): /* 2 bytes */
2034 mov (%rsi), %cx
2035 mov %cx, (%rdi)
2036 #ifdef USE_AS_STRNCPY
2037 mov $2, %cl
2038 sub $2, %r8
2039 jnz LABEL(strncpy_fill_tail)
2040 #endif
2043 .p2align 4
2044 LABEL(tail_2): /* 3 bytes */
2045 mov (%rsi), %cx
2046 mov %cx, (%rdi)
2047 mov 1(%rsi), %cx
2048 mov %cx, 1(%rdi)
2049 #ifdef USE_AS_STRNCPY
2050 mov $3, %cl
2051 sub $3, %r8
2052 jnz LABEL(strncpy_fill_tail)
2053 #endif
2056 .p2align 4
2057 LABEL(tail_3): /* 4 bytes */
2058 mov (%rsi), %ecx
2059 mov %ecx, (%rdi)
2060 #ifdef USE_AS_STRNCPY
2061 mov $4, %cl
2062 sub $4, %r8
2063 jnz LABEL(strncpy_fill_tail)
2064 #endif
2067 .p2align 4
2068 LABEL(tail_4): /* 5 bytes */
2069 mov (%rsi), %ecx
2070 mov %ecx, (%rdi)
2071 mov 1(%rsi), %edx
2072 mov %edx, 1(%rdi)
2073 #ifdef USE_AS_STRNCPY
2074 mov $5, %cl
2075 sub $5, %r8
2076 jnz LABEL(strncpy_fill_tail)
2077 #endif
2080 .p2align 4
2081 LABEL(tail_5): /* 6 bytes */
2082 mov (%rsi), %ecx
2083 mov %ecx, (%rdi)
2084 mov 2(%rsi), %edx
2085 mov %edx, 2(%rdi)
2086 #ifdef USE_AS_STRNCPY
2087 mov $6, %cl
2088 sub $6, %r8
2089 jnz LABEL(strncpy_fill_tail)
2090 #endif
2093 .p2align 4
2094 LABEL(tail_6): /* 7 bytes */
2095 mov (%rsi), %ecx
2096 mov %ecx, (%rdi)
2097 mov 3(%rsi), %edx
2098 mov %edx,3(%rdi)
2099 #ifdef USE_AS_STRNCPY
2100 mov $7, %cl
2101 sub $7, %r8
2102 jnz LABEL(strncpy_fill_tail)
2103 #endif
2106 .p2align 4
2107 LABEL(tail_8): /* 9 bytes */
2108 mov (%rsi), %rcx
2109 mov %rcx, (%rdi)
2110 mov 5(%rsi), %edx
2111 mov %edx, 5(%rdi)
2112 #ifdef USE_AS_STRNCPY
2113 mov $9, %cl
2114 sub $9, %r8
2115 jnz LABEL(strncpy_fill_tail)
2116 #endif
2119 .p2align 4
2120 LABEL(AMD_exit_more_8):
2121 test %dh, %dh
2122 jz LABEL(AMD_exit_more_16)
2123 test $0x01, %dh
2124 jnz LABEL(tail_8)
2125 test $0x02, %dh
2126 jnz LABEL(tail_9)
2127 test $0x04, %dh
2128 jnz LABEL(tail_10)
2129 test $0x08, %dh
2130 jnz LABEL(tail_11)
2131 test $0x10, %dh
2132 jnz LABEL(tail_12)
2133 test $0x20, %dh
2134 jnz LABEL(tail_13)
2135 test $0x40, %dh
2136 jnz LABEL(tail_14)
2138 .p2align 4
2139 LABEL(tail_15): /* 16 bytes */
2140 mov (%rsi), %rcx
2141 mov %rcx, (%rdi)
2142 mov 8(%rsi), %rdx
2143 mov %rdx, 8(%rdi)
2144 #ifdef USE_AS_STRNCPY
2145 mov $16, %cl
2146 sub $16, %r8
2147 jnz LABEL(strncpy_fill_tail)
2148 #endif
2151 .p2align 4
2152 LABEL(tail_9): /* 10 bytes */
2153 mov (%rsi), %rcx
2154 mov %rcx, (%rdi)
2155 mov 6(%rsi), %edx
2156 mov %edx, 6(%rdi)
2157 #ifdef USE_AS_STRNCPY
2158 mov $10, %cl
2159 sub $10, %r8
2160 jnz LABEL(strncpy_fill_tail)
2161 #endif
2164 .p2align 4
2165 LABEL(tail_10): /* 11 bytes */
2166 mov (%rsi), %rcx
2167 mov %rcx, (%rdi)
2168 mov 7(%rsi), %edx
2169 mov %edx, 7(%rdi)
2170 #ifdef USE_AS_STRNCPY
2171 mov $11, %cl
2172 sub $11, %r8
2173 jnz LABEL(strncpy_fill_tail)
2174 #endif
2177 .p2align 4
2178 LABEL(tail_11): /* 12 bytes */
2179 mov (%rsi), %rcx
2180 mov %rcx, (%rdi)
2181 mov 8(%rsi), %edx
2182 mov %edx, 8(%rdi)
2183 #ifdef USE_AS_STRNCPY
2184 mov $12, %cl
2185 sub $12, %r8
2186 jnz LABEL(strncpy_fill_tail)
2187 #endif
2190 .p2align 4
2191 LABEL(tail_12): /* 13 bytes */
2192 mov (%rsi), %rcx
2193 mov %rcx, (%rdi)
2194 mov 5(%rsi), %rcx
2195 mov %rcx, 5(%rdi)
2196 #ifdef USE_AS_STRNCPY
2197 mov $13, %cl
2198 sub $13, %r8
2199 jnz LABEL(strncpy_fill_tail)
2200 #endif
2203 .p2align 4
2204 LABEL(tail_13): /* 14 bytes */
2205 mov (%rsi), %rcx
2206 mov %rcx, (%rdi)
2207 mov 6(%rsi), %rcx
2208 mov %rcx, 6(%rdi)
2209 #ifdef USE_AS_STRNCPY
2210 mov $14, %cl
2211 sub $14, %r8
2212 jnz LABEL(strncpy_fill_tail)
2213 #endif
2216 .p2align 4
2217 LABEL(tail_14): /* 15 bytes */
2218 mov (%rsi), %rcx
2219 mov %rcx, (%rdi)
2220 mov 7(%rsi), %rcx
2221 mov %rcx, 7(%rdi)
2222 #ifdef USE_AS_STRNCPY
2223 mov $15, %cl
2224 sub $15, %r8
2225 jnz LABEL(strncpy_fill_tail)
2226 #endif
2229 .p2align 4
2230 LABEL(AMD_exit_more_16):
2231 shr $16, %edx
2232 test %dl, %dl
2233 jz LABEL(AMD_exit_more_24)
2234 test $0x01, %dl
2235 jnz LABEL(tail_16)
2236 test $0x02, %dl
2237 jnz LABEL(tail_17)
2238 test $0x04, %dl
2239 jnz LABEL(tail_18)
2240 test $0x08, %dl
2241 jnz LABEL(tail_19)
2242 test $0x10, %dl
2243 jnz LABEL(tail_20)
2244 test $0x20, %dl
2245 jnz LABEL(tail_21)
2246 test $0x40, %dl
2247 jnz LABEL(tail_22)
2249 .p2align 4
2250 LABEL(tail_23): /* 24 bytes */
2251 mov (%rsi), %rcx
2252 mov %rcx, (%rdi)
2253 mov 8(%rsi), %rdx
2254 mov %rdx, 8(%rdi)
2255 mov 16(%rsi), %rcx
2256 mov %rcx, 16(%rdi)
2257 #ifdef USE_AS_STRNCPY
2258 mov $24, %cl
2259 sub $24, %r8
2260 jnz LABEL(strncpy_fill_tail)
2261 #endif
2264 .p2align 4
2265 LABEL(tail_16): /* 17 bytes */
2266 mov (%rsi), %rcx
2267 mov %rcx, (%rdi)
2268 mov 8(%rsi), %rdx
2269 mov %rdx, 8(%rdi)
2270 mov 16(%rsi), %cl
2271 mov %cl, 16(%rdi)
2272 #ifdef USE_AS_STRNCPY
2273 mov $17, %cl
2274 sub $17, %r8
2275 jnz LABEL(strncpy_fill_tail)
2276 #endif
2279 .p2align 4
2280 LABEL(tail_17): /* 18 bytes */
2281 mov (%rsi), %rcx
2282 mov %rcx, (%rdi)
2283 mov 8(%rsi), %rdx
2284 mov %rdx, 8(%rdi)
2285 mov 16(%rsi), %cx
2286 mov %cx, 16(%rdi)
2287 #ifdef USE_AS_STRNCPY
2288 mov $18, %cl
2289 sub $18, %r8
2290 jnz LABEL(strncpy_fill_tail)
2291 #endif
2294 .p2align 4
2295 LABEL(tail_18): /* 19 bytes */
2296 mov (%rsi), %rcx
2297 mov %rcx, (%rdi)
2298 mov 8(%rsi), %rdx
2299 mov %rdx, 8(%rdi)
2300 mov 15(%rsi), %ecx
2301 mov %ecx,15(%rdi)
2302 #ifdef USE_AS_STRNCPY
2303 mov $19, %cl
2304 sub $19, %r8
2305 jnz LABEL(strncpy_fill_tail)
2306 #endif
2309 .p2align 4
2310 LABEL(tail_19): /* 20 bytes */
2311 mov (%rsi), %rcx
2312 mov %rcx, (%rdi)
2313 mov 8(%rsi), %rdx
2314 mov %rdx, 8(%rdi)
2315 mov 16(%rsi), %ecx
2316 mov %ecx, 16(%rdi)
2317 #ifdef USE_AS_STRNCPY
2318 mov $20, %cl
2319 sub $20, %r8
2320 jnz LABEL(strncpy_fill_tail)
2321 #endif
2324 .p2align 4
2325 LABEL(tail_20): /* 21 bytes */
2326 mov (%rsi), %rcx
2327 mov %rcx, (%rdi)
2328 mov 8(%rsi), %rdx
2329 mov %rdx, 8(%rdi)
2330 mov 13(%rsi), %rcx
2331 mov %rcx, 13(%rdi)
2332 #ifdef USE_AS_STRNCPY
2333 mov $21, %cl
2334 sub $21, %r8
2335 jnz LABEL(strncpy_fill_tail)
2336 #endif
2339 .p2align 4
2340 LABEL(tail_21): /* 22 bytes */
2341 mov (%rsi), %rcx
2342 mov %rcx, (%rdi)
2343 mov 8(%rsi), %rdx
2344 mov %rdx, 8(%rdi)
2345 mov 14(%rsi), %rcx
2346 mov %rcx, 14(%rdi)
2347 #ifdef USE_AS_STRNCPY
2348 mov $22, %cl
2349 sub $22, %r8
2350 jnz LABEL(strncpy_fill_tail)
2351 #endif
2354 .p2align 4
2355 LABEL(tail_22): /* 23 bytes */
2356 mov (%rsi), %rcx
2357 mov %rcx, (%rdi)
2358 mov 8(%rsi), %rdx
2359 mov %rdx, 8(%rdi)
2360 mov 15(%rsi), %rcx
2361 mov %rcx, 15(%rdi)
2362 #ifdef USE_AS_STRNCPY
2363 mov $23, %cl
2364 sub $23, %r8
2365 jnz LABEL(strncpy_fill_tail)
2366 #endif
2369 .p2align 4
2370 LABEL(AMD_exit_more_24):
2371 test $0x01, %dh
2372 jnz LABEL(tail_24)
2373 test $0x02, %dh
2374 jnz LABEL(tail_25)
2375 test $0x04, %dh
2376 jnz LABEL(tail_26)
2377 test $0x08, %dh
2378 jnz LABEL(tail_27)
2379 test $0x10, %dh
2380 jnz LABEL(tail_28)
2381 test $0x20, %dh
2382 jnz LABEL(tail_29)
2383 test $0x40, %dh
2384 jnz LABEL(tail_30)
2386 .p2align 4
2387 LABEL(tail_31): /* 32 bytes */
2388 mov (%rsi), %rcx
2389 mov %rcx, (%rdi)
2390 mov 8(%rsi), %rdx
2391 mov %rdx, 8(%rdi)
2392 mov 16(%rsi), %rcx
2393 mov %rcx, 16(%rdi)
2394 mov 24(%rsi), %rdx
2395 mov %rdx, 24(%rdi)
2396 #ifdef USE_AS_STRNCPY
2397 mov $32, %cl
2398 sub $32, %r8
2399 jnz LABEL(strncpy_fill_tail)
2400 #endif
2403 .p2align 4
2404 LABEL(tail_24): /* 25 bytes */
2405 mov (%rsi), %rcx
2406 mov %rcx, (%rdi)
2407 mov 8(%rsi), %rdx
2408 mov %rdx, 8(%rdi)
2409 mov 16(%rsi), %rcx
2410 mov %rcx, 16(%rdi)
2411 mov 21(%rsi), %edx
2412 mov %edx, 21(%rdi)
2413 #ifdef USE_AS_STRNCPY
2414 mov $25, %cl
2415 sub $25, %r8
2416 jnz LABEL(strncpy_fill_tail)
2417 #endif
2420 .p2align 4
2421 LABEL(tail_25): /* 26 bytes */
2422 mov (%rsi), %rcx
2423 mov %rcx, (%rdi)
2424 mov 8(%rsi), %rdx
2425 mov %rdx, 8(%rdi)
2426 mov 16(%rsi), %rcx
2427 mov %rcx, 16(%rdi)
2428 mov 22(%rsi), %edx
2429 mov %edx, 22(%rdi)
2430 #ifdef USE_AS_STRNCPY
2431 mov $26, %cl
2432 sub $26, %r8
2433 jnz LABEL(strncpy_fill_tail)
2434 #endif
2437 .p2align 4
2438 LABEL(tail_26): /* 27 bytes */
2439 mov (%rsi), %rcx
2440 mov %rcx, (%rdi)
2441 mov 8(%rsi), %rdx
2442 mov %rdx, 8(%rdi)
2443 mov 16(%rsi), %rcx
2444 mov %rcx, 16(%rdi)
2445 mov 23(%rsi), %edx
2446 mov %edx, 23(%rdi)
2447 #ifdef USE_AS_STRNCPY
2448 mov $27, %cl
2449 sub $27, %r8
2450 jnz LABEL(strncpy_fill_tail)
2451 #endif
2454 .p2align 4
2455 LABEL(tail_27): /* 28 bytes */
2456 mov (%rsi), %rcx
2457 mov %rcx, (%rdi)
2458 mov 8(%rsi), %rdx
2459 mov %rdx, 8(%rdi)
2460 mov 16(%rsi), %rcx
2461 mov %rcx, 16(%rdi)
2462 mov 24(%rsi), %edx
2463 mov %edx, 24(%rdi)
2464 #ifdef USE_AS_STRNCPY
2465 mov $28, %cl
2466 sub $28, %r8
2467 jnz LABEL(strncpy_fill_tail)
2468 #endif
2471 .p2align 4
2472 LABEL(tail_28): /* 29 bytes */
2473 mov (%rsi), %rcx
2474 mov %rcx, (%rdi)
2475 mov 8(%rsi), %rdx
2476 mov %rdx, 8(%rdi)
2477 mov 16(%rsi), %rcx
2478 mov %rcx, 16(%rdi)
2479 mov 21(%rsi), %rdx
2480 mov %rdx, 21(%rdi)
2481 #ifdef USE_AS_STRNCPY
2482 mov $29, %cl
2483 sub $29, %r8
2484 jnz LABEL(strncpy_fill_tail)
2485 #endif
2488 .p2align 4
2489 LABEL(tail_29): /* 30 bytes */
2490 mov (%rsi), %rcx
2491 mov %rcx, (%rdi)
2492 mov 8(%rsi), %rdx
2493 mov %rdx, 8(%rdi)
2494 mov 16(%rsi), %rcx
2495 mov %rcx, 16(%rdi)
2496 mov 22(%rsi), %rdx
2497 mov %rdx, 22(%rdi)
2498 #ifdef USE_AS_STRNCPY
2499 mov $30, %cl
2500 sub $30, %r8
2501 jnz LABEL(strncpy_fill_tail)
2502 #endif
2505 .p2align 4
2506 LABEL(tail_30): /* 31 bytes */
2507 mov (%rsi), %rcx
2508 mov %rcx, (%rdi)
2509 mov 8(%rsi), %rdx
2510 mov %rdx, 8(%rdi)
2511 mov 16(%rsi), %rcx
2512 mov %rcx, 16(%rdi)
2513 mov 23(%rsi), %rdx
2514 mov %rdx, 23(%rdi)
2515 #ifdef USE_AS_STRNCPY
2516 mov $31, %cl
2517 sub $31, %r8
2518 jnz LABEL(strncpy_fill_tail)
2519 #endif
2522 .pushsection .rodata
2523 .p2align 4
2524 LABEL(tail_table):
2525 .int LABEL(tail_0) - LABEL(tail_table) /* 1 byte */
2526 .int LABEL(tail_1) - LABEL(tail_table)
2527 .int LABEL(tail_2) - LABEL(tail_table)
2528 .int LABEL(tail_3) - LABEL(tail_table)
2529 .int LABEL(tail_4) - LABEL(tail_table)
2530 .int LABEL(tail_5) - LABEL(tail_table)
2531 .int LABEL(tail_6) - LABEL(tail_table)
2532 .int LABEL(tail_7) - LABEL(tail_table)
2533 .int LABEL(tail_8) - LABEL(tail_table)
2534 .int LABEL(tail_9) - LABEL(tail_table)
2535 .int LABEL(tail_10) - LABEL(tail_table)
2536 .int LABEL(tail_11) - LABEL(tail_table)
2537 .int LABEL(tail_12) - LABEL(tail_table)
2538 .int LABEL(tail_13) - LABEL(tail_table)
2539 .int LABEL(tail_14) - LABEL(tail_table)
2540 .int LABEL(tail_15) - LABEL(tail_table)
2541 .int LABEL(tail_16) - LABEL(tail_table)
2542 .int LABEL(tail_17) - LABEL(tail_table)
2543 .int LABEL(tail_18) - LABEL(tail_table)
2544 .int LABEL(tail_19) - LABEL(tail_table)
2545 .int LABEL(tail_20) - LABEL(tail_table)
2546 .int LABEL(tail_21) - LABEL(tail_table)
2547 .int LABEL(tail_22) - LABEL(tail_table)
2548 .int LABEL(tail_23) - LABEL(tail_table)
2549 .int LABEL(tail_24) - LABEL(tail_table)
2550 .int LABEL(tail_25) - LABEL(tail_table)
2551 .int LABEL(tail_26) - LABEL(tail_table)
2552 .int LABEL(tail_27) - LABEL(tail_table)
2553 .int LABEL(tail_28) - LABEL(tail_table)
2554 .int LABEL(tail_29) - LABEL(tail_table)
2555 .int LABEL(tail_30) - LABEL(tail_table)
2556 .int LABEL(tail_31) - LABEL(tail_table) /* 32 bytes */
2558 .p2align 4
2559 LABEL(unaligned_table):
2560 .int LABEL(ashr_0) - LABEL(unaligned_table)
2561 .int LABEL(ashr_1) - LABEL(unaligned_table)
2562 .int LABEL(ashr_2) - LABEL(unaligned_table)
2563 .int LABEL(ashr_3) - LABEL(unaligned_table)
2564 .int LABEL(ashr_4) - LABEL(unaligned_table)
2565 .int LABEL(ashr_5) - LABEL(unaligned_table)
2566 .int LABEL(ashr_6) - LABEL(unaligned_table)
2567 .int LABEL(ashr_7) - LABEL(unaligned_table)
2568 .int LABEL(ashr_8) - LABEL(unaligned_table)
2569 .int LABEL(ashr_9) - LABEL(unaligned_table)
2570 .int LABEL(ashr_10) - LABEL(unaligned_table)
2571 .int LABEL(ashr_11) - LABEL(unaligned_table)
2572 .int LABEL(ashr_12) - LABEL(unaligned_table)
2573 .int LABEL(ashr_13) - LABEL(unaligned_table)
2574 .int LABEL(ashr_14) - LABEL(unaligned_table)
2575 .int LABEL(ashr_15) - LABEL(unaligned_table)
2576 .popsection
2578 #ifdef USE_AS_STRNCPY
2579 SET_SIZE(strncpy)
2580 #else
2581 SET_SIZE(strcpy) /* (char *, const char *) */
2582 #endif