import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / amd64 / gen / strlen.s
blob02a93b50c42bb01a31b8e911ed62620f79907efb
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2009, Intel Corporation
24 * All rights reserved.
28 * strlen - calculate the length of string
31 #include "SYS.h"
32 #include "proc64_id.h"
34 #define LABEL(s) .strlen##s
37 * This implementation uses SSE instructions to compare up to 16 bytes
38 * at a time looking for the end of string (null char).
40 ENTRY(strlen) /* (const char *s) */
41 mov %rdi, %rsi /* keep original %rdi value */
42 mov %rsi, %rcx
43 pxor %xmm0, %xmm0 /* 16 null chars */
44 and $15, %rcx
45 jz LABEL(align16_loop) /* string is 16 byte aligned */
48 * Unaligned case. Round down to 16-byte boundary before comparing
49 * 16 bytes for a null char. The code then compensates for any extra chars
50 * preceding the start of the string.
52 LABEL(unalign16):
53 and $0xfffffffffffffff0, %rsi
55 pcmpeqb (%rsi), %xmm0
56 lea 16(%rdi), %rsi
57 pmovmskb %xmm0, %edx
59 shr %cl, %edx /* Compensate for bytes preceding the string */
60 test %edx, %edx
61 jnz LABEL(exit)
62 sub %rcx, %rsi /* no null, adjust to next 16-byte boundary */
63 pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
65 .p2align 4
66 LABEL(align16_loop): /* 16 byte aligned */
67 pcmpeqb (%rsi), %xmm0 /* look for null bytes */
68 pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
70 add $16, %rsi /* prepare to search next 16 bytes */
71 test %edx, %edx /* if no null byte, %edx must be 0 */
72 jnz LABEL(exit) /* found a null */
74 pcmpeqb (%rsi), %xmm0
75 pmovmskb %xmm0, %edx
76 add $16, %rsi
77 test %edx, %edx
78 jnz LABEL(exit)
80 pcmpeqb (%rsi), %xmm0
81 pmovmskb %xmm0, %edx
82 add $16, %rsi
83 test %edx, %edx
84 jnz LABEL(exit)
86 pcmpeqb (%rsi), %xmm0
87 pmovmskb %xmm0, %edx
88 add $16, %rsi
89 test %edx, %edx
90 jz LABEL(align16_loop)
92 .p2align 4
93 LABEL(exit):
94 neg %rdi
96 * Check to see if BSF is fast on this processor. If not, use a different
97 * exit tail to find first bit set indicating null byte match.
99 testl $USE_BSF, .memops_method(%rip)
100 jz LABEL(AMD_exit)
102 lea -16(%rdi, %rsi), %rax /* calculate exact offset */
103 bsf %edx, %ecx /* Least significant 1 bit is index of null */
104 lea (%rax, %rcx),%rax
108 * This exit tail does not use the bsf instruction.
110 .p2align 4
111 LABEL(AMD_exit):
112 lea -16(%rdi, %rsi), %rax
113 test %dl, %dl
114 jz LABEL(exit_high)
115 test $0x01, %dl
116 jnz LABEL(exit_tail0)
118 test $0x02, %dl
119 jnz LABEL(exit_tail1)
121 .p2align 4
122 test $0x04, %dl
123 jnz LABEL(exit_tail2)
125 test $0x08, %dl
126 jnz LABEL(exit_tail3)
128 test $0x10, %dl
129 jnz LABEL(exit_tail4)
131 test $0x20, %dl
132 jnz LABEL(exit_tail5)
134 test $0x40, %dl
135 jnz LABEL(exit_tail6)
136 add $7, %rax
139 .p2align 4
140 LABEL(exit_high):
141 add $8, %rax
142 test $0x01, %dh
143 jnz LABEL(exit_tail0)
145 test $0x02, %dh
146 jnz LABEL(exit_tail1)
148 test $0x04, %dh
149 jnz LABEL(exit_tail2)
151 test $0x08, %dh
152 jnz LABEL(exit_tail3)
154 test $0x10, %dh
155 jnz LABEL(exit_tail4)
157 test $0x20, %dh
158 jnz LABEL(exit_tail5)
160 test $0x40, %dh
161 jnz LABEL(exit_tail6)
162 add $7, %rax
165 .p2align 4
166 LABEL(exit_tail0):
167 xor %ecx, %ecx
170 .p2align 4
171 LABEL(exit_tail1):
172 add $1, %rax
175 .p2align 4
176 LABEL(exit_tail2):
177 add $2, %rax
180 .p2align 4
181 LABEL(exit_tail3):
182 add $3, %rax
185 .p2align 4
186 LABEL(exit_tail4):
187 add $4, %rax
190 .p2align 4
191 LABEL(exit_tail5):
192 add $5, %rax
195 .p2align 4
196 LABEL(exit_tail6):
197 add $6, %rax
199 SET_SIZE(strlen)