clarify the purpose of this project
[nyanglibc.git] / string / memchr.s
blob96db3bde831cc955309677595c6efc673556996f
1 .text
2 .globl memchr
3 .type memchr,@function
4 .align 1<<4
5 memchr:
6 movd %esi, %xmm1
7 mov %edi, %ecx
8 punpcklbw %xmm1, %xmm1
9 test %rdx, %rdx
10 jz .Lreturn_null
11 punpcklbw %xmm1, %xmm1
12 and $63, %ecx
13 pshufd $0, %xmm1, %xmm1
14 cmp $48, %ecx
15 ja .Lcrosscache
16 movdqu (%rdi), %xmm0
17 pcmpeqb %xmm1, %xmm0
18 pmovmskb %xmm0, %eax
19 test %eax, %eax
20 jnz .Lmatches_1
21 sub $16, %rdx
22 jbe .Lreturn_null
23 add $16, %rdi
24 and $15, %ecx
25 and $-16, %rdi
26 add %rcx, %rdx
27 sub $64, %rdx
28 jbe .Lexit_loop
29 jmp .Lloop_prolog
30 .p2align 4
31 .Lcrosscache:
32 and $15, %ecx
33 and $-16, %rdi
34 movdqa (%rdi), %xmm0
35 pcmpeqb %xmm1, %xmm0
36 pmovmskb %xmm0, %eax
37 sar %cl, %eax
38 test %eax, %eax
39 je .Lunaligned_no_match
40 bsf %eax, %eax
41 sub %rax, %rdx
42 jbe .Lreturn_null
43 add %rdi, %rax
44 add %rcx, %rax
45 ret
46 .p2align 4
47 .Lunaligned_no_match:
48 neg %rcx
49 add $16, %rcx
50 sub %rcx, %rdx
51 jbe .Lreturn_null
52 add $16, %rdi
53 sub $64, %rdx
54 jbe .Lexit_loop
55 .p2align 4
56 .Lloop_prolog:
57 movdqa (%rdi), %xmm0
58 pcmpeqb %xmm1, %xmm0
59 pmovmskb %xmm0, %eax
60 test %eax, %eax
61 jnz .Lmatches
62 movdqa 16(%rdi), %xmm2
63 pcmpeqb %xmm1, %xmm2
64 pmovmskb %xmm2, %eax
65 test %eax, %eax
66 jnz .Lmatches16
67 movdqa 32(%rdi), %xmm3
68 pcmpeqb %xmm1, %xmm3
69 pmovmskb %xmm3, %eax
70 test %eax, %eax
71 jnz .Lmatches32
72 movdqa 48(%rdi), %xmm4
73 pcmpeqb %xmm1, %xmm4
74 add $64, %rdi
75 pmovmskb %xmm4, %eax
76 test %eax, %eax
77 jnz .Lmatches0
78 test $0x3f, %rdi
79 jz .Lalign64_loop
80 sub $64, %rdx
81 jbe .Lexit_loop
82 movdqa (%rdi), %xmm0
83 pcmpeqb %xmm1, %xmm0
84 pmovmskb %xmm0, %eax
85 test %eax, %eax
86 jnz .Lmatches
87 movdqa 16(%rdi), %xmm2
88 pcmpeqb %xmm1, %xmm2
89 pmovmskb %xmm2, %eax
90 test %eax, %eax
91 jnz .Lmatches16
92 movdqa 32(%rdi), %xmm3
93 pcmpeqb %xmm1, %xmm3
94 pmovmskb %xmm3, %eax
95 test %eax, %eax
96 jnz .Lmatches32
97 movdqa 48(%rdi), %xmm3
98 pcmpeqb %xmm1, %xmm3
99 pmovmskb %xmm3, %eax
100 add $64, %rdi
101 test %eax, %eax
102 jnz .Lmatches0
103 mov %rdi, %rcx
104 and $-64, %rdi
105 and $63, %ecx
106 add %rcx, %rdx
107 .p2align 4
108 .Lalign64_loop:
109 sub $64, %rdx
110 jbe .Lexit_loop
111 movdqa (%rdi), %xmm0
112 movdqa 16(%rdi), %xmm2
113 movdqa 32(%rdi), %xmm3
114 movdqa 48(%rdi), %xmm4
115 pcmpeqb %xmm1, %xmm0
116 pcmpeqb %xmm1, %xmm2
117 pcmpeqb %xmm1, %xmm3
118 pcmpeqb %xmm1, %xmm4
119 pmaxub %xmm0, %xmm3
120 pmaxub %xmm2, %xmm4
121 pmaxub %xmm3, %xmm4
122 pmovmskb %xmm4, %eax
123 add $64, %rdi
124 test %eax, %eax
125 jz .Lalign64_loop
126 sub $64, %rdi
127 pmovmskb %xmm0, %eax
128 test %eax, %eax
129 jnz .Lmatches
130 pmovmskb %xmm2, %eax
131 test %eax, %eax
132 jnz .Lmatches16
133 movdqa 32(%rdi), %xmm3
134 pcmpeqb %xmm1, %xmm3
135 pcmpeqb 48(%rdi), %xmm1
136 pmovmskb %xmm3, %eax
137 test %eax, %eax
138 jnz .Lmatches32
139 pmovmskb %xmm1, %eax
140 bsf %eax, %eax
141 lea 48(%rdi, %rax), %rax
143 .p2align 4
144 .Lexit_loop:
145 add $32, %edx
146 jle .Lexit_loop_32
147 movdqa (%rdi), %xmm0
148 pcmpeqb %xmm1, %xmm0
149 pmovmskb %xmm0, %eax
150 test %eax, %eax
151 jnz .Lmatches
152 movdqa 16(%rdi), %xmm2
153 pcmpeqb %xmm1, %xmm2
154 pmovmskb %xmm2, %eax
155 test %eax, %eax
156 jnz .Lmatches16
157 movdqa 32(%rdi), %xmm3
158 pcmpeqb %xmm1, %xmm3
159 pmovmskb %xmm3, %eax
160 test %eax, %eax
161 jnz .Lmatches32_1
162 sub $16, %edx
163 jle .Lreturn_null
164 pcmpeqb 48(%rdi), %xmm1
165 pmovmskb %xmm1, %eax
166 test %eax, %eax
167 jnz .Lmatches48_1
168 xor %eax, %eax
170 .p2align 4
171 .Lexit_loop_32:
172 add $32, %edx
173 movdqa (%rdi), %xmm0
174 pcmpeqb %xmm1, %xmm0
175 pmovmskb %xmm0, %eax
176 test %eax, %eax
177 jnz .Lmatches_1
178 sub $16, %edx
179 jbe .Lreturn_null
180 pcmpeqb 16(%rdi), %xmm1
181 pmovmskb %xmm1, %eax
182 test %eax, %eax
183 jnz .Lmatches16_1
184 xor %eax, %eax
186 .p2align 4
187 .Lmatches0:
188 bsf %eax, %eax
189 lea -16(%rax, %rdi), %rax
191 .p2align 4
192 .Lmatches:
193 bsf %eax, %eax
194 add %rdi, %rax
196 .p2align 4
197 .Lmatches16:
198 bsf %eax, %eax
199 lea 16(%rax, %rdi), %rax
201 .p2align 4
202 .Lmatches32:
203 bsf %eax, %eax
204 lea 32(%rax, %rdi), %rax
206 .p2align 4
207 .Lmatches_1:
208 bsf %eax, %eax
209 sub %rax, %rdx
210 jbe .Lreturn_null
211 add %rdi, %rax
213 .p2align 4
214 .Lmatches16_1:
215 bsf %eax, %eax
216 sub %rax, %rdx
217 jbe .Lreturn_null
218 lea 16(%rdi, %rax), %rax
220 .p2align 4
221 .Lmatches32_1:
222 bsf %eax, %eax
223 sub %rax, %rdx
224 jbe .Lreturn_null
225 lea 32(%rdi, %rax), %rax
227 .p2align 4
228 .Lmatches48_1:
229 bsf %eax, %eax
230 sub %rax, %rdx
231 jbe .Lreturn_null
232 lea 48(%rdi, %rax), %rax
234 .p2align 4
235 .Lreturn_null:
236 xor %eax, %eax
238 .size memchr,.-memchr
239 .globl __memchr
240 .set __memchr,memchr