clarify the purpose of this project
[nyanglibc.git] / string / memrchr.s
blob266f854b82f94cca5311126388f2a5149b865dcc
1 .text
2 .globl __memrchr
3 .type __memrchr,@function
4 .align 1<<4
5 __memrchr:
6 movd %esi, %xmm1
7 sub $16, %rdx
8 jbe .Llength_less16
9 punpcklbw %xmm1, %xmm1
10 punpcklbw %xmm1, %xmm1
11 add %rdx, %rdi
12 pshufd $0, %xmm1, %xmm1
13 movdqu (%rdi), %xmm0
14 pcmpeqb %xmm1, %xmm0
15 pmovmskb %xmm0, %eax
16 test %eax, %eax
17 jnz .Lmatches0
18 sub $64, %rdi
19 mov %edi, %ecx
20 and $15, %ecx
21 jz .Lloop_prolog
22 add $16, %rdi
23 add $16, %rdx
24 and $-16, %rdi
25 sub %rcx, %rdx
26 .p2align 4
27 .Lloop_prolog:
28 sub $64, %rdx
29 jbe .Lexit_loop
30 movdqa 48(%rdi), %xmm0
31 pcmpeqb %xmm1, %xmm0
32 pmovmskb %xmm0, %eax
33 test %eax, %eax
34 jnz .Lmatches48
35 movdqa 32(%rdi), %xmm2
36 pcmpeqb %xmm1, %xmm2
37 pmovmskb %xmm2, %eax
38 test %eax, %eax
39 jnz .Lmatches32
40 movdqa 16(%rdi), %xmm3
41 pcmpeqb %xmm1, %xmm3
42 pmovmskb %xmm3, %eax
43 test %eax, %eax
44 jnz .Lmatches16
45 movdqa (%rdi), %xmm4
46 pcmpeqb %xmm1, %xmm4
47 pmovmskb %xmm4, %eax
48 test %eax, %eax
49 jnz .Lmatches0
50 sub $64, %rdi
51 sub $64, %rdx
52 jbe .Lexit_loop
53 movdqa 48(%rdi), %xmm0
54 pcmpeqb %xmm1, %xmm0
55 pmovmskb %xmm0, %eax
56 test %eax, %eax
57 jnz .Lmatches48
58 movdqa 32(%rdi), %xmm2
59 pcmpeqb %xmm1, %xmm2
60 pmovmskb %xmm2, %eax
61 test %eax, %eax
62 jnz .Lmatches32
63 movdqa 16(%rdi), %xmm3
64 pcmpeqb %xmm1, %xmm3
65 pmovmskb %xmm3, %eax
66 test %eax, %eax
67 jnz .Lmatches16
68 movdqa (%rdi), %xmm3
69 pcmpeqb %xmm1, %xmm3
70 pmovmskb %xmm3, %eax
71 test %eax, %eax
72 jnz .Lmatches0
73 mov %edi, %ecx
74 and $63, %ecx
75 jz .Lalign64_loop
76 add $64, %rdi
77 add $64, %rdx
78 and $-64, %rdi
79 sub %rcx, %rdx
80 .p2align 4
81 .Lalign64_loop:
82 sub $64, %rdi
83 sub $64, %rdx
84 jbe .Lexit_loop
85 movdqa (%rdi), %xmm0
86 movdqa 16(%rdi), %xmm2
87 movdqa 32(%rdi), %xmm3
88 movdqa 48(%rdi), %xmm4
89 pcmpeqb %xmm1, %xmm0
90 pcmpeqb %xmm1, %xmm2
91 pcmpeqb %xmm1, %xmm3
92 pcmpeqb %xmm1, %xmm4
93 pmaxub %xmm3, %xmm0
94 pmaxub %xmm4, %xmm2
95 pmaxub %xmm0, %xmm2
96 pmovmskb %xmm2, %eax
97 test %eax, %eax
98 jz .Lalign64_loop
99 pmovmskb %xmm4, %eax
100 test %eax, %eax
101 jnz .Lmatches48
102 pmovmskb %xmm3, %eax
103 test %eax, %eax
104 jnz .Lmatches32
105 movdqa 16(%rdi), %xmm2
106 pcmpeqb %xmm1, %xmm2
107 pcmpeqb (%rdi), %xmm1
108 pmovmskb %xmm2, %eax
109 test %eax, %eax
110 jnz .Lmatches16
111 pmovmskb %xmm1, %eax
112 bsr %eax, %eax
113 add %rdi, %rax
115 .p2align 4
116 .Lexit_loop:
117 add $64, %edx
118 cmp $32, %edx
119 jbe .Lexit_loop_32
120 movdqa 48(%rdi), %xmm0
121 pcmpeqb %xmm1, %xmm0
122 pmovmskb %xmm0, %eax
123 test %eax, %eax
124 jnz .Lmatches48
125 movdqa 32(%rdi), %xmm2
126 pcmpeqb %xmm1, %xmm2
127 pmovmskb %xmm2, %eax
128 test %eax, %eax
129 jnz .Lmatches32
130 movdqa 16(%rdi), %xmm3
131 pcmpeqb %xmm1, %xmm3
132 pmovmskb %xmm3, %eax
133 test %eax, %eax
134 jnz .Lmatches16_1
135 cmp $48, %edx
136 jbe .Lreturn_null
137 pcmpeqb (%rdi), %xmm1
138 pmovmskb %xmm1, %eax
139 test %eax, %eax
140 jnz .Lmatches0_1
141 xor %eax, %eax
143 .p2align 4
144 .Lexit_loop_32:
145 movdqa 48(%rdi), %xmm0
146 pcmpeqb %xmm1, %xmm0
147 pmovmskb %xmm0, %eax
148 test %eax, %eax
149 jnz .Lmatches48_1
150 cmp $16, %edx
151 jbe .Lreturn_null
152 pcmpeqb 32(%rdi), %xmm1
153 pmovmskb %xmm1, %eax
154 test %eax, %eax
155 jnz .Lmatches32_1
156 xor %eax, %eax
158 .p2align 4
159 .Lmatches0:
160 bsr %eax, %eax
161 add %rdi, %rax
163 .p2align 4
164 .Lmatches16:
165 bsr %eax, %eax
166 lea 16(%rax, %rdi), %rax
168 .p2align 4
169 .Lmatches32:
170 bsr %eax, %eax
171 lea 32(%rax, %rdi), %rax
173 .p2align 4
174 .Lmatches48:
175 bsr %eax, %eax
176 lea 48(%rax, %rdi), %rax
178 .p2align 4
179 .Lmatches0_1:
180 bsr %eax, %eax
181 sub $64, %rdx
182 add %rax, %rdx
183 jl .Lreturn_null
184 add %rdi, %rax
186 .p2align 4
187 .Lmatches16_1:
188 bsr %eax, %eax
189 sub $48, %rdx
190 add %rax, %rdx
191 jl .Lreturn_null
192 lea 16(%rdi, %rax), %rax
194 .p2align 4
195 .Lmatches32_1:
196 bsr %eax, %eax
197 sub $32, %rdx
198 add %rax, %rdx
199 jl .Lreturn_null
200 lea 32(%rdi, %rax), %rax
202 .p2align 4
203 .Lmatches48_1:
204 bsr %eax, %eax
205 sub $16, %rdx
206 add %rax, %rdx
207 jl .Lreturn_null
208 lea 48(%rdi, %rax), %rax
210 .p2align 4
211 .Lreturn_null:
212 xor %eax, %eax
214 .p2align 4
215 .Llength_less16_offset0:
216 test %edx, %edx
217 jz .Lreturn_null
218 mov %dl, %cl
219 pcmpeqb (%rdi), %xmm1
220 mov $1, %edx
221 sal %cl, %edx
222 sub $1, %edx
223 pmovmskb %xmm1, %eax
224 and %edx, %eax
225 test %eax, %eax
226 jz .Lreturn_null
227 bsr %eax, %eax
228 add %rdi, %rax
230 .p2align 4
231 .Llength_less16:
232 punpcklbw %xmm1, %xmm1
233 punpcklbw %xmm1, %xmm1
234 add $16, %edx
235 pshufd $0, %xmm1, %xmm1
236 mov %edi, %ecx
237 and $15, %ecx
238 jz .Llength_less16_offset0
239 mov %cl, %dh
240 mov %ecx, %esi
241 add %dl, %dh
242 and $-16, %rdi
243 sub $16, %dh
244 ja .Llength_less16_part2
245 pcmpeqb (%rdi), %xmm1
246 pmovmskb %xmm1, %eax
247 sar %cl, %eax
248 mov %dl, %cl
249 mov $1, %edx
250 sal %cl, %edx
251 sub $1, %edx
252 and %edx, %eax
253 test %eax, %eax
254 jz .Lreturn_null
255 bsr %eax, %eax
256 add %rdi, %rax
257 add %rsi, %rax
259 .p2align 4
260 .Llength_less16_part2:
261 movdqa 16(%rdi), %xmm2
262 pcmpeqb %xmm1, %xmm2
263 pmovmskb %xmm2, %eax
264 mov %dh, %cl
265 mov $1, %edx
266 sal %cl, %edx
267 sub $1, %edx
268 and %edx, %eax
269 test %eax, %eax
270 jnz .Llength_less16_part2_return
271 pcmpeqb (%rdi), %xmm1
272 pmovmskb %xmm1, %eax
273 mov %esi, %ecx
274 sar %cl, %eax
275 test %eax, %eax
276 jz .Lreturn_null
277 bsr %eax, %eax
278 add %rdi, %rax
279 add %rsi, %rax
281 .p2align 4
282 .Llength_less16_part2_return:
283 bsr %eax, %eax
284 lea 16(%rax, %rdi), %rax
286 .size __memrchr,.-__memrchr
287 .weak memrchr
288 memrchr = __memrchr