1 /* Copyright (C) 2000, 2002 Free Software Foundation, Inc.
2 Contributed by Richard Henderson (rth@tamu.edu)
3 EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 /* Copy no more than COUNT bytes of the null-terminated string from
24 This is an internal routine used by strncpy, stpncpy, and strncat.
25 As such, it uses special linkage conventions to make implementation
26 of these public functions more efficient.
34 Furthermore, COUNT may not be zero.
37 t0 = last word written
38 t8 = bitmask (with one bit set) indicating the last byte written
39 t10 = bitmask (with one bit set) indicating the byte position of
40 the end of the range specified by COUNT
41 a0 = unaligned address of the last *word* written
42 a2 = the number of full words left in COUNT
44 Furthermore, v0, a3-a5, t11, and t12 are untouched.
53 /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
54 doesn't like putting the entry point for a procedure somewhere in the
55 middle of the procedure descriptor. Work around this by putting the
56 aligned copy in its own procedure descriptor */
65 /* On entry to this basic block:
66 t0 == the first destination word for masking back in
67 t1 == the first source word. */
69 /* Create the 1st output word and detect 0's in the 1st input word. */
70 lda t2, -1 # E : build a mask against false zero
71 mskqh t2, a1, t2 # U : detection in the src word (stall)
72 mskqh t1, a1, t3 # U :
73 ornot t1, t2, t2 # E : (stall)
75 mskql t0, a1, t0 # U : assemble the first output word
76 cmpbge zero, t2, t7 # E : bits set iff null found
77 or t0, t3, t0 # E : (stall)
85 /* On entry to this basic block:
86 t0 == a source word not containing a null. */
90 * separate store quads from load quads
91 * limit of 1 bcond/quad to permit training
101 cmpbge zero, t0, t7 # E :
104 beq t7, $a_loop # U :
109 /* Take care of the final (partial) word store. At this point
110 the end-of-count bit is set in t7 iff it applies.
112 On entry to this basic block we have:
113 t0 == the source word containing the null
114 t7 == the cmpbge mask that found it. */
117 negq t7, t8 # E : find low bit set
118 and t7, t8, t8 # E : (stall)
119 /* For the sake of the cache, don't read a destination word
120 if we're not going to need it. */
121 and t8, 0x80, t6 # E : (stall)
122 bne t6, 1f # U : (stall)
124 /* We're doing a partial word store and so need to combine
125 our source and original destination words. */
126 ldq_u t1, 0(a0) # L :
128 or t8, t6, t7 # E : (stall)
129 zapnot t0, t7, t0 # U : clear src bytes > null (stall)
131 zap t1, t7, t1 # .. e1 : clear dst bytes <= null
132 or t0, t1, t0 # e1 : (stall)
136 1: stq_u t0, 0(a0) # L :
137 ret (t9) # L0 : Latency=3
141 /* Add the end-of-count bit to the eos detection bitmask. */
144 br $a_eos # L0 : Latency=3
157 /* Are source and destination co-aligned? */
159 and a0, 7, t0 # E : find dest misalignment
160 and t1, 7, t1 # E : (stall)
161 addq a2, t0, a2 # E : bias count by dest misalignment (stall)
164 and a2, 7, t2 # E : (stall)
165 srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)
166 addq zero, 1, t10 # E :
168 sll t10, t2, t10 # U : t10 = bitmask of last count byte
169 bne t1, $unaligned # U :
170 /* We are co-aligned; take care of a partial first word. */
171 ldq_u t1, 0(a1) # L : load first src word
174 beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
175 ldq_u t0, 0(a0) # L :
179 br stxncpy_aligned # .. e1 :
186 /* The source and destination are not co-aligned. Align the destination
187 and cope. We have to be very careful about not reading too much and
192 /* We know just enough now to be able to assemble the first
193 full source word. We can still find a zero at the end of it
194 that prevents us from outputting the whole thing.
196 On entry to this basic block:
197 t0 == the first dest word, unmasked
198 t1 == the shifted low bits of the first source word
199 t6 == bytemask that is -1 in dest word bytes */
201 ldq_u t2, 8(a1) # L : Latency=3 load second src word
203 mskql t0, a0, t0 # U : mask trailing garbage in dst
204 extqh t2, a1, t4 # U : (3 cycle stall on t2)
206 or t1, t4, t1 # E : first aligned src word complete (stall)
207 mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
208 or t0, t1, t0 # E : first output word complete (stall)
209 or t0, t6, t6 # E : mask original data for zero test (stall)
211 cmpbge zero, t6, t7 # E :
212 beq a2, $u_eocfin # U :
216 bne t7, $u_final # U :
217 mskql t6, a1, t6 # U : mask out bits already seen
218 stq_u t0, 0(a0) # L : store first output word
221 cmpbge zero, t2, t7 # E : find nulls in second partial
224 bne t7, $u_late_head_exit # U :
226 /* Finally, we've got all the stupid leading edge cases taken care
227 of and we can set up to enter the main loop. */
228 extql t2, a1, t1 # U : position hi-bits of lo word
230 ldq_u t2, 8(a1) # L : read next high-order source word
233 extqh t2, a1, t0 # U : position lo-bits of hi word (stall)
234 cmpbge zero, t2, t7 # E :
238 /* Unaligned copy main loop. In order to avoid reading too much,
239 the loop is structured to detect zeros in aligned source words.
240 This has, unfortunately, effectively pulled half of a loop
241 iteration out into the head and half into the tail, but it does
242 prevent nastiness from accumulating in the very thing we want
243 to run as fast as possible.
245 On entry to this basic block:
246 t0 == the shifted low-order bits from the current source word
247 t1 == the shifted high-order bits from the previous source word
248 t2 == the unshifted current source word
250 We further know that t2 does not contain a null terminator. */
254 or t0, t1, t0 # E : current dst word now complete
255 subq a2, 1, a2 # E : decrement word count
256 extql t2, a1, t1 # U : extract high bits for next time
259 stq_u t0, -8(a0) # L : save the current word
261 ldq_u t2, 8(a1) # L : Latency=3 load high word for next time
264 extqh t2, a1, t0 # U : extract low bits (2 cycle stall)
265 cmpbge zero, t2, t7 # E : test new word for eos
267 beq t7, $u_loop # U :
269 /* We've found a zero somewhere in the source word we just read.
270 If it resides in the lower half, we have one (probably partial)
271 word to write out, and if it resides in the upper half, we
272 have one full and one partial word left to write out.
274 On entry to this basic block:
275 t0 == the shifted low-order bits from the current source word
276 t1 == the shifted high-order bits from the previous source word
277 t2 == the unshifted current source word. */
279 or t0, t1, t0 # E : first (partial) source word complete
281 cmpbge zero, t0, t7 # E : is the null in this first bit? (stall)
282 bne t7, $u_final # U : (stall)
284 stq_u t0, 0(a0) # L : the null was in the high-order bits
290 extql t2, a1, t0 # U :
291 cmpbge zero, t0, t7 # E :
292 or t7, t10, t6 # E : (stall)
293 cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall)
295 /* Take care of a final (probably partial) result word.
296 On entry to this basic block:
297 t0 == assembled source word
298 t7 == cmpbge mask that found the null. */
300 negq t7, t6 # E : isolate low bit set
301 and t6, t7, t8 # E : (stall)
302 and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
303 bne t6, 1f # U : (stall)
305 ldq_u t1, 0(a0) # L :
307 or t6, t8, t7 # E : (stall)
308 zapnot t0, t7, t0 # U : kill source bytes > null
310 zap t1, t7, t1 # U : kill dest bytes <= null
311 or t0, t1, t0 # E : (stall)
315 1: stq_u t0, 0(a0) # L :
316 ret (t9) # L0 : Latency=3
318 /* Got to end-of-count before end of string.
319 On entry to this basic block:
320 t1 == the shifted high-order bits from the previous source word */
323 sll t10, t6, t6 # U : (stall)
324 and t6, 0xff, t6 # E : (stall)
325 bne t6, 1f # U : (stall)
327 ldq_u t2, 8(a1) # L : load final src word
329 extqh t2, a1, t0 # U : extract low bits for last word (stall)
330 or t1, t0, t1 # E : (stall)
332 1: cmpbge zero, t1, t7 # E :
335 $u_eocfin: # end-of-count, final word
337 br $u_final # L0 : Latency=3
339 /* Unaligned copy entry point. */
343 ldq_u t1, 0(a1) # L : load first source word
344 and a0, 7, t4 # E : find dest misalignment
345 and a1, 7, t5 # E : find src misalignment
346 /* Conditionally load the first destination word and a bytemask
347 with 0xff indicating that the destination byte is sacrosanct. */
352 ldq_u t0, 0(a0) # L :
355 mskql t6, a0, t6 # U :
358 1: subq a1, t4, a1 # E : sub dest misalignment from src addr
360 /* If source misalignment is larger than dest misalignment, we need
361 extra startup checks to avoid SEGV. */
363 cmplt t4, t5, t8 # E :
364 extql t1, a1, t1 # U : shift src into place
365 lda t2, -1 # E : for creating masks later
366 beq t8, $u_head # U : (stall)
368 mskqh t2, t5, t2 # U : begin src byte validity mask
369 cmpbge zero, t1, t7 # E : is there a zero?
370 extql t2, a1, t2 # U :
371 or t7, t10, t5 # E : test for end-of-count too
373 cmpbge zero, t2, t3 # E :
374 cmoveq a2, t5, t7 # E : Latency=2, extra map slot
375 nop # E : keep with cmoveq
376 andnot t7, t3, t7 # E : (stall)
378 beq t7, $u_head # U :
379 /* At this point we've found a zero in the first partial word of
380 the source. We need to isolate the valid source data and mask
381 it into the original destination data. (Incidentally, we know
382 that we'll need at least one byte of that original dest word.) */
383 ldq_u t0, 0(a0) # L :
384 negq t7, t6 # E : build bitmask of bytes <= zero
385 mskqh t1, t4, t1 # U :
388 subq t8, 1, t6 # E : (stall)
389 or t6, t8, t7 # E : (stall)
390 zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
392 zapnot t1, t7, t1 # U : to source validity mask
393 andnot t0, t2, t0 # E : zero place for source to reside
394 or t0, t1, t0 # E : and put it there (stall both t0, t1)
395 stq_u t0, 0(a0) # L : (stall)
397 ret (t9) # L0 : Latency=3