1 /* Copyright (C) 2000 Free Software Foundation, Inc.
2 Contributed by Richard Henderson (rth@tamu.edu)
3 EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 /* Copy a null-terminated string from SRC to DST.
23 This is an internal routine used by strcpy, stpcpy, and strcat.
24 As such, it uses special linkage conventions to make implementation
25 of these public functions more efficient.
33 t8 = bitmask (with one bit set) indicating the last byte written
34 a0 = unaligned address of the last *word* written
36 Furthermore, v0, a3-a5, t11, and t12 are untouched.
47 /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
48 doesn't like putting the entry point for a procedure somewhere in the
49 middle of the procedure descriptor. Work around this by putting the
50 aligned copy in its own procedure descriptor */
59 /* On entry to this basic block:
60 t0 == the first destination word for masking back in
61 t1 == the first source word. */
63 /* Create the 1st output word and detect 0's in the 1st input word. */
64 lda t2, -1 # E : build a mask against false zero
65 mskqh t2, a1, t2 # U : detection in the src word (stall)
66 mskqh t1, a1, t3 # U :
67 ornot t1, t2, t2 # E : (stall)
69 mskql t0, a1, t0 # U : assemble the first output word
70 cmpbge zero, t2, t10 # E : bits set iff null found
71 or t0, t3, t1 # E : (stall)
72 bne t10, $a_eos # U : (stall)
74 /* On entry to this basic block:
75 t0 == the first destination word for masking back in
76 t1 == a source word not containing a null. */
77 /* Nops here to separate store quads from load quads */
85 ldq_u t1, 0(a1) # L : Latency=3
87 cmpbge zero, t1, t10 # E : (3 cycle stall)
88 beq t10, $a_loop # U : (stall for t10)
90 /* Take care of the final (partial) word store.
91 On entry to this basic block we have:
92 t1 == the source word containing the null
93 t10 == the cmpbge mask that found it. */
95 negq t10, t6 # E : find low bit set
96 and t10, t6, t8 # E : (stall)
97 /* For the sake of the cache, don't read a destination word
98 if we're not going to need it. */
99 and t8, 0x80, t6 # E : (stall)
100 bne t6, 1f # U : (stall)
102 /* We're doing a partial word store and so need to combine
103 our source and original destination words. */
104 ldq_u t0, 0(a0) # L : Latency=3
106 zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
107 or t8, t6, t10 # E : (stall)
109 zap t0, t10, t0 # E : clear dst bytes <= null
110 or t0, t1, t1 # E : (stall)
114 1: stq_u t1, 0(a0) # L :
115 ret (t9) # L0 : Latency=3
128 /* Are source and destination co-aligned? */
131 and t0, 7, t0 # E : (stall)
132 bne t0, $unaligned # U : (stall)
134 /* We are co-aligned; take care of a partial first word. */
135 ldq_u t1, 0(a1) # L : load first src word
136 and a0, 7, t0 # E : take care not to load a word ...
138 beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
140 ldq_u t0, 0(a0) # L :
141 br stxcpy_aligned # L0 : Latency=3
146 /* The source and destination are not co-aligned. Align the destination
147 and cope. We have to be very careful about not reading too much and
152 /* We know just enough now to be able to assemble the first
153 full source word. We can still find a zero at the end of it
154 that prevents us from outputting the whole thing.
156 On entry to this basic block:
157 t0 == the first dest word, for masking back in, if needed else 0
158 t1 == the low bits of the first source word
159 t6 == bytemask that is -1 in dest word bytes */
161 ldq_u t2, 8(a1) # L :
163 extql t1, a1, t1 # U : (stall on a1)
164 extqh t2, a1, t4 # U : (stall on a1)
166 mskql t0, a0, t0 # U :
168 mskqh t1, a0, t1 # U : (stall on t1)
169 or t0, t1, t1 # E : (stall on t1)
172 cmpbge zero, t6, t10 # E : (stall)
173 lda t6, -1 # E : for masking just below
174 bne t10, $u_final # U : (stall)
176 mskql t6, a1, t6 # U : mask out the bits we have
177 or t6, t2, t2 # E : already extracted before (stall)
178 cmpbge zero, t2, t10 # E : testing eos (stall)
179 bne t10, $u_late_head_exit # U : (stall)
181 /* Finally, we've got all the stupid leading edge cases taken care
182 of and we can set up to enter the main loop. */
184 stq_u t1, 0(a0) # L : store first output word
186 extql t2, a1, t0 # U : position ho-bits of lo word
187 ldq_u t2, 8(a1) # U : read next high-order source word
190 cmpbge zero, t2, t10 # E : (stall for t2)
192 bne t10, $u_eos # U : (stall)
194 /* Unaligned copy main loop. In order to avoid reading too much,
195 the loop is structured to detect zeros in aligned source words.
196 This has, unfortunately, effectively pulled half of a loop
197 iteration out into the head and half into the tail, but it does
198 prevent nastiness from accumulating in the very thing we want
199 to run as fast as possible.
201 On entry to this basic block:
202 t0 == the shifted high-order bits from the previous source word
203 t2 == the unshifted current source word
205 We further know that t2 does not contain a null terminator. */
209 extqh t2, a1, t1 # U : extract high bits for current word
210 addq a1, 8, a1 # E : (stall)
211 extql t2, a1, t3 # U : extract low bits for next time (stall)
214 or t0, t1, t1 # E : current dst word now complete
215 ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
216 stq_u t1, -8(a0) # L : save the current word (stall)
219 cmpbge zero, t2, t10 # E : test new word for eos
220 beq t10, $u_loop # U : (stall)
224 /* We've found a zero somewhere in the source word we just read.
225 If it resides in the lower half, we have one (probably partial)
226 word to write out, and if it resides in the upper half, we
227 have one full and one partial word left to write out.
229 On entry to this basic block:
230 t0 == the shifted high-order bits from the previous source word
231 t2 == the unshifted current source word. */
233 extqh t2, a1, t1 # U :
234 or t0, t1, t1 # E : first (partial) source word complete (stall)
235 cmpbge zero, t1, t10 # E : is the null in this first bit? (stall)
236 bne t10, $u_final # U : (stall)
239 stq_u t1, 0(a0) # L : the null was in the high-order bits
241 extql t2, a1, t1 # U :
242 cmpbge zero, t1, t10 # E : (stall)
244 /* Take care of a final (probably partial) result word.
245 On entry to this basic block:
246 t1 == assembled source word
247 t10 == cmpbge mask that found the null. */
249 negq t10, t6 # E : isolate low bit set
250 and t6, t10, t8 # E : (stall)
251 and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
252 bne t6, 1f # U : (stall)
254 ldq_u t0, 0(a0) # E :
256 or t6, t8, t10 # E : (stall)
257 zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
259 zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
260 or t0, t1, t1 # E : (stall)
264 1: stq_u t1, 0(a0) # L :
265 ret (t9) # L0 : Latency=3
269 /* Unaligned copy entry point. */
273 ldq_u t1, 0(a1) # L : load first source word
274 and a0, 7, t4 # E : find dest misalignment
275 and a1, 7, t5 # E : find src misalignment
276 /* Conditionally load the first destination word and a bytemask
277 with 0xff indicating that the destination byte is sacrosanct. */
282 ldq_u t0, 0(a0) # L :
285 mskql t6, a0, t6 # U :
290 subq a1, t4, a1 # E : sub dest misalignment from src addr
291 /* If source misalignment is larger than dest misalignment, we need
292 extra startup checks to avoid SEGV. */
293 cmplt t4, t5, t8 # E :
294 beq t8, $u_head # U :
295 lda t2, -1 # E : mask out leading garbage in source
297 mskqh t2, t5, t2 # U :
298 ornot t1, t2, t3 # E : (stall)
299 cmpbge zero, t3, t10 # E : is there a zero? (stall)
300 beq t10, $u_head # U : (stall)
302 /* At this point we've found a zero in the first partial word of
303 the source. We need to isolate the valid source data and mask
304 it into the original destination data. (Incidentally, we know
305 that we'll need at least one byte of that original dest word.) */
307 ldq_u t0, 0(a0) # L :
308 negq t10, t6 # E : build bitmask of bytes <= zero
309 and t6, t10, t8 # E : (stall)
313 or t6, t8, t10 # E : (stall)
314 srl t8, t5, t8 # U : adjust final null return value
315 zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
317 and t1, t2, t1 # E : to source validity mask
318 extql t2, a1, t2 # U :
319 extql t1, a1, t1 # U : (stall)
320 andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
322 or t0, t1, t1 # e1 : and put it there
323 stq_u t1, 0(a0) # .. e0 : (stall)