4 ! by Toshiyasu Morita (tm@netcom.com)
5 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
6 ! SH5 code Copyright 2002 SuperH Ltd.
8 ! Entry: ARG0: destination pointer
12 ! Exit: RESULT: destination pointer
13 ! any other registers in the range r0-r7: trashed
15 ! Notes: Usually one wants to do small reads and write a longword, but
16 ! unfortunately it is difficult in some cases to concatanate bytes
17 ! into a longword on the SH, so this does a longword read and small
20 ! This implementation makes two assumptions about how it is called:
22 ! 1.: If the byte count is nonzero, the address of the last byte to be
23 ! copied is unsigned greater than the address of the first byte to
24 ! be copied. This could be easily swapped for a signed comparison,
25 ! but the algorithm used needs some comparison.
27 ! 2.: When there are two or three bytes in the last word of an 11-or-more
28 ! bytes memory chunk to b copied, the rest of the word can be read
29 ! without side effects.
30 ! This could be easily changed by increasing the minumum size of
31 ! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
32 ! however, this would cost a few extra cyles on average.
33 ! For SHmedia, the assumption is that any quadword can be read in its
34 ! enirety if at least one byte is included in the copy.
43 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
44 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
45 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
46 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
54 movi (L1-L0+63*32 + 1) & 0xffff,r1
67 L4_7: /* 4..7 byte memcpy cntd. */
74 L2_3: /* 2 or 3 byte memcpy cntd. */
83 L8_15: /* 8..15 byte memcpy cntd. */
90 /* 2 or 3 byte memcpy */
100 /* 4 .. 7 byte memcpy */
101 LDUAL (r3, 0, r0, r1)
109 /* 8 .. 15 byte memcpy */
110 LDUAQ (r3, 0, r0, r1)
118 /* 16 .. 24 byte memcpy */
119 LDUAQ (r3, 0, r0, r1)
120 LDUAQ (r3, 8, r8, r9)
143 movi 64+8, r27 // could subtract r7 from that.
192 #else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
210 #ifdef __LITTLE_ENDIAN__
211 ! Little endian version copies with increasing addresses.
212 mov DST,TMP1 ! Save return value
213 mov #11,r0 ! Check if small number of bytes
215 ! COUNT becomes src end address
216 SL(bf, L_small, add SRC,COUNT)
218 tst r1,SRC ! check if source even
219 SL(bt, L_even, mov COUNT,r7)
220 mov.b @SRC+,r0 ! no, make it even.
223 L_even: tst r1,DST ! check if destination is even
225 SL(bf, L_odddst, mov #2,r1)
226 tst r1,DST ! check if destination is 4-byte aligned
228 SL(bt, L_al4dst, sub SRC,r0)
231 ! add #2,DST DST is dead here.
238 add #-6,r7 ! r7 := src end address minus 9.
241 mov.l @SRC+,TMP0 ! Read & write two longwords per iteration
256 mov.l @SRC+,DST ! Read longword, write longword per iteration
258 SL(bf, L_al4both_loop, mov.l DST,@(r0,SRC))
265 SL(bt, L_al4src, add #-1,DST)
274 mov.l @SRC+,r0 ! Read longword, write byte, word, byte per iteration
281 SL(bf, L_odd_loop, add #4,DST)
282 .align 2 ! avoid nop in more frequently executed code.
301 #else /* ! __LITTLE_ENDIAN__ */
302 ! Big endian version copies with decreasing addresses.
308 SL(bf, L_small, add #-1,SRC)
320 SL(bf, L_odddst, add #8,r7)
346 nop ! avoid nop in executed code.
353 SL(bt, L_al4both_loop,
358 nop ! avoid nop in executed code.
394 #endif /* ! __LITTLE_ENDIAN__ */
395 #endif /* ! SHMEDIA */