2 Copyright (c) 2024, Synopsys, Inc. All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
7 1) Redistributions of source code must retain the above copyright notice,
8 this list of conditions and the following disclaimer.
10 2) Redistributions in binary form must reproduce the above copyright notice,
11 this list of conditions and the following disclaimer in the documentation
12 and/or other materials provided with the distribution.
14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors
15 may be used to endorse or promote products derived from this software
16 without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 POSSIBILITY OF SUCH DAMAGE.
37 ; The 64-bit crunching implementation.
39 #if defined (__ARC64_ARCH32__) && !defined(__ARC64_LL64__)
43 ; If the destination is greater than the source
46 ; or if the source plus count is smaller than the destination
49 ; We can safely perform a normal memcpy. Otherwise, we need to perform it
51 blo.d @.L_normal_memcpy
52 lsr.f r11, r2, 4 ; counter for 16-byte chunks
57 ; The only thing that changes between memcpy and memmove is copy direction
58 ; in case the dest and src address memory locations overlap
59 ; More detailed information is in the forwards copy and at the end of
84 ; Return if there are no 16 byte chunks
87 .L_write_backwards_16_bytes:
95 dbnz.d r11, @.L_write_backwards_16_bytes
101 beq.d @.L_write_forwards_15_bytes
102 mov r3, r0 ; work on a copy of "r0"
104 .L_write_forwards_16_bytes:
112 dbnz.d r11, @.L_write_forwards_16_bytes
116 .L_write_forwards_15_bytes:
143 ; If the destination is greater than the source
146 ; or if the source plus count is smaller than the destination
149 ; We can safely perform a normal memcpy. Otherwise, we need to perform it
151 blo.d @.L_normal_memcpy
152 LSRP.f r12, r2, 5 ; counter for 32-byte chunks
157 ; The only thing that changes between memcpy and memmove is copy direction
158 ; in case the dest and src address memory locations overlap
159 ; More detailed information is in the forwards copy and at the end of
162 ; Set both r0 and r1 to point to the end of each memory location
189 ; Jump if there are no 32 byte chunks
192 .L_write_backwards_32_bytes: ; Take care of 32 byte chunks
193 #if defined (__ARC64_M128__)
195 lddl.aw r4r5, [r1, -16]
196 lddl.aw r6r7, [r1, -16]
198 stdl.aw r4r5, [r3, -16]
199 stdl.aw r6r7, [r3, -16]
200 dbnz r12, @.L_write_backwards_32_bytes
202 #elif defined (__ARC64_ARCH64__) || ( defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__) )
212 dbnz.d r12, @.L_write_backwards_32_bytes
213 ST64.aw r10, [r3, -8]
216 # error Unknown configuration
223 ;LSRP.f r12, r2, 5 ; Moved up
225 beq.d @.L_write_forwards_31_bytes
226 MOVP r3, r0 ; do not clobber the "dest"
228 .L_write_forwards_32_bytes: ; Take care of 32 byte chunks
229 #if defined (__ARC64_M128__)
231 lddl.ab r4r5, [r1, +16]
232 lddl.ab r6r7, [r1, +16]
234 stdl.ab r4r5, [r3, +16]
235 stdl.ab r6r7, [r3, +16]
236 dbnz r12, @.L_write_forwards_32_bytes
238 #elif defined (__ARC64_ARCH64__) || ( defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__) )
247 dbnz.d r12, @.L_write_forwards_32_bytes
248 ST64.ab r10, [r3, +8] ; Shove store in delay slot
251 # error Unknown configuration
254 bmsk_s r2, r2, 4 ; From now on, we only care for the remainder % 32
257 ; The remainder bits indicating how many more bytes to copy
258 ; .------------------------.
259 ; | b4 | b3 | b2 | b1 | b0 |
260 ; `------------------------'
262 .L_write_forwards_31_bytes:
263 bbit0.d r2, 2, @1f ; is b2 set? then copy 4 bytes
264 lsr r12, r2, 3 ; see the notes below
268 bbit0.d r2, 1, @1f ; is b1 set? then copy 2 bytes
273 bbit0.d r2, 0, @1f ; is b0 set? then copy 1 byte
278 ; Interpreting bits (b4,b3) [1] and how they correlate to branch index:
280 ; (b4,b3) | bytes to copy | branch index
281 ; --------+---------------+-------------
287 ; To go from (b4,b3) to branch index, the bits must be flipped.
288 ; In other words, they must be XORed with 11b [2].
290 ; Last but not least, "bi" jumps at boundaries of 4. We need to double
291 ; the index to jump 8 bytes [3].
293 ; Hence, the 3 operations for calculating the branch index that are spread
294 ; in "bbit0" delay slots:
297 ; xor r12, r12, 3 [2]
298 ; asl r12, r12, 1 [3]