2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License. See the file "COPYING" in the main directory of this archive
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
12 #include <linux/linkage.h>
13 #include <variant/core.h>
14 #include <asm/asmmacro.h>
17 * void *memcpy(void *dst, const void *src, size_t len);
19 * This function is intended to do the same thing as the standard
20 * library function memcpy() for most cases.
21 * However, where the source and/or destination references
22 * an instruction RAM or ROM or a data RAM or ROM, that
23 * source and/or destination will always be accessed with
24 * 32-bit load and store instructions (as required for these
28 * !!!!!!! Handling of IRAM/IROM has not yet
29 * !!!!!!! been implemented.
31 * The (general case) algorithm is as follows:
32 * If destination is unaligned, align it by conditionally
33 * copying 1 and 2 bytes.
34 * If source is aligned,
35 * do 16 bytes with a loop, and then finish up with
36 * 8, 4, 2, and 1 byte copies conditional on the length;
37 * else (if source is unaligned),
38 * do the same, but use SRC to align the source data.
39 * This code tries to use fall-through branches for the common
40 * case of aligned source and destination and multiple
64 .byte 0 # 1 mod 4 alignment for LOOPNEZ
65 # (0 mod 4 alignment for LBEG)
68 loopnez a4, .Lbytecopydone
69 #else /* !XCHAL_HAVE_LOOPS */
70 beqz a4, .Lbytecopydone
71 add a7, a3, a4 # a7 = end address for source
72 #endif /* !XCHAL_HAVE_LOOPS */
79 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80 #endif /* !XCHAL_HAVE_LOOPS */
85 * Destination is unaligned
89 .Ldst1mod2: # dst is only byte aligned
90 _bltui a4, 7, .Lbytecopy # do short copies byte by byte
98 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
99 # return to main algorithm
100 .Ldst2mod4: # dst 16-bit aligned
102 _bltui a4, 6, .Lbytecopy # do short copies byte by byte
110 j .Ldstaligned # dst is now aligned, return to main algorithm
115 entry sp, 16 # minimal stack frame
116 # a2/ dst, a3/ src, a4/ len
117 mov a5, a2 # copy dst so that a2 is return value
119 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
120 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
121 .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
122 srli a7, a4, 4 # number of loop iterations with 16B
124 movi a8, 3 # if source is not aligned,
125 _bany a3, a8, .Lsrcunaligned # then use shifting copy
127 * Destination and source are word-aligned, use word copy.
129 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
131 loopnez a7, .Loop1done
132 #else /* !XCHAL_HAVE_LOOPS */
135 add a8, a8, a3 # a8 = end of last 16B source chunk
136 #endif /* !XCHAL_HAVE_LOOPS */
148 #if !XCHAL_HAVE_LOOPS
149 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
150 #endif /* !XCHAL_HAVE_LOOPS */
189 * Destination is aligned, Source is unaligned
194 _beqz a4, .Ldone # avoid loading anything for zero-length copies
195 # copy 16 bytes per iteration for word-aligned dst and unaligned src
196 __ssa8 a3 # set shift amount from byte offset
198 /* set to 1 when running on ISS (simulator) with the
199 lint or ferret client, or 0 to save a few cycles */
200 #define SIM_CHECKS_ALIGNMENT 1
201 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202 and a11, a3, a8 # save unalignment offset for below
203 sub a3, a3, a11 # align a3
205 l32i a6, a3, 0 # load first word
207 loopnez a7, .Loop2done
208 #else /* !XCHAL_HAVE_LOOPS */
211 add a10, a10, a3 # a10 = end of last 16B source chunk
212 #endif /* !XCHAL_HAVE_LOOPS */
228 #if !XCHAL_HAVE_LOOPS
229 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230 #endif /* !XCHAL_HAVE_LOOPS */
253 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254 add a3, a3, a11 # readjust a3 with correct misalignment
278 * void bcopy(const void *src, void *dest, size_t n);
283 entry sp, 16 # minimal stack frame
284 # a2=src, a3=dst, a4=len
288 j .Lmovecommon # go to common code for memmove+bcopy
293 * void *memmove(void *dst, const void *src, size_t len);
295 * This function is intended to do the same thing as the standard
296 * library function memmove() for most cases.
297 * However, where the source and/or destination references
298 * an instruction RAM or ROM or a data RAM or ROM, that
299 * source and/or destination will always be accessed with
300 * 32-bit load and store instructions (as required for these
304 * !!!!!!! Handling of IRAM/IROM has not yet
305 * !!!!!!! been implemented.
307 * The (general case) algorithm is as follows:
308 * If end of source doesn't overlap destination then use memcpy.
309 * Otherwise do memcpy backwards.
330 .byte 0 # 1 mod 4 alignment for LOOPNEZ
331 # (0 mod 4 alignment for LBEG)
334 loopnez a4, .Lbackbytecopydone
335 #else /* !XCHAL_HAVE_LOOPS */
336 beqz a4, .Lbackbytecopydone
337 sub a7, a3, a4 # a7 = start address for source
338 #endif /* !XCHAL_HAVE_LOOPS */
344 #if !XCHAL_HAVE_LOOPS
345 bne a3, a7, .Lbacknextbyte # continue loop if
346 # $a3:src != $a7:src_start
347 #endif /* !XCHAL_HAVE_LOOPS */
352 * Destination is unaligned
356 .Lbackdst1mod2: # dst is only byte aligned
357 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
365 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
366 # return to main algorithm
367 .Lbackdst2mod4: # dst 16-bit aligned
369 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
377 j .Lbackdstaligned # dst is now aligned,
378 # return to main algorithm
383 entry sp, 16 # minimal stack frame
384 # a2/ dst, a3/ src, a4/ len
385 mov a5, a2 # copy dst so that a2 is return value
388 bgeu a6, a4, .Lcommon
393 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
394 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
395 .Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
396 srli a7, a4, 4 # number of loop iterations with 16B
398 movi a8, 3 # if source is not aligned,
399 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
401 * Destination and source are word-aligned, use word copy.
403 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
405 loopnez a7, .backLoop1done
406 #else /* !XCHAL_HAVE_LOOPS */
407 beqz a7, .backLoop1done
409 sub a8, a3, a8 # a8 = start of first 16B source chunk
410 #endif /* !XCHAL_HAVE_LOOPS */
422 #if !XCHAL_HAVE_LOOPS
423 bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
424 #endif /* !XCHAL_HAVE_LOOPS */
426 bbci.l a4, 3, .Lback2
435 bbsi.l a4, 2, .Lback3
436 bbsi.l a4, 1, .Lback4
437 bbsi.l a4, 0, .Lback5
445 bbsi.l a4, 1, .Lback4
446 bbsi.l a4, 0, .Lback5
454 bbsi.l a4, 0, .Lback5
465 * Destination is aligned, Source is unaligned
470 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
471 # copy 16 bytes per iteration for word-aligned dst and unaligned src
472 __ssa8 a3 # set shift amount from byte offset
473 #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
474 * the lint or ferret client, or 0
475 * to save a few cycles */
476 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
477 and a11, a3, a8 # save unalignment offset for below
478 sub a3, a3, a11 # align a3
480 l32i a6, a3, 0 # load first word
482 loopnez a7, .backLoop2done
483 #else /* !XCHAL_HAVE_LOOPS */
484 beqz a7, .backLoop2done
486 sub a10, a3, a10 # a10 = start of first 16B source chunk
487 #endif /* !XCHAL_HAVE_LOOPS */
503 #if !XCHAL_HAVE_LOOPS
504 bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
505 #endif /* !XCHAL_HAVE_LOOPS */
507 bbci.l a4, 3, .Lback12
519 bbci.l a4, 2, .Lback13
528 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
529 add a3, a3, a11 # readjust a3 with correct misalignment
531 bbsi.l a4, 1, .Lback14
532 bbsi.l a4, 0, .Lback15
543 bbsi.l a4, 0, .Lback15