arch/xtensa/lib/memcopy.S

   1 /*
   2  * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
   3  * xthal_memcpy and xthal_bcopy
   4  *
   5  * This file is subject to the terms and conditions of the GNU General Public
   6  * License.  See the file "COPYING" in the main directory of this archive
   7  * for more details.
   8  *
   9  * Copyright (C) 2002 - 2005 Tensilica Inc.
  10  */
  11
  12 #include <xtensa/coreasm.h>
  13
  14         .macro  src_b   r, w0, w1
  15 #ifdef __XTENSA_EB__
  16         src     \r, \w0, \w1
  17 #else
  18         src     \r, \w1, \w0
  19 #endif
  20         .endm
  21
  22         .macro  ssa8    r
  23 #ifdef __XTENSA_EB__
  24         ssa8b   \r
  25 #else
  26         ssa8l   \r
  27 #endif
  28         .endm
  29
  30
  31 /*
  32  * void *memcpy(void *dst, const void *src, size_t len);
  33  * void *memmove(void *dst, const void *src, size_t len);
  34  * void *bcopy(const void *src, void *dst, size_t len);
  35  *
  36  * This function is intended to do the same thing as the standard
  37  * library function memcpy() (or bcopy()) for most cases.
  38  * However, where the source and/or destination references
  39  * an instruction RAM or ROM or a data RAM or ROM, that
  40  * source and/or destination will always be accessed with
  41  * 32-bit load and store instructions (as required for these
  42  * types of devices).
  43  *
  44  * !!!!!!!  XTFIXME:
  45  * !!!!!!!  Handling of IRAM/IROM has not yet
  46  * !!!!!!!  been implemented.
  47  *
  48  * The bcopy version is provided here to avoid the overhead
  49  * of an extra call, for callers that require this convention.
  50  *
  51  * The (general case) algorithm is as follows:
  52  *   If destination is unaligned, align it by conditionally
  53  *     copying 1 and 2 bytes.
  54  *   If source is aligned,
  55  *     do 16 bytes with a loop, and then finish up with
  56  *     8, 4, 2, and 1 byte copies conditional on the length;
  57  *   else (if source is unaligned),
  58  *     do the same, but use SRC to align the source data.
  59  *   This code tries to use fall-through branches for the common
  60  *     case of aligned source and destination and multiple
  61  *     of 4 (or 8) length.
  62  *
  63  * Register use:
  64  *      a0/ return address
  65  *      a1/ stack pointer
  66  *      a2/ return value
  67  *      a3/ src
  68  *      a4/ length
  69  *      a5/ dst
  70  *      a6/ tmp
  71  *      a7/ tmp
  72  *      a8/ tmp
  73  *      a9/ tmp
  74  *      a10/ tmp
  75  *      a11/ tmp
  76  */
  77
  78         .text
  79         .align  4
  80         .global bcopy
  81         .type   bcopy,@function
  82 bcopy:
  83         entry   sp, 16          # minimal stack frame
  84         # a2=src, a3=dst, a4=len
  85         mov     a5, a3          # copy dst so that a2 is return value
  86         mov     a3, a2
  87         mov     a2, a5
  88         j       .Lcommon        # go to common code for memcpy+bcopy
  89
  90
  91 /*
  92  * Byte by byte copy
  93  */
  94         .align  4
  95         .byte   0               # 1 mod 4 alignment for LOOPNEZ
  96                                 # (0 mod 4 alignment for LBEG)
  97 .Lbytecopy:
  98 #if XCHAL_HAVE_LOOPS
  99         loopnez a4, .Lbytecopydone
 100 #else /* !XCHAL_HAVE_LOOPS */
 101         beqz    a4, .Lbytecopydone
 102         add     a7, a3, a4      # a7 = end address for source
 103 #endif /* !XCHAL_HAVE_LOOPS */
 104 .Lnextbyte:
 105         l8ui    a6, a3, 0
 106         addi    a3, a3, 1
 107         s8i     a6, a5, 0
 108         addi    a5, a5, 1
 109 #if !XCHAL_HAVE_LOOPS
 110         blt     a3, a7, .Lnextbyte
 111 #endif /* !XCHAL_HAVE_LOOPS */
 112 .Lbytecopydone:
 113         retw
 114
 115 /*
 116  * Destination is unaligned
 117  */
 118
 119         .align  4
 120 .Ldst1mod2:     # dst is only byte aligned
 121         _bltui  a4, 7, .Lbytecopy       # do short copies byte by byte
 122
 123         # copy 1 byte
 124         l8ui    a6, a3,  0
 125         addi    a3, a3,  1
 126         addi    a4, a4, -1
 127         s8i     a6, a5,  0
 128         addi    a5, a5,  1
 129         _bbci.l a5, 1, .Ldstaligned     # if dst is now aligned, then
 130                                         # return to main algorithm
 131 .Ldst2mod4:     # dst 16-bit aligned
 132         # copy 2 bytes
 133         _bltui  a4, 6, .Lbytecopy       # do short copies byte by byte
 134         l8ui    a6, a3,  0
 135         l8ui    a7, a3,  1
 136         addi    a3, a3,  2
 137         addi    a4, a4, -2
 138         s8i     a6, a5,  0
 139         s8i     a7, a5,  1
 140         addi    a5, a5,  2
 141         j       .Ldstaligned    # dst is now aligned, return to main algorithm
 142
 143         .align  4
 144         .global memcpy
 145         .type   memcpy,@function
 146 memcpy:
 147         .global memmove
 148         .type   memmove,@function
 149 memmove:
 150
 151         entry   sp, 16          # minimal stack frame
 152         # a2/ dst, a3/ src, a4/ len
 153         mov     a5, a2          # copy dst so that a2 is return value
 154 .Lcommon:
 155         _bbsi.l a2, 0, .Ldst1mod2       # if dst is 1 mod 2
 156         _bbsi.l a2, 1, .Ldst2mod4       # if dst is 2 mod 4
 157 .Ldstaligned:   # return here from .Ldst?mod? once dst is aligned
 158         srli    a7, a4, 4       # number of loop iterations with 16B
 159                                 # per iteration
 160         movi    a8, 3           # if source is not aligned,
 161         _bany   a3, a8, .Lsrcunaligned  # then use shifting copy
 162         /*
 163          * Destination and source are word-aligned, use word copy.
 164          */
 165         # copy 16 bytes per iteration for word-aligned dst and word-aligned src
 166 #if XCHAL_HAVE_LOOPS
 167         loopnez a7, .Loop1done
 168 #else /* !XCHAL_HAVE_LOOPS */
 169         beqz    a7, .Loop1done
 170         slli    a8, a7, 4
 171         add     a8, a8, a3      # a8 = end of last 16B source chunk
 172 #endif /* !XCHAL_HAVE_LOOPS */
 173 .Loop1:
 174         l32i    a6, a3,  0
 175         l32i    a7, a3,  4
 176         s32i    a6, a5,  0
 177         l32i    a6, a3,  8
 178         s32i    a7, a5,  4
 179         l32i    a7, a3, 12
 180         s32i    a6, a5,  8
 181         addi    a3, a3, 16
 182         s32i    a7, a5, 12
 183         addi    a5, a5, 16
 184 #if !XCHAL_HAVE_LOOPS
 185         blt     a3, a8, .Loop1
 186 #endif /* !XCHAL_HAVE_LOOPS */
 187 .Loop1done:
 188         bbci.l  a4, 3, .L2
 189         # copy 8 bytes
 190         l32i    a6, a3,  0
 191         l32i    a7, a3,  4
 192         addi    a3, a3,  8
 193         s32i    a6, a5,  0
 194         s32i    a7, a5,  4
 195         addi    a5, a5,  8
 196 .L2:
 197         bbsi.l  a4, 2, .L3
 198         bbsi.l  a4, 1, .L4
 199         bbsi.l  a4, 0, .L5
 200         retw
 201 .L3:
 202         # copy 4 bytes
 203         l32i    a6, a3,  0
 204         addi    a3, a3,  4
 205         s32i    a6, a5,  0
 206         addi    a5, a5,  4
 207         bbsi.l  a4, 1, .L4
 208         bbsi.l  a4, 0, .L5
 209         retw
 210 .L4:
 211         # copy 2 bytes
 212         l16ui   a6, a3,  0
 213         addi    a3, a3,  2
 214         s16i    a6, a5,  0
 215         addi    a5, a5,  2
 216         bbsi.l  a4, 0, .L5
 217         retw
 218 .L5:
 219         # copy 1 byte
 220         l8ui    a6, a3,  0
 221         s8i     a6, a5,  0
 222         retw
 223
 224 /*
 225  * Destination is aligned, Source is unaligned
 226  */
 227
 228         .align  4
 229 .Lsrcunaligned:
 230         _beqz   a4, .Ldone      # avoid loading anything for zero-length copies
 231         # copy 16 bytes per iteration for word-aligned dst and unaligned src
 232         ssa8    a3              # set shift amount from byte offset
 233 #define SIM_CHECKS_ALIGNMENT    1       /* set to 1 when running on ISS (simulator) with the
 234                                            lint or ferret client, or 0 to save a few cycles */
 235 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 236         and     a11, a3, a8     # save unalignment offset for below
 237         sub     a3, a3, a11     # align a3
 238 #endif
 239         l32i    a6, a3, 0       # load first word
 240 #if XCHAL_HAVE_LOOPS
 241         loopnez a7, .Loop2done
 242 #else /* !XCHAL_HAVE_LOOPS */
 243         beqz    a7, .Loop2done
 244         slli    a10, a7, 4
 245         add     a10, a10, a3    # a10 = end of last 16B source chunk
 246 #endif /* !XCHAL_HAVE_LOOPS */
 247 .Loop2:
 248         l32i    a7, a3,  4
 249         l32i    a8, a3,  8
 250         src_b   a6, a6, a7
 251         s32i    a6, a5,  0
 252         l32i    a9, a3, 12
 253         src_b   a7, a7, a8
 254         s32i    a7, a5,  4
 255         l32i    a6, a3, 16
 256         src_b   a8, a8, a9
 257         s32i    a8, a5,  8
 258         addi    a3, a3, 16
 259         src_b   a9, a9, a6
 260         s32i    a9, a5, 12
 261         addi    a5, a5, 16
 262 #if !XCHAL_HAVE_LOOPS
 263         blt     a3, a10, .Loop2
 264 #endif /* !XCHAL_HAVE_LOOPS */
 265 .Loop2done:
 266         bbci.l  a4, 3, .L12
 267         # copy 8 bytes
 268         l32i    a7, a3,  4
 269         l32i    a8, a3,  8
 270         src_b   a6, a6, a7
 271         s32i    a6, a5,  0
 272         addi    a3, a3,  8
 273         src_b   a7, a7, a8
 274         s32i    a7, a5,  4
 275         addi    a5, a5,  8
 276         mov     a6, a8
 277 .L12:
 278         bbci.l  a4, 2, .L13
 279         # copy 4 bytes
 280         l32i    a7, a3,  4
 281         addi    a3, a3,  4
 282         src_b   a6, a6, a7
 283         s32i    a6, a5,  0
 284         addi    a5, a5,  4
 285         mov     a6, a7
 286 .L13:
 287 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
 288         add     a3, a3, a11     # readjust a3 with correct misalignment
 289 #endif
 290         bbsi.l  a4, 1, .L14
 291         bbsi.l  a4, 0, .L15
 292 .Ldone: retw
 293 .L14:
 294         # copy 2 bytes
 295         l8ui    a6, a3,  0
 296         l8ui    a7, a3,  1
 297         addi    a3, a3,  2
 298         s8i     a6, a5,  0
 299         s8i     a7, a5,  1
 300         addi    a5, a5,  2
 301         bbsi.l  a4, 0, .L15
 302         retw
 303 .L15:
 304         # copy 1 byte
 305         l8ui    a6, a3,  0
 306         s8i     a6, a5,  0
 307         retw
 308 \f
 309 /*
 310  * Local Variables:
 311  * mode:fundamental
 312  * comment-start: "# "
 313  * comment-start-skip: "# *"
 314  * End:
 315  */