newlib/libc/machine/xtensa/memcpy.S

   1 /* ANSI C standard library function memcpy.
   2
   3    Copyright (c) 2002-2008 Tensilica Inc.
   4
   5    Permission is hereby granted, free of charge, to any person obtaining
   6    a copy of this software and associated documentation files (the
   7    "Software"), to deal in the Software without restriction, including
   8    without limitation the rights to use, copy, modify, merge, publish,
   9    distribute, sublicense, and/or sell copies of the Software, and to
  10    permit persons to whom the Software is furnished to do so, subject to
  11    the following conditions:
  12
  13    The above copyright notice and this permission notice shall be included
  14    in all copies or substantial portions of the Software.
  15
  16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  19    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  20    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  21    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  22    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
  23
  24 #include "xtensa-asm.h"
  25
  26 /* If the Xtensa Unaligned Load Exception option is not used, this
  27    code can run a few cycles faster by relying on the low address bits
  28    being ignored.  However, if the code is then run with an Xtensa ISS
  29    client that checks for unaligned accesses, it will produce a lot of
  30    warning messages.  Set this flag to disable the use of unaligned
  31    accesses and keep the ISS happy.  */
  32
  33 /* #define UNALIGNED_ADDRESSES_CHECKED XCHAL_UNALIGNED_LOAD_EXCEPTION */
  34 #define UNALIGNED_ADDRESSES_CHECKED 1
  35
  36
  37 /* void *memcpy (void *dst, const void *src, size_t len)
  38
  39    The algorithm is as follows:
  40
  41    If the destination is unaligned, align it by conditionally
  42    copying 1- and/or 2-byte pieces.
  43
  44    If the source is aligned, copy 16 bytes with a loop, and then finish up
  45    with 8, 4, 2, and 1-byte copies conditional on the length.
  46
  47    Else (if source is unaligned), do the same, but use SRC to align the
  48    source data.
  49
  50    This code tries to use fall-through branches for the common
  51    case of aligned source and destination and multiple of 4 (or 8) length.  */
  52
  53
  54 /* Byte by byte copy.  */
  55
  56         .text
  57         .begin schedule
  58         .align  XCHAL_INST_FETCH_WIDTH
  59         .literal_position
  60 __memcpy_aux:
  61
  62         /* Skip bytes to get proper alignment for three-byte loop */
  63 .skip XCHAL_INST_FETCH_WIDTH - 3
  64
  65 .Lbytecopy:
  66 #if XCHAL_HAVE_LOOPS
  67         loopnez a4, 2f
  68 #else
  69         beqz    a4, 2f
  70         add     a7, a3, a4      // a7 = end address for source
  71 #endif
  72 1:      l8ui    a6, a3, 0
  73         addi    a3, a3, 1
  74 #if XTENSA_ESP32_PSRAM_CACHE_FIX
  75         nop
  76         nop
  77         nop
  78 #endif
  79         s8i     a6, a5, 0
  80         addi    a5, a5, 1
  81 #if XTENSA_ESP32_PSRAM_CACHE_FIX
  82         memw
  83 #endif
  84 #if !XCHAL_HAVE_LOOPS
  85         bltu    a3, a7, 1b
  86 #endif
  87 2:      leaf_return
  88
  89
  90 /* Destination is unaligned.  */
  91
  92         .align  4
  93 .Ldst1mod2: // dst is only byte aligned
  94
  95         /* Do short copies byte-by-byte.  */
  96         bltui   a4, 7, .Lbytecopy
  97
  98         /* Copy 1 byte.  */
  99         l8ui    a6, a3, 0
 100         addi    a3, a3, 1
 101         addi    a4, a4, -1
 102         s8i     a6, a5, 0
 103 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 104         memw
 105 #endif
 106         addi    a5, a5, 1
 107
 108         /* Return to main algorithm if dst is now aligned.  */
 109         bbci.l  a5, 1, .Ldstaligned
 110
 111 .Ldst2mod4: // dst has 16-bit alignment
 112
 113         /* Do short copies byte-by-byte.  */
 114         bltui   a4, 6, .Lbytecopy
 115
 116         /* Copy 2 bytes.  */
 117         l8ui    a6, a3, 0
 118         l8ui    a7, a3, 1
 119         addi    a3, a3, 2
 120         addi    a4, a4, -2
 121         s8i     a6, a5, 0
 122         s8i     a7, a5, 1
 123 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 124         memw
 125 #endif
 126         addi    a5, a5, 2
 127
 128         /* dst is now aligned; return to main algorithm.  */
 129         j       .Ldstaligned
 130
 131
 132         .align  4
 133         .global memcpy
 134         .type   memcpy, @function
 135 memcpy:
 136         leaf_entry sp, 16
 137         /* a2 = dst, a3 = src, a4 = len */
 138
 139         mov     a5, a2          // copy dst so that a2 is return value
 140         bbsi.l  a2, 0, .Ldst1mod2
 141         bbsi.l  a2, 1, .Ldst2mod4
 142 .Ldstaligned:
 143
 144         /* Get number of loop iterations with 16B per iteration.  */
 145         srli    a7, a4, 4
 146
 147         /* Check if source is aligned.  */
 148         slli    a8, a3, 30
 149         bnez    a8, .Lsrcunaligned
 150
 151         /* Destination and source are word-aligned, use word copy.  */
 152 #if XCHAL_HAVE_LOOPS
 153         loopnez a7, 2f
 154 #else
 155         beqz    a7, 2f
 156         slli    a8, a7, 4
 157         add     a8, a8, a3      // a8 = end of last 16B source chunk
 158 #endif
 159
 160 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 161
 162 1:      l32i    a6, a3, 0
 163         l32i    a7, a3, 4
 164         s32i    a6, a5, 0
 165         s32i    a7, a5, 4
 166         memw
 167         l32i    a6, a3, 8
 168         l32i    a7, a3, 12
 169         s32i    a6, a5, 8
 170         s32i    a7, a5, 12
 171         memw
 172
 173         addi    a3, a3, 16
 174         addi    a5, a5, 16
 175
 176 #else
 177
 178 1:      l32i    a6, a3, 0
 179         l32i    a7, a3, 4
 180         s32i    a6, a5, 0
 181         l32i    a6, a3, 8
 182         s32i    a7, a5, 4
 183         l32i    a7, a3, 12
 184         s32i    a6, a5, 8
 185         addi    a3, a3, 16
 186         s32i    a7, a5, 12
 187         addi    a5, a5, 16
 188
 189 #endif
 190
 191
 192 #if !XCHAL_HAVE_LOOPS
 193         bltu    a3, a8, 1b
 194 #endif
 195
 196         /* Copy any leftover pieces smaller than 16B.  */
 197 2:      bbci.l  a4, 3, 3f
 198
 199         /* Copy 8 bytes.  */
 200         l32i    a6, a3, 0
 201         l32i    a7, a3, 4
 202         addi    a3, a3, 8
 203         s32i    a6, a5, 0
 204         s32i    a7, a5, 4
 205         addi    a5, a5, 8
 206
 207 3:      bbsi.l  a4, 2, 4f
 208         bbsi.l  a4, 1, 5f
 209         bbsi.l  a4, 0, 6f
 210 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 211         memw
 212 #endif
 213         leaf_return
 214
 215         .align 4
 216         /* Copy 4 bytes.  */
 217 4:      l32i    a6, a3, 0
 218         addi    a3, a3, 4
 219         s32i    a6, a5, 0
 220         addi    a5, a5, 4
 221         bbsi.l  a4, 1, 5f
 222         bbsi.l  a4, 0, 6f
 223 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 224         memw
 225 #endif
 226         leaf_return
 227
 228         /* Copy 2 bytes.  */
 229 5:      l16ui   a6, a3, 0
 230         addi    a3, a3, 2
 231         s16i    a6, a5, 0
 232         addi    a5, a5, 2
 233         bbsi.l  a4, 0, 6f
 234 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 235         memw
 236 #endif
 237         leaf_return
 238
 239         /* Copy 1 byte.  */
 240 6:      l8ui    a6, a3, 0
 241         s8i     a6, a5, 0
 242
 243 .Ldone:
 244 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 245         memw
 246 #endif
 247         leaf_return
 248
 249
 250 /* Destination is aligned; source is unaligned.  */
 251
 252         .align  4
 253 .Lsrcunaligned:
 254         /* Avoid loading anything for zero-length copies.  */
 255         beqz    a4, .Ldone
 256
 257         /* Copy 16 bytes per iteration for word-aligned dst and
 258            unaligned src.  */
 259         ssa8    a3              // set shift amount from byte offset
 260 #if UNALIGNED_ADDRESSES_CHECKED
 261         srli    a11, a8, 30     // save unalignment offset for below
 262         sub     a3, a3, a11     // align a3
 263 #endif
 264         l32i    a6, a3, 0       // load first word
 265 #if XCHAL_HAVE_LOOPS
 266         loopnez a7, 2f
 267 #else
 268         beqz    a7, 2f
 269         slli    a10, a7, 4
 270         add     a10, a10, a3    // a10 = end of last 16B source chunk
 271 #endif
 272 1:      l32i    a7, a3, 4
 273         l32i    a8, a3, 8
 274         src_b   a6, a6, a7
 275         s32i    a6, a5, 0
 276         l32i    a9, a3, 12
 277         src_b   a7, a7, a8
 278         s32i    a7, a5, 4
 279         l32i    a6, a3, 16
 280         src_b   a8, a8, a9
 281         s32i    a8, a5, 8
 282         addi    a3, a3, 16
 283         src_b   a9, a9, a6
 284         s32i    a9, a5, 12
 285         addi    a5, a5, 16
 286 #if !XCHAL_HAVE_LOOPS
 287         bltu    a3, a10, 1b
 288 #endif
 289
 290 2:      bbci.l  a4, 3, 3f
 291
 292         /* Copy 8 bytes.  */
 293         l32i    a7, a3, 4
 294         l32i    a8, a3, 8
 295         src_b   a6, a6, a7
 296         s32i    a6, a5, 0
 297         addi    a3, a3, 8
 298         src_b   a7, a7, a8
 299         s32i    a7, a5, 4
 300         addi    a5, a5, 8
 301         mov     a6, a8
 302
 303 3:      bbci.l  a4, 2, 4f
 304
 305         /* Copy 4 bytes.  */
 306         l32i    a7, a3, 4
 307         addi    a3, a3, 4
 308         src_b   a6, a6, a7
 309         s32i    a6, a5, 0
 310         addi    a5, a5, 4
 311         mov     a6, a7
 312 4:
 313 #if UNALIGNED_ADDRESSES_CHECKED
 314         add     a3, a3, a11     // readjust a3 with correct misalignment
 315 #endif
 316         bbsi.l  a4, 1, 5f
 317         bbsi.l  a4, 0, 6f
 318         leaf_return
 319
 320         /* Copy 2 bytes.  */
 321 5:      l8ui    a6, a3, 0
 322         l8ui    a7, a3, 1
 323         addi    a3, a3, 2
 324         s8i     a6, a5, 0
 325         s8i     a7, a5, 1
 326         addi    a5, a5, 2
 327         bbsi.l  a4, 0, 6f
 328 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 329         memw
 330 #endif
 331         leaf_return
 332
 333         /* Copy 1 byte.  */
 334 6:      l8ui    a6, a3, 0
 335         s8i     a6, a5, 0
 336 #if XTENSA_ESP32_PSRAM_CACHE_FIX
 337         memw
 338 #endif
 339         leaf_return
 340
 341         .end schedule
 342
 343         .size   memcpy, . - memcpy