newlib/libc/machine/arm/memcpy-armv7m.S

   1 /*
   2  * Copyright (c) 2013 ARM Ltd
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. The name of the company may not be used to endorse or promote
  14  *    products derived from this software without specific prior written
  15  *    permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
  30    unaligned access.
  31
  32    If compiled with GCC, this file should be enclosed within following
  33    pre-processing check:
  34    if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
  35
  36    Prototype: void *memcpy (void *dst, const void *src, size_t count);
  37
  38    The job will be done in 5 steps.
  39    Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
  40    Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
  41    Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
  42    Step 4: Copy word by word
  43    Step 5: Copy byte-to-byte
  44
  45    Tunable options:
  46      __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
  47      __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
  48  */
  49 #include "arm_asm.h"
  50
  51 #ifndef __OPT_BIG_BLOCK_SIZE
  52 #define __OPT_BIG_BLOCK_SIZE (4 * 16)
  53 #endif
  54
  55 #ifndef __OPT_MID_BLOCK_SIZE
  56 #define __OPT_MID_BLOCK_SIZE (4 * 4)
  57 #endif
  58
  59 #if __OPT_BIG_BLOCK_SIZE == 16
  60 #define BEGIN_UNROLL_BIG_BLOCK \
  61   .irp offset, 0,4,8,12
  62 #elif __OPT_BIG_BLOCK_SIZE == 32
  63 #define BEGIN_UNROLL_BIG_BLOCK \
  64   .irp offset, 0,4,8,12,16,20,24,28
  65 #elif __OPT_BIG_BLOCK_SIZE == 64
  66 #define BEGIN_UNROLL_BIG_BLOCK \
  67   .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
  68 #else
  69 #error "Illegal __OPT_BIG_BLOCK_SIZE"
  70 #endif
  71
  72 #if __OPT_MID_BLOCK_SIZE == 8
  73 #define BEGIN_UNROLL_MID_BLOCK \
  74   .irp offset, 0,4
  75 #elif __OPT_MID_BLOCK_SIZE == 16
  76 #define BEGIN_UNROLL_MID_BLOCK \
  77   .irp offset, 0,4,8,12
  78 #else
  79 #error "Illegal __OPT_MID_BLOCK_SIZE"
  80 #endif
  81
  82 #define END_UNROLL .endr
  83
  84         .syntax unified
  85         .text
  86         .align  2
  87         .global memcpy
  88         .thumb
  89         .thumb_func
  90         .fnstart
  91         .cfi_sections .debug_frame
  92         .cfi_startproc
  93         .type   memcpy, %function
  94 memcpy:
  95         @ r0: dst
  96         @ r1: src
  97         @ r2: len
  98 #ifdef __ARM_FEATURE_UNALIGNED
  99         /* In case of UNALIGNED access supported, ip is not used in
 100            function body.  */
 101         prologue push_ip=HAVE_PAC_LEAF
 102         mov     ip, r0
 103 #else
 104         prologue 0 push_ip=HAVE_PAC_LEAF
 105 #endif /* __ARM_FEATURE_UNALIGNED */
 106         orr     r3, r1, r0
 107         ands    r3, r3, #3
 108         bne     .Lmisaligned_copy
 109
 110 .Lbig_block:
 111         subs    r2, __OPT_BIG_BLOCK_SIZE
 112         blo     .Lmid_block
 113
 114         /* Kernel loop for big block copy */
 115         .align 2
 116 .Lbig_block_loop:
 117         BEGIN_UNROLL_BIG_BLOCK
 118 #ifdef __ARM_ARCH_7EM__
 119         ldr     r3, [r1], #4
 120         str     r3, [r0], #4
 121         END_UNROLL
 122 #else /* __ARM_ARCH_7M__ */
 123         ldr     r3, [r1, \offset]
 124         str     r3, [r0, \offset]
 125         END_UNROLL
 126         adds    r0, __OPT_BIG_BLOCK_SIZE
 127         adds    r1, __OPT_BIG_BLOCK_SIZE
 128 #endif
 129         subs    r2, __OPT_BIG_BLOCK_SIZE
 130         bhs .Lbig_block_loop
 131
 132 .Lmid_block:
 133         adds    r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
 134         blo     .Lcopy_word_by_word
 135
 136         /* Kernel loop for mid-block copy */
 137         .align 2
 138 .Lmid_block_loop:
 139         BEGIN_UNROLL_MID_BLOCK
 140 #ifdef __ARM_ARCH_7EM__
 141         ldr     r3, [r1], #4
 142         str     r3, [r0], #4
 143         END_UNROLL
 144 #else /* __ARM_ARCH_7M__ */
 145         ldr     r3, [r1, \offset]
 146         str     r3, [r0, \offset]
 147         END_UNROLL
 148         adds    r0, __OPT_MID_BLOCK_SIZE
 149         adds    r1, __OPT_MID_BLOCK_SIZE
 150 #endif
 151         subs    r2, __OPT_MID_BLOCK_SIZE
 152         bhs     .Lmid_block_loop
 153
 154 .Lcopy_word_by_word:
 155         adds    r2, __OPT_MID_BLOCK_SIZE - 4
 156         blo     .Lcopy_less_than_4
 157
 158         /* Kernel loop for small block copy */
 159         .align 2
 160 .Lcopy_word_by_word_loop:
 161         ldr     r3, [r1], #4
 162         str     r3, [r0], #4
 163         subs    r2, #4
 164         bhs     .Lcopy_word_by_word_loop
 165
 166 .Lcopy_less_than_4:
 167         adds    r2, #4
 168         beq     .Ldone
 169
 170         lsls    r2, r2, #31
 171         itt ne
 172         ldrbne  r3, [r1], #1
 173         strbne  r3, [r0], #1
 174
 175         bcc     .Ldone
 176 #ifdef __ARM_FEATURE_UNALIGNED
 177         ldrh    r3, [r1]
 178         strh    r3, [r0]
 179 #else
 180         ldrb    r3, [r1]
 181         strb    r3, [r0]
 182         ldrb    r3, [r1, #1]
 183         strb    r3, [r0, #1]
 184 #endif /* __ARM_FEATURE_UNALIGNED */
 185
 186 .Ldone:
 187         .cfi_remember_state
 188 #ifdef __ARM_FEATURE_UNALIGNED
 189         mov     r0, ip
 190         epilogue push_ip=HAVE_PAC_LEAF
 191 #else
 192         epilogue 0 push_ip=HAVE_PAC_LEAF
 193 #endif /*  __ARM_FEATURE_UNALIGNED */
 194
 195         .align 2
 196 .Lmisaligned_copy:
 197         .cfi_restore_state
 198 #ifdef __ARM_FEATURE_UNALIGNED
 199         /* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
 200            once destination is adjusted to aligned.  */
 201 #define Ldst_aligned Lbig_block
 202
 203         /* Copy word by word using LDR when alignment can be done in hardware,
 204         i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
 205
 206         cmp     r2, #8
 207         blo     .Lbyte_copy
 208
 209         /* if src is aligned, just go to the big block loop.  */
 210         lsls    r3, r1, #30
 211         beq     .Ldst_aligned
 212 #else
 213         /* if len < 12, misalignment adjustment has more overhead than
 214         just byte-to-byte copy.  Also, len must >=8 to guarantee code
 215         afterward work correctly.  */
 216         cmp     r2, #12
 217         blo     .Lbyte_copy
 218 #endif /* __ARM_FEATURE_UNALIGNED */
 219
 220         /* Align dst only, not trying to align src.  That is the because
 221         handling of aligned src and misaligned dst need more overhead than
 222         otherwise.  By doing this the worst case is when initial src is aligned,
 223         additional up to 4 byte additional copy will executed, which is
 224         acceptable.  */
 225
 226         ands    r3, r0, #3
 227         beq     .Ldst_aligned
 228
 229         rsb     r3, #4
 230         subs    r2, r3
 231
 232         lsls    r3, r3, #31
 233         itt ne
 234         ldrbne  r3, [r1], #1
 235         strbne  r3, [r0], #1
 236
 237         bcc .Ldst_aligned
 238
 239 #ifdef __ARM_FEATURE_UNALIGNED
 240         ldrh    r3, [r1], #2
 241         strh    r3, [r0], #2
 242         b       .Ldst_aligned
 243 #else
 244         ldrb    r3, [r1], #1
 245         strb    r3, [r0], #1
 246         ldrb    r3, [r1], #1
 247         strb    r3, [r0], #1
 248         /* Now that dst is aligned */
 249 .Ldst_aligned:
 250         /* if r1 is aligned now, it means r0/r1 has the same misalignment,
 251         and they are both aligned now.  Go aligned copy.  */
 252         ands    r3, r1, #3
 253         beq     .Lbig_block
 254
 255         /* dst is aligned, but src isn't.  Misaligned copy.  */
 256
 257         push    {r4, r5}
 258         .cfi_adjust_cfa_offset 8
 259         .cfi_rel_offset 4, 0
 260         .cfi_rel_offset 5, 4
 261         subs    r2, #4
 262
 263         /* Backward r1 by misaligned bytes, to make r1 aligned.
 264         Since we need to restore r1 to unaligned address after the loop,
 265         we need keep the offset bytes to ip and sub it from r1 afterward.  */
 266         subs    r1, r3
 267         rsb     ip, r3, #4
 268
 269         /* Pre-load on word */
 270         ldr     r4, [r1], #4
 271
 272         cmp     r3, #2
 273         beq     .Lmisaligned_copy_2_2
 274         cmp     r3, #3
 275         beq     .Lmisaligned_copy_3_1
 276
 277         .macro mis_src_copy shift
 278 1:
 279 #ifdef __ARM_BIG_ENDIAN
 280         lsls    r4, r4, \shift
 281 #else
 282         lsrs    r4, r4, \shift
 283 #endif
 284         ldr     r3, [r1], #4
 285 #ifdef __ARM_BIG_ENDIAN
 286         lsrs    r5, r3, 32-\shift
 287 #else
 288         lsls    r5, r3, 32-\shift
 289 #endif
 290         orr     r4, r4, r5
 291         str     r4, [r0], #4
 292         mov     r4, r3
 293         subs    r2, #4
 294         bhs     1b
 295         .endm
 296
 297 .Lmisaligned_copy_1_3:
 298         mis_src_copy shift=8
 299         b       .Lsrc_misaligned_tail
 300
 301 .Lmisaligned_copy_3_1:
 302         mis_src_copy shift=24
 303         b       .Lsrc_misaligned_tail
 304
 305 .Lmisaligned_copy_2_2:
 306         /* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
 307         mis_src_copy shift=16
 308
 309 .Lsrc_misaligned_tail:
 310         adds    r2, #4
 311         subs    r1, ip
 312         pop     {r4, r5}
 313         .cfi_restore 4
 314         .cfi_restore 5
 315         .cfi_adjust_cfa_offset -8
 316
 317 #endif /* __ARM_FEATURE_UNALIGNED */
 318
 319 .Lbyte_copy:
 320         subs    r2, #4
 321         blo     .Lcopy_less_than_4
 322
 323 .Lbyte_copy_loop:
 324         subs    r2, #1
 325         ldrb    r3, [r1], #1
 326         strb    r3, [r0], #1
 327         bhs     .Lbyte_copy_loop
 328
 329         ldrb    r3, [r1]
 330         strb    r3, [r0]
 331         ldrb    r3, [r1, #1]
 332         strb    r3, [r0, #1]
 333         ldrb    r3, [r1, #2]
 334         strb    r3, [r0, #2]
 335
 336 #ifdef __ARM_FEATURE_UNALIGNED
 337         mov     r0, ip
 338         epilogue push_ip=HAVE_PAC_LEAF
 339 #else
 340         epilogue 0 push_ip=HAVE_PAC_LEAF
 341 #endif /* __ARM_FEATURE_UNALIGNED */
 342         .cfi_endproc
 343         .cantunwind
 344         .fnend
 345         .size   memcpy, .-memcpy