newlib/libc/machine/arm/aeabi_memcpy-armv7a.S

   1 /*
   2  * Copyright (c) 2014 ARM Ltd
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. The name of the company may not be used to endorse or promote
  14  *    products derived from this software without specific prior written
  15  *    permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  22  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 #include "arm-acle-compat.h"
  30
  31 /* NOTE: This ifdef MUST match the one in aeabi_memcpy.c.  */
  32 #if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
  33         (defined (__ARM_NEON__) || !defined (__SOFTFP__))
  34
  35         .syntax unified
  36         .global __aeabi_memcpy
  37         .type   __aeabi_memcpy, %function
  38 __aeabi_memcpy:
  39         /* Assumes that n >= 0, and dst, src are valid pointers.
  40           If there is at least 8 bytes to copy, use LDRD/STRD.
  41           If src and dst are misaligned with different offsets,
  42           first copy byte by byte until dst is aligned,
  43           and then copy using LDRD/STRD and shift if needed.
  44           When less than 8 left, copy a word and then byte by byte.  */
  45
  46        /* Save registers (r0 holds the return value):
  47           optimized push {r0, r4, r5, lr}.
  48           To try and improve performance, stack layout changed,
  49           i.e., not keeping the stack looking like users expect
  50           (highest numbered register at highest address).  */
  51         push {r0, lr}
  52         strd r4, r5, [sp, #-8]!
  53
  54         /* Get copying of tiny blocks out of the way first.  */
  55         /* Is there at least 4 bytes to copy?  */
  56         subs    r2, r2, #4
  57         blt     copy_less_than_4       /* If n < 4.  */
  58
  59         /* Check word alignment.  */
  60         ands    ip, r0, #3             /* ip = last 2 bits of dst.  */
  61         bne     dst_not_word_aligned   /* If dst is not word-aligned.  */
  62
  63         /* Get here if dst is word-aligned.  */
  64         ands    ip, r1, #3             /* ip = last 2 bits of src.  */
  65         bne     src_not_word_aligned   /* If src is not word-aligned.  */
  66 word_aligned:
  67         /* Get here if source and dst both are word-aligned.
  68            The number of bytes remaining to copy is r2+4.  */
  69
  70         /* Is there is at least 64 bytes to copy?  */
  71         subs    r2, r2, #60
  72         blt     copy_less_than_64                /* If r2 + 4 < 64.  */
  73
  74         /* First, align the destination buffer to 8-bytes,
  75            to make sure double loads and stores don't cross cache line boundary,
  76            as they are then more expensive even if the data is in the cache
  77            (require two load/store issue cycles instead of one).
  78            If only one of the buffers is not 8-bytes aligned,
  79            then it's more important to align dst than src,
  80            because there is more penalty for stores
  81            than loads that cross cacheline boundary.
  82            This check and realignment are only worth doing
  83            if there is a lot to copy.  */
  84
  85         /* Get here if dst is word aligned,
  86            i.e., the 2 least significant bits are 0.
  87            If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
  88            then copy 1 word (4 bytes).  */
  89         ands    r3, r0, #4
  90         beq     two_word_aligned  /* If dst already two-word aligned.  */
  91         ldr     r3, [r1], #4
  92         str     r3, [r0], #4
  93         subs    r2, r2, #4
  94         blt     copy_less_than_64
  95
  96 two_word_aligned:
  97         /* TODO: Align to cacheline (useful for PLD optimization).  */
  98
  99         /* Every loop iteration copies 64 bytes.  */
 100 1:
 101         .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
 102         ldrd    r4, r5, [r1, \offset]
 103         strd    r4, r5, [r0, \offset]
 104         .endr
 105
 106         add     r0, r0, #64
 107         add     r1, r1, #64
 108         subs    r2, r2, #64
 109         bge     1b                     /* If there is more to copy.  */
 110
 111 copy_less_than_64:
 112
 113         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
 114            Restore the count if there is more than 7 bytes to copy.  */
 115         adds    r2, r2, #56
 116         blt     copy_less_than_8
 117
 118         /* Copy 8 bytes at a time.  */
 119 2:
 120         ldrd    r4, r5, [r1], #8
 121         strd    r4, r5, [r0], #8
 122         subs    r2, r2, #8
 123         bge     2b                     /* If there is more to copy.  */
 124
 125 copy_less_than_8:
 126
 127         /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
 128            Check if there is more to copy.  */
 129         cmn     r2, #8
 130         beq     return                          /* If r2 + 8 == 0.  */
 131
 132         /* Restore the count if there is more than 3 bytes to copy.  */
 133         adds    r2, r2, #4
 134         blt     copy_less_than_4
 135
 136         /* Copy 4 bytes.  */
 137         ldr     r3, [r1], #4
 138         str     r3, [r0], #4
 139
 140 copy_less_than_4:
 141         /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */
 142
 143         /* Restore the count, check if there is more to copy.  */
 144         adds    r2, r2, #4
 145         beq     return                          /* If r2 == 0.  */
 146
 147         /* Get here with r2 is in {1,2,3}={01,10,11}.  */
 148         /* Logical shift left r2, insert 0s, update flags.  */
 149         lsls    r2, r2, #31
 150
 151         /* Copy byte by byte.
 152            Condition ne means the last bit of r2 is 0.
 153            Condition cs means the second to last bit of r2 is set,
 154            i.e., r2 is 1 or 3.  */
 155         itt     ne
 156         ldrbne  r3, [r1], #1
 157         strbne  r3, [r0], #1
 158
 159         itttt   cs
 160         ldrbcs  r4, [r1], #1
 161         ldrbcs  r5, [r1]
 162         strbcs  r4, [r0], #1
 163         strbcs  r5, [r0]
 164
 165 return:
 166         /* Restore registers: optimized pop {r0, r4, r5, pc}   */
 167         ldrd r4, r5, [sp], #8
 168         pop {r0, pc}         /* This is the only return point of memcpy.  */
 169
 170 dst_not_word_aligned:
 171
 172        /* Get here when dst is not aligned and ip has the last 2 bits of dst,
 173           i.e., ip is the offset of dst from word.
 174           The number of bytes that remains to copy is r2 + 4,
 175           i.e., there are at least 4 bytes to copy.
 176           Write a partial word (0 to 3 bytes), such that dst becomes
 177           word-aligned.  */
 178
 179        /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
 180           then there are (4 - ip) bytes to fill up to align dst to the next
 181           word.  */
 182         rsb     ip, ip, #4                 /* ip = #4 - ip.  */
 183         cmp     ip, #2
 184
 185        /* Copy byte by byte with conditionals.  */
 186         itt     gt
 187         ldrbgt  r3, [r1], #1
 188         strbgt  r3, [r0], #1
 189
 190         itt     ge
 191         ldrbge  r4, [r1], #1
 192         strbge  r4, [r0], #1
 193
 194         ldrb    lr, [r1], #1
 195         strb    lr, [r0], #1
 196
 197        /* Update the count.
 198           ip holds the number of bytes we have just copied.  */
 199         subs    r2, r2, ip                        /* r2 = r2 - ip.  */
 200         blt     copy_less_than_4                  /* If r2 < ip.  */
 201
 202        /* Get here if there are more than 4 bytes to copy.
 203           Check if src is aligned.  If beforehand src and dst were not word
 204           aligned but congruent (same offset), then now they are both
 205           word-aligned, and we can copy the rest efficiently (without
 206           shifting).  */
 207         ands    ip, r1, #3                    /* ip = last 2 bits of src.  */
 208         beq     word_aligned                  /* If r1 is word-aligned.  */
 209
 210 src_not_word_aligned:
 211        /* Get here when src is not word-aligned, but dst is word-aligned.
 212           The number of bytes that remains to copy is r2+4.  */
 213
 214        /* Copy word by word using LDR when alignment can be done in hardware,
 215           i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
 216         subs    r2, r2, #60
 217         blt     8f
 218
 219 7:
 220         /* Copy 64 bytes in every loop iteration.  */
 221         .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
 222         ldr     r3, [r1, \offset]
 223         str     r3, [r0, \offset]
 224         .endr
 225
 226         add     r0, r0, #64
 227         add     r1, r1, #64
 228         subs    r2, r2, #64
 229         bge     7b
 230
 231 8:
 232         /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
 233            Check if there is more than 3 bytes to copy.  */
 234         adds    r2, r2, #60
 235         blt     copy_less_than_4
 236
 237 9:
 238        /* Get here if there is less than 64 but at least 4 bytes to copy,
 239           where the number of bytes to copy is r2+4.  */
 240         ldr     r3, [r1], #4
 241         str     r3, [r0], #4
 242         subs    r2, r2, #4
 243         bge     9b
 244
 245         b       copy_less_than_4
 246
 247
 248         .syntax unified
 249         .global __aeabi_memcpy4
 250         .type   __aeabi_memcpy4, %function
 251 __aeabi_memcpy4:
 252         /* Assumes that both of its arguments are 4-byte aligned.  */
 253
 254         push {r0, lr}
 255         strd r4, r5, [sp, #-8]!
 256
 257         /* Is there at least 4 bytes to copy?  */
 258         subs    r2, r2, #4
 259         blt     copy_less_than_4       /* If n < 4.  */
 260
 261         bl      word_aligned
 262
 263         .syntax unified
 264         .global __aeabi_memcpy8
 265         .type   __aeabi_memcpy8, %function
 266 __aeabi_memcpy8:
 267         /* Assumes that both of its arguments are 8-byte aligned.  */
 268
 269         push {r0, lr}
 270         strd r4, r5, [sp, #-8]!
 271
 272         /* Is there at least 4 bytes to copy?  */
 273         subs    r2, r2, #4
 274         blt     copy_less_than_4        /* If n < 4.  */
 275
 276         /* Is there at least 8 bytes to copy?  */
 277         subs    r2, r2, #4
 278         blt     copy_less_than_8        /* If n < 8.  */
 279
 280         /* Is there at least 64 bytes to copy?  */
 281         subs    r2, r2, #56
 282         blt     copy_less_than_64       /* if n + 8 < 64.  */
 283
 284         bl      two_word_aligned
 285
 286 #endif