libc/AOR_v20.02/string/aarch64/memcpy.S

   1 /*
   2  * memcpy - copy memory area
   3  *
   4  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5  * See https://llvm.org/LICENSE.txt for license information.
   6  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7  */
   8
   9 /* Assumptions:
  10  *
  11  * ARMv8-a, AArch64, unaligned accesses.
  12  *
  13  */
  14
  15 #include "../asmdefs.h"
  16
  17 #define dstin   x0
  18 #define src     x1
  19 #define count   x2
  20 #define dst     x3
  21 #define srcend  x4
  22 #define dstend  x5
  23 #define A_l     x6
  24 #define A_lw    w6
  25 #define A_h     x7
  26 #define B_l     x8
  27 #define B_lw    w8
  28 #define B_h     x9
  29 #define C_l     x10
  30 #define C_lw    w10
  31 #define C_h     x11
  32 #define D_l     x12
  33 #define D_h     x13
  34 #define E_l     x14
  35 #define E_h     x15
  36 #define F_l     x16
  37 #define F_h     x17
  38 #define G_l     count
  39 #define G_h     dst
  40 #define H_l     src
  41 #define H_h     srcend
  42 #define tmp1    x14
  43
  44 /* This implementation handles overlaps and supports both memcpy and memmove
  45    from a single entry point.  It uses unaligned accesses and branchless
  46    sequences to keep the code small, simple and improve performance.
  47
  48    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  49    copies of up to 128 bytes, and large copies.  The overhead of the overlap
  50    check is negligible since it is only required for large copies.
  51
  52    Large copies use a software pipelined loop processing 64 bytes per iteration.
  53    The destination pointer is 16-byte aligned to minimize unaligned accesses.
  54    The loop tail is handled by always copying 64 bytes from the end.
  55 */
  56
  57 ENTRY (__memcpy_aarch64)
  58 ENTRY_ALIAS (__memmove_aarch64)
  59         add     srcend, src, count
  60         add     dstend, dstin, count
  61         cmp     count, 128
  62         b.hi    L(copy_long)
  63         cmp     count, 32
  64         b.hi    L(copy32_128)
  65
  66         /* Small copies: 0..32 bytes.  */
  67         cmp     count, 16
  68         b.lo    L(copy16)
  69         ldp     A_l, A_h, [src]
  70         ldp     D_l, D_h, [srcend, -16]
  71         stp     A_l, A_h, [dstin]
  72         stp     D_l, D_h, [dstend, -16]
  73         ret
  74
  75         /* Copy 8-15 bytes.  */
  76 L(copy16):
  77         tbz     count, 3, L(copy8)
  78         ldr     A_l, [src]
  79         ldr     A_h, [srcend, -8]
  80         str     A_l, [dstin]
  81         str     A_h, [dstend, -8]
  82         ret
  83
  84         .p2align 3
  85         /* Copy 4-7 bytes.  */
  86 L(copy8):
  87         tbz     count, 2, L(copy4)
  88         ldr     A_lw, [src]
  89         ldr     B_lw, [srcend, -4]
  90         str     A_lw, [dstin]
  91         str     B_lw, [dstend, -4]
  92         ret
  93
  94         /* Copy 0..3 bytes using a branchless sequence.  */
  95 L(copy4):
  96         cbz     count, L(copy0)
  97         lsr     tmp1, count, 1
  98         ldrb    A_lw, [src]
  99         ldrb    C_lw, [srcend, -1]
 100         ldrb    B_lw, [src, tmp1]
 101         strb    A_lw, [dstin]
 102         strb    B_lw, [dstin, tmp1]
 103         strb    C_lw, [dstend, -1]
 104 L(copy0):
 105         ret
 106
 107         .p2align 4
 108         /* Medium copies: 33..128 bytes.  */
 109 L(copy32_128):
 110         ldp     A_l, A_h, [src]
 111         ldp     B_l, B_h, [src, 16]
 112         ldp     C_l, C_h, [srcend, -32]
 113         ldp     D_l, D_h, [srcend, -16]
 114         cmp     count, 64
 115         b.hi    L(copy128)
 116         stp     A_l, A_h, [dstin]
 117         stp     B_l, B_h, [dstin, 16]
 118         stp     C_l, C_h, [dstend, -32]
 119         stp     D_l, D_h, [dstend, -16]
 120         ret
 121
 122         .p2align 4
 123         /* Copy 65..128 bytes.  */
 124 L(copy128):
 125         ldp     E_l, E_h, [src, 32]
 126         ldp     F_l, F_h, [src, 48]
 127         cmp     count, 96
 128         b.ls    L(copy96)
 129         ldp     G_l, G_h, [srcend, -64]
 130         ldp     H_l, H_h, [srcend, -48]
 131         stp     G_l, G_h, [dstend, -64]
 132         stp     H_l, H_h, [dstend, -48]
 133 L(copy96):
 134         stp     A_l, A_h, [dstin]
 135         stp     B_l, B_h, [dstin, 16]
 136         stp     E_l, E_h, [dstin, 32]
 137         stp     F_l, F_h, [dstin, 48]
 138         stp     C_l, C_h, [dstend, -32]
 139         stp     D_l, D_h, [dstend, -16]
 140         ret
 141
 142         .p2align 4
 143         /* Copy more than 128 bytes.  */
 144 L(copy_long):
 145         /* Use backwards copy if there is an overlap.  */
 146         sub     tmp1, dstin, src
 147         cbz     tmp1, L(copy0)
 148         cmp     tmp1, count
 149         b.lo    L(copy_long_backwards)
 150
 151         /* Copy 16 bytes and then align dst to 16-byte alignment.  */
 152
 153         ldp     D_l, D_h, [src]
 154         and     tmp1, dstin, 15
 155         bic     dst, dstin, 15
 156         sub     src, src, tmp1
 157         add     count, count, tmp1      /* Count is now 16 too large.  */
 158         ldp     A_l, A_h, [src, 16]
 159         stp     D_l, D_h, [dstin]
 160         ldp     B_l, B_h, [src, 32]
 161         ldp     C_l, C_h, [src, 48]
 162         ldp     D_l, D_h, [src, 64]!
 163         subs    count, count, 128 + 16  /* Test and readjust count.  */
 164         b.ls    L(copy64_from_end)
 165
 166 L(loop64):
 167         stp     A_l, A_h, [dst, 16]
 168         ldp     A_l, A_h, [src, 16]
 169         stp     B_l, B_h, [dst, 32]
 170         ldp     B_l, B_h, [src, 32]
 171         stp     C_l, C_h, [dst, 48]
 172         ldp     C_l, C_h, [src, 48]
 173         stp     D_l, D_h, [dst, 64]!
 174         ldp     D_l, D_h, [src, 64]!
 175         subs    count, count, 64
 176         b.hi    L(loop64)
 177
 178         /* Write the last iteration and copy 64 bytes from the end.  */
 179 L(copy64_from_end):
 180         ldp     E_l, E_h, [srcend, -64]
 181         stp     A_l, A_h, [dst, 16]
 182         ldp     A_l, A_h, [srcend, -48]
 183         stp     B_l, B_h, [dst, 32]
 184         ldp     B_l, B_h, [srcend, -32]
 185         stp     C_l, C_h, [dst, 48]
 186         ldp     C_l, C_h, [srcend, -16]
 187         stp     D_l, D_h, [dst, 64]
 188         stp     E_l, E_h, [dstend, -64]
 189         stp     A_l, A_h, [dstend, -48]
 190         stp     B_l, B_h, [dstend, -32]
 191         stp     C_l, C_h, [dstend, -16]
 192         ret
 193
 194         .p2align 4
 195
 196         /* Large backwards copy for overlapping copies.
 197            Copy 16 bytes and then align dst to 16-byte alignment.  */
 198 L(copy_long_backwards):
 199         ldp     D_l, D_h, [srcend, -16]
 200         and     tmp1, dstend, 15
 201         sub     srcend, srcend, tmp1
 202         sub     count, count, tmp1
 203         ldp     A_l, A_h, [srcend, -16]
 204         stp     D_l, D_h, [dstend, -16]
 205         ldp     B_l, B_h, [srcend, -32]
 206         ldp     C_l, C_h, [srcend, -48]
 207         ldp     D_l, D_h, [srcend, -64]!
 208         sub     dstend, dstend, tmp1
 209         subs    count, count, 128
 210         b.ls    L(copy64_from_start)
 211
 212 L(loop64_backwards):
 213         stp     A_l, A_h, [dstend, -16]
 214         ldp     A_l, A_h, [srcend, -16]
 215         stp     B_l, B_h, [dstend, -32]
 216         ldp     B_l, B_h, [srcend, -32]
 217         stp     C_l, C_h, [dstend, -48]
 218         ldp     C_l, C_h, [srcend, -48]
 219         stp     D_l, D_h, [dstend, -64]!
 220         ldp     D_l, D_h, [srcend, -64]!
 221         subs    count, count, 64
 222         b.hi    L(loop64_backwards)
 223
 224         /* Write the last iteration and copy 64 bytes from the start.  */
 225 L(copy64_from_start):
 226         ldp     G_l, G_h, [src, 48]
 227         stp     A_l, A_h, [dstend, -16]
 228         ldp     A_l, A_h, [src, 32]
 229         stp     B_l, B_h, [dstend, -32]
 230         ldp     B_l, B_h, [src, 16]
 231         stp     C_l, C_h, [dstend, -48]
 232         ldp     C_l, C_h, [src]
 233         stp     D_l, D_h, [dstend, -64]
 234         stp     G_l, G_h, [dstin, 48]
 235         stp     A_l, A_h, [dstin, 32]
 236         stp     B_l, B_h, [dstin, 16]
 237         stp     C_l, C_h, [dstin]
 238         ret
 239
 240 END (__memcpy_aarch64)