libc/AOR_v20.02/string/aarch64/memcpy-advsimd.S

   1 /*
   2  * memcpy - copy memory area
   3  *
   4  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5  * See https://llvm.org/LICENSE.txt for license information.
   6  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7  */
   8
   9 /* Assumptions:
  10  *
  11  * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  12  *
  13  */
  14
  15 #include "../asmdefs.h"
  16
  17 #define dstin   x0
  18 #define src     x1
  19 #define count   x2
  20 #define dst     x3
  21 #define srcend  x4
  22 #define dstend  x5
  23 #define A_l     x6
  24 #define A_lw    w6
  25 #define A_h     x7
  26 #define B_l     x8
  27 #define B_lw    w8
  28 #define B_h     x9
  29 #define C_lw    w10
  30 #define tmp1    x14
  31
  32 #define A_q     q0
  33 #define B_q     q1
  34 #define C_q     q2
  35 #define D_q     q3
  36 #define E_q     q4
  37 #define F_q     q5
  38 #define G_q     q6
  39 #define H_q     q7
  40
  41 /* This implementation handles overlaps and supports both memcpy and memmove
  42    from a single entry point.  It uses unaligned accesses and branchless
  43    sequences to keep the code small, simple and improve performance.
  44
  45    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  46    copies of up to 128 bytes, and large copies.  The overhead of the overlap
  47    check is negligible since it is only required for large copies.
  48
  49    Large copies use a software pipelined loop processing 64 bytes per iteration.
  50    The source pointer is 16-byte aligned to minimize unaligned accesses.
  51    The loop tail is handled by always copying 64 bytes from the end.
  52 */
  53
  54 ENTRY (__memcpy_aarch64_simd)
  55 ENTRY_ALIAS (__memmove_aarch64_simd)
  56         add     srcend, src, count
  57         add     dstend, dstin, count
  58         cmp     count, 128
  59         b.hi    L(copy_long)
  60         cmp     count, 32
  61         b.hi    L(copy32_128)
  62
  63         /* Small copies: 0..32 bytes.  */
  64         cmp     count, 16
  65         b.lo    L(copy16)
  66         ldr     A_q, [src]
  67         ldr     B_q, [srcend, -16]
  68         str     A_q, [dstin]
  69         str     B_q, [dstend, -16]
  70         ret
  71
  72         /* Copy 8-15 bytes.  */
  73 L(copy16):
  74         tbz     count, 3, L(copy8)
  75         ldr     A_l, [src]
  76         ldr     A_h, [srcend, -8]
  77         str     A_l, [dstin]
  78         str     A_h, [dstend, -8]
  79         ret
  80
  81         .p2align 3
  82         /* Copy 4-7 bytes.  */
  83 L(copy8):
  84         tbz     count, 2, L(copy4)
  85         ldr     A_lw, [src]
  86         ldr     B_lw, [srcend, -4]
  87         str     A_lw, [dstin]
  88         str     B_lw, [dstend, -4]
  89         ret
  90
  91         /* Copy 0..3 bytes using a branchless sequence.  */
  92 L(copy4):
  93         cbz     count, L(copy0)
  94         lsr     tmp1, count, 1
  95         ldrb    A_lw, [src]
  96         ldrb    C_lw, [srcend, -1]
  97         ldrb    B_lw, [src, tmp1]
  98         strb    A_lw, [dstin]
  99         strb    B_lw, [dstin, tmp1]
 100         strb    C_lw, [dstend, -1]
 101 L(copy0):
 102         ret
 103
 104         .p2align 4
 105         /* Medium copies: 33..128 bytes.  */
 106 L(copy32_128):
 107         ldp     A_q, B_q, [src]
 108         ldp     C_q, D_q, [srcend, -32]
 109         cmp     count, 64
 110         b.hi    L(copy128)
 111         stp     A_q, B_q, [dstin]
 112         stp     C_q, D_q, [dstend, -32]
 113         ret
 114
 115         .p2align 4
 116         /* Copy 65..128 bytes.  */
 117 L(copy128):
 118         ldp     E_q, F_q, [src, 32]
 119         cmp     count, 96
 120         b.ls    L(copy96)
 121         ldp     G_q, H_q, [srcend, -64]
 122         stp     G_q, H_q, [dstend, -64]
 123 L(copy96):
 124         stp     A_q, B_q, [dstin]
 125         stp     E_q, F_q, [dstin, 32]
 126         stp     C_q, D_q, [dstend, -32]
 127         ret
 128
 129         /* Copy more than 128 bytes.  */
 130 L(copy_long):
 131         /* Use backwards copy if there is an overlap.  */
 132         sub     tmp1, dstin, src
 133         cmp     tmp1, count
 134         b.lo    L(copy_long_backwards)
 135
 136         /* Copy 16 bytes and then align src to 16-byte alignment.  */
 137         ldr     D_q, [src]
 138         and     tmp1, src, 15
 139         bic     src, src, 15
 140         sub     dst, dstin, tmp1
 141         add     count, count, tmp1      /* Count is now 16 too large.  */
 142         ldp     A_q, B_q, [src, 16]
 143         str     D_q, [dstin]
 144         ldp     C_q, D_q, [src, 48]
 145         subs    count, count, 128 + 16  /* Test and readjust count.  */
 146         b.ls    L(copy64_from_end)
 147 L(loop64):
 148         stp     A_q, B_q, [dst, 16]
 149         ldp     A_q, B_q, [src, 80]
 150         stp     C_q, D_q, [dst, 48]
 151         ldp     C_q, D_q, [src, 112]
 152         add     src, src, 64
 153         add     dst, dst, 64
 154         subs    count, count, 64
 155         b.hi    L(loop64)
 156
 157         /* Write the last iteration and copy 64 bytes from the end.  */
 158 L(copy64_from_end):
 159         ldp     E_q, F_q, [srcend, -64]
 160         stp     A_q, B_q, [dst, 16]
 161         ldp     A_q, B_q, [srcend, -32]
 162         stp     C_q, D_q, [dst, 48]
 163         stp     E_q, F_q, [dstend, -64]
 164         stp     A_q, B_q, [dstend, -32]
 165         ret
 166
 167         /* Large backwards copy for overlapping copies.
 168            Copy 16 bytes and then align srcend to 16-byte alignment.  */
 169 L(copy_long_backwards):
 170         cbz     tmp1, L(copy0)
 171         ldr     D_q, [srcend, -16]
 172         and     tmp1, srcend, 15
 173         bic     srcend, srcend, 15
 174         sub     count, count, tmp1
 175         ldp     A_q, B_q, [srcend, -32]
 176         str     D_q, [dstend, -16]
 177         ldp     C_q, D_q, [srcend, -64]
 178         sub     dstend, dstend, tmp1
 179         subs    count, count, 128
 180         b.ls    L(copy64_from_start)
 181
 182 L(loop64_backwards):
 183         stp     A_q, B_q, [dstend, -32]
 184         ldp     A_q, B_q, [srcend, -96]
 185         stp     C_q, D_q, [dstend, -64]
 186         ldp     C_q, D_q, [srcend, -128]
 187         sub     srcend, srcend, 64
 188         sub     dstend, dstend, 64
 189         subs    count, count, 64
 190         b.hi    L(loop64_backwards)
 191
 192         /* Write the last iteration and copy 64 bytes from the start.  */
 193 L(copy64_from_start):
 194         ldp     E_q, F_q, [src, 32]
 195         stp     A_q, B_q, [dstend, -32]
 196         ldp     A_q, B_q, [src]
 197         stp     C_q, D_q, [dstend, -64]
 198         stp     E_q, F_q, [dstin, 32]
 199         stp     A_q, B_q, [dstin]
 200         ret
 201
 202 END (__memcpy_aarch64_simd)