libc/AOR_v20.02/string/aarch64/strcpy.S

   1 /*
   2  * strcpy/stpcpy - copy a string returning pointer to start/end.
   3  *
   4  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5  * See https://llvm.org/LICENSE.txt for license information.
   6  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7  */
   8
   9 /* Assumptions:
  10  *
  11  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  12  */
  13
  14 #include "../asmdefs.h"
  15
  16 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
  17
  18    To test the page crossing code path more thoroughly, compile with
  19    -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
  20    entry path.  This option is not intended for production use.  */
  21
  22 /* Arguments and results.  */
  23 #define dstin           x0
  24 #define srcin           x1
  25
  26 /* Locals and temporaries.  */
  27 #define src             x2
  28 #define dst             x3
  29 #define data1           x4
  30 #define data1w          w4
  31 #define data2           x5
  32 #define data2w          w5
  33 #define has_nul1        x6
  34 #define has_nul2        x7
  35 #define tmp1            x8
  36 #define tmp2            x9
  37 #define tmp3            x10
  38 #define tmp4            x11
  39 #define zeroones        x12
  40 #define data1a          x13
  41 #define data2a          x14
  42 #define pos             x15
  43 #define len             x16
  44 #define to_align        x17
  45
  46 #ifdef BUILD_STPCPY
  47 #define STRCPY __stpcpy_aarch64
  48 #else
  49 #define STRCPY __strcpy_aarch64
  50 #endif
  51
  52         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  53            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  54            can be done in parallel across the entire word.  */
  55
  56 #define REP8_01 0x0101010101010101
  57 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  58 #define REP8_80 0x8080808080808080
  59
  60         /* AArch64 systems have a minimum page size of 4k.  We can do a quick
  61            page size check for crossing this boundary on entry and if we
  62            do not, then we can short-circuit much of the entry code.  We
  63            expect early page-crossing strings to be rare (probability of
  64            16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
  65            predictable, even with random strings.
  66
  67            We don't bother checking for larger page sizes, the cost of setting
  68            up the correct page size is just not worth the extra gain from
  69            a small reduction in the cases taking the slow path.  Note that
  70            we only care about whether the first fetch, which may be
  71            misaligned, crosses a page boundary - after that we move to aligned
  72            fetches for the remainder of the string.  */
  73
  74 #ifdef STRCPY_TEST_PAGE_CROSS
  75         /* Make everything that isn't Qword aligned look like a page cross.  */
  76 #define MIN_PAGE_P2 4
  77 #else
  78 #define MIN_PAGE_P2 12
  79 #endif
  80
  81 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
  82
  83 ENTRY (STRCPY)
  84         /* For moderately short strings, the fastest way to do the copy is to
  85            calculate the length of the string in the same way as strlen, then
  86            essentially do a memcpy of the result.  This avoids the need for
  87            multiple byte copies and further means that by the time we
  88            reach the bulk copy loop we know we can always use DWord
  89            accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
  90            with the same source string, so branch prediction is likely to
  91            always be difficult - we mitigate against this by preferring
  92            conditional select operations over branches whenever this is
  93            feasible.  */
  94         and     tmp2, srcin, #(MIN_PAGE_SIZE - 1)
  95         mov     zeroones, #REP8_01
  96         and     to_align, srcin, #15
  97         cmp     tmp2, #(MIN_PAGE_SIZE - 16)
  98         neg     tmp1, to_align
  99         /* The first fetch will straddle a (possible) page boundary iff
 100            srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
 101            aligned string will never fail the page align check, so will
 102            always take the fast path.  */
 103         b.gt    L(page_cross)
 104
 105 L(page_cross_ok):
 106         ldp     data1, data2, [srcin]
 107 #ifdef __AARCH64EB__
 108         /* Because we expect the end to be found within 16 characters
 109            (profiling shows this is the most common case), it's worth
 110            swapping the bytes now to save having to recalculate the
 111            termination syndrome later.  We preserve data1 and data2
 112            so that we can re-use the values later on.  */
 113         rev     tmp2, data1
 114         sub     tmp1, tmp2, zeroones
 115         orr     tmp2, tmp2, #REP8_7f
 116         bics    has_nul1, tmp1, tmp2
 117         b.ne    L(fp_le8)
 118         rev     tmp4, data2
 119         sub     tmp3, tmp4, zeroones
 120         orr     tmp4, tmp4, #REP8_7f
 121 #else
 122         sub     tmp1, data1, zeroones
 123         orr     tmp2, data1, #REP8_7f
 124         bics    has_nul1, tmp1, tmp2
 125         b.ne    L(fp_le8)
 126         sub     tmp3, data2, zeroones
 127         orr     tmp4, data2, #REP8_7f
 128 #endif
 129         bics    has_nul2, tmp3, tmp4
 130         b.eq    L(bulk_entry)
 131
 132         /* The string is short (<=16 bytes).  We don't know exactly how
 133            short though, yet.  Work out the exact length so that we can
 134            quickly select the optimal copy strategy.  */
 135 L(fp_gt8):
 136         rev     has_nul2, has_nul2
 137         clz     pos, has_nul2
 138         mov     tmp2, #56
 139         add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
 140         sub     pos, tmp2, pos
 141 #ifdef __AARCH64EB__
 142         lsr     data2, data2, pos
 143 #else
 144         lsl     data2, data2, pos
 145 #endif
 146         str     data2, [dst, #1]
 147         str     data1, [dstin]
 148 #ifdef BUILD_STPCPY
 149         add     dstin, dst, #8
 150 #endif
 151         ret
 152
 153 L(fp_le8):
 154         rev     has_nul1, has_nul1
 155         clz     pos, has_nul1
 156         add     dst, dstin, pos, lsr #3         /* Bits to bytes.  */
 157         subs    tmp2, pos, #24                  /* Pos in bits. */
 158         b.lt    L(fp_lt4)
 159 #ifdef __AARCH64EB__
 160         mov     tmp2, #56
 161         sub     pos, tmp2, pos
 162         lsr     data2, data1, pos
 163         lsr     data1, data1, #32
 164 #else
 165         lsr     data2, data1, tmp2
 166 #endif
 167         /* 4->7 bytes to copy.  */
 168         str     data2w, [dst, #-3]
 169         str     data1w, [dstin]
 170 #ifdef BUILD_STPCPY
 171         mov     dstin, dst
 172 #endif
 173         ret
 174 L(fp_lt4):
 175         cbz     pos, L(fp_lt2)
 176         /* 2->3 bytes to copy.  */
 177 #ifdef __AARCH64EB__
 178         lsr     data1, data1, #48
 179 #endif
 180         strh    data1w, [dstin]
 181         /* Fall-through, one byte (max) to go.  */
 182 L(fp_lt2):
 183         /* Null-terminated string.  Last character must be zero!  */
 184         strb    wzr, [dst]
 185 #ifdef BUILD_STPCPY
 186         mov     dstin, dst
 187 #endif
 188         ret
 189
 190         .p2align 6
 191         /* Aligning here ensures that the entry code and main loop all lies
 192            within one 64-byte cache line.  */
 193 L(bulk_entry):
 194         sub     to_align, to_align, #16
 195         stp     data1, data2, [dstin]
 196         sub     src, srcin, to_align
 197         sub     dst, dstin, to_align
 198         b       L(entry_no_page_cross)
 199
 200         /* The inner loop deals with two Dwords at a time.  This has a
 201            slightly higher start-up cost, but we should win quite quickly,
 202            especially on cores with a high number of issue slots per
 203            cycle, as we get much better parallelism out of the operations.  */
 204 L(main_loop):
 205         stp     data1, data2, [dst], #16
 206 L(entry_no_page_cross):
 207         ldp     data1, data2, [src], #16
 208         sub     tmp1, data1, zeroones
 209         orr     tmp2, data1, #REP8_7f
 210         sub     tmp3, data2, zeroones
 211         orr     tmp4, data2, #REP8_7f
 212         bic     has_nul1, tmp1, tmp2
 213         bics    has_nul2, tmp3, tmp4
 214         ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
 215         b.eq    L(main_loop)
 216
 217         /* Since we know we are copying at least 16 bytes, the fastest way
 218            to deal with the tail is to determine the location of the
 219            trailing NUL, then (re)copy the 16 bytes leading up to that.  */
 220         cmp     has_nul1, #0
 221 #ifdef __AARCH64EB__
 222         /* For big-endian, carry propagation (if the final byte in the
 223            string is 0x01) means we cannot use has_nul directly.  The
 224            easiest way to get the correct byte is to byte-swap the data
 225            and calculate the syndrome a second time.  */
 226         csel    data1, data1, data2, ne
 227         rev     data1, data1
 228         sub     tmp1, data1, zeroones
 229         orr     tmp2, data1, #REP8_7f
 230         bic     has_nul1, tmp1, tmp2
 231 #else
 232         csel    has_nul1, has_nul1, has_nul2, ne
 233 #endif
 234         rev     has_nul1, has_nul1
 235         clz     pos, has_nul1
 236         add     tmp1, pos, #72
 237         add     pos, pos, #8
 238         csel    pos, pos, tmp1, ne
 239         add     src, src, pos, lsr #3
 240         add     dst, dst, pos, lsr #3
 241         ldp     data1, data2, [src, #-32]
 242         stp     data1, data2, [dst, #-16]
 243 #ifdef BUILD_STPCPY
 244         sub     dstin, dst, #1
 245 #endif
 246         ret
 247
 248 L(page_cross):
 249         bic     src, srcin, #15
 250         /* Start by loading two words at [srcin & ~15], then forcing the
 251            bytes that precede srcin to 0xff.  This means they never look
 252            like termination bytes.  */
 253         ldp     data1, data2, [src]
 254         lsl     tmp1, tmp1, #3  /* Bytes beyond alignment -> bits.  */
 255         tst     to_align, #7
 256         csetm   tmp2, ne
 257 #ifdef __AARCH64EB__
 258         lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 259 #else
 260         lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 261 #endif
 262         orr     data1, data1, tmp2
 263         orr     data2a, data2, tmp2
 264         cmp     to_align, #8
 265         csinv   data1, data1, xzr, lt
 266         csel    data2, data2, data2a, lt
 267         sub     tmp1, data1, zeroones
 268         orr     tmp2, data1, #REP8_7f
 269         sub     tmp3, data2, zeroones
 270         orr     tmp4, data2, #REP8_7f
 271         bic     has_nul1, tmp1, tmp2
 272         bics    has_nul2, tmp3, tmp4
 273         ccmp    has_nul1, #0, #0, eq    /* NZCV = 0000  */
 274         b.eq    L(page_cross_ok)
 275         /* We now need to make data1 and data2 look like they've been
 276            loaded directly from srcin.  Do a rotate on the 128-bit value.  */
 277         lsl     tmp1, to_align, #3      /* Bytes->bits.  */
 278         neg     tmp2, to_align, lsl #3
 279 #ifdef __AARCH64EB__
 280         lsl     data1a, data1, tmp1
 281         lsr     tmp4, data2, tmp2
 282         lsl     data2, data2, tmp1
 283         orr     tmp4, tmp4, data1a
 284         cmp     to_align, #8
 285         csel    data1, tmp4, data2, lt
 286         rev     tmp2, data1
 287         rev     tmp4, data2
 288         sub     tmp1, tmp2, zeroones
 289         orr     tmp2, tmp2, #REP8_7f
 290         sub     tmp3, tmp4, zeroones
 291         orr     tmp4, tmp4, #REP8_7f
 292 #else
 293         lsr     data1a, data1, tmp1
 294         lsl     tmp4, data2, tmp2
 295         lsr     data2, data2, tmp1
 296         orr     tmp4, tmp4, data1a
 297         cmp     to_align, #8
 298         csel    data1, tmp4, data2, lt
 299         sub     tmp1, data1, zeroones
 300         orr     tmp2, data1, #REP8_7f
 301         sub     tmp3, data2, zeroones
 302         orr     tmp4, data2, #REP8_7f
 303 #endif
 304         bic     has_nul1, tmp1, tmp2
 305         cbnz    has_nul1, L(fp_le8)
 306         bic     has_nul2, tmp3, tmp4
 307         b       L(fp_gt8)
 308
 309 END (STRCPY)