newlib/libc/machine/aarch64/strncmp.S

   1 /*
   2  * strncmp - compare two strings
   3  *
   4  * Copyright (c) 2013-2022, Arm Limited.
   5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
   6  */
   7 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
   8 /* See strcmp-stub.c  */
   9 #else
  10
  11 /* Assumptions:
  12  *
  13  * ARMv8-a, AArch64.
  14  * MTE compatible.
  15  */
  16
  17 #include "asmdefs.h"
  18
  19 #define REP8_01 0x0101010101010101
  20 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  21
  22 /* Parameters and result.  */
  23 #define src1            x0
  24 #define src2            x1
  25 #define limit           x2
  26 #define result          x0
  27
  28 /* Internal variables.  */
  29 #define data1           x3
  30 #define data1w          w3
  31 #define data2           x4
  32 #define data2w          w4
  33 #define has_nul         x5
  34 #define diff            x6
  35 #define syndrome        x7
  36 #define tmp1            x8
  37 #define tmp2            x9
  38 #define tmp3            x10
  39 #define zeroones        x11
  40 #define pos             x12
  41 #define mask            x13
  42 #define endloop         x14
  43 #define count           mask
  44 #define offset          pos
  45 #define neg_offset      x15
  46
  47 /* Define endian dependent shift operations.
  48    On big-endian early bytes are at MSB and on little-endian LSB.
  49    LS_FW means shifting towards early bytes.
  50    LS_BK means shifting towards later bytes.
  51    */
  52 #ifdef __AARCH64EB__
  53 #define LS_FW lsl
  54 #define LS_BK lsr
  55 #else
  56 #define LS_FW lsr
  57 #define LS_BK lsl
  58 #endif
  59
  60 ENTRY (strncmp)
  61         PTR_ARG (0)
  62         PTR_ARG (1)
  63         SIZE_ARG (2)
  64         cbz     limit, L(ret0)
  65         eor     tmp1, src1, src2
  66         mov     zeroones, #REP8_01
  67         tst     tmp1, #7
  68         and     count, src1, #7
  69         b.ne    L(misaligned8)
  70         cbnz    count, L(mutual_align)
  71
  72         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  73            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  74            can be done in parallel across the entire word.  */
  75         .p2align 4
  76 L(loop_aligned):
  77         ldr     data1, [src1], #8
  78         ldr     data2, [src2], #8
  79 L(start_realigned):
  80         subs    limit, limit, #8
  81         sub     tmp1, data1, zeroones
  82         orr     tmp2, data1, #REP8_7f
  83         eor     diff, data1, data2      /* Non-zero if differences found.  */
  84         csinv   endloop, diff, xzr, hi  /* Last Dword or differences.  */
  85         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  86         ccmp    endloop, #0, #0, eq
  87         b.eq    L(loop_aligned)
  88         /* End of main loop */
  89
  90 L(full_check):
  91 #ifndef __AARCH64EB__
  92         orr     syndrome, diff, has_nul
  93         add     limit, limit, 8 /* Rewind limit to before last subs. */
  94 L(syndrome_check):
  95         /* Limit was reached. Check if the NUL byte or the difference
  96            is before the limit. */
  97         rev     syndrome, syndrome
  98         rev     data1, data1
  99         clz     pos, syndrome
 100         rev     data2, data2
 101         lsl     data1, data1, pos
 102         cmp     limit, pos, lsr #3
 103         lsl     data2, data2, pos
 104         /* But we need to zero-extend (char is unsigned) the value and then
 105            perform a signed 32-bit subtraction.  */
 106         lsr     data1, data1, #56
 107         sub     result, data1, data2, lsr #56
 108         csel result, result, xzr, hi
 109         ret
 110 #else
 111         /* Not reached the limit, must have found the end or a diff.  */
 112         tbz     limit, #63, L(not_limit)
 113         add     tmp1, limit, 8
 114         cbz     limit, L(not_limit)
 115
 116         lsl     limit, tmp1, #3 /* Bits -> bytes.  */
 117         mov     mask, #~0
 118         lsr     mask, mask, limit
 119         bic     data1, data1, mask
 120         bic     data2, data2, mask
 121
 122         /* Make sure that the NUL byte is marked in the syndrome.  */
 123         orr     has_nul, has_nul, mask
 124
 125 L(not_limit):
 126         /* For big-endian we cannot use the trick with the syndrome value
 127            as carry-propagation can corrupt the upper bits if the trailing
 128            bytes in the string contain 0x01.  */
 129         /* However, if there is no NUL byte in the dword, we can generate
 130            the result directly.  We can't just subtract the bytes as the
 131            MSB might be significant.  */
 132         cbnz    has_nul, 1f
 133         cmp     data1, data2
 134         cset    result, ne
 135         cneg    result, result, lo
 136         ret
 137 1:
 138         /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 139         rev     tmp3, data1
 140         sub     tmp1, tmp3, zeroones
 141         orr     tmp2, tmp3, #REP8_7f
 142         bic     has_nul, tmp1, tmp2
 143         rev     has_nul, has_nul
 144         orr     syndrome, diff, has_nul
 145         clz     pos, syndrome
 146         /* The most-significant-non-zero bit of the syndrome marks either the
 147            first bit that is different, or the top bit of the first zero byte.
 148            Shifting left now will bring the critical information into the
 149            top bits.  */
 150 L(end_quick):
 151         lsl     data1, data1, pos
 152         lsl     data2, data2, pos
 153         /* But we need to zero-extend (char is unsigned) the value and then
 154            perform a signed 32-bit subtraction.  */
 155         lsr     data1, data1, #56
 156         sub     result, data1, data2, lsr #56
 157         ret
 158 #endif
 159
 160 L(mutual_align):
 161         /* Sources are mutually aligned, but are not currently at an
 162            alignment boundary.  Round down the addresses and then mask off
 163            the bytes that precede the start point.
 164            We also need to adjust the limit calculations, but without
 165            overflowing if the limit is near ULONG_MAX.  */
 166         bic     src1, src1, #7
 167         bic     src2, src2, #7
 168         ldr     data1, [src1], #8
 169         neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
 170         ldr     data2, [src2], #8
 171         mov     tmp2, #~0
 172         LS_FW   tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 173         /* Adjust the limit and ensure it doesn't overflow.  */
 174         adds    limit, limit, count
 175         csinv   limit, limit, xzr, lo
 176         orr     data1, data1, tmp2
 177         orr     data2, data2, tmp2
 178         b       L(start_realigned)
 179
 180         .p2align 4
 181         /* Don't bother with dwords for up to 16 bytes.  */
 182 L(misaligned8):
 183         cmp     limit, #16
 184         b.hs    L(try_misaligned_words)
 185
 186 L(byte_loop):
 187         /* Perhaps we can do better than this.  */
 188         ldrb    data1w, [src1], #1
 189         ldrb    data2w, [src2], #1
 190         subs    limit, limit, #1
 191         ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
 192         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 193         b.eq    L(byte_loop)
 194 L(done):
 195         sub     result, data1, data2
 196         ret
 197         /* Align the SRC1 to a dword by doing a bytewise compare and then do
 198            the dword loop.  */
 199 L(try_misaligned_words):
 200         cbz     count, L(src1_aligned)
 201
 202         neg     count, count
 203         and     count, count, #7
 204         sub     limit, limit, count
 205
 206 L(page_end_loop):
 207         ldrb    data1w, [src1], #1
 208         ldrb    data2w, [src2], #1
 209         cmp     data1w, #1
 210         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 211         b.ne    L(done)
 212         subs    count, count, #1
 213         b.hi    L(page_end_loop)
 214
 215         /* The following diagram explains the comparison of misaligned strings.
 216            The bytes are shown in natural order. For little-endian, it is
 217            reversed in the registers. The "x" bytes are before the string.
 218            The "|" separates data that is loaded at one time.
 219            src1     | a a a a a a a a | b b b c c c c c | . . .
 220            src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
 221
 222            After shifting in each step, the data looks like this:
 223                         STEP_A              STEP_B              STEP_C
 224            data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
 225            data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 226
 227            The bytes with "0" are eliminated from the syndrome via mask.
 228
 229            Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
 230            time from SRC2. The comparison happens in 3 steps. After each step
 231            the loop can exit, or read from SRC1 or SRC2. */
 232 L(src1_aligned):
 233         /* Calculate offset from 8 byte alignment to string start in bits. No
 234            need to mask offset since shifts are ignoring upper bits. */
 235         lsl     offset, src2, #3
 236         bic     src2, src2, #0xf
 237         mov     mask, -1
 238         neg     neg_offset, offset
 239         ldr     data1, [src1], #8
 240         ldp     tmp1, tmp2, [src2], #16
 241         LS_BK   mask, mask, neg_offset
 242         and     neg_offset, neg_offset, #63     /* Need actual value for cmp later. */
 243         /* Skip the first compare if data in tmp1 is irrelevant. */
 244         tbnz    offset, 6, L(misaligned_mid_loop)
 245
 246 L(loop_misaligned):
 247         /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
 248         LS_FW   data2, tmp1, offset
 249         LS_BK   tmp1, tmp2, neg_offset
 250         subs    limit, limit, #8
 251         orr     data2, data2, tmp1      /* 8 bytes from SRC2 combined from two regs.*/
 252         sub     has_nul, data1, zeroones
 253         eor     diff, data1, data2      /* Non-zero if differences found.  */
 254         orr     tmp3, data1, #REP8_7f
 255         csinv   endloop, diff, xzr, hi  /* If limit, set to all ones. */
 256         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL byte found in SRC1. */
 257         orr     tmp3, endloop, has_nul
 258         cbnz    tmp3, L(full_check)
 259
 260         ldr     data1, [src1], #8
 261 L(misaligned_mid_loop):
 262         /* STEP_B: Compare first part of data1 to second part of tmp2. */
 263         LS_FW   data2, tmp2, offset
 264 #ifdef __AARCH64EB__
 265         /* For big-endian we do a byte reverse to avoid carry-propagation
 266         problem described above. This way we can reuse the has_nul in the
 267         next step and also use syndrome value trick at the end. */
 268         rev     tmp3, data1
 269         #define data1_fixed tmp3
 270 #else
 271         #define data1_fixed data1
 272 #endif
 273         sub     has_nul, data1_fixed, zeroones
 274         orr     tmp3, data1_fixed, #REP8_7f
 275         eor     diff, data2, data1      /* Non-zero if differences found.  */
 276         bic     has_nul, has_nul, tmp3  /* Non-zero if NUL terminator.  */
 277 #ifdef __AARCH64EB__
 278         rev     has_nul, has_nul
 279 #endif
 280         cmp     limit, neg_offset, lsr #3
 281         orr     syndrome, diff, has_nul
 282         bic     syndrome, syndrome, mask        /* Ignore later bytes. */
 283         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 284         cbnz    tmp3, L(syndrome_check)
 285
 286         /* STEP_C: Compare second part of data1 to first part of tmp1. */
 287         ldp     tmp1, tmp2, [src2], #16
 288         cmp     limit, #8
 289         LS_BK   data2, tmp1, neg_offset
 290         eor     diff, data2, data1      /* Non-zero if differences found.  */
 291         orr     syndrome, diff, has_nul
 292         and     syndrome, syndrome, mask        /* Ignore earlier bytes. */
 293         csinv   tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
 294         cbnz    tmp3, L(syndrome_check)
 295
 296         ldr     data1, [src1], #8
 297         sub     limit, limit, #8
 298         b       L(loop_misaligned)
 299
 300 #ifdef  __AARCH64EB__
 301 L(syndrome_check):
 302         clz     pos, syndrome
 303         cmp     pos, limit, lsl #3
 304         b.lo    L(end_quick)
 305 #endif
 306
 307 L(ret0):
 308         mov     result, #0
 309         ret
 310 END(strncmp)
 311 #endif