libc/AOR_v20.02/string/aarch64/strcmp.S

   1 /*
   2  * strcmp - compare two strings
   3  *
   4  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5  * See https://llvm.org/LICENSE.txt for license information.
   6  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7  */
   8
   9 /* Assumptions:
  10  *
  11  * ARMv8-a, AArch64
  12  */
  13
  14 #include "../asmdefs.h"
  15
  16 #define REP8_01 0x0101010101010101
  17 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  18 #define REP8_80 0x8080808080808080
  19
  20 /* Parameters and result.  */
  21 #define src1            x0
  22 #define src2            x1
  23 #define result          x0
  24
  25 /* Internal variables.  */
  26 #define data1           x2
  27 #define data1w          w2
  28 #define data2           x3
  29 #define data2w          w3
  30 #define has_nul         x4
  31 #define diff            x5
  32 #define syndrome        x6
  33 #define tmp1            x7
  34 #define tmp2            x8
  35 #define tmp3            x9
  36 #define zeroones        x10
  37 #define pos             x11
  38
  39         /* Start of performance-critical section  -- one 64B cache line.  */
  40 ENTRY (__strcmp_aarch64)
  41         eor     tmp1, src1, src2
  42         mov     zeroones, #REP8_01
  43         tst     tmp1, #7
  44         b.ne    L(misaligned8)
  45         ands    tmp1, src1, #7
  46         b.ne    L(mutual_align)
  47         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  48            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  49            can be done in parallel across the entire word.  */
  50 L(loop_aligned):
  51         ldr     data1, [src1], #8
  52         ldr     data2, [src2], #8
  53 L(start_realigned):
  54         sub     tmp1, data1, zeroones
  55         orr     tmp2, data1, #REP8_7f
  56         eor     diff, data1, data2      /* Non-zero if differences found.  */
  57         bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  58         orr     syndrome, diff, has_nul
  59         cbz     syndrome, L(loop_aligned)
  60         /* End of performance-critical section  -- one 64B cache line.  */
  61
  62 L(end):
  63 #ifndef __AARCH64EB__
  64         rev     syndrome, syndrome
  65         rev     data1, data1
  66         /* The MS-non-zero bit of the syndrome marks either the first bit
  67            that is different, or the top bit of the first zero byte.
  68            Shifting left now will bring the critical information into the
  69            top bits.  */
  70         clz     pos, syndrome
  71         rev     data2, data2
  72         lsl     data1, data1, pos
  73         lsl     data2, data2, pos
  74         /* But we need to zero-extend (char is unsigned) the value and then
  75            perform a signed 32-bit subtraction.  */
  76         lsr     data1, data1, #56
  77         sub     result, data1, data2, lsr #56
  78         ret
  79 #else
  80         /* For big-endian we cannot use the trick with the syndrome value
  81            as carry-propagation can corrupt the upper bits if the trailing
  82            bytes in the string contain 0x01.  */
  83         /* However, if there is no NUL byte in the dword, we can generate
  84            the result directly.  We can't just subtract the bytes as the
  85            MSB might be significant.  */
  86         cbnz    has_nul, 1f
  87         cmp     data1, data2
  88         cset    result, ne
  89         cneg    result, result, lo
  90         ret
  91 1:
  92         /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
  93         rev     tmp3, data1
  94         sub     tmp1, tmp3, zeroones
  95         orr     tmp2, tmp3, #REP8_7f
  96         bic     has_nul, tmp1, tmp2
  97         rev     has_nul, has_nul
  98         orr     syndrome, diff, has_nul
  99         clz     pos, syndrome
 100         /* The MS-non-zero bit of the syndrome marks either the first bit
 101            that is different, or the top bit of the first zero byte.
 102            Shifting left now will bring the critical information into the
 103            top bits.  */
 104         lsl     data1, data1, pos
 105         lsl     data2, data2, pos
 106         /* But we need to zero-extend (char is unsigned) the value and then
 107            perform a signed 32-bit subtraction.  */
 108         lsr     data1, data1, #56
 109         sub     result, data1, data2, lsr #56
 110         ret
 111 #endif
 112
 113 L(mutual_align):
 114         /* Sources are mutually aligned, but are not currently at an
 115            alignment boundary.  Round down the addresses and then mask off
 116            the bytes that preceed the start point.  */
 117         bic     src1, src1, #7
 118         bic     src2, src2, #7
 119         lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */
 120         ldr     data1, [src1], #8
 121         neg     tmp1, tmp1              /* Bits to alignment -64.  */
 122         ldr     data2, [src2], #8
 123         mov     tmp2, #~0
 124 #ifdef __AARCH64EB__
 125         /* Big-endian.  Early bytes are at MSB.  */
 126         lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 127 #else
 128         /* Little-endian.  Early bytes are at LSB.  */
 129         lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */
 130 #endif
 131         orr     data1, data1, tmp2
 132         orr     data2, data2, tmp2
 133         b       L(start_realigned)
 134
 135 L(misaligned8):
 136         /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
 137            checking to make sure that we don't access beyond page boundary in
 138            SRC2.  */
 139         tst     src1, #7
 140         b.eq    L(loop_misaligned)
 141 L(do_misaligned):
 142         ldrb    data1w, [src1], #1
 143         ldrb    data2w, [src2], #1
 144         cmp     data1w, #1
 145         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 146         b.ne    L(done)
 147         tst     src1, #7
 148         b.ne    L(do_misaligned)
 149
 150 L(loop_misaligned):
 151         /* Test if we are within the last dword of the end of a 4K page.  If
 152            yes then jump back to the misaligned loop to copy a byte at a time.  */
 153         and     tmp1, src2, #0xff8
 154         eor     tmp1, tmp1, #0xff8
 155         cbz     tmp1, L(do_misaligned)
 156         ldr     data1, [src1], #8
 157         ldr     data2, [src2], #8
 158
 159         sub     tmp1, data1, zeroones
 160         orr     tmp2, data1, #REP8_7f
 161         eor     diff, data1, data2      /* Non-zero if differences found.  */
 162         bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
 163         orr     syndrome, diff, has_nul
 164         cbz     syndrome, L(loop_misaligned)
 165         b       L(end)
 166
 167 L(done):
 168         sub     result, data1, data2
 169         ret
 170
 171 END (__strcmp_aarch64)