libc/AOR_v20.02/string/aarch64/strchrnul.S

   1 /*
   2  * strchrnul - find a character or nul in a string
   3  *
   4  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5  * See https://llvm.org/LICENSE.txt for license information.
   6  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7  */
   8
   9 /* Assumptions:
  10  *
  11  * ARMv8-a, AArch64
  12  * Neon Available.
  13  */
  14
  15 #include "../asmdefs.h"
  16
  17 /* Arguments and results.  */
  18 #define srcin           x0
  19 #define chrin           w1
  20
  21 #define result          x0
  22
  23 #define src             x2
  24 #define tmp1            x3
  25 #define wtmp2           w4
  26 #define tmp3            x5
  27
  28 #define vrepchr         v0
  29 #define vdata1          v1
  30 #define vdata2          v2
  31 #define vhas_nul1       v3
  32 #define vhas_nul2       v4
  33 #define vhas_chr1       v5
  34 #define vhas_chr2       v6
  35 #define vrepmask        v7
  36 #define vend1           v16
  37
  38 /* Core algorithm.
  39
  40    For each 32-byte hunk we calculate a 64-bit syndrome value, with
  41    two bits per byte (LSB is always in bits 0 and 1, for both big
  42    and little-endian systems).  For each tuple, bit 0 is set iff
  43    the relevant byte matched the requested character or nul.  Since the
  44    bits in the syndrome reflect exactly the order in which things occur
  45    in the original string a count_trailing_zeros() operation will
  46    identify exactly which byte is causing the termination.  */
  47
  48 /* Locals and temporaries.  */
  49
  50 ENTRY (__strchrnul_aarch64)
  51         /* Magic constant 0x40100401 to allow us to identify which lane
  52            matches the termination condition.  */
  53         mov     wtmp2, #0x0401
  54         movk    wtmp2, #0x4010, lsl #16
  55         dup     vrepchr.16b, chrin
  56         bic     src, srcin, #31         /* Work with aligned 32-byte hunks.  */
  57         dup     vrepmask.4s, wtmp2
  58         ands    tmp1, srcin, #31
  59         b.eq    L(loop)
  60
  61         /* Input string is not 32-byte aligned.  Rather than forcing
  62            the padding bytes to a safe value, we calculate the syndrome
  63            for all the bytes, but then mask off those bits of the
  64            syndrome that are related to the padding.  */
  65         ld1     {vdata1.16b, vdata2.16b}, [src], #32
  66         neg     tmp1, tmp1
  67         cmeq    vhas_nul1.16b, vdata1.16b, #0
  68         cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
  69         cmeq    vhas_nul2.16b, vdata2.16b, #0
  70         cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
  71         orr     vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
  72         orr     vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
  73         and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
  74         and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
  75         lsl     tmp1, tmp1, #1
  76         addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
  77         mov     tmp3, #~0
  78         addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
  79         lsr     tmp1, tmp3, tmp1
  80
  81         mov     tmp3, vend1.d[0]
  82         bic     tmp1, tmp3, tmp1        // Mask padding bits.
  83         cbnz    tmp1, L(tail)
  84
  85 L(loop):
  86         ld1     {vdata1.16b, vdata2.16b}, [src], #32
  87         cmeq    vhas_nul1.16b, vdata1.16b, #0
  88         cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
  89         cmeq    vhas_nul2.16b, vdata2.16b, #0
  90         cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
  91         /* Use a fast check for the termination condition.  */
  92         orr     vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
  93         orr     vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
  94         orr     vend1.16b, vhas_chr1.16b, vhas_chr2.16b
  95         addp    vend1.2d, vend1.2d, vend1.2d
  96         mov     tmp1, vend1.d[0]
  97         cbz     tmp1, L(loop)
  98
  99         /* Termination condition found.  Now need to establish exactly why
 100            we terminated.  */
 101         and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 102         and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
 103         addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b         // 256->128
 104         addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
 105
 106         mov     tmp1, vend1.d[0]
 107 L(tail):
 108         /* Count the trailing zeros, by bit reversing...  */
 109         rbit    tmp1, tmp1
 110         /* Re-bias source.  */
 111         sub     src, src, #32
 112         clz     tmp1, tmp1      /* ... and counting the leading zeros.  */
 113         /* tmp1 is twice the offset into the fragment.  */
 114         add     result, src, tmp1, lsr #1
 115         ret
 116
 117 END (__strchrnul_aarch64)