newlib/libc/machine/aarch64/strchrnul.S

   1 /*
   2  * strchrnul - find a character or nul in a string
   3  *
   4  * Copyright (c) 2014-2022, Arm Limited.
   5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
   6  */
   7 #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
   8 /* See strchrnul-stub.c  */
   9 #else
  10
  11 /* Assumptions:
  12  *
  13  * ARMv8-a, AArch64
  14  * Neon Available.
  15  */
  16
  17 #include "asmdefs.h"
  18
  19 /* Arguments and results.  */
  20 #define srcin           x0
  21 #define chrin           w1
  22
  23 #define result          x0
  24
  25 #define src             x2
  26 #define tmp1            x3
  27 #define wtmp2           w4
  28 #define tmp3            x5
  29
  30 #define vrepchr         v0
  31 #define vdata1          v1
  32 #define vdata2          v2
  33 #define vhas_nul1       v3
  34 #define vhas_nul2       v4
  35 #define vhas_chr1       v5
  36 #define vhas_chr2       v6
  37 #define vrepmask        v7
  38 #define vend1           v16
  39
  40 /* Core algorithm.
  41
  42    For each 32-byte hunk we calculate a 64-bit syndrome value, with
  43    two bits per byte (LSB is always in bits 0 and 1, for both big
  44    and little-endian systems).  For each tuple, bit 0 is set iff
  45    the relevant byte matched the requested character or nul.  Since the
  46    bits in the syndrome reflect exactly the order in which things occur
  47    in the original string a count_trailing_zeros() operation will
  48    identify exactly which byte is causing the termination.  */
  49
  50 /* Locals and temporaries.  */
  51
  52 ENTRY (strchrnul)
  53         PTR_ARG (0)
  54         /* Magic constant 0x40100401 to allow us to identify which lane
  55            matches the termination condition.  */
  56         mov     wtmp2, #0x0401
  57         movk    wtmp2, #0x4010, lsl #16
  58         dup     vrepchr.16b, chrin
  59         bic     src, srcin, #31         /* Work with aligned 32-byte hunks.  */
  60         dup     vrepmask.4s, wtmp2
  61         ands    tmp1, srcin, #31
  62         b.eq    L(loop)
  63
  64         /* Input string is not 32-byte aligned.  Rather than forcing
  65            the padding bytes to a safe value, we calculate the syndrome
  66            for all the bytes, but then mask off those bits of the
  67            syndrome that are related to the padding.  */
  68         ld1     {vdata1.16b, vdata2.16b}, [src], #32
  69         neg     tmp1, tmp1
  70         cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
  71         cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
  72         cmhs    vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
  73         cmhs    vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
  74         and     vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
  75         and     vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
  76         lsl     tmp1, tmp1, #1
  77         addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
  78         mov     tmp3, #~0
  79         addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
  80         lsr     tmp1, tmp3, tmp1
  81
  82         mov     tmp3, vend1.d[0]
  83         bic     tmp1, tmp3, tmp1        // Mask padding bits.
  84         cbnz    tmp1, L(tail)
  85
  86         .p2align 4
  87 L(loop):
  88         ld1     {vdata1.16b, vdata2.16b}, [src], #32
  89         cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
  90         cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
  91         cmhs    vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
  92         cmhs    vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
  93         orr     vend1.16b, vhas_nul1.16b, vhas_nul2.16b
  94         umaxp   vend1.16b, vend1.16b, vend1.16b
  95         mov     tmp1, vend1.d[0]
  96         cbz     tmp1, L(loop)
  97
  98         /* Termination condition found.  Now need to establish exactly why
  99            we terminated.  */
 100         and     vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
 101         and     vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
 102         addp    vend1.16b, vhas_chr1.16b, vhas_chr2.16b         // 256->128
 103         addp    vend1.16b, vend1.16b, vend1.16b         // 128->64
 104
 105         mov     tmp1, vend1.d[0]
 106 L(tail):
 107         /* Count the trailing zeros, by bit reversing...  */
 108         rbit    tmp1, tmp1
 109         /* Re-bias source.  */
 110         sub     src, src, #32
 111         clz     tmp1, tmp1      /* ... and counting the leading zeros.  */
 112         /* tmp1 is twice the offset into the fragment.  */
 113         add     result, src, tmp1, lsr #1
 114         ret
 115
 116 END (strchrnul)
 117 #endif