arch/arm64/lib/memcmp.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (C) 2013 ARM Ltd.
   4  * Copyright (C) 2013 Linaro.
   5  *
   6  * This code is based on glibc cortex strings work originally authored by Linaro
   7  * be found @
   8  *
   9  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10  * files/head:/src/aarch64/
  11  */
  12
  13 #include <linux/linkage.h>
  14 #include <asm/assembler.h>
  15
  16 /*
  17 * compare memory areas(when two memory areas' offset are different,
  18 * alignment handled by the hardware)
  19 *
  20 * Parameters:
  21 *  x0 - const memory area 1 pointer
  22 *  x1 - const memory area 2 pointer
  23 *  x2 - the maximal compare byte length
  24 * Returns:
  25 *  x0 - a compare result, maybe less than, equal to, or greater than ZERO
  26 */
  27
  28 /* Parameters and result.  */
  29 src1            .req    x0
  30 src2            .req    x1
  31 limit           .req    x2
  32 result          .req    x0
  33
  34 /* Internal variables.  */
  35 data1           .req    x3
  36 data1w          .req    w3
  37 data2           .req    x4
  38 data2w          .req    w4
  39 has_nul         .req    x5
  40 diff            .req    x6
  41 endloop         .req    x7
  42 tmp1            .req    x8
  43 tmp2            .req    x9
  44 tmp3            .req    x10
  45 pos             .req    x11
  46 limit_wd        .req    x12
  47 mask            .req    x13
  48
  49 SYM_FUNC_START_WEAK_PI(memcmp)
  50         cbz     limit, .Lret0
  51         eor     tmp1, src1, src2
  52         tst     tmp1, #7
  53         b.ne    .Lmisaligned8
  54         ands    tmp1, src1, #7
  55         b.ne    .Lmutual_align
  56         sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
  57         lsr     limit_wd, limit_wd, #3 /* Convert to Dwords.  */
  58         /*
  59         * The input source addresses are at alignment boundary.
  60         * Directly compare eight bytes each time.
  61         */
  62 .Lloop_aligned:
  63         ldr     data1, [src1], #8
  64         ldr     data2, [src2], #8
  65 .Lstart_realigned:
  66         subs    limit_wd, limit_wd, #1
  67         eor     diff, data1, data2      /* Non-zero if differences found.  */
  68         csinv   endloop, diff, xzr, cs  /* Last Dword or differences.  */
  69         cbz     endloop, .Lloop_aligned
  70
  71         /* Not reached the limit, must have found a diff.  */
  72         tbz     limit_wd, #63, .Lnot_limit
  73
  74         /* Limit % 8 == 0 => the diff is in the last 8 bytes. */
  75         ands    limit, limit, #7
  76         b.eq    .Lnot_limit
  77         /*
  78         * The remained bytes less than 8. It is needed to extract valid data
  79         * from last eight bytes of the intended memory range.
  80         */
  81         lsl     limit, limit, #3        /* bytes-> bits.  */
  82         mov     mask, #~0
  83 CPU_BE( lsr     mask, mask, limit )
  84 CPU_LE( lsl     mask, mask, limit )
  85         bic     data1, data1, mask
  86         bic     data2, data2, mask
  87
  88         orr     diff, diff, mask
  89         b       .Lnot_limit
  90
  91 .Lmutual_align:
  92         /*
  93         * Sources are mutually aligned, but are not currently at an
  94         * alignment boundary. Round down the addresses and then mask off
  95         * the bytes that precede the start point.
  96         */
  97         bic     src1, src1, #7
  98         bic     src2, src2, #7
  99         ldr     data1, [src1], #8
 100         ldr     data2, [src2], #8
 101         /*
 102         * We can not add limit with alignment offset(tmp1) here. Since the
 103         * addition probably make the limit overflown.
 104         */
 105         sub     limit_wd, limit, #1/*limit != 0, so no underflow.*/
 106         and     tmp3, limit_wd, #7
 107         lsr     limit_wd, limit_wd, #3
 108         add     tmp3, tmp3, tmp1
 109         add     limit_wd, limit_wd, tmp3, lsr #3
 110         add     limit, limit, tmp1/* Adjust the limit for the extra.  */
 111
 112         lsl     tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
 113         neg     tmp1, tmp1/* Bits to alignment -64.  */
 114         mov     tmp2, #~0
 115         /*mask off the non-intended bytes before the start address.*/
 116 CPU_BE( lsl     tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
 117         /* Little-endian.  Early bytes are at LSB.  */
 118 CPU_LE( lsr     tmp2, tmp2, tmp1 )
 119
 120         orr     data1, data1, tmp2
 121         orr     data2, data2, tmp2
 122         b       .Lstart_realigned
 123
 124         /*src1 and src2 have different alignment offset.*/
 125 .Lmisaligned8:
 126         cmp     limit, #8
 127         b.lo    .Ltiny8proc /*limit < 8: compare byte by byte*/
 128
 129         and     tmp1, src1, #7
 130         neg     tmp1, tmp1
 131         add     tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
 132         and     tmp2, src2, #7
 133         neg     tmp2, tmp2
 134         add     tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
 135         subs    tmp3, tmp1, tmp2
 136         csel    pos, tmp1, tmp2, hi /*Choose the maximum.*/
 137
 138         sub     limit, limit, pos
 139         /*compare the proceeding bytes in the first 8 byte segment.*/
 140 .Ltinycmp:
 141         ldrb    data1w, [src1], #1
 142         ldrb    data2w, [src2], #1
 143         subs    pos, pos, #1
 144         ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
 145         b.eq    .Ltinycmp
 146         cbnz    pos, 1f /*diff occurred before the last byte.*/
 147         cmp     data1w, data2w
 148         b.eq    .Lstart_align
 149 1:
 150         sub     result, data1, data2
 151         ret
 152
 153 .Lstart_align:
 154         lsr     limit_wd, limit, #3
 155         cbz     limit_wd, .Lremain8
 156
 157         ands    xzr, src1, #7
 158         b.eq    .Lrecal_offset
 159         /*process more leading bytes to make src1 aligned...*/
 160         add     src1, src1, tmp3 /*backwards src1 to alignment boundary*/
 161         add     src2, src2, tmp3
 162         sub     limit, limit, tmp3
 163         lsr     limit_wd, limit, #3
 164         cbz     limit_wd, .Lremain8
 165         /*load 8 bytes from aligned SRC1..*/
 166         ldr     data1, [src1], #8
 167         ldr     data2, [src2], #8
 168
 169         subs    limit_wd, limit_wd, #1
 170         eor     diff, data1, data2  /*Non-zero if differences found.*/
 171         csinv   endloop, diff, xzr, ne
 172         cbnz    endloop, .Lunequal_proc
 173         /*How far is the current SRC2 from the alignment boundary...*/
 174         and     tmp3, tmp3, #7
 175
 176 .Lrecal_offset:/*src1 is aligned now..*/
 177         neg     pos, tmp3
 178 .Lloopcmp_proc:
 179         /*
 180         * Divide the eight bytes into two parts. First,backwards the src2
 181         * to an alignment boundary,load eight bytes and compare from
 182         * the SRC2 alignment boundary. If all 8 bytes are equal,then start
 183         * the second part's comparison. Otherwise finish the comparison.
 184         * This special handle can garantee all the accesses are in the
 185         * thread/task space in avoid to overrange access.
 186         */
 187         ldr     data1, [src1,pos]
 188         ldr     data2, [src2,pos]
 189         eor     diff, data1, data2  /* Non-zero if differences found.  */
 190         cbnz    diff, .Lnot_limit
 191
 192         /*The second part process*/
 193         ldr     data1, [src1], #8
 194         ldr     data2, [src2], #8
 195         eor     diff, data1, data2  /* Non-zero if differences found.  */
 196         subs    limit_wd, limit_wd, #1
 197         csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
 198         cbz     endloop, .Lloopcmp_proc
 199 .Lunequal_proc:
 200         cbz     diff, .Lremain8
 201
 202 /* There is difference occurred in the latest comparison. */
 203 .Lnot_limit:
 204 /*
 205 * For little endian,reverse the low significant equal bits into MSB,then
 206 * following CLZ can find how many equal bits exist.
 207 */
 208 CPU_LE( rev     diff, diff )
 209 CPU_LE( rev     data1, data1 )
 210 CPU_LE( rev     data2, data2 )
 211
 212         /*
 213         * The MS-non-zero bit of DIFF marks either the first bit
 214         * that is different, or the end of the significant data.
 215         * Shifting left now will bring the critical information into the
 216         * top bits.
 217         */
 218         clz     pos, diff
 219         lsl     data1, data1, pos
 220         lsl     data2, data2, pos
 221         /*
 222         * We need to zero-extend (char is unsigned) the value and then
 223         * perform a signed subtraction.
 224         */
 225         lsr     data1, data1, #56
 226         sub     result, data1, data2, lsr #56
 227         ret
 228
 229 .Lremain8:
 230         /* Limit % 8 == 0 =>. all data are equal.*/
 231         ands    limit, limit, #7
 232         b.eq    .Lret0
 233
 234 .Ltiny8proc:
 235         ldrb    data1w, [src1], #1
 236         ldrb    data2w, [src2], #1
 237         subs    limit, limit, #1
 238
 239         ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000. */
 240         b.eq    .Ltiny8proc
 241         sub     result, data1, data2
 242         ret
 243 .Lret0:
 244         mov     result, #0
 245         ret
 246 SYM_FUNC_END_PI(memcmp)
 247 EXPORT_SYMBOL_NOKASAN(memcmp)