arch/arm64/lib/memcmp.S

   1 /*
   2  * Copyright (C) 2013 ARM Ltd.
   3  * Copyright (C) 2013 Linaro.
   4  *
   5  * This code is based on glibc cortex strings work originally authored by Linaro
   6  * and re-licensed under GPLv2 for the Linux kernel. The original code can
   7  * be found @
   8  *
   9  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10  * files/head:/src/aarch64/
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License version 2 as
  14  * published by the Free Software Foundation.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  23  */
  24
  25 #include <linux/linkage.h>
  26 #include <asm/assembler.h>
  27
  28 /*
  29 * compare memory areas(when two memory areas' offset are different,
  30 * alignment handled by the hardware)
  31 *
  32 * Parameters:
  33 *  x0 - const memory area 1 pointer
  34 *  x1 - const memory area 2 pointer
  35 *  x2 - the maximal compare byte length
  36 * Returns:
  37 *  x0 - a compare result, maybe less than, equal to, or greater than ZERO
  38 */
  39
  40 /* Parameters and result.  */
  41 src1            .req    x0
  42 src2            .req    x1
  43 limit           .req    x2
  44 result          .req    x0
  45
  46 /* Internal variables.  */
  47 data1           .req    x3
  48 data1w          .req    w3
  49 data2           .req    x4
  50 data2w          .req    w4
  51 has_nul         .req    x5
  52 diff            .req    x6
  53 endloop         .req    x7
  54 tmp1            .req    x8
  55 tmp2            .req    x9
  56 tmp3            .req    x10
  57 pos             .req    x11
  58 limit_wd        .req    x12
  59 mask            .req    x13
  60
  61 ENTRY(memcmp)
  62         cbz     limit, .Lret0
  63         eor     tmp1, src1, src2
  64         tst     tmp1, #7
  65         b.ne    .Lmisaligned8
  66         ands    tmp1, src1, #7
  67         b.ne    .Lmutual_align
  68         sub     limit_wd, limit, #1 /* limit != 0, so no underflow.  */
  69         lsr     limit_wd, limit_wd, #3 /* Convert to Dwords.  */
  70         /*
  71         * The input source addresses are at alignment boundary.
  72         * Directly compare eight bytes each time.
  73         */
  74 .Lloop_aligned:
  75         ldr     data1, [src1], #8
  76         ldr     data2, [src2], #8
  77 .Lstart_realigned:
  78         subs    limit_wd, limit_wd, #1
  79         eor     diff, data1, data2      /* Non-zero if differences found.  */
  80         csinv   endloop, diff, xzr, cs  /* Last Dword or differences.  */
  81         cbz     endloop, .Lloop_aligned
  82
  83         /* Not reached the limit, must have found a diff.  */
  84         tbz     limit_wd, #63, .Lnot_limit
  85
  86         /* Limit % 8 == 0 => the diff is in the last 8 bytes. */
  87         ands    limit, limit, #7
  88         b.eq    .Lnot_limit
  89         /*
  90         * The remained bytes less than 8. It is needed to extract valid data
  91         * from last eight bytes of the intended memory range.
  92         */
  93         lsl     limit, limit, #3        /* bytes-> bits.  */
  94         mov     mask, #~0
  95 CPU_BE( lsr     mask, mask, limit )
  96 CPU_LE( lsl     mask, mask, limit )
  97         bic     data1, data1, mask
  98         bic     data2, data2, mask
  99
 100         orr     diff, diff, mask
 101         b       .Lnot_limit
 102
 103 .Lmutual_align:
 104         /*
 105         * Sources are mutually aligned, but are not currently at an
 106         * alignment boundary. Round down the addresses and then mask off
 107         * the bytes that precede the start point.
 108         */
 109         bic     src1, src1, #7
 110         bic     src2, src2, #7
 111         ldr     data1, [src1], #8
 112         ldr     data2, [src2], #8
 113         /*
 114         * We can not add limit with alignment offset(tmp1) here. Since the
 115         * addition probably make the limit overflown.
 116         */
 117         sub     limit_wd, limit, #1/*limit != 0, so no underflow.*/
 118         and     tmp3, limit_wd, #7
 119         lsr     limit_wd, limit_wd, #3
 120         add     tmp3, tmp3, tmp1
 121         add     limit_wd, limit_wd, tmp3, lsr #3
 122         add     limit, limit, tmp1/* Adjust the limit for the extra.  */
 123
 124         lsl     tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
 125         neg     tmp1, tmp1/* Bits to alignment -64.  */
 126         mov     tmp2, #~0
 127         /*mask off the non-intended bytes before the start address.*/
 128 CPU_BE( lsl     tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
 129         /* Little-endian.  Early bytes are at LSB.  */
 130 CPU_LE( lsr     tmp2, tmp2, tmp1 )
 131
 132         orr     data1, data1, tmp2
 133         orr     data2, data2, tmp2
 134         b       .Lstart_realigned
 135
 136         /*src1 and src2 have different alignment offset.*/
 137 .Lmisaligned8:
 138         cmp     limit, #8
 139         b.lo    .Ltiny8proc /*limit < 8: compare byte by byte*/
 140
 141         and     tmp1, src1, #7
 142         neg     tmp1, tmp1
 143         add     tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
 144         and     tmp2, src2, #7
 145         neg     tmp2, tmp2
 146         add     tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
 147         subs    tmp3, tmp1, tmp2
 148         csel    pos, tmp1, tmp2, hi /*Choose the maximum.*/
 149
 150         sub     limit, limit, pos
 151         /*compare the proceeding bytes in the first 8 byte segment.*/
 152 .Ltinycmp:
 153         ldrb    data1w, [src1], #1
 154         ldrb    data2w, [src2], #1
 155         subs    pos, pos, #1
 156         ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
 157         b.eq    .Ltinycmp
 158         cbnz    pos, 1f /*diff occurred before the last byte.*/
 159         cmp     data1w, data2w
 160         b.eq    .Lstart_align
 161 1:
 162         sub     result, data1, data2
 163         ret
 164
 165 .Lstart_align:
 166         lsr     limit_wd, limit, #3
 167         cbz     limit_wd, .Lremain8
 168
 169         ands    xzr, src1, #7
 170         b.eq    .Lrecal_offset
 171         /*process more leading bytes to make src1 aligned...*/
 172         add     src1, src1, tmp3 /*backwards src1 to alignment boundary*/
 173         add     src2, src2, tmp3
 174         sub     limit, limit, tmp3
 175         lsr     limit_wd, limit, #3
 176         cbz     limit_wd, .Lremain8
 177         /*load 8 bytes from aligned SRC1..*/
 178         ldr     data1, [src1], #8
 179         ldr     data2, [src2], #8
 180
 181         subs    limit_wd, limit_wd, #1
 182         eor     diff, data1, data2  /*Non-zero if differences found.*/
 183         csinv   endloop, diff, xzr, ne
 184         cbnz    endloop, .Lunequal_proc
 185         /*How far is the current SRC2 from the alignment boundary...*/
 186         and     tmp3, tmp3, #7
 187
 188 .Lrecal_offset:/*src1 is aligned now..*/
 189         neg     pos, tmp3
 190 .Lloopcmp_proc:
 191         /*
 192         * Divide the eight bytes into two parts. First,backwards the src2
 193         * to an alignment boundary,load eight bytes and compare from
 194         * the SRC2 alignment boundary. If all 8 bytes are equal,then start
 195         * the second part's comparison. Otherwise finish the comparison.
 196         * This special handle can garantee all the accesses are in the
 197         * thread/task space in avoid to overrange access.
 198         */
 199         ldr     data1, [src1,pos]
 200         ldr     data2, [src2,pos]
 201         eor     diff, data1, data2  /* Non-zero if differences found.  */
 202         cbnz    diff, .Lnot_limit
 203
 204         /*The second part process*/
 205         ldr     data1, [src1], #8
 206         ldr     data2, [src2], #8
 207         eor     diff, data1, data2  /* Non-zero if differences found.  */
 208         subs    limit_wd, limit_wd, #1
 209         csinv   endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
 210         cbz     endloop, .Lloopcmp_proc
 211 .Lunequal_proc:
 212         cbz     diff, .Lremain8
 213
 214 /* There is difference occurred in the latest comparison. */
 215 .Lnot_limit:
 216 /*
 217 * For little endian,reverse the low significant equal bits into MSB,then
 218 * following CLZ can find how many equal bits exist.
 219 */
 220 CPU_LE( rev     diff, diff )
 221 CPU_LE( rev     data1, data1 )
 222 CPU_LE( rev     data2, data2 )
 223
 224         /*
 225         * The MS-non-zero bit of DIFF marks either the first bit
 226         * that is different, or the end of the significant data.
 227         * Shifting left now will bring the critical information into the
 228         * top bits.
 229         */
 230         clz     pos, diff
 231         lsl     data1, data1, pos
 232         lsl     data2, data2, pos
 233         /*
 234         * We need to zero-extend (char is unsigned) the value and then
 235         * perform a signed subtraction.
 236         */
 237         lsr     data1, data1, #56
 238         sub     result, data1, data2, lsr #56
 239         ret
 240
 241 .Lremain8:
 242         /* Limit % 8 == 0 =>. all data are equal.*/
 243         ands    limit, limit, #7
 244         b.eq    .Lret0
 245
 246 .Ltiny8proc:
 247         ldrb    data1w, [src1], #1
 248         ldrb    data2w, [src2], #1
 249         subs    limit, limit, #1
 250
 251         ccmp    data1w, data2w, #0, ne  /* NZCV = 0b0000. */
 252         b.eq    .Ltiny8proc
 253         sub     result, data1, data2
 254         ret
 255 .Lret0:
 256         mov     result, #0
 257         ret
 258 ENDPIPROC(memcmp)