arch/arm64/lib/memmove.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (C) 2013 ARM Ltd.
   4  * Copyright (C) 2013 Linaro.
   5  *
   6  * This code is based on glibc cortex strings work originally authored by Linaro
   7  * be found @
   8  *
   9  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10  * files/head:/src/aarch64/
  11  */
  12
  13 #include <linux/linkage.h>
  14 #include <asm/assembler.h>
  15 #include <asm/cache.h>
  16
  17 /*
  18  * Move a buffer from src to test (alignment handled by the hardware).
  19  * If dest <= src, call memcpy, otherwise copy in reverse order.
  20  *
  21  * Parameters:
  22  *      x0 - dest
  23  *      x1 - src
  24  *      x2 - n
  25  * Returns:
  26  *      x0 - dest
  27  */
  28 dstin   .req    x0
  29 src     .req    x1
  30 count   .req    x2
  31 tmp1    .req    x3
  32 tmp1w   .req    w3
  33 tmp2    .req    x4
  34 tmp2w   .req    w4
  35 tmp3    .req    x5
  36 tmp3w   .req    w5
  37 dst     .req    x6
  38
  39 A_l     .req    x7
  40 A_h     .req    x8
  41 B_l     .req    x9
  42 B_h     .req    x10
  43 C_l     .req    x11
  44 C_h     .req    x12
  45 D_l     .req    x13
  46 D_h     .req    x14
  47
  48         .weak memmove
  49 SYM_FUNC_START_ALIAS(__memmove)
  50 SYM_FUNC_START_PI(memmove)
  51         cmp     dstin, src
  52         b.lo    __memcpy
  53         add     tmp1, src, count
  54         cmp     dstin, tmp1
  55         b.hs    __memcpy                /* No overlap.  */
  56
  57         add     dst, dstin, count
  58         add     src, src, count
  59         cmp     count, #16
  60         b.lo    .Ltail15  /*probably non-alignment accesses.*/
  61
  62         ands    tmp2, src, #15     /* Bytes to reach alignment.  */
  63         b.eq    .LSrcAligned
  64         sub     count, count, tmp2
  65         /*
  66         * process the aligned offset length to make the src aligned firstly.
  67         * those extra instructions' cost is acceptable. It also make the
  68         * coming accesses are based on aligned address.
  69         */
  70         tbz     tmp2, #0, 1f
  71         ldrb    tmp1w, [src, #-1]!
  72         strb    tmp1w, [dst, #-1]!
  73 1:
  74         tbz     tmp2, #1, 2f
  75         ldrh    tmp1w, [src, #-2]!
  76         strh    tmp1w, [dst, #-2]!
  77 2:
  78         tbz     tmp2, #2, 3f
  79         ldr     tmp1w, [src, #-4]!
  80         str     tmp1w, [dst, #-4]!
  81 3:
  82         tbz     tmp2, #3, .LSrcAligned
  83         ldr     tmp1, [src, #-8]!
  84         str     tmp1, [dst, #-8]!
  85
  86 .LSrcAligned:
  87         cmp     count, #64
  88         b.ge    .Lcpy_over64
  89
  90         /*
  91         * Deal with small copies quickly by dropping straight into the
  92         * exit block.
  93         */
  94 .Ltail63:
  95         /*
  96         * Copy up to 48 bytes of data. At this point we only need the
  97         * bottom 6 bits of count to be accurate.
  98         */
  99         ands    tmp1, count, #0x30
 100         b.eq    .Ltail15
 101         cmp     tmp1w, #0x20
 102         b.eq    1f
 103         b.lt    2f
 104         ldp     A_l, A_h, [src, #-16]!
 105         stp     A_l, A_h, [dst, #-16]!
 106 1:
 107         ldp     A_l, A_h, [src, #-16]!
 108         stp     A_l, A_h, [dst, #-16]!
 109 2:
 110         ldp     A_l, A_h, [src, #-16]!
 111         stp     A_l, A_h, [dst, #-16]!
 112
 113 .Ltail15:
 114         tbz     count, #3, 1f
 115         ldr     tmp1, [src, #-8]!
 116         str     tmp1, [dst, #-8]!
 117 1:
 118         tbz     count, #2, 2f
 119         ldr     tmp1w, [src, #-4]!
 120         str     tmp1w, [dst, #-4]!
 121 2:
 122         tbz     count, #1, 3f
 123         ldrh    tmp1w, [src, #-2]!
 124         strh    tmp1w, [dst, #-2]!
 125 3:
 126         tbz     count, #0, .Lexitfunc
 127         ldrb    tmp1w, [src, #-1]
 128         strb    tmp1w, [dst, #-1]
 129
 130 .Lexitfunc:
 131         ret
 132
 133 .Lcpy_over64:
 134         subs    count, count, #128
 135         b.ge    .Lcpy_body_large
 136         /*
 137         * Less than 128 bytes to copy, so handle 64 bytes here and then jump
 138         * to the tail.
 139         */
 140         ldp     A_l, A_h, [src, #-16]
 141         stp     A_l, A_h, [dst, #-16]
 142         ldp     B_l, B_h, [src, #-32]
 143         ldp     C_l, C_h, [src, #-48]
 144         stp     B_l, B_h, [dst, #-32]
 145         stp     C_l, C_h, [dst, #-48]
 146         ldp     D_l, D_h, [src, #-64]!
 147         stp     D_l, D_h, [dst, #-64]!
 148
 149         tst     count, #0x3f
 150         b.ne    .Ltail63
 151         ret
 152
 153         /*
 154         * Critical loop. Start at a new cache line boundary. Assuming
 155         * 64 bytes per line this ensures the entire loop is in one line.
 156         */
 157         .p2align        L1_CACHE_SHIFT
 158 .Lcpy_body_large:
 159         /* pre-load 64 bytes data. */
 160         ldp     A_l, A_h, [src, #-16]
 161         ldp     B_l, B_h, [src, #-32]
 162         ldp     C_l, C_h, [src, #-48]
 163         ldp     D_l, D_h, [src, #-64]!
 164 1:
 165         /*
 166         * interlace the load of next 64 bytes data block with store of the last
 167         * loaded 64 bytes data.
 168         */
 169         stp     A_l, A_h, [dst, #-16]
 170         ldp     A_l, A_h, [src, #-16]
 171         stp     B_l, B_h, [dst, #-32]
 172         ldp     B_l, B_h, [src, #-32]
 173         stp     C_l, C_h, [dst, #-48]
 174         ldp     C_l, C_h, [src, #-48]
 175         stp     D_l, D_h, [dst, #-64]!
 176         ldp     D_l, D_h, [src, #-64]!
 177         subs    count, count, #64
 178         b.ge    1b
 179         stp     A_l, A_h, [dst, #-16]
 180         stp     B_l, B_h, [dst, #-32]
 181         stp     C_l, C_h, [dst, #-48]
 182         stp     D_l, D_h, [dst, #-64]!
 183
 184         tst     count, #0x3f
 185         b.ne    .Ltail63
 186         ret
 187 SYM_FUNC_END_PI(memmove)
 188 EXPORT_SYMBOL(memmove)
 189 SYM_FUNC_END_ALIAS(__memmove)
 190 EXPORT_SYMBOL(__memmove)