arch/arm64/lib/memmove.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (C) 2013 ARM Ltd.
   4  * Copyright (C) 2013 Linaro.
   5  *
   6  * This code is based on glibc cortex strings work originally authored by Linaro
   7  * be found @
   8  *
   9  * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10  * files/head:/src/aarch64/
  11  */
  12
  13 #include <linux/linkage.h>
  14 #include <asm/assembler.h>
  15 #include <asm/cache.h>
  16
  17 /*
  18  * Move a buffer from src to test (alignment handled by the hardware).
  19  * If dest <= src, call memcpy, otherwise copy in reverse order.
  20  *
  21  * Parameters:
  22  *      x0 - dest
  23  *      x1 - src
  24  *      x2 - n
  25  * Returns:
  26  *      x0 - dest
  27  */
  28 dstin   .req    x0
  29 src     .req    x1
  30 count   .req    x2
  31 tmp1    .req    x3
  32 tmp1w   .req    w3
  33 tmp2    .req    x4
  34 tmp2w   .req    w4
  35 tmp3    .req    x5
  36 tmp3w   .req    w5
  37 dst     .req    x6
  38
  39 A_l     .req    x7
  40 A_h     .req    x8
  41 B_l     .req    x9
  42 B_h     .req    x10
  43 C_l     .req    x11
  44 C_h     .req    x12
  45 D_l     .req    x13
  46 D_h     .req    x14
  47
  48 SYM_FUNC_START_ALIAS(__memmove)
  49 SYM_FUNC_START_WEAK_PI(memmove)
  50         cmp     dstin, src
  51         b.lo    __memcpy
  52         add     tmp1, src, count
  53         cmp     dstin, tmp1
  54         b.hs    __memcpy                /* No overlap.  */
  55
  56         add     dst, dstin, count
  57         add     src, src, count
  58         cmp     count, #16
  59         b.lo    .Ltail15  /*probably non-alignment accesses.*/
  60
  61         ands    tmp2, src, #15     /* Bytes to reach alignment.  */
  62         b.eq    .LSrcAligned
  63         sub     count, count, tmp2
  64         /*
  65         * process the aligned offset length to make the src aligned firstly.
  66         * those extra instructions' cost is acceptable. It also make the
  67         * coming accesses are based on aligned address.
  68         */
  69         tbz     tmp2, #0, 1f
  70         ldrb    tmp1w, [src, #-1]!
  71         strb    tmp1w, [dst, #-1]!
  72 1:
  73         tbz     tmp2, #1, 2f
  74         ldrh    tmp1w, [src, #-2]!
  75         strh    tmp1w, [dst, #-2]!
  76 2:
  77         tbz     tmp2, #2, 3f
  78         ldr     tmp1w, [src, #-4]!
  79         str     tmp1w, [dst, #-4]!
  80 3:
  81         tbz     tmp2, #3, .LSrcAligned
  82         ldr     tmp1, [src, #-8]!
  83         str     tmp1, [dst, #-8]!
  84
  85 .LSrcAligned:
  86         cmp     count, #64
  87         b.ge    .Lcpy_over64
  88
  89         /*
  90         * Deal with small copies quickly by dropping straight into the
  91         * exit block.
  92         */
  93 .Ltail63:
  94         /*
  95         * Copy up to 48 bytes of data. At this point we only need the
  96         * bottom 6 bits of count to be accurate.
  97         */
  98         ands    tmp1, count, #0x30
  99         b.eq    .Ltail15
 100         cmp     tmp1w, #0x20
 101         b.eq    1f
 102         b.lt    2f
 103         ldp     A_l, A_h, [src, #-16]!
 104         stp     A_l, A_h, [dst, #-16]!
 105 1:
 106         ldp     A_l, A_h, [src, #-16]!
 107         stp     A_l, A_h, [dst, #-16]!
 108 2:
 109         ldp     A_l, A_h, [src, #-16]!
 110         stp     A_l, A_h, [dst, #-16]!
 111
 112 .Ltail15:
 113         tbz     count, #3, 1f
 114         ldr     tmp1, [src, #-8]!
 115         str     tmp1, [dst, #-8]!
 116 1:
 117         tbz     count, #2, 2f
 118         ldr     tmp1w, [src, #-4]!
 119         str     tmp1w, [dst, #-4]!
 120 2:
 121         tbz     count, #1, 3f
 122         ldrh    tmp1w, [src, #-2]!
 123         strh    tmp1w, [dst, #-2]!
 124 3:
 125         tbz     count, #0, .Lexitfunc
 126         ldrb    tmp1w, [src, #-1]
 127         strb    tmp1w, [dst, #-1]
 128
 129 .Lexitfunc:
 130         ret
 131
 132 .Lcpy_over64:
 133         subs    count, count, #128
 134         b.ge    .Lcpy_body_large
 135         /*
 136         * Less than 128 bytes to copy, so handle 64 bytes here and then jump
 137         * to the tail.
 138         */
 139         ldp     A_l, A_h, [src, #-16]
 140         stp     A_l, A_h, [dst, #-16]
 141         ldp     B_l, B_h, [src, #-32]
 142         ldp     C_l, C_h, [src, #-48]
 143         stp     B_l, B_h, [dst, #-32]
 144         stp     C_l, C_h, [dst, #-48]
 145         ldp     D_l, D_h, [src, #-64]!
 146         stp     D_l, D_h, [dst, #-64]!
 147
 148         tst     count, #0x3f
 149         b.ne    .Ltail63
 150         ret
 151
 152         /*
 153         * Critical loop. Start at a new cache line boundary. Assuming
 154         * 64 bytes per line this ensures the entire loop is in one line.
 155         */
 156         .p2align        L1_CACHE_SHIFT
 157 .Lcpy_body_large:
 158         /* pre-load 64 bytes data. */
 159         ldp     A_l, A_h, [src, #-16]
 160         ldp     B_l, B_h, [src, #-32]
 161         ldp     C_l, C_h, [src, #-48]
 162         ldp     D_l, D_h, [src, #-64]!
 163 1:
 164         /*
 165         * interlace the load of next 64 bytes data block with store of the last
 166         * loaded 64 bytes data.
 167         */
 168         stp     A_l, A_h, [dst, #-16]
 169         ldp     A_l, A_h, [src, #-16]
 170         stp     B_l, B_h, [dst, #-32]
 171         ldp     B_l, B_h, [src, #-32]
 172         stp     C_l, C_h, [dst, #-48]
 173         ldp     C_l, C_h, [src, #-48]
 174         stp     D_l, D_h, [dst, #-64]!
 175         ldp     D_l, D_h, [src, #-64]!
 176         subs    count, count, #64
 177         b.ge    1b
 178         stp     A_l, A_h, [dst, #-16]
 179         stp     B_l, B_h, [dst, #-32]
 180         stp     C_l, C_h, [dst, #-48]
 181         stp     D_l, D_h, [dst, #-64]!
 182
 183         tst     count, #0x3f
 184         b.ne    .Ltail63
 185         ret
 186 SYM_FUNC_END_PI(memmove)
 187 EXPORT_SYMBOL(memmove)
 188 SYM_FUNC_END_ALIAS(__memmove)
 189 EXPORT_SYMBOL(__memmove)