payloads/libpayload/arch/arm64/memmove.S

   1 /* Copyright (c) 2013, Linaro Limited
   2    All rights reserved.
   3
   4    Redistribution and use in source and binary forms, with or without
   5    modification, are permitted provided that the following conditions are met:
   6        * Redistributions of source code must retain the above copyright
   7          notice, this list of conditions and the following disclaimer.
   8        * Redistributions in binary form must reproduce the above copyright
   9          notice, this list of conditions and the following disclaimer in the
  10          documentation and/or other materials provided with the distribution.
  11        * Neither the name of the Linaro nor the
  12          names of its contributors may be used to endorse or promote products
  13          derived from this software without specific prior written permission.
  14
  15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE,
  22    DATA, OR PROFITS                                        ; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
  26
  27 /* Assumptions:
  28  *
  29  * ARMv8-a, AArch64
  30  * Unaligned accesses
  31  */
  32
  33 .macro def_fn f p2align=0
  34 .text
  35 .p2align \p2align
  36 .global \f
  37 .type \f, %function
  38 \f:
  39 .endm
  40
  41 /* Parameters and result.  */
  42 #define dstin   x0
  43 #define src     x1
  44 #define count   x2
  45 #define tmp1    x3
  46 #define tmp1w   w3
  47 #define tmp2    x4
  48 #define tmp2w   w4
  49 #define tmp3    x5
  50 #define tmp3w   w5
  51 #define dst     x6
  52
  53 #define A_l     x7
  54 #define A_h     x8
  55 #define B_l     x9
  56 #define B_h     x10
  57 #define C_l     x11
  58 #define C_h     x12
  59 #define D_l     x13
  60 #define D_h     x14
  61
  62 def_fn memmove, 6
  63         cmp     dstin, src
  64         b.lo    .Ldownwards
  65         add     tmp1, src, count
  66         cmp     dstin, tmp1
  67         b.hs    memcpy          /* No overlap.  */
  68
  69         /* Upwards move with potential overlap.
  70          * Need to move from the tail backwards.  SRC and DST point one
  71          * byte beyond the remaining data to move.  */
  72         add     dst, dstin, count
  73         add     src, src, count
  74         cmp     count, #64
  75         b.ge    .Lmov_not_short_up
  76
  77         /* Deal with small moves quickly by dropping straight into the
  78          * exit block.  */
  79 .Ltail63up:
  80         /* Move up to 48 bytes of data.  At this point we only need the
  81          * bottom 6 bits of count to be accurate.  */
  82         ands    tmp1, count, #0x30
  83         b.eq    .Ltail15up
  84         sub     dst, dst, tmp1
  85         sub     src, src, tmp1
  86         cmp     tmp1w, #0x20
  87         b.eq    1f
  88         b.lt    2f
  89         ldp     A_l, A_h, [src, #32]
  90         stp     A_l, A_h, [dst, #32]
  91 1:
  92         ldp     A_l, A_h, [src, #16]
  93         stp     A_l, A_h, [dst, #16]
  94 2:
  95         ldp     A_l, A_h, [src]
  96         stp     A_l, A_h, [dst]
  97 .Ltail15up:
  98         /* Move up to 15 bytes of data.  Does not assume additional data
  99          * being moved.  */
 100         tbz     count, #3, 1f
 101         ldr     tmp1, [src, #-8]!
 102         str     tmp1, [dst, #-8]!
 103 1:
 104         tbz     count, #2, 1f
 105         ldr     tmp1w, [src, #-4]!
 106         str     tmp1w, [dst, #-4]!
 107 1:
 108         tbz     count, #1, 1f
 109         ldrh    tmp1w, [src, #-2]!
 110         strh    tmp1w, [dst, #-2]!
 111 1:
 112         tbz     count, #0, 1f
 113         ldrb    tmp1w, [src, #-1]
 114         strb    tmp1w, [dst, #-1]
 115 1:
 116         ret
 117
 118 .Lmov_not_short_up:
 119         /* We don't much care about the alignment of DST, but we want SRC
 120          * to be 128-bit (16 byte) aligned so that we don't cross cache line
 121          * boundaries on both loads and stores.  */
 122         ands    tmp2, src, #15          /* Bytes to reach alignment.  */
 123         b.eq    2f
 124         sub     count, count, tmp2
 125         /* Move enough data to reach alignment; unlike memcpy, we have to
 126          * be aware of the overlap, which means we can't move data twice.  */
 127         tbz     tmp2, #3, 1f
 128         ldr     tmp1, [src, #-8]!
 129         str     tmp1, [dst, #-8]!
 130 1:
 131         tbz     tmp2, #2, 1f
 132         ldr     tmp1w, [src, #-4]!
 133         str     tmp1w, [dst, #-4]!
 134 1:
 135         tbz     tmp2, #1, 1f
 136         ldrh    tmp1w, [src, #-2]!
 137         strh    tmp1w, [dst, #-2]!
 138 1:
 139         tbz     tmp2, #0, 1f
 140         ldrb    tmp1w, [src, #-1]!
 141         strb    tmp1w, [dst, #-1]!
 142 1:
 143
 144         /* There may be less than 63 bytes to go now.  */
 145         cmp     count, #63
 146         b.le    .Ltail63up
 147 2:
 148         subs    count, count, #128
 149         b.ge    .Lmov_body_large_up
 150         /* Less than 128 bytes to move, so handle 64 here and then jump
 151          * to the tail.  */
 152         ldp     A_l, A_h, [src, #-64]!
 153         ldp     B_l, B_h, [src, #16]
 154         ldp     C_l, C_h, [src, #32]
 155         ldp     D_l, D_h, [src, #48]
 156         stp     A_l, A_h, [dst, #-64]!
 157         stp     B_l, B_h, [dst, #16]
 158         stp     C_l, C_h, [dst, #32]
 159         stp     D_l, D_h, [dst, #48]
 160         tst     count, #0x3f
 161         b.ne    .Ltail63up
 162         ret
 163
 164         /* Critical loop.  Start at a new Icache line boundary.  Assuming
 165          * 64 bytes per line this ensures the entire loop is in one line.  */
 166         .p2align 6
 167 .Lmov_body_large_up:
 168         /* There are at least 128 bytes to move.  */
 169         ldp     A_l, A_h, [src, #-16]
 170         ldp     B_l, B_h, [src, #-32]
 171         ldp     C_l, C_h, [src, #-48]
 172         ldp     D_l, D_h, [src, #-64]!
 173 1:
 174         stp     A_l, A_h, [dst, #-16]
 175         ldp     A_l, A_h, [src, #-16]
 176         stp     B_l, B_h, [dst, #-32]
 177         ldp     B_l, B_h, [src, #-32]
 178         stp     C_l, C_h, [dst, #-48]
 179         ldp     C_l, C_h, [src, #-48]
 180         stp     D_l, D_h, [dst, #-64]!
 181         ldp     D_l, D_h, [src, #-64]!
 182         subs    count, count, #64
 183         b.ge    1b
 184         stp     A_l, A_h, [dst, #-16]
 185         stp     B_l, B_h, [dst, #-32]
 186         stp     C_l, C_h, [dst, #-48]
 187         stp     D_l, D_h, [dst, #-64]!
 188         tst     count, #0x3f
 189         b.ne    .Ltail63up
 190         ret
 191
 192
 193 .Ldownwards:
 194         /* For a downwards move we can safely use memcpy provided that
 195          * DST is more than 16 bytes away from SRC.  */
 196         sub     tmp1, src, #16
 197         cmp     dstin, tmp1
 198         b.ls    memcpy          /* May overlap, but not critically.  */
 199
 200         mov     dst, dstin      /* Preserve DSTIN for return value.  */
 201         cmp     count, #64
 202         b.ge    .Lmov_not_short_down
 203
 204         /* Deal with small moves quickly by dropping straight into the
 205          * exit block.  */
 206 .Ltail63down:
 207         /* Move up to 48 bytes of data.  At this point we only need the
 208          * bottom 6 bits of count to be accurate.  */
 209         ands    tmp1, count, #0x30
 210         b.eq    .Ltail15down
 211         add     dst, dst, tmp1
 212         add     src, src, tmp1
 213         cmp     tmp1w, #0x20
 214         b.eq    1f
 215         b.lt    2f
 216         ldp     A_l, A_h, [src, #-48]
 217         stp     A_l, A_h, [dst, #-48]
 218 1:
 219         ldp     A_l, A_h, [src, #-32]
 220         stp     A_l, A_h, [dst, #-32]
 221 2:
 222         ldp     A_l, A_h, [src, #-16]
 223         stp     A_l, A_h, [dst, #-16]
 224 .Ltail15down:
 225         /* Move up to 15 bytes of data.  Does not assume additional data
 226            being moved.  */
 227         tbz     count, #3, 1f
 228         ldr     tmp1, [src], #8
 229         str     tmp1, [dst], #8
 230 1:
 231         tbz     count, #2, 1f
 232         ldr     tmp1w, [src], #4
 233         str     tmp1w, [dst], #4
 234 1:
 235         tbz     count, #1, 1f
 236         ldrh    tmp1w, [src], #2
 237         strh    tmp1w, [dst], #2
 238 1:
 239         tbz     count, #0, 1f
 240         ldrb    tmp1w, [src]
 241         strb    tmp1w, [dst]
 242 1:
 243         ret
 244
 245 .Lmov_not_short_down:
 246         /* We don't much care about the alignment of DST, but we want SRC
 247          * to be 128-bit (16 byte) aligned so that we don't cross cache line
 248          * boundaries on both loads and stores.  */
 249         neg     tmp2, src
 250         ands    tmp2, tmp2, #15         /* Bytes to reach alignment.  */
 251         b.eq    2f
 252         sub     count, count, tmp2
 253         /* Move enough data to reach alignment; unlike memcpy, we have to
 254          * be aware of the overlap, which means we can't move data twice.  */
 255         tbz     tmp2, #3, 1f
 256         ldr     tmp1, [src], #8
 257         str     tmp1, [dst], #8
 258 1:
 259         tbz     tmp2, #2, 1f
 260         ldr     tmp1w, [src], #4
 261         str     tmp1w, [dst], #4
 262 1:
 263         tbz     tmp2, #1, 1f
 264         ldrh    tmp1w, [src], #2
 265         strh    tmp1w, [dst], #2
 266 1:
 267         tbz     tmp2, #0, 1f
 268         ldrb    tmp1w, [src], #1
 269         strb    tmp1w, [dst], #1
 270 1:
 271
 272         /* There may be less than 63 bytes to go now.  */
 273         cmp     count, #63
 274         b.le    .Ltail63down
 275 2:
 276         subs    count, count, #128
 277         b.ge    .Lmov_body_large_down
 278         /* Less than 128 bytes to move, so handle 64 here and then jump
 279          * to the tail.  */
 280         ldp     A_l, A_h, [src]
 281         ldp     B_l, B_h, [src, #16]
 282         ldp     C_l, C_h, [src, #32]
 283         ldp     D_l, D_h, [src, #48]
 284         stp     A_l, A_h, [dst]
 285         stp     B_l, B_h, [dst, #16]
 286         stp     C_l, C_h, [dst, #32]
 287         stp     D_l, D_h, [dst, #48]
 288         tst     count, #0x3f
 289         add     src, src, #64
 290         add     dst, dst, #64
 291         b.ne    .Ltail63down
 292         ret
 293
 294         /* Critical loop.  Start at a new cache line boundary.  Assuming
 295          * 64 bytes per line this ensures the entire loop is in one line.  */
 296         .p2align 6
 297 .Lmov_body_large_down:
 298         /* There are at least 128 bytes to move.  */
 299         ldp     A_l, A_h, [src, #0]
 300         sub     dst, dst, #16           /* Pre-bias.  */
 301         ldp     B_l, B_h, [src, #16]
 302         ldp     C_l, C_h, [src, #32]
 303         ldp     D_l, D_h, [src, #48]!   /* src += 64 - Pre-bias.  */
 304 1:
 305         stp     A_l, A_h, [dst, #16]
 306         ldp     A_l, A_h, [src, #16]
 307         stp     B_l, B_h, [dst, #32]
 308         ldp     B_l, B_h, [src, #32]
 309         stp     C_l, C_h, [dst, #48]
 310         ldp     C_l, C_h, [src, #48]
 311         stp     D_l, D_h, [dst, #64]!
 312         ldp     D_l, D_h, [src, #64]!
 313         subs    count, count, #64
 314         b.ge    1b
 315         stp     A_l, A_h, [dst, #16]
 316         stp     B_l, B_h, [dst, #32]
 317         stp     C_l, C_h, [dst, #48]
 318         stp     D_l, D_h, [dst, #64]
 319         add     src, src, #16
 320         add     dst, dst, #64 + 16
 321         tst     count, #0x3f
 322         b.ne    .Ltail63down
 323         ret
 324         .size memmove, . - memmove