libc/AOR_v20.02/string/arm/memcpy.S

   1 /*
   2  * memcpy - copy memory area
   3  *
   4  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5  * See https://llvm.org/LICENSE.txt for license information.
   6  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7  */
   8
   9 /*
  10    This memcpy routine is optimised for Cortex-A15 cores and takes advantage
  11    of VFP or NEON when built with the appropriate flags.
  12
  13    Assumptions:
  14
  15     ARMv6 (ARMv7-a if using Neon)
  16     ARM state
  17     Unaligned accesses
  18
  19  */
  20
  21 #include "../asmdefs.h"
  22
  23         .syntax unified
  24         /* This implementation requires ARM state.  */
  25         .arm
  26
  27 #ifdef __ARM_NEON__
  28
  29         .fpu    neon
  30         .arch   armv7-a
  31 # define FRAME_SIZE     4
  32 # define USE_VFP
  33 # define USE_NEON
  34
  35 #elif !defined (__SOFTFP__)
  36
  37         .arch   armv6
  38         .fpu    vfpv2
  39 # define FRAME_SIZE     32
  40 # define USE_VFP
  41
  42 #else
  43         .arch   armv6
  44 # define FRAME_SIZE    32
  45
  46 #endif
  47
  48 /* Old versions of GAS incorrectly implement the NEON align semantics.  */
  49 #ifdef BROKEN_ASM_NEON_ALIGN
  50 #define ALIGN(addr, align) addr,:align
  51 #else
  52 #define ALIGN(addr, align) addr:align
  53 #endif
  54
  55 #define PC_OFFSET       8       /* PC pipeline compensation.  */
  56 #define INSN_SIZE       4
  57
  58 /* Call parameters.  */
  59 #define dstin   r0
  60 #define src     r1
  61 #define count   r2
  62
  63 /* Locals.  */
  64 #define tmp1    r3
  65 #define dst     ip
  66 #define tmp2    r10
  67
  68 #ifndef USE_NEON
  69 /* For bulk copies using GP registers.  */
  70 #define A_l     r2              /* Call-clobbered.  */
  71 #define A_h     r3              /* Call-clobbered.  */
  72 #define B_l     r4
  73 #define B_h     r5
  74 #define C_l     r6
  75 #define C_h     r7
  76 #define D_l     r8
  77 #define D_h     r9
  78 #endif
  79
  80 /* Number of lines ahead to pre-fetch data.  If you change this the code
  81    below will need adjustment to compensate.  */
  82
  83 #define prefetch_lines  5
  84
  85 #ifdef USE_VFP
  86         .macro  cpy_line_vfp vreg, base
  87         vstr    \vreg, [dst, #\base]
  88         vldr    \vreg, [src, #\base]
  89         vstr    d0, [dst, #\base + 8]
  90         vldr    d0, [src, #\base + 8]
  91         vstr    d1, [dst, #\base + 16]
  92         vldr    d1, [src, #\base + 16]
  93         vstr    d2, [dst, #\base + 24]
  94         vldr    d2, [src, #\base + 24]
  95         vstr    \vreg, [dst, #\base + 32]
  96         vldr    \vreg, [src, #\base + prefetch_lines * 64 - 32]
  97         vstr    d0, [dst, #\base + 40]
  98         vldr    d0, [src, #\base + 40]
  99         vstr    d1, [dst, #\base + 48]
 100         vldr    d1, [src, #\base + 48]
 101         vstr    d2, [dst, #\base + 56]
 102         vldr    d2, [src, #\base + 56]
 103         .endm
 104
 105         .macro  cpy_tail_vfp vreg, base
 106         vstr    \vreg, [dst, #\base]
 107         vldr    \vreg, [src, #\base]
 108         vstr    d0, [dst, #\base + 8]
 109         vldr    d0, [src, #\base + 8]
 110         vstr    d1, [dst, #\base + 16]
 111         vldr    d1, [src, #\base + 16]
 112         vstr    d2, [dst, #\base + 24]
 113         vldr    d2, [src, #\base + 24]
 114         vstr    \vreg, [dst, #\base + 32]
 115         vstr    d0, [dst, #\base + 40]
 116         vldr    d0, [src, #\base + 40]
 117         vstr    d1, [dst, #\base + 48]
 118         vldr    d1, [src, #\base + 48]
 119         vstr    d2, [dst, #\base + 56]
 120         vldr    d2, [src, #\base + 56]
 121         .endm
 122 #endif
 123
 124 ENTRY (__memcpy_arm)
 125
 126         mov     dst, dstin      /* Preserve dstin, we need to return it.  */
 127         cmp     count, #64
 128         bge     L(cpy_not_short)
 129         /* Deal with small copies quickly by dropping straight into the
 130            exit block.  */
 131
 132 L(tail63unaligned):
 133 #ifdef USE_NEON
 134         and     tmp1, count, #0x38
 135         rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 136         add     pc, pc, tmp1
 137         vld1.8  {d0}, [src]!    /* 14 words to go.  */
 138         vst1.8  {d0}, [dst]!
 139         vld1.8  {d0}, [src]!    /* 12 words to go.  */
 140         vst1.8  {d0}, [dst]!
 141         vld1.8  {d0}, [src]!    /* 10 words to go.  */
 142         vst1.8  {d0}, [dst]!
 143         vld1.8  {d0}, [src]!    /* 8 words to go.  */
 144         vst1.8  {d0}, [dst]!
 145         vld1.8  {d0}, [src]!    /* 6 words to go.  */
 146         vst1.8  {d0}, [dst]!
 147         vld1.8  {d0}, [src]!    /* 4 words to go.  */
 148         vst1.8  {d0}, [dst]!
 149         vld1.8  {d0}, [src]!    /* 2 words to go.  */
 150         vst1.8  {d0}, [dst]!
 151
 152         tst     count, #4
 153         ldrne   tmp1, [src], #4
 154         strne   tmp1, [dst], #4
 155 #else
 156         /* Copy up to 15 full words of data.  May not be aligned.  */
 157         /* Cannot use VFP for unaligned data.  */
 158         and     tmp1, count, #0x3c
 159         add     dst, dst, tmp1
 160         add     src, src, tmp1
 161         rsb     tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
 162         /* Jump directly into the sequence below at the correct offset.  */
 163         add     pc, pc, tmp1, lsl #1
 164
 165         ldr     tmp1, [src, #-60]       /* 15 words to go.  */
 166         str     tmp1, [dst, #-60]
 167
 168         ldr     tmp1, [src, #-56]       /* 14 words to go.  */
 169         str     tmp1, [dst, #-56]
 170         ldr     tmp1, [src, #-52]
 171         str     tmp1, [dst, #-52]
 172
 173         ldr     tmp1, [src, #-48]       /* 12 words to go.  */
 174         str     tmp1, [dst, #-48]
 175         ldr     tmp1, [src, #-44]
 176         str     tmp1, [dst, #-44]
 177
 178         ldr     tmp1, [src, #-40]       /* 10 words to go.  */
 179         str     tmp1, [dst, #-40]
 180         ldr     tmp1, [src, #-36]
 181         str     tmp1, [dst, #-36]
 182
 183         ldr     tmp1, [src, #-32]       /* 8 words to go.  */
 184         str     tmp1, [dst, #-32]
 185         ldr     tmp1, [src, #-28]
 186         str     tmp1, [dst, #-28]
 187
 188         ldr     tmp1, [src, #-24]       /* 6 words to go.  */
 189         str     tmp1, [dst, #-24]
 190         ldr     tmp1, [src, #-20]
 191         str     tmp1, [dst, #-20]
 192
 193         ldr     tmp1, [src, #-16]       /* 4 words to go.  */
 194         str     tmp1, [dst, #-16]
 195         ldr     tmp1, [src, #-12]
 196         str     tmp1, [dst, #-12]
 197
 198         ldr     tmp1, [src, #-8]        /* 2 words to go.  */
 199         str     tmp1, [dst, #-8]
 200         ldr     tmp1, [src, #-4]
 201         str     tmp1, [dst, #-4]
 202 #endif
 203
 204         lsls    count, count, #31
 205         ldrhcs  tmp1, [src], #2
 206         ldrbne  src, [src]              /* Src is dead, use as a scratch.  */
 207         strhcs  tmp1, [dst], #2
 208         strbne  src, [dst]
 209         bx      lr
 210
 211 L(cpy_not_short):
 212         /* At least 64 bytes to copy, but don't know the alignment yet.  */
 213         str     tmp2, [sp, #-FRAME_SIZE]!
 214         and     tmp2, src, #7
 215         and     tmp1, dst, #7
 216         cmp     tmp1, tmp2
 217         bne     L(cpy_notaligned)
 218
 219 #ifdef USE_VFP
 220         /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
 221            that the FP pipeline is much better at streaming loads and
 222            stores.  This is outside the critical loop.  */
 223         vmov.f32        s0, s0
 224 #endif
 225
 226         /* SRC and DST have the same mutual 64-bit alignment, but we may
 227            still need to pre-copy some bytes to get to natural alignment.
 228            We bring SRC and DST into full 64-bit alignment.  */
 229         lsls    tmp2, dst, #29
 230         beq     1f
 231         rsbs    tmp2, tmp2, #0
 232         sub     count, count, tmp2, lsr #29
 233         ldrmi   tmp1, [src], #4
 234         strmi   tmp1, [dst], #4
 235         lsls    tmp2, tmp2, #2
 236         ldrhcs  tmp1, [src], #2
 237         ldrbne  tmp2, [src], #1
 238         strhcs  tmp1, [dst], #2
 239         strbne  tmp2, [dst], #1
 240
 241 1:
 242         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 243         blt     L(tail63aligned)
 244
 245         cmp     tmp2, #512
 246         bge     L(cpy_body_long)
 247
 248 L(cpy_body_medium):                     /* Count in tmp2.  */
 249 #ifdef USE_VFP
 250 1:
 251         vldr    d0, [src, #0]
 252         subs    tmp2, tmp2, #64
 253         vldr    d1, [src, #8]
 254         vstr    d0, [dst, #0]
 255         vldr    d0, [src, #16]
 256         vstr    d1, [dst, #8]
 257         vldr    d1, [src, #24]
 258         vstr    d0, [dst, #16]
 259         vldr    d0, [src, #32]
 260         vstr    d1, [dst, #24]
 261         vldr    d1, [src, #40]
 262         vstr    d0, [dst, #32]
 263         vldr    d0, [src, #48]
 264         vstr    d1, [dst, #40]
 265         vldr    d1, [src, #56]
 266         vstr    d0, [dst, #48]
 267         add     src, src, #64
 268         vstr    d1, [dst, #56]
 269         add     dst, dst, #64
 270         bge     1b
 271         tst     tmp2, #0x3f
 272         beq     L(done)
 273
 274 L(tail63aligned):                       /* Count in tmp2.  */
 275         and     tmp1, tmp2, #0x38
 276         add     dst, dst, tmp1
 277         add     src, src, tmp1
 278         rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 279         add     pc, pc, tmp1
 280
 281         vldr    d0, [src, #-56] /* 14 words to go.  */
 282         vstr    d0, [dst, #-56]
 283         vldr    d0, [src, #-48] /* 12 words to go.  */
 284         vstr    d0, [dst, #-48]
 285         vldr    d0, [src, #-40] /* 10 words to go.  */
 286         vstr    d0, [dst, #-40]
 287         vldr    d0, [src, #-32] /* 8 words to go.  */
 288         vstr    d0, [dst, #-32]
 289         vldr    d0, [src, #-24] /* 6 words to go.  */
 290         vstr    d0, [dst, #-24]
 291         vldr    d0, [src, #-16] /* 4 words to go.  */
 292         vstr    d0, [dst, #-16]
 293         vldr    d0, [src, #-8]  /* 2 words to go.  */
 294         vstr    d0, [dst, #-8]
 295 #else
 296         sub     src, src, #8
 297         sub     dst, dst, #8
 298 1:
 299         ldrd    A_l, A_h, [src, #8]
 300         strd    A_l, A_h, [dst, #8]
 301         ldrd    A_l, A_h, [src, #16]
 302         strd    A_l, A_h, [dst, #16]
 303         ldrd    A_l, A_h, [src, #24]
 304         strd    A_l, A_h, [dst, #24]
 305         ldrd    A_l, A_h, [src, #32]
 306         strd    A_l, A_h, [dst, #32]
 307         ldrd    A_l, A_h, [src, #40]
 308         strd    A_l, A_h, [dst, #40]
 309         ldrd    A_l, A_h, [src, #48]
 310         strd    A_l, A_h, [dst, #48]
 311         ldrd    A_l, A_h, [src, #56]
 312         strd    A_l, A_h, [dst, #56]
 313         ldrd    A_l, A_h, [src, #64]!
 314         strd    A_l, A_h, [dst, #64]!
 315         subs    tmp2, tmp2, #64
 316         bge     1b
 317         tst     tmp2, #0x3f
 318         bne     1f
 319         ldr     tmp2,[sp], #FRAME_SIZE
 320         bx      lr
 321 1:
 322         add     src, src, #8
 323         add     dst, dst, #8
 324
 325 L(tail63aligned):                       /* Count in tmp2.  */
 326         /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 327            we know that the src and dest are 64-bit aligned so we can use
 328            LDRD/STRD to improve efficiency.  */
 329         /* TMP2 is now negative, but we don't care about that.  The bottom
 330            six bits still tell us how many bytes are left to copy.  */
 331
 332         and     tmp1, tmp2, #0x38
 333         add     dst, dst, tmp1
 334         add     src, src, tmp1
 335         rsb     tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
 336         add     pc, pc, tmp1
 337         ldrd    A_l, A_h, [src, #-56]   /* 14 words to go.  */
 338         strd    A_l, A_h, [dst, #-56]
 339         ldrd    A_l, A_h, [src, #-48]   /* 12 words to go.  */
 340         strd    A_l, A_h, [dst, #-48]
 341         ldrd    A_l, A_h, [src, #-40]   /* 10 words to go.  */
 342         strd    A_l, A_h, [dst, #-40]
 343         ldrd    A_l, A_h, [src, #-32]   /* 8 words to go.  */
 344         strd    A_l, A_h, [dst, #-32]
 345         ldrd    A_l, A_h, [src, #-24]   /* 6 words to go.  */
 346         strd    A_l, A_h, [dst, #-24]
 347         ldrd    A_l, A_h, [src, #-16]   /* 4 words to go.  */
 348         strd    A_l, A_h, [dst, #-16]
 349         ldrd    A_l, A_h, [src, #-8]    /* 2 words to go.  */
 350         strd    A_l, A_h, [dst, #-8]
 351
 352 #endif
 353         tst     tmp2, #4
 354         ldrne   tmp1, [src], #4
 355         strne   tmp1, [dst], #4
 356         lsls    tmp2, tmp2, #31         /* Count (tmp2) now dead. */
 357         ldrhcs  tmp1, [src], #2
 358         ldrbne  tmp2, [src]
 359         strhcs  tmp1, [dst], #2
 360         strbne  tmp2, [dst]
 361
 362 L(done):
 363         ldr     tmp2, [sp], #FRAME_SIZE
 364         bx      lr
 365
 366 L(cpy_body_long):                       /* Count in tmp2.  */
 367
 368         /* Long copy.  We know that there's at least (prefetch_lines * 64)
 369            bytes to go.  */
 370 #ifdef USE_VFP
 371         /* Don't use PLD.  Instead, read some data in advance of the current
 372            copy position into a register.  This should act like a PLD
 373            operation but we won't have to repeat the transfer.  */
 374
 375         vldr    d3, [src, #0]
 376         vldr    d4, [src, #64]
 377         vldr    d5, [src, #128]
 378         vldr    d6, [src, #192]
 379         vldr    d7, [src, #256]
 380
 381         vldr    d0, [src, #8]
 382         vldr    d1, [src, #16]
 383         vldr    d2, [src, #24]
 384         add     src, src, #32
 385
 386         subs    tmp2, tmp2, #prefetch_lines * 64 * 2
 387         blt     2f
 388 1:
 389         cpy_line_vfp    d3, 0
 390         cpy_line_vfp    d4, 64
 391         cpy_line_vfp    d5, 128
 392         add     dst, dst, #3 * 64
 393         add     src, src, #3 * 64
 394         cpy_line_vfp    d6, 0
 395         cpy_line_vfp    d7, 64
 396         add     dst, dst, #2 * 64
 397         add     src, src, #2 * 64
 398         subs    tmp2, tmp2, #prefetch_lines * 64
 399         bge     1b
 400
 401 2:
 402         cpy_tail_vfp    d3, 0
 403         cpy_tail_vfp    d4, 64
 404         cpy_tail_vfp    d5, 128
 405         add     src, src, #3 * 64
 406         add     dst, dst, #3 * 64
 407         cpy_tail_vfp    d6, 0
 408         vstr    d7, [dst, #64]
 409         vldr    d7, [src, #64]
 410         vstr    d0, [dst, #64 + 8]
 411         vldr    d0, [src, #64 + 8]
 412         vstr    d1, [dst, #64 + 16]
 413         vldr    d1, [src, #64 + 16]
 414         vstr    d2, [dst, #64 + 24]
 415         vldr    d2, [src, #64 + 24]
 416         vstr    d7, [dst, #64 + 32]
 417         add     src, src, #96
 418         vstr    d0, [dst, #64 + 40]
 419         vstr    d1, [dst, #64 + 48]
 420         vstr    d2, [dst, #64 + 56]
 421         add     dst, dst, #128
 422         add     tmp2, tmp2, #prefetch_lines * 64
 423         b       L(cpy_body_medium)
 424 #else
 425         /* Long copy.  Use an SMS style loop to maximize the I/O
 426            bandwidth of the core.  We don't have enough spare registers
 427            to synthesise prefetching, so use PLD operations.  */
 428         /* Pre-bias src and dst.  */
 429         sub     src, src, #8
 430         sub     dst, dst, #8
 431         pld     [src, #8]
 432         pld     [src, #72]
 433         subs    tmp2, tmp2, #64
 434         pld     [src, #136]
 435         ldrd    A_l, A_h, [src, #8]
 436         strd    B_l, B_h, [sp, #8]
 437         ldrd    B_l, B_h, [src, #16]
 438         strd    C_l, C_h, [sp, #16]
 439         ldrd    C_l, C_h, [src, #24]
 440         strd    D_l, D_h, [sp, #24]
 441         pld     [src, #200]
 442         ldrd    D_l, D_h, [src, #32]!
 443         b       1f
 444         .p2align        6
 445 2:
 446         pld     [src, #232]
 447         strd    A_l, A_h, [dst, #40]
 448         ldrd    A_l, A_h, [src, #40]
 449         strd    B_l, B_h, [dst, #48]
 450         ldrd    B_l, B_h, [src, #48]
 451         strd    C_l, C_h, [dst, #56]
 452         ldrd    C_l, C_h, [src, #56]
 453         strd    D_l, D_h, [dst, #64]!
 454         ldrd    D_l, D_h, [src, #64]!
 455         subs    tmp2, tmp2, #64
 456 1:
 457         strd    A_l, A_h, [dst, #8]
 458         ldrd    A_l, A_h, [src, #8]
 459         strd    B_l, B_h, [dst, #16]
 460         ldrd    B_l, B_h, [src, #16]
 461         strd    C_l, C_h, [dst, #24]
 462         ldrd    C_l, C_h, [src, #24]
 463         strd    D_l, D_h, [dst, #32]
 464         ldrd    D_l, D_h, [src, #32]
 465         bcs     2b
 466         /* Save the remaining bytes and restore the callee-saved regs.  */
 467         strd    A_l, A_h, [dst, #40]
 468         add     src, src, #40
 469         strd    B_l, B_h, [dst, #48]
 470         ldrd    B_l, B_h, [sp, #8]
 471         strd    C_l, C_h, [dst, #56]
 472         ldrd    C_l, C_h, [sp, #16]
 473         strd    D_l, D_h, [dst, #64]
 474         ldrd    D_l, D_h, [sp, #24]
 475         add     dst, dst, #72
 476         tst     tmp2, #0x3f
 477         bne     L(tail63aligned)
 478         ldr     tmp2, [sp], #FRAME_SIZE
 479         bx      lr
 480 #endif
 481
 482 L(cpy_notaligned):
 483         pld     [src]
 484         pld     [src, #64]
 485         /* There's at least 64 bytes to copy, but there is no mutual
 486            alignment.  */
 487         /* Bring DST to 64-bit alignment.  */
 488         lsls    tmp2, dst, #29
 489         pld     [src, #(2 * 64)]
 490         beq     1f
 491         rsbs    tmp2, tmp2, #0
 492         sub     count, count, tmp2, lsr #29
 493         ldrmi   tmp1, [src], #4
 494         strmi   tmp1, [dst], #4
 495         lsls    tmp2, tmp2, #2
 496         ldrbne  tmp1, [src], #1
 497         ldrhcs  tmp2, [src], #2
 498         strbne  tmp1, [dst], #1
 499         strhcs  tmp2, [dst], #2
 500 1:
 501         pld     [src, #(3 * 64)]
 502         subs    count, count, #64
 503         ldrmi   tmp2, [sp], #FRAME_SIZE
 504         bmi     L(tail63unaligned)
 505         pld     [src, #(4 * 64)]
 506
 507 #ifdef USE_NEON
 508         vld1.8  {d0-d3}, [src]!
 509         vld1.8  {d4-d7}, [src]!
 510         subs    count, count, #64
 511         bmi     2f
 512 1:
 513         pld     [src, #(4 * 64)]
 514         vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
 515         vld1.8  {d0-d3}, [src]!
 516         vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
 517         vld1.8  {d4-d7}, [src]!
 518         subs    count, count, #64
 519         bpl     1b
 520 2:
 521         vst1.8  {d0-d3}, [ALIGN (dst, 64)]!
 522         vst1.8  {d4-d7}, [ALIGN (dst, 64)]!
 523         ands    count, count, #0x3f
 524 #else
 525         /* Use an SMS style loop to maximize the I/O bandwidth.  */
 526         sub     src, src, #4
 527         sub     dst, dst, #8
 528         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 529         ldr     A_l, [src, #4]
 530         ldr     A_h, [src, #8]
 531         strd    B_l, B_h, [sp, #8]
 532         ldr     B_l, [src, #12]
 533         ldr     B_h, [src, #16]
 534         strd    C_l, C_h, [sp, #16]
 535         ldr     C_l, [src, #20]
 536         ldr     C_h, [src, #24]
 537         strd    D_l, D_h, [sp, #24]
 538         ldr     D_l, [src, #28]
 539         ldr     D_h, [src, #32]!
 540         b       1f
 541         .p2align        6
 542 2:
 543         pld     [src, #(5 * 64) - (32 - 4)]
 544         strd    A_l, A_h, [dst, #40]
 545         ldr     A_l, [src, #36]
 546         ldr     A_h, [src, #40]
 547         strd    B_l, B_h, [dst, #48]
 548         ldr     B_l, [src, #44]
 549         ldr     B_h, [src, #48]
 550         strd    C_l, C_h, [dst, #56]
 551         ldr     C_l, [src, #52]
 552         ldr     C_h, [src, #56]
 553         strd    D_l, D_h, [dst, #64]!
 554         ldr     D_l, [src, #60]
 555         ldr     D_h, [src, #64]!
 556         subs    tmp2, tmp2, #64
 557 1:
 558         strd    A_l, A_h, [dst, #8]
 559         ldr     A_l, [src, #4]
 560         ldr     A_h, [src, #8]
 561         strd    B_l, B_h, [dst, #16]
 562         ldr     B_l, [src, #12]
 563         ldr     B_h, [src, #16]
 564         strd    C_l, C_h, [dst, #24]
 565         ldr     C_l, [src, #20]
 566         ldr     C_h, [src, #24]
 567         strd    D_l, D_h, [dst, #32]
 568         ldr     D_l, [src, #28]
 569         ldr     D_h, [src, #32]
 570         bcs     2b
 571
 572         /* Save the remaining bytes and restore the callee-saved regs.  */
 573         strd    A_l, A_h, [dst, #40]
 574         add     src, src, #36
 575         strd    B_l, B_h, [dst, #48]
 576         ldrd    B_l, B_h, [sp, #8]
 577         strd    C_l, C_h, [dst, #56]
 578         ldrd    C_l, C_h, [sp, #16]
 579         strd    D_l, D_h, [dst, #64]
 580         ldrd    D_l, D_h, [sp, #24]
 581         add     dst, dst, #72
 582         ands    count, tmp2, #0x3f
 583 #endif
 584         ldr     tmp2, [sp], #FRAME_SIZE
 585         bne     L(tail63unaligned)
 586         bx      lr
 587
 588 END (__memcpy_arm)