common/lib/libc/arch/arm/string/memcpy_arm.S

   1 /*      $NetBSD: memcpy_arm.S,v 1.5 2013/12/02 21:21:33 joerg Exp $     */
   2
   3 /*-
   4  * Copyright (c) 1997 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Neil A. Carson and Mark Brinicombe
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 #include <machine/asm.h>
  33
  34 /*
  35  * This is one fun bit of code ...
  36  * Some easy listening music is suggested while trying to understand this
  37  * code e.g. Iron Maiden
  38  *
  39  * For anyone attempting to understand it :
  40  *
  41  * The core code is implemented here with simple stubs for memcpy().
  42  *
  43  * All local labels are prefixed with Lmemcpy_
  44  * Following the prefix a label starting f is used in the forward copy code
  45  * while a label using b is used in the backwards copy code
  46  * The source and destination addresses determine whether a forward or
  47  * backward copy is performed.
  48  * Separate bits of code are used to deal with the following situations
  49  * for both the forward and backwards copy.
  50  * unaligned source address
  51  * unaligned destination address
  52  * Separate copy routines are used to produce an optimised result for each
  53  * of these cases.
  54  * The copy code will use LDM/STM instructions to copy up to 32 bytes at
  55  * a time where possible.
  56  *
  57  * Note: r12 (aka ip) can be trashed during the function along with
  58  * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
  59  * Additional registers are preserved prior to use i.e. r4, r5 & lr
  60  *
  61  * Apologies for the state of the comments ;-)
  62  */
  63 /* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
  64 ENTRY(memcpy)
  65         /* save leaf functions having to store this away */
  66         push    {r0, lr}                /* memcpy() returns dest addr */
  67
  68         subs    r2, r2, #4
  69         blt     .Lmemcpy_l4             /* less than 4 bytes */
  70         ands    r12, r0, #3
  71         bne     .Lmemcpy_destul         /* oh unaligned destination addr */
  72         ands    r12, r1, #3
  73         bne     .Lmemcpy_srcul          /* oh unaligned source addr */
  74
  75 .Lmemcpy_t8:
  76         /* We have aligned source and destination */
  77         subs    r2, r2, #8
  78         blt     .Lmemcpy_l12            /* less than 12 bytes (4 from above) */
  79         subs    r2, r2, #0x14
  80         blt     .Lmemcpy_l32            /* less than 32 bytes (12 from above) */
  81         push    {r4}            /* borrow r4 */
  82
  83         /* blat 32 bytes at a time */
  84         /* XXX for really big copies perhaps we should use more registers */
  85 .Lmemcpy_loop32:
  86         ldmia   r1!, {r3, r4, r12, lr}
  87         stmia   r0!, {r3, r4, r12, lr}
  88         ldmia   r1!, {r3, r4, r12, lr}
  89         stmia   r0!, {r3, r4, r12, lr}
  90         subs    r2, r2, #0x20
  91         bge     .Lmemcpy_loop32
  92
  93         cmn     r2, #0x10
  94         ldmiage r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
  95         stmiage r0!, {r3, r4, r12, lr}
  96         subge   r2, r2, #0x10
  97         pop     {r4}            /* return r4 */
  98
  99 .Lmemcpy_l32:
 100         adds    r2, r2, #0x14
 101
 102         /* blat 12 bytes at a time */
 103 .Lmemcpy_loop12:
 104         ldmiage r1!, {r3, r12, lr}
 105         stmiage r0!, {r3, r12, lr}
 106         subsge  r2, r2, #0x0c
 107         bge     .Lmemcpy_loop12
 108
 109 .Lmemcpy_l12:
 110         adds    r2, r2, #8
 111         blt     .Lmemcpy_l4
 112
 113         subs    r2, r2, #4
 114         ldrlt   r3, [r1], #4
 115         strlt   r3, [r0], #4
 116         ldmiage r1!, {r3, r12}
 117         stmiage r0!, {r3, r12}
 118         subge   r2, r2, #4
 119
 120 .Lmemcpy_l4:
 121         /* less than 4 bytes to go */
 122         adds    r2, r2, #4
 123 #ifdef __APCS_26_
 124         ldmiaeq sp!, {r0, pc}^          /* done */
 125 #else
 126         popeq   {r0, pc}                /* done */
 127 #endif
 128         /* copy the crud byte at a time */
 129         cmp     r2, #2
 130         ldrb    r3, [r1], #1
 131         strb    r3, [r0], #1
 132         ldrbge  r3, [r1], #1
 133         strbge  r3, [r0], #1
 134         ldrbgt  r3, [r1], #1
 135         strbgt  r3, [r0], #1
 136         pop     {r0, pc}
 137
 138         /* erg - unaligned destination */
 139 .Lmemcpy_destul:
 140         rsb     r12, r12, #4
 141         cmp     r12, #2
 142
 143         /* align destination with byte copies */
 144         ldrb    r3, [r1], #1
 145         strb    r3, [r0], #1
 146         ldrbge  r3, [r1], #1
 147         strbge  r3, [r0], #1
 148         ldrbgt  r3, [r1], #1
 149         strbgt  r3, [r0], #1
 150         subs    r2, r2, r12
 151         blt     .Lmemcpy_l4             /* less the 4 bytes */
 152
 153         ands    r12, r1, #3
 154         beq     .Lmemcpy_t8             /* we have an aligned source */
 155
 156         /* erg - unaligned source */
 157         /* This is where it gets nasty ... */
 158 .Lmemcpy_srcul:
 159         bic     r1, r1, #3
 160         ldr     lr, [r1], #4
 161         cmp     r12, #2
 162         bgt     .Lmemcpy_srcul3
 163         beq     .Lmemcpy_srcul2
 164         cmp     r2, #0x0c
 165         blt     .Lmemcpy_srcul1loop4
 166         sub     r2, r2, #0x0c
 167         push    {r4, r5}
 168
 169 .Lmemcpy_srcul1loop16:
 170 #ifdef __ARMEB__
 171         mov     r3, lr, lsl #8
 172 #else
 173         mov     r3, lr, lsr #8
 174 #endif
 175         ldmia   r1!, {r4, r5, r12, lr}
 176 #ifdef __ARMEB__
 177         orr     r3, r3, r4, lsr #24
 178         mov     r4, r4, lsl #8
 179         orr     r4, r4, r5, lsr #24
 180         mov     r5, r5, lsl #8
 181         orr     r5, r5, r12, lsr #24
 182         mov     r12, r12, lsl #8
 183         orr     r12, r12, lr, lsr #24
 184 #else
 185         orr     r3, r3, r4, lsl #24
 186         mov     r4, r4, lsr #8
 187         orr     r4, r4, r5, lsl #24
 188         mov     r5, r5, lsr #8
 189         orr     r5, r5, r12, lsl #24
 190         mov     r12, r12, lsr #8
 191         orr     r12, r12, lr, lsl #24
 192 #endif
 193         stmia   r0!, {r3-r5, r12}
 194         subs    r2, r2, #0x10
 195         bge     .Lmemcpy_srcul1loop16
 196         pop     {r4, r5}
 197         adds    r2, r2, #0x0c
 198         blt     .Lmemcpy_srcul1l4
 199
 200 .Lmemcpy_srcul1loop4:
 201 #ifdef __ARMEB__
 202         mov     r12, lr, lsl #8
 203 #else
 204         mov     r12, lr, lsr #8
 205 #endif
 206         ldr     lr, [r1], #4
 207 #ifdef __ARMEB__
 208         orr     r12, r12, lr, lsr #24
 209 #else
 210         orr     r12, r12, lr, lsl #24
 211 #endif
 212         str     r12, [r0], #4
 213         subs    r2, r2, #4
 214         bge     .Lmemcpy_srcul1loop4
 215
 216 .Lmemcpy_srcul1l4:
 217         sub     r1, r1, #3
 218         b       .Lmemcpy_l4
 219
 220 .Lmemcpy_srcul2:
 221         cmp     r2, #0x0c
 222         blt     .Lmemcpy_srcul2loop4
 223         sub     r2, r2, #0x0c
 224         push    {r4, r5}
 225
 226 .Lmemcpy_srcul2loop16:
 227 #ifdef __ARMEB__
 228         mov     r3, lr, lsl #16
 229 #else
 230         mov     r3, lr, lsr #16
 231 #endif
 232         ldmia   r1!, {r4, r5, r12, lr}
 233 #ifdef __ARMEB__
 234         orr     r3, r3, r4, lsr #16
 235         mov     r4, r4, lsl #16
 236         orr     r4, r4, r5, lsr #16
 237         mov     r5, r5, lsl #16
 238         orr     r5, r5, r12, lsr #16
 239         mov     r12, r12, lsl #16
 240         orr     r12, r12, lr, lsr #16
 241 #else
 242         orr     r3, r3, r4, lsl #16
 243         mov     r4, r4, lsr #16
 244         orr     r4, r4, r5, lsl #16
 245         mov     r5, r5, lsr #16
 246         orr     r5, r5, r12, lsl #16
 247         mov     r12, r12, lsr #16
 248         orr     r12, r12, lr, lsl #16
 249 #endif
 250         stmia   r0!, {r3-r5, r12}
 251         subs    r2, r2, #0x10
 252         bge     .Lmemcpy_srcul2loop16
 253         pop     {r4, r5}
 254         adds    r2, r2, #0x0c
 255         blt     .Lmemcpy_srcul2l4
 256
 257 .Lmemcpy_srcul2loop4:
 258 #ifdef __ARMEB__
 259         mov     r12, lr, lsl #16
 260 #else
 261         mov     r12, lr, lsr #16
 262 #endif
 263         ldr     lr, [r1], #4
 264 #ifdef __ARMEB__
 265         orr     r12, r12, lr, lsr #16
 266 #else
 267         orr     r12, r12, lr, lsl #16
 268 #endif
 269         str     r12, [r0], #4
 270         subs    r2, r2, #4
 271         bge     .Lmemcpy_srcul2loop4
 272
 273 .Lmemcpy_srcul2l4:
 274         sub     r1, r1, #2
 275         b       .Lmemcpy_l4
 276
 277 .Lmemcpy_srcul3:
 278         cmp     r2, #0x0c
 279         blt     .Lmemcpy_srcul3loop4
 280         sub     r2, r2, #0x0c
 281         push    {r4, r5}
 282
 283 .Lmemcpy_srcul3loop16:
 284 #ifdef __ARMEB__
 285         mov     r3, lr, lsl #24
 286 #else
 287         mov     r3, lr, lsr #24
 288 #endif
 289         ldmia   r1!, {r4, r5, r12, lr}
 290 #ifdef __ARMEB__
 291         orr     r3, r3, r4, lsr #8
 292         mov     r4, r4, lsl #24
 293         orr     r4, r4, r5, lsr #8
 294         mov     r5, r5, lsl #24
 295         orr     r5, r5, r12, lsr #8
 296         mov     r12, r12, lsl #24
 297         orr     r12, r12, lr, lsr #8
 298 #else
 299         orr     r3, r3, r4, lsl #8
 300         mov     r4, r4, lsr #24
 301         orr     r4, r4, r5, lsl #8
 302         mov     r5, r5, lsr #24
 303         orr     r5, r5, r12, lsl #8
 304         mov     r12, r12, lsr #24
 305         orr     r12, r12, lr, lsl #8
 306 #endif
 307         stmia   r0!, {r3-r5, r12}
 308         subs    r2, r2, #0x10
 309         bge     .Lmemcpy_srcul3loop16
 310         pop     {r4, r5}
 311         adds    r2, r2, #0x0c
 312         blt     .Lmemcpy_srcul3l4
 313
 314 .Lmemcpy_srcul3loop4:
 315 #ifdef __ARMEB__
 316         mov     r12, lr, lsl #24
 317 #else
 318         mov     r12, lr, lsr #24
 319 #endif
 320         ldr     lr, [r1], #4
 321 #ifdef __ARMEB__
 322         orr     r12, r12, lr, lsr #8
 323 #else
 324         orr     r12, r12, lr, lsl #8
 325 #endif
 326         str     r12, [r0], #4
 327         subs    r2, r2, #4
 328         bge     .Lmemcpy_srcul3loop4
 329
 330 .Lmemcpy_srcul3l4:
 331         sub     r1, r1, #1
 332         b       .Lmemcpy_l4
 333 END(memcpy)