arch/ia64/lib/memcpy.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  *
   4  * Optimized version of the standard memcpy() function
   5  *
   6  * Inputs:
   7  *      in0:    destination address
   8  *      in1:    source address
   9  *      in2:    number of bytes to copy
  10  * Output:
  11  *      no return value
  12  *
  13  * Copyright (C) 2000-2001 Hewlett-Packard Co
  14  *      Stephane Eranian <eranian@hpl.hp.com>
  15  *      David Mosberger-Tang <davidm@hpl.hp.com>
  16  */
  17 #include <asm/asmmacro.h>
  18 #include <asm/export.h>
  19
  20 GLOBAL_ENTRY(memcpy)
  21
  22 #       define MEM_LAT  21              /* latency to memory */
  23
  24 #       define dst      r2
  25 #       define src      r3
  26 #       define retval   r8
  27 #       define saved_pfs r9
  28 #       define saved_lc r10
  29 #       define saved_pr r11
  30 #       define cnt      r16
  31 #       define src2     r17
  32 #       define t0       r18
  33 #       define t1       r19
  34 #       define t2       r20
  35 #       define t3       r21
  36 #       define t4       r22
  37 #       define src_end  r23
  38
  39 #       define N        (MEM_LAT + 4)
  40 #       define Nrot     ((N + 7) & ~7)
  41
  42         /*
  43          * First, check if everything (src, dst, len) is a multiple of eight.  If
  44          * so, we handle everything with no taken branches (other than the loop
  45          * itself) and a small icache footprint.  Otherwise, we jump off to
  46          * the more general copy routine handling arbitrary
  47          * sizes/alignment etc.
  48          */
  49         .prologue
  50         .save ar.pfs, saved_pfs
  51         alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
  52         .save ar.lc, saved_lc
  53         mov saved_lc=ar.lc
  54         or t0=in0,in1
  55         ;;
  56
  57         or t0=t0,in2
  58         .save pr, saved_pr
  59         mov saved_pr=pr
  60
  61         .body
  62
  63         cmp.eq p6,p0=in2,r0     // zero length?
  64         mov retval=in0          // return dst
  65 (p6)    br.ret.spnt.many rp     // zero length, return immediately
  66         ;;
  67
  68         mov dst=in0             // copy because of rotation
  69         shr.u cnt=in2,3         // number of 8-byte words to copy
  70         mov pr.rot=1<<16
  71         ;;
  72
  73         adds cnt=-1,cnt         // br.ctop is repeat/until
  74         cmp.gtu p7,p0=16,in2    // copying less than 16 bytes?
  75         mov ar.ec=N
  76         ;;
  77
  78         and t0=0x7,t0
  79         mov ar.lc=cnt
  80         ;;
  81         cmp.ne p6,p0=t0,r0
  82
  83         mov src=in1             // copy because of rotation
  84 (p7)    br.cond.spnt.few .memcpy_short
  85 (p6)    br.cond.spnt.few .memcpy_long
  86         ;;
  87         nop.m   0
  88         ;;
  89         nop.m   0
  90         nop.i   0
  91         ;;
  92         nop.m   0
  93         ;;
  94         .rotr val[N]
  95         .rotp p[N]
  96         .align 32
  97 1: { .mib
  98 (p[0])  ld8 val[0]=[src],8
  99         nop.i 0
 100         brp.loop.imp 1b, 2f
 101 }
 102 2: { .mfb
 103 (p[N-1])st8 [dst]=val[N-1],8
 104         nop.f 0
 105         br.ctop.dptk.few 1b
 106 }
 107         ;;
 108         mov ar.lc=saved_lc
 109         mov pr=saved_pr,-1
 110         mov ar.pfs=saved_pfs
 111         br.ret.sptk.many rp
 112
 113         /*
 114          * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
 115          * copy loop.  This performs relatively poorly on Itanium, but it doesn't
 116          * get used very often (gcc inlines small copies) and due to atomicity
 117          * issues, we want to avoid read-modify-write of entire words.
 118          */
 119         .align 32
 120 .memcpy_short:
 121         adds cnt=-1,in2         // br.ctop is repeat/until
 122         mov ar.ec=MEM_LAT
 123         brp.loop.imp 1f, 2f
 124         ;;
 125         mov ar.lc=cnt
 126         ;;
 127         nop.m   0
 128         ;;
 129         nop.m   0
 130         nop.i   0
 131         ;;
 132         nop.m   0
 133         ;;
 134         nop.m   0
 135         ;;
 136         /*
 137          * It is faster to put a stop bit in the loop here because it makes
 138          * the pipeline shorter (and latency is what matters on short copies).
 139          */
 140         .align 32
 141 1: { .mib
 142 (p[0])  ld1 val[0]=[src],1
 143         nop.i 0
 144         brp.loop.imp 1b, 2f
 145 } ;;
 146 2: { .mfb
 147 (p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
 148         nop.f 0
 149         br.ctop.dptk.few 1b
 150 } ;;
 151         mov ar.lc=saved_lc
 152         mov pr=saved_pr,-1
 153         mov ar.pfs=saved_pfs
 154         br.ret.sptk.many rp
 155
 156         /*
 157          * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
 158          * an overriding concern here, but throughput is.  We first do
 159          * sub-word copying until the destination is aligned, then we check
 160          * if the source is also aligned.  If so, we do a simple load/store-loop
 161          * until there are less than 8 bytes left over and then we do the tail,
 162          * by storing the last few bytes using sub-word copying.  If the source
 163          * is not aligned, we branch off to the non-congruent loop.
 164          *
 165          *   stage:   op:
 166          *         0  ld
 167          *         :
 168          * MEM_LAT+3  shrp
 169          * MEM_LAT+4  st
 170          *
 171          * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
 172          * seems to introduce an unavoidable bubble in the pipeline so the overall
 173          * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
 174          * of 4 byte/cycle.  Still not bad.
 175          */
 176 #       undef N
 177 #       undef Nrot
 178 #       define N        (MEM_LAT + 5)           /* number of stages */
 179 #       define Nrot     ((N+1 + 2 + 7) & ~7)    /* number of rotating regs */
 180
 181 #define LOG_LOOP_SIZE   6
 182
 183 .memcpy_long:
 184         alloc t3=ar.pfs,3,Nrot,0,Nrot   // resize register frame
 185         and t0=-8,src           // t0 = src & ~7
 186         and t2=7,src            // t2 = src & 7
 187         ;;
 188         ld8 t0=[t0]             // t0 = 1st source word
 189         adds src2=7,src         // src2 = (src + 7)
 190         sub t4=r0,dst           // t4 = -dst
 191         ;;
 192         and src2=-8,src2        // src2 = (src + 7) & ~7
 193         shl t2=t2,3             // t2 = 8*(src & 7)
 194         shl t4=t4,3             // t4 = 8*(dst & 7)
 195         ;;
 196         ld8 t1=[src2]           // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
 197         sub t3=64,t2            // t3 = 64-8*(src & 7)
 198         shr.u t0=t0,t2
 199         ;;
 200         add src_end=src,in2
 201         shl t1=t1,t3
 202         mov pr=t4,0x38          // (p5,p4,p3)=(dst & 7)
 203         ;;
 204         or t0=t0,t1
 205         mov cnt=r0
 206         adds src_end=-1,src_end
 207         ;;
 208 (p3)    st1 [dst]=t0,1
 209 (p3)    shr.u t0=t0,8
 210 (p3)    adds cnt=1,cnt
 211         ;;
 212 (p4)    st2 [dst]=t0,2
 213 (p4)    shr.u t0=t0,16
 214 (p4)    adds cnt=2,cnt
 215         ;;
 216 (p5)    st4 [dst]=t0,4
 217 (p5)    adds cnt=4,cnt
 218         and src_end=-8,src_end  // src_end = last word of source buffer
 219         ;;
 220
 221         // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
 222
 223 1:{     add src=cnt,src                 // make src point to remainder of source buffer
 224         sub cnt=in2,cnt                 // cnt = number of bytes left to copy
 225         mov t4=ip
 226   }     ;;
 227         and src2=-8,src                 // align source pointer
 228         adds t4=.memcpy_loops-1b,t4
 229         mov ar.ec=N
 230
 231         and t0=7,src                    // t0 = src & 7
 232         shr.u t2=cnt,3                  // t2 = number of 8-byte words left to copy
 233         shl cnt=cnt,3                   // move bits 0-2 to 3-5
 234         ;;
 235
 236         .rotr val[N+1], w[2]
 237         .rotp p[N]
 238
 239         cmp.ne p6,p0=t0,r0              // is src aligned, too?
 240         shl t0=t0,LOG_LOOP_SIZE         // t0 = 8*(src & 7)
 241         adds t2=-1,t2                   // br.ctop is repeat/until
 242         ;;
 243         add t4=t0,t4
 244         mov pr=cnt,0x38                 // set (p5,p4,p3) to # of bytes last-word bytes to copy
 245         mov ar.lc=t2
 246         ;;
 247         nop.m   0
 248         ;;
 249         nop.m   0
 250         nop.i   0
 251         ;;
 252         nop.m   0
 253         ;;
 254 (p6)    ld8 val[1]=[src2],8             // prime the pump...
 255         mov b6=t4
 256         br.sptk.few b6
 257         ;;
 258
 259 .memcpy_tail:
 260         // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
 261         // less than 8) and t0 contains the last few bytes of the src buffer:
 262 (p5)    st4 [dst]=t0,4
 263 (p5)    shr.u t0=t0,32
 264         mov ar.lc=saved_lc
 265         ;;
 266 (p4)    st2 [dst]=t0,2
 267 (p4)    shr.u t0=t0,16
 268         mov ar.pfs=saved_pfs
 269         ;;
 270 (p3)    st1 [dst]=t0
 271         mov pr=saved_pr,-1
 272         br.ret.sptk.many rp
 273
 274 ///////////////////////////////////////////////////////
 275         .align 64
 276
 277 #define COPY(shift,index)                                                                       \
 278  1: { .mib                                                                                      \
 279         (p[0])          ld8 val[0]=[src2],8;                                                    \
 280         (p[MEM_LAT+3])  shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;                    \
 281                         brp.loop.imp 1b, 2f                                                     \
 282     };                                                                                          \
 283  2: { .mfb                                                                                      \
 284         (p[MEM_LAT+4])  st8 [dst]=w[1],8;                                                       \
 285                         nop.f 0;                                                                \
 286                         br.ctop.dptk.few 1b;                                                    \
 287     };                                                                                          \
 288                         ;;                                                                      \
 289                         ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */    \
 290                         ;;                                                                      \
 291                         shrp t0=val[N-1],val[N-index],shift;                                    \
 292                         br .memcpy_tail
 293 .memcpy_loops:
 294         COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
 295         COPY(8, 0)
 296         COPY(16, 0)
 297         COPY(24, 0)
 298         COPY(32, 0)
 299         COPY(40, 0)
 300         COPY(48, 0)
 301         COPY(56, 0)
 302
 303 END(memcpy)
 304 EXPORT_SYMBOL(memcpy)