arch/sh/lib64/copy_user_memcpy.S

   1 ! SPDX-License-Identifier: GPL-2.0
   2 !
   3 ! Fast SH memcpy
   4 !
   5 ! by Toshiyasu Morita (tm@netcom.com)
   6 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   7 ! SH5 code Copyright 2002 SuperH Ltd.
   8 !
   9 ! Entry: ARG0: destination pointer
  10 !        ARG1: source pointer
  11 !        ARG2: byte count
  12 !
  13 ! Exit:  RESULT: destination pointer
  14 !        any other registers in the range r0-r7: trashed
  15 !
  16 ! Notes: Usually one wants to do small reads and write a longword, but
  17 !        unfortunately it is difficult in some cases to concatanate bytes
  18 !        into a longword on the SH, so this does a longword read and small
  19 !        writes.
  20 !
  21 ! This implementation makes two assumptions about how it is called:
  22 !
  23 ! 1.: If the byte count is nonzero, the address of the last byte to be
  24 !     copied is unsigned greater than the address of the first byte to
  25 !     be copied.  This could be easily swapped for a signed comparison,
  26 !     but the algorithm used needs some comparison.
  27 !
  28 ! 2.: When there are two or three bytes in the last word of an 11-or-more
  29 !     bytes memory chunk to b copied, the rest of the word can be read
  30 !     without side effects.
  31 !     This could be easily changed by increasing the minimum size of
  32 !     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  33 !     however, this would cost a few extra cyles on average.
  34 !     For SHmedia, the assumption is that any quadword can be read in its
  35 !     enirety if at least one byte is included in the copy.
  36
  37 /* Imported into Linux kernel by Richard Curnow.  This is used to implement the
  38    __copy_user function in the general case, so it has to be a distinct
  39    function from intra-kernel memcpy to allow for exception fix-ups in the
  40    event that the user pointer is bad somewhere in the copy (e.g. due to
  41    running off the end of the vma).
  42
  43    Note, this algorithm will be slightly wasteful in the case where the source
  44    and destination pointers are equally aligned, because the stlo/sthi pairs
  45    could then be merged back into single stores.  If there are a lot of cache
  46    misses, this is probably offset by the stall lengths on the preloads.
  47
  48 */
  49
  50 /* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
  51  * erratum.  The first two prefetches are nop-ed out to avoid upsetting the
  52  * instruction counts used in the jump address calculation.
  53  * */
  54
  55         .section .text..SHmedia32,"ax"
  56         .little
  57         .balign 32
  58         .global copy_user_memcpy
  59         .global copy_user_memcpy_end
  60 copy_user_memcpy:
  61
  62 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  63 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  64 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  65 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  66
  67         nop ! ld.b r3,0,r63 ! TAKum03020
  68         pta/l Large,tr0
  69         movi 25,r0
  70         bgeu/u r4,r0,tr0
  71         nsb r4,r0
  72         shlli r0,5,r0
  73         movi (L1-L0+63*32 + 1) & 0xffff,r1
  74         sub r1, r0, r0
  75 L0:     ptrel r0,tr0
  76         add r2,r4,r5
  77         ptabs r18,tr1
  78         add r3,r4,r6
  79         blink tr0,r63
  80
  81 /* Rearranged to make cut2 safe */
  82         .balign 8
  83 L4_7:   /* 4..7 byte memcpy cntd. */
  84         stlo.l r2, 0, r0
  85         or r6, r7, r6
  86         sthi.l r5, -1, r6
  87         stlo.l r5, -4, r6
  88         blink tr1,r63
  89
  90         .balign 8
  91 L1:     /* 0 byte memcpy */
  92         nop
  93         blink tr1,r63
  94         nop
  95         nop
  96         nop
  97         nop
  98
  99 L2_3:   /* 2 or 3 byte memcpy cntd. */
 100         st.b r5,-1,r6
 101         blink tr1,r63
 102
 103         /* 1 byte memcpy */
 104         ld.b r3,0,r0
 105         st.b r2,0,r0
 106         blink tr1,r63
 107
 108 L8_15:  /* 8..15 byte memcpy cntd. */
 109         stlo.q r2, 0, r0
 110         or r6, r7, r6
 111         sthi.q r5, -1, r6
 112         stlo.q r5, -8, r6
 113         blink tr1,r63
 114
 115         /* 2 or 3 byte memcpy */
 116         ld.b r3,0,r0
 117         nop ! ld.b r2,0,r63 ! TAKum03020
 118         ld.b r3,1,r1
 119         st.b r2,0,r0
 120         pta/l L2_3,tr0
 121         ld.b r6,-1,r6
 122         st.b r2,1,r1
 123         blink tr0, r63
 124
 125         /* 4 .. 7 byte memcpy */
 126         LDUAL (r3, 0, r0, r1)
 127         pta L4_7, tr0
 128         ldlo.l r6, -4, r7
 129         or r0, r1, r0
 130         sthi.l r2, 3, r0
 131         ldhi.l r6, -1, r6
 132         blink tr0, r63
 133
 134         /* 8 .. 15 byte memcpy */
 135         LDUAQ (r3, 0, r0, r1)
 136         pta L8_15, tr0
 137         ldlo.q r6, -8, r7
 138         or r0, r1, r0
 139         sthi.q r2, 7, r0
 140         ldhi.q r6, -1, r6
 141         blink tr0, r63
 142
 143         /* 16 .. 24 byte memcpy */
 144         LDUAQ (r3, 0, r0, r1)
 145         LDUAQ (r3, 8, r8, r9)
 146         or r0, r1, r0
 147         sthi.q r2, 7, r0
 148         or r8, r9, r8
 149         sthi.q r2, 15, r8
 150         ldlo.q r6, -8, r7
 151         ldhi.q r6, -1, r6
 152         stlo.q r2, 8, r8
 153         stlo.q r2, 0, r0
 154         or r6, r7, r6
 155         sthi.q r5, -1, r6
 156         stlo.q r5, -8, r6
 157         blink tr1,r63
 158
 159 Large:
 160         ! ld.b r2, 0, r63 ! TAKum03020
 161         pta/l  Loop_ua, tr1
 162         ori r3, -8, r7
 163         sub r2, r7, r22
 164         sub r3, r2, r6
 165         add r2, r4, r5
 166         ldlo.q r3, 0, r0
 167         addi r5, -16, r5
 168         movi 64+8, r27 ! could subtract r7 from that.
 169         stlo.q r2, 0, r0
 170         sthi.q r2, 7, r0
 171         ldx.q r22, r6, r0
 172         bgtu/l r27, r4, tr1
 173
 174         addi r5, -48, r27
 175         pta/l Loop_line, tr0
 176         addi r6, 64, r36
 177         addi r6, -24, r19
 178         addi r6, -16, r20
 179         addi r6, -8, r21
 180
 181 Loop_line:
 182         ! ldx.q r22, r36, r63 ! TAKum03020
 183         alloco r22, 32
 184         synco
 185         addi r22, 32, r22
 186         ldx.q r22, r19, r23
 187         sthi.q r22, -25, r0
 188         ldx.q r22, r20, r24
 189         ldx.q r22, r21, r25
 190         stlo.q r22, -32, r0
 191         ldx.q r22, r6,  r0
 192         sthi.q r22, -17, r23
 193         sthi.q r22,  -9, r24
 194         sthi.q r22,  -1, r25
 195         stlo.q r22, -24, r23
 196         stlo.q r22, -16, r24
 197         stlo.q r22,  -8, r25
 198         bgeu r27, r22, tr0
 199
 200 Loop_ua:
 201         addi r22, 8, r22
 202         sthi.q r22, -1, r0
 203         stlo.q r22, -8, r0
 204         ldx.q r22, r6, r0
 205         bgtu/l r5, r22, tr1
 206
 207         add r3, r4, r7
 208         ldlo.q r7, -8, r1
 209         sthi.q r22, 7, r0
 210         ldhi.q r7, -1, r7
 211         ptabs r18,tr1
 212         stlo.q r22, 0, r0
 213         or r1, r7, r1
 214         sthi.q r5, 15, r1
 215         stlo.q r5, 8, r1
 216         blink tr1, r63
 217 copy_user_memcpy_end:
 218         nop