arch/x86/lib/memmove_64.S

   1 /*
   2  * Normally compiler builtins are used, but sometimes the compiler calls out
   3  * of line code. Based on asm-i386/string.h.
   4  *
   5  * This assembly file is re-written from memmove_64.c file.
   6  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   7  */
   8 #include <linux/linkage.h>
   9 #include <asm/cpufeatures.h>
  10 #include <asm/alternative-asm.h>
  11
  12 #undef memmove
  13
  14 /*
  15  * Implement memmove(). This can handle overlap between src and dst.
  16  *
  17  * Input:
  18  * rdi: dest
  19  * rsi: src
  20  * rdx: count
  21  *
  22  * Output:
  23  * rax: dest
  24  */
  25 .weak memmove
  26
  27 ENTRY(memmove)
  28 ENTRY(__memmove)
  29
  30         /* Handle more 32 bytes in loop */
  31         mov %rdi, %rax
  32         cmp $0x20, %rdx
  33         jb      1f
  34
  35         /* Decide forward/backward copy mode */
  36         cmp %rdi, %rsi
  37         jge .Lmemmove_begin_forward
  38         mov %rsi, %r8
  39         add %rdx, %r8
  40         cmp %rdi, %r8
  41         jg 2f
  42
  43 .Lmemmove_begin_forward:
  44         ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
  45
  46         /*
  47          * movsq instruction have many startup latency
  48          * so we handle small size by general register.
  49          */
  50         cmp  $680, %rdx
  51         jb      3f
  52         /*
  53          * movsq instruction is only good for aligned case.
  54          */
  55
  56         cmpb %dil, %sil
  57         je 4f
  58 3:
  59         sub $0x20, %rdx
  60         /*
  61          * We gobble 32 bytes forward in each loop.
  62          */
  63 5:
  64         sub $0x20, %rdx
  65         movq 0*8(%rsi), %r11
  66         movq 1*8(%rsi), %r10
  67         movq 2*8(%rsi), %r9
  68         movq 3*8(%rsi), %r8
  69         leaq 4*8(%rsi), %rsi
  70
  71         movq %r11, 0*8(%rdi)
  72         movq %r10, 1*8(%rdi)
  73         movq %r9, 2*8(%rdi)
  74         movq %r8, 3*8(%rdi)
  75         leaq 4*8(%rdi), %rdi
  76         jae 5b
  77         addq $0x20, %rdx
  78         jmp 1f
  79         /*
  80          * Handle data forward by movsq.
  81          */
  82         .p2align 4
  83 4:
  84         movq %rdx, %rcx
  85         movq -8(%rsi, %rdx), %r11
  86         lea -8(%rdi, %rdx), %r10
  87         shrq $3, %rcx
  88         rep movsq
  89         movq %r11, (%r10)
  90         jmp 13f
  91 .Lmemmove_end_forward:
  92
  93         /*
  94          * Handle data backward by movsq.
  95          */
  96         .p2align 4
  97 7:
  98         movq %rdx, %rcx
  99         movq (%rsi), %r11
 100         movq %rdi, %r10
 101         leaq -8(%rsi, %rdx), %rsi
 102         leaq -8(%rdi, %rdx), %rdi
 103         shrq $3, %rcx
 104         std
 105         rep movsq
 106         cld
 107         movq %r11, (%r10)
 108         jmp 13f
 109
 110         /*
 111          * Start to prepare for backward copy.
 112          */
 113         .p2align 4
 114 2:
 115         cmp $680, %rdx
 116         jb 6f
 117         cmp %dil, %sil
 118         je 7b
 119 6:
 120         /*
 121          * Calculate copy position to tail.
 122          */
 123         addq %rdx, %rsi
 124         addq %rdx, %rdi
 125         subq $0x20, %rdx
 126         /*
 127          * We gobble 32 bytes backward in each loop.
 128          */
 129 8:
 130         subq $0x20, %rdx
 131         movq -1*8(%rsi), %r11
 132         movq -2*8(%rsi), %r10
 133         movq -3*8(%rsi), %r9
 134         movq -4*8(%rsi), %r8
 135         leaq -4*8(%rsi), %rsi
 136
 137         movq %r11, -1*8(%rdi)
 138         movq %r10, -2*8(%rdi)
 139         movq %r9, -3*8(%rdi)
 140         movq %r8, -4*8(%rdi)
 141         leaq -4*8(%rdi), %rdi
 142         jae 8b
 143         /*
 144          * Calculate copy position to head.
 145          */
 146         addq $0x20, %rdx
 147         subq %rdx, %rsi
 148         subq %rdx, %rdi
 149 1:
 150         cmpq $16, %rdx
 151         jb 9f
 152         /*
 153          * Move data from 16 bytes to 31 bytes.
 154          */
 155         movq 0*8(%rsi), %r11
 156         movq 1*8(%rsi), %r10
 157         movq -2*8(%rsi, %rdx), %r9
 158         movq -1*8(%rsi, %rdx), %r8
 159         movq %r11, 0*8(%rdi)
 160         movq %r10, 1*8(%rdi)
 161         movq %r9, -2*8(%rdi, %rdx)
 162         movq %r8, -1*8(%rdi, %rdx)
 163         jmp 13f
 164         .p2align 4
 165 9:
 166         cmpq $8, %rdx
 167         jb 10f
 168         /*
 169          * Move data from 8 bytes to 15 bytes.
 170          */
 171         movq 0*8(%rsi), %r11
 172         movq -1*8(%rsi, %rdx), %r10
 173         movq %r11, 0*8(%rdi)
 174         movq %r10, -1*8(%rdi, %rdx)
 175         jmp 13f
 176 10:
 177         cmpq $4, %rdx
 178         jb 11f
 179         /*
 180          * Move data from 4 bytes to 7 bytes.
 181          */
 182         movl (%rsi), %r11d
 183         movl -4(%rsi, %rdx), %r10d
 184         movl %r11d, (%rdi)
 185         movl %r10d, -4(%rdi, %rdx)
 186         jmp 13f
 187 11:
 188         cmp $2, %rdx
 189         jb 12f
 190         /*
 191          * Move data from 2 bytes to 3 bytes.
 192          */
 193         movw (%rsi), %r11w
 194         movw -2(%rsi, %rdx), %r10w
 195         movw %r11w, (%rdi)
 196         movw %r10w, -2(%rdi, %rdx)
 197         jmp 13f
 198 12:
 199         cmp $1, %rdx
 200         jb 13f
 201         /*
 202          * Move data for 1 byte.
 203          */
 204         movb (%rsi), %r11b
 205         movb %r11b, (%rdi)
 206 13:
 207         retq
 208 ENDPROC(__memmove)
 209 ENDPROC(memmove)