arch/x86/lib/memmove_64.S

   1 /*
   2  * Normally compiler builtins are used, but sometimes the compiler calls out
   3  * of line code. Based on asm-i386/string.h.
   4  *
   5  * This assembly file is re-written from memmove_64.c file.
   6  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   7  */
   8 #define _STRING_C
   9 #include <linux/linkage.h>
  10 #include <asm/dwarf2.h>
  11 #include <asm/cpufeature.h>
  12 #include <asm/alternative-asm.h>
  13
  14 #undef memmove
  15
  16 /*
  17  * Implement memmove(). This can handle overlap between src and dst.
  18  *
  19  * Input:
  20  * rdi: dest
  21  * rsi: src
  22  * rdx: count
  23  *
  24  * Output:
  25  * rax: dest
  26  */
  27 ENTRY(memmove)
  28         CFI_STARTPROC
  29
  30         /* Handle more 32 bytes in loop */
  31         mov %rdi, %rax
  32         cmp $0x20, %rdx
  33         jb      1f
  34
  35         /* Decide forward/backward copy mode */
  36         cmp %rdi, %rsi
  37         jge .Lmemmove_begin_forward
  38         mov %rsi, %r8
  39         add %rdx, %r8
  40         cmp %rdi, %r8
  41         jg 2f
  42
  43 .Lmemmove_begin_forward:
  44         /*
  45          * movsq instruction have many startup latency
  46          * so we handle small size by general register.
  47          */
  48         cmp  $680, %rdx
  49         jb      3f
  50         /*
  51          * movsq instruction is only good for aligned case.
  52          */
  53
  54         cmpb %dil, %sil
  55         je 4f
  56 3:
  57         sub $0x20, %rdx
  58         /*
  59          * We gobble 32 bytes forward in each loop.
  60          */
  61 5:
  62         sub $0x20, %rdx
  63         movq 0*8(%rsi), %r11
  64         movq 1*8(%rsi), %r10
  65         movq 2*8(%rsi), %r9
  66         movq 3*8(%rsi), %r8
  67         leaq 4*8(%rsi), %rsi
  68
  69         movq %r11, 0*8(%rdi)
  70         movq %r10, 1*8(%rdi)
  71         movq %r9, 2*8(%rdi)
  72         movq %r8, 3*8(%rdi)
  73         leaq 4*8(%rdi), %rdi
  74         jae 5b
  75         addq $0x20, %rdx
  76         jmp 1f
  77         /*
  78          * Handle data forward by movsq.
  79          */
  80         .p2align 4
  81 4:
  82         movq %rdx, %rcx
  83         movq -8(%rsi, %rdx), %r11
  84         lea -8(%rdi, %rdx), %r10
  85         shrq $3, %rcx
  86         rep movsq
  87         movq %r11, (%r10)
  88         jmp 13f
  89 .Lmemmove_end_forward:
  90
  91         /*
  92          * Handle data backward by movsq.
  93          */
  94         .p2align 4
  95 7:
  96         movq %rdx, %rcx
  97         movq (%rsi), %r11
  98         movq %rdi, %r10
  99         leaq -8(%rsi, %rdx), %rsi
 100         leaq -8(%rdi, %rdx), %rdi
 101         shrq $3, %rcx
 102         std
 103         rep movsq
 104         cld
 105         movq %r11, (%r10)
 106         jmp 13f
 107
 108         /*
 109          * Start to prepare for backward copy.
 110          */
 111         .p2align 4
 112 2:
 113         cmp $680, %rdx
 114         jb 6f
 115         cmp %dil, %sil
 116         je 7b
 117 6:
 118         /*
 119          * Calculate copy position to tail.
 120          */
 121         addq %rdx, %rsi
 122         addq %rdx, %rdi
 123         subq $0x20, %rdx
 124         /*
 125          * We gobble 32 bytes backward in each loop.
 126          */
 127 8:
 128         subq $0x20, %rdx
 129         movq -1*8(%rsi), %r11
 130         movq -2*8(%rsi), %r10
 131         movq -3*8(%rsi), %r9
 132         movq -4*8(%rsi), %r8
 133         leaq -4*8(%rsi), %rsi
 134
 135         movq %r11, -1*8(%rdi)
 136         movq %r10, -2*8(%rdi)
 137         movq %r9, -3*8(%rdi)
 138         movq %r8, -4*8(%rdi)
 139         leaq -4*8(%rdi), %rdi
 140         jae 8b
 141         /*
 142          * Calculate copy position to head.
 143          */
 144         addq $0x20, %rdx
 145         subq %rdx, %rsi
 146         subq %rdx, %rdi
 147 1:
 148         cmpq $16, %rdx
 149         jb 9f
 150         /*
 151          * Move data from 16 bytes to 31 bytes.
 152          */
 153         movq 0*8(%rsi), %r11
 154         movq 1*8(%rsi), %r10
 155         movq -2*8(%rsi, %rdx), %r9
 156         movq -1*8(%rsi, %rdx), %r8
 157         movq %r11, 0*8(%rdi)
 158         movq %r10, 1*8(%rdi)
 159         movq %r9, -2*8(%rdi, %rdx)
 160         movq %r8, -1*8(%rdi, %rdx)
 161         jmp 13f
 162         .p2align 4
 163 9:
 164         cmpq $8, %rdx
 165         jb 10f
 166         /*
 167          * Move data from 8 bytes to 15 bytes.
 168          */
 169         movq 0*8(%rsi), %r11
 170         movq -1*8(%rsi, %rdx), %r10
 171         movq %r11, 0*8(%rdi)
 172         movq %r10, -1*8(%rdi, %rdx)
 173         jmp 13f
 174 10:
 175         cmpq $4, %rdx
 176         jb 11f
 177         /*
 178          * Move data from 4 bytes to 7 bytes.
 179          */
 180         movl (%rsi), %r11d
 181         movl -4(%rsi, %rdx), %r10d
 182         movl %r11d, (%rdi)
 183         movl %r10d, -4(%rdi, %rdx)
 184         jmp 13f
 185 11:
 186         cmp $2, %rdx
 187         jb 12f
 188         /*
 189          * Move data from 2 bytes to 3 bytes.
 190          */
 191         movw (%rsi), %r11w
 192         movw -2(%rsi, %rdx), %r10w
 193         movw %r11w, (%rdi)
 194         movw %r10w, -2(%rdi, %rdx)
 195         jmp 13f
 196 12:
 197         cmp $1, %rdx
 198         jb 13f
 199         /*
 200          * Move data for 1 byte.
 201          */
 202         movb (%rsi), %r11b
 203         movb %r11b, (%rdi)
 204 13:
 205         retq
 206         CFI_ENDPROC
 207
 208         .section .altinstr_replacement,"ax"
 209 .Lmemmove_begin_forward_efs:
 210         /* Forward moving data. */
 211         movq %rdx, %rcx
 212         rep movsb
 213         retq
 214 .Lmemmove_end_forward_efs:
 215         .previous
 216
 217         .section .altinstructions,"a"
 218         altinstruction_entry .Lmemmove_begin_forward,           \
 219                 .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,   \
 220                 .Lmemmove_end_forward-.Lmemmove_begin_forward,  \
 221                 .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 222         .previous
 223 ENDPROC(memmove)