arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4
   5 #include <asm/cpufeature.h>
   6 #include <asm/dwarf2.h>
   7
   8 /*
   9  * memcpy - Copy a memory block.
  10  *
  11  * Input:
  12  *  rdi destination
  13  *  rsi source
  14  *  rdx count
  15  *
  16  * Output:
  17  * rax original destination
  18  */
  19
  20 /*
  21  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  22  *
  23  * This gets patched over the unrolled variant (below) via the
  24  * alternative instructions framework:
  25  */
  26         .section .altinstr_replacement, "ax", @progbits
  27 .Lmemcpy_c:
  28         movq %rdi, %rax
  29
  30         movl %edx, %ecx
  31         shrl $3, %ecx
  32         andl $7, %edx
  33         rep movsq
  34         movl %edx, %ecx
  35         rep movsb
  36         ret
  37 .Lmemcpy_e:
  38         .previous
  39
  40 ENTRY(__memcpy)
  41 ENTRY(memcpy)
  42         CFI_STARTPROC
  43         movq %rdi, %rax
  44
  45         /*
  46          * Use 32bit CMP here to avoid long NOP padding.
  47          */
  48         cmp  $0x20, %edx
  49         jb .Lhandle_tail
  50
  51         /*
  52          * We check whether memory false dependece could occur,
  53          * then jump to corresponding copy mode.
  54          */
  55         cmp  %dil, %sil
  56         jl .Lcopy_backward
  57         subl $0x20, %edx
  58 .Lcopy_forward_loop:
  59         subq $0x20,     %rdx
  60
  61         /*
  62          * Move in blocks of 4x8 bytes:
  63          */
  64         movq 0*8(%rsi), %r8
  65         movq 1*8(%rsi), %r9
  66         movq 2*8(%rsi), %r10
  67         movq 3*8(%rsi), %r11
  68         leaq 4*8(%rsi), %rsi
  69
  70         movq %r8,       0*8(%rdi)
  71         movq %r9,       1*8(%rdi)
  72         movq %r10,      2*8(%rdi)
  73         movq %r11,      3*8(%rdi)
  74         leaq 4*8(%rdi), %rdi
  75         jae  .Lcopy_forward_loop
  76         addq $0x20,     %rdx
  77         jmp  .Lhandle_tail
  78
  79 .Lcopy_backward:
  80         /*
  81          * Calculate copy position to tail.
  82          */
  83         addq %rdx,      %rsi
  84         addq %rdx,      %rdi
  85         subq $0x20,     %rdx
  86         /*
  87          * At most 3 ALU operations in one cycle,
  88          * so append NOPS in the same 16bytes trunk.
  89          */
  90         .p2align 4
  91 .Lcopy_backward_loop:
  92         subq $0x20,     %rdx
  93         movq -1*8(%rsi),        %r8
  94         movq -2*8(%rsi),        %r9
  95         movq -3*8(%rsi),        %r10
  96         movq -4*8(%rsi),        %r11
  97         leaq -4*8(%rsi),        %rsi
  98         movq %r8,               -1*8(%rdi)
  99         movq %r9,               -2*8(%rdi)
 100         movq %r10,              -3*8(%rdi)
 101         movq %r11,              -4*8(%rdi)
 102         leaq -4*8(%rdi),        %rdi
 103         jae  .Lcopy_backward_loop
 104
 105         /*
 106          * Calculate copy position to head.
 107          */
 108         addq $0x20,     %rdx
 109         subq %rdx,      %rsi
 110         subq %rdx,      %rdi
 111 .Lhandle_tail:
 112         cmpq $16,       %rdx
 113         jb   .Lless_16bytes
 114
 115         /*
 116          * Move data from 16 bytes to 31 bytes.
 117          */
 118         movq 0*8(%rsi), %r8
 119         movq 1*8(%rsi), %r9
 120         movq -2*8(%rsi, %rdx),  %r10
 121         movq -1*8(%rsi, %rdx),  %r11
 122         movq %r8,       0*8(%rdi)
 123         movq %r9,       1*8(%rdi)
 124         movq %r10,      -2*8(%rdi, %rdx)
 125         movq %r11,      -1*8(%rdi, %rdx)
 126         retq
 127         .p2align 4
 128 .Lless_16bytes:
 129         cmpq $8,        %rdx
 130         jb   .Lless_8bytes
 131         /*
 132          * Move data from 8 bytes to 15 bytes.
 133          */
 134         movq 0*8(%rsi), %r8
 135         movq -1*8(%rsi, %rdx),  %r9
 136         movq %r8,       0*8(%rdi)
 137         movq %r9,       -1*8(%rdi, %rdx)
 138         retq
 139         .p2align 4
 140 .Lless_8bytes:
 141         cmpq $4,        %rdx
 142         jb   .Lless_3bytes
 143
 144         /*
 145          * Move data from 4 bytes to 7 bytes.
 146          */
 147         movl (%rsi), %ecx
 148         movl -4(%rsi, %rdx), %r8d
 149         movl %ecx, (%rdi)
 150         movl %r8d, -4(%rdi, %rdx)
 151         retq
 152         .p2align 4
 153 .Lless_3bytes:
 154         cmpl $0, %edx
 155         je .Lend
 156         /*
 157          * Move data from 1 bytes to 3 bytes.
 158          */
 159 .Lloop_1:
 160         movb (%rsi), %r8b
 161         movb %r8b, (%rdi)
 162         incq %rdi
 163         incq %rsi
 164         decl %edx
 165         jnz .Lloop_1
 166
 167 .Lend:
 168         retq
 169         CFI_ENDPROC
 170 ENDPROC(memcpy)
 171 ENDPROC(__memcpy)
 172
 173         /*
 174          * Some CPUs run faster using the string copy instructions.
 175          * It is also a lot simpler. Use this when possible:
 176          */
 177
 178         .section .altinstructions, "a"
 179         .align 8
 180         .quad memcpy
 181         .quad .Lmemcpy_c
 182         .word X86_FEATURE_REP_GOOD
 183
 184         /*
 185          * Replace only beginning, memcpy is used to apply alternatives,
 186          * so it is silly to overwrite itself with nops - reboot is the
 187          * only outcome...
 188          */
 189         .byte .Lmemcpy_e - .Lmemcpy_c
 190         .byte .Lmemcpy_e - .Lmemcpy_c
 191         .previous