arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4 #include <asm/cpufeature.h>
   5 #include <asm/dwarf2.h>
   6 #include <asm/alternative-asm.h>
   7
   8 /*
   9  * We build a jump to memcpy_orig by default which gets NOPped out on
  10  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  11  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  12  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  13  */
  14
  15 .weak memcpy
  16
  17 /*
  18  * memcpy - Copy a memory block.
  19  *
  20  * Input:
  21  *  rdi destination
  22  *  rsi source
  23  *  rdx count
  24  *
  25  * Output:
  26  * rax original destination
  27  */
  28 ENTRY(__memcpy)
  29 ENTRY(memcpy)
  30         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  31                       "jmp memcpy_erms", X86_FEATURE_ERMS
  32
  33         movq %rdi, %rax
  34         movq %rdx, %rcx
  35         shrq $3, %rcx
  36         andl $7, %edx
  37         rep movsq
  38         movl %edx, %ecx
  39         rep movsb
  40         ret
  41 ENDPROC(memcpy)
  42 ENDPROC(__memcpy)
  43
  44 /*
  45  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  46  * simpler than memcpy. Use memcpy_erms when possible.
  47  */
  48 ENTRY(memcpy_erms)
  49         movq %rdi, %rax
  50         movq %rdx, %rcx
  51         rep movsb
  52         ret
  53 ENDPROC(memcpy_erms)
  54
  55 ENTRY(memcpy_orig)
  56         CFI_STARTPROC
  57         movq %rdi, %rax
  58
  59         cmpq $0x20, %rdx
  60         jb .Lhandle_tail
  61
  62         /*
  63          * We check whether memory false dependence could occur,
  64          * then jump to corresponding copy mode.
  65          */
  66         cmp  %dil, %sil
  67         jl .Lcopy_backward
  68         subq $0x20, %rdx
  69 .Lcopy_forward_loop:
  70         subq $0x20,     %rdx
  71
  72         /*
  73          * Move in blocks of 4x8 bytes:
  74          */
  75         movq 0*8(%rsi), %r8
  76         movq 1*8(%rsi), %r9
  77         movq 2*8(%rsi), %r10
  78         movq 3*8(%rsi), %r11
  79         leaq 4*8(%rsi), %rsi
  80
  81         movq %r8,       0*8(%rdi)
  82         movq %r9,       1*8(%rdi)
  83         movq %r10,      2*8(%rdi)
  84         movq %r11,      3*8(%rdi)
  85         leaq 4*8(%rdi), %rdi
  86         jae  .Lcopy_forward_loop
  87         addl $0x20,     %edx
  88         jmp  .Lhandle_tail
  89
  90 .Lcopy_backward:
  91         /*
  92          * Calculate copy position to tail.
  93          */
  94         addq %rdx,      %rsi
  95         addq %rdx,      %rdi
  96         subq $0x20,     %rdx
  97         /*
  98          * At most 3 ALU operations in one cycle,
  99          * so append NOPS in the same 16 bytes trunk.
 100          */
 101         .p2align 4
 102 .Lcopy_backward_loop:
 103         subq $0x20,     %rdx
 104         movq -1*8(%rsi),        %r8
 105         movq -2*8(%rsi),        %r9
 106         movq -3*8(%rsi),        %r10
 107         movq -4*8(%rsi),        %r11
 108         leaq -4*8(%rsi),        %rsi
 109         movq %r8,               -1*8(%rdi)
 110         movq %r9,               -2*8(%rdi)
 111         movq %r10,              -3*8(%rdi)
 112         movq %r11,              -4*8(%rdi)
 113         leaq -4*8(%rdi),        %rdi
 114         jae  .Lcopy_backward_loop
 115
 116         /*
 117          * Calculate copy position to head.
 118          */
 119         addl $0x20,     %edx
 120         subq %rdx,      %rsi
 121         subq %rdx,      %rdi
 122 .Lhandle_tail:
 123         cmpl $16,       %edx
 124         jb   .Lless_16bytes
 125
 126         /*
 127          * Move data from 16 bytes to 31 bytes.
 128          */
 129         movq 0*8(%rsi), %r8
 130         movq 1*8(%rsi), %r9
 131         movq -2*8(%rsi, %rdx),  %r10
 132         movq -1*8(%rsi, %rdx),  %r11
 133         movq %r8,       0*8(%rdi)
 134         movq %r9,       1*8(%rdi)
 135         movq %r10,      -2*8(%rdi, %rdx)
 136         movq %r11,      -1*8(%rdi, %rdx)
 137         retq
 138         .p2align 4
 139 .Lless_16bytes:
 140         cmpl $8,        %edx
 141         jb   .Lless_8bytes
 142         /*
 143          * Move data from 8 bytes to 15 bytes.
 144          */
 145         movq 0*8(%rsi), %r8
 146         movq -1*8(%rsi, %rdx),  %r9
 147         movq %r8,       0*8(%rdi)
 148         movq %r9,       -1*8(%rdi, %rdx)
 149         retq
 150         .p2align 4
 151 .Lless_8bytes:
 152         cmpl $4,        %edx
 153         jb   .Lless_3bytes
 154
 155         /*
 156          * Move data from 4 bytes to 7 bytes.
 157          */
 158         movl (%rsi), %ecx
 159         movl -4(%rsi, %rdx), %r8d
 160         movl %ecx, (%rdi)
 161         movl %r8d, -4(%rdi, %rdx)
 162         retq
 163         .p2align 4
 164 .Lless_3bytes:
 165         subl $1, %edx
 166         jb .Lend
 167         /*
 168          * Move data from 1 bytes to 3 bytes.
 169          */
 170         movzbl (%rsi), %ecx
 171         jz .Lstore_1byte
 172         movzbq 1(%rsi), %r8
 173         movzbq (%rsi, %rdx), %r9
 174         movb %r8b, 1(%rdi)
 175         movb %r9b, (%rdi, %rdx)
 176 .Lstore_1byte:
 177         movb %cl, (%rdi)
 178
 179 .Lend:
 180         retq
 181         CFI_ENDPROC
 182 ENDPROC(memcpy_orig)