arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4 #include <asm/cpufeature.h>
   5 #include <asm/alternative-asm.h>
   6
   7 /*
   8  * We build a jump to memcpy_orig by default which gets NOPped out on
   9  * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  10  * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  11  * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  12  */
  13
  14 .weak memcpy
  15
  16 /*
  17  * memcpy - Copy a memory block.
  18  *
  19  * Input:
  20  *  rdi destination
  21  *  rsi source
  22  *  rdx count
  23  *
  24  * Output:
  25  * rax original destination
  26  */
  27 ENTRY(__memcpy)
  28 ENTRY(memcpy)
  29         ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  30                       "jmp memcpy_erms", X86_FEATURE_ERMS
  31
  32         movq %rdi, %rax
  33         movq %rdx, %rcx
  34         shrq $3, %rcx
  35         andl $7, %edx
  36         rep movsq
  37         movl %edx, %ecx
  38         rep movsb
  39         ret
  40 ENDPROC(memcpy)
  41 ENDPROC(__memcpy)
  42
  43 /*
  44  * memcpy_erms() - enhanced fast string memcpy. This is faster and
  45  * simpler than memcpy. Use memcpy_erms when possible.
  46  */
  47 ENTRY(memcpy_erms)
  48         movq %rdi, %rax
  49         movq %rdx, %rcx
  50         rep movsb
  51         ret
  52 ENDPROC(memcpy_erms)
  53
  54 ENTRY(memcpy_orig)
  55         movq %rdi, %rax
  56
  57         cmpq $0x20, %rdx
  58         jb .Lhandle_tail
  59
  60         /*
  61          * We check whether memory false dependence could occur,
  62          * then jump to corresponding copy mode.
  63          */
  64         cmp  %dil, %sil
  65         jl .Lcopy_backward
  66         subq $0x20, %rdx
  67 .Lcopy_forward_loop:
  68         subq $0x20,     %rdx
  69
  70         /*
  71          * Move in blocks of 4x8 bytes:
  72          */
  73         movq 0*8(%rsi), %r8
  74         movq 1*8(%rsi), %r9
  75         movq 2*8(%rsi), %r10
  76         movq 3*8(%rsi), %r11
  77         leaq 4*8(%rsi), %rsi
  78
  79         movq %r8,       0*8(%rdi)
  80         movq %r9,       1*8(%rdi)
  81         movq %r10,      2*8(%rdi)
  82         movq %r11,      3*8(%rdi)
  83         leaq 4*8(%rdi), %rdi
  84         jae  .Lcopy_forward_loop
  85         addl $0x20,     %edx
  86         jmp  .Lhandle_tail
  87
  88 .Lcopy_backward:
  89         /*
  90          * Calculate copy position to tail.
  91          */
  92         addq %rdx,      %rsi
  93         addq %rdx,      %rdi
  94         subq $0x20,     %rdx
  95         /*
  96          * At most 3 ALU operations in one cycle,
  97          * so append NOPS in the same 16 bytes trunk.
  98          */
  99         .p2align 4
 100 .Lcopy_backward_loop:
 101         subq $0x20,     %rdx
 102         movq -1*8(%rsi),        %r8
 103         movq -2*8(%rsi),        %r9
 104         movq -3*8(%rsi),        %r10
 105         movq -4*8(%rsi),        %r11
 106         leaq -4*8(%rsi),        %rsi
 107         movq %r8,               -1*8(%rdi)
 108         movq %r9,               -2*8(%rdi)
 109         movq %r10,              -3*8(%rdi)
 110         movq %r11,              -4*8(%rdi)
 111         leaq -4*8(%rdi),        %rdi
 112         jae  .Lcopy_backward_loop
 113
 114         /*
 115          * Calculate copy position to head.
 116          */
 117         addl $0x20,     %edx
 118         subq %rdx,      %rsi
 119         subq %rdx,      %rdi
 120 .Lhandle_tail:
 121         cmpl $16,       %edx
 122         jb   .Lless_16bytes
 123
 124         /*
 125          * Move data from 16 bytes to 31 bytes.
 126          */
 127         movq 0*8(%rsi), %r8
 128         movq 1*8(%rsi), %r9
 129         movq -2*8(%rsi, %rdx),  %r10
 130         movq -1*8(%rsi, %rdx),  %r11
 131         movq %r8,       0*8(%rdi)
 132         movq %r9,       1*8(%rdi)
 133         movq %r10,      -2*8(%rdi, %rdx)
 134         movq %r11,      -1*8(%rdi, %rdx)
 135         retq
 136         .p2align 4
 137 .Lless_16bytes:
 138         cmpl $8,        %edx
 139         jb   .Lless_8bytes
 140         /*
 141          * Move data from 8 bytes to 15 bytes.
 142          */
 143         movq 0*8(%rsi), %r8
 144         movq -1*8(%rsi, %rdx),  %r9
 145         movq %r8,       0*8(%rdi)
 146         movq %r9,       -1*8(%rdi, %rdx)
 147         retq
 148         .p2align 4
 149 .Lless_8bytes:
 150         cmpl $4,        %edx
 151         jb   .Lless_3bytes
 152
 153         /*
 154          * Move data from 4 bytes to 7 bytes.
 155          */
 156         movl (%rsi), %ecx
 157         movl -4(%rsi, %rdx), %r8d
 158         movl %ecx, (%rdi)
 159         movl %r8d, -4(%rdi, %rdx)
 160         retq
 161         .p2align 4
 162 .Lless_3bytes:
 163         subl $1, %edx
 164         jb .Lend
 165         /*
 166          * Move data from 1 bytes to 3 bytes.
 167          */
 168         movzbl (%rsi), %ecx
 169         jz .Lstore_1byte
 170         movzbq 1(%rsi), %r8
 171         movzbq (%rsi, %rdx), %r9
 172         movb %r8b, 1(%rdi)
 173         movb %r9b, (%rdi, %rdx)
 174 .Lstore_1byte:
 175         movb %cl, (%rdi)
 176
 177 .Lend:
 178         retq
 179 ENDPROC(memcpy_orig)