arch/x86/lib/memset_64.S

   1 /* Copyright 2002 Andi Kleen, SuSE Labs */
   2
   3 #include <linux/linkage.h>
   4 #include <asm/dwarf2.h>
   5
   6 /*
   7  * ISO C memset - set a memory block to a byte value.
   8  *
   9  * rdi   destination
  10  * rsi   value (char)
  11  * rdx   count (bytes)
  12  *
  13  * rax   original destination
  14  */
  15         ALIGN
  16 memset_c:
  17         CFI_STARTPROC
  18         movq %rdi,%r9
  19         movl %edx,%r8d
  20         andl $7,%r8d
  21         movl %edx,%ecx
  22         shrl $3,%ecx
  23         /* expand byte value  */
  24         movzbl %sil,%esi
  25         movabs $0x0101010101010101,%rax
  26         mulq %rsi               /* with rax, clobbers rdx */
  27         rep stosq
  28         movl %r8d,%ecx
  29         rep stosb
  30         movq %r9,%rax
  31         ret
  32         CFI_ENDPROC
  33 ENDPROC(memset_c)
  34
  35 ENTRY(memset)
  36 ENTRY(__memset)
  37         CFI_STARTPROC
  38         movq %rdi,%r10
  39         movq %rdx,%r11
  40
  41         /* expand byte value  */
  42         movzbl %sil,%ecx
  43         movabs $0x0101010101010101,%rax
  44         mul    %rcx             /* with rax, clobbers rdx */
  45
  46         /* align dst */
  47         movl  %edi,%r9d
  48         andl  $7,%r9d
  49         jnz  .Lbad_alignment
  50         CFI_REMEMBER_STATE
  51 .Lafter_bad_alignment:
  52
  53         movl %r11d,%ecx
  54         shrl $6,%ecx
  55         jz       .Lhandle_tail
  56
  57         .p2align 4
  58 .Lloop_64:
  59         decl   %ecx
  60         movq  %rax,(%rdi)
  61         movq  %rax,8(%rdi)
  62         movq  %rax,16(%rdi)
  63         movq  %rax,24(%rdi)
  64         movq  %rax,32(%rdi)
  65         movq  %rax,40(%rdi)
  66         movq  %rax,48(%rdi)
  67         movq  %rax,56(%rdi)
  68         leaq  64(%rdi),%rdi
  69         jnz    .Lloop_64
  70
  71         /* Handle tail in loops. The loops should be faster than hard
  72            to predict jump tables. */
  73         .p2align 4
  74 .Lhandle_tail:
  75         movl    %r11d,%ecx
  76         andl    $63&(~7),%ecx
  77         jz              .Lhandle_7
  78         shrl    $3,%ecx
  79         .p2align 4
  80 .Lloop_8:
  81         decl   %ecx
  82         movq  %rax,(%rdi)
  83         leaq  8(%rdi),%rdi
  84         jnz    .Lloop_8
  85
  86 .Lhandle_7:
  87         movl    %r11d,%ecx
  88         andl    $7,%ecx
  89         jz      .Lende
  90         .p2align 4
  91 .Lloop_1:
  92         decl    %ecx
  93         movb    %al,(%rdi)
  94         leaq    1(%rdi),%rdi
  95         jnz     .Lloop_1
  96
  97 .Lende:
  98         movq    %r10,%rax
  99         ret
 100
 101         CFI_RESTORE_STATE
 102 .Lbad_alignment:
 103         cmpq $7,%r11
 104         jbe     .Lhandle_7
 105         movq %rax,(%rdi)        /* unaligned store */
 106         movq $8,%r8
 107         subq %r9,%r8
 108         addq %r8,%rdi
 109         subq %r8,%r11
 110         jmp .Lafter_bad_alignment
 111 .Lfinal:
 112         CFI_ENDPROC
 113 ENDPROC(memset)
 114 ENDPROC(__memset)
 115
 116         /* Some CPUs run faster using the string instructions.
 117            It is also a lot simpler. Use this when possible */
 118
 119 #include <asm/cpufeature.h>
 120
 121         .section .altinstr_replacement,"ax"
 122 1:      .byte 0xeb                              /* jmp <disp8> */
 123         .byte (memset_c - memset) - (2f - 1b)   /* offset */
 124 2:
 125         .previous
 126         .section .altinstructions,"a"
 127         .align 8
 128         .quad memset
 129         .quad 1b
 130         .byte X86_FEATURE_REP_GOOD
 131         .byte .Lfinal - memset
 132         .byte 2b - 1b
 133         .previous