arch/x86/include/asm/xor_avx.h

   1 #ifndef _ASM_X86_XOR_AVX_H
   2 #define _ASM_X86_XOR_AVX_H
   3
   4 /*
   5  * Optimized RAID-5 checksumming functions for AVX
   6  *
   7  * Copyright (C) 2012 Intel Corporation
   8  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
   9  *
  10  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License
  14  * as published by the Free Software Foundation; version 2
  15  * of the License.
  16  */
  17
  18 #ifdef CONFIG_AS_AVX
  19
  20 #include <linux/compiler.h>
  21 #include <asm/fpu/api.h>
  22
  23 #define BLOCK4(i) \
  24                 BLOCK(32 * i, 0) \
  25                 BLOCK(32 * (i + 1), 1) \
  26                 BLOCK(32 * (i + 2), 2) \
  27                 BLOCK(32 * (i + 3), 3)
  28
  29 #define BLOCK16() \
  30                 BLOCK4(0) \
  31                 BLOCK4(4) \
  32                 BLOCK4(8) \
  33                 BLOCK4(12)
  34
  35 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
  36 {
  37         unsigned long lines = bytes >> 9;
  38
  39         kernel_fpu_begin();
  40
  41         while (lines--) {
  42 #undef BLOCK
  43 #define BLOCK(i, reg) \
  44 do { \
  45         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
  46         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
  47                 "m" (p0[i / sizeof(*p0)])); \
  48         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  49                 "=m" (p0[i / sizeof(*p0)])); \
  50 } while (0);
  51
  52                 BLOCK16()
  53
  54                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  55                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  56         }
  57
  58         kernel_fpu_end();
  59 }
  60
  61 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  62         unsigned long *p2)
  63 {
  64         unsigned long lines = bytes >> 9;
  65
  66         kernel_fpu_begin();
  67
  68         while (lines--) {
  69 #undef BLOCK
  70 #define BLOCK(i, reg) \
  71 do { \
  72         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
  73         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  74                 "m" (p1[i / sizeof(*p1)])); \
  75         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
  76                 "m" (p0[i / sizeof(*p0)])); \
  77         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
  78                 "=m" (p0[i / sizeof(*p0)])); \
  79 } while (0);
  80
  81                 BLOCK16()
  82
  83                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
  84                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
  85                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
  86         }
  87
  88         kernel_fpu_end();
  89 }
  90
  91 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
  92         unsigned long *p2, unsigned long *p3)
  93 {
  94         unsigned long lines = bytes >> 9;
  95
  96         kernel_fpu_begin();
  97
  98         while (lines--) {
  99 #undef BLOCK
 100 #define BLOCK(i, reg) \
 101 do { \
 102         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
 103         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 104                 "m" (p2[i / sizeof(*p2)])); \
 105         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 106                 "m" (p1[i / sizeof(*p1)])); \
 107         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 108                 "m" (p0[i / sizeof(*p0)])); \
 109         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 110                 "=m" (p0[i / sizeof(*p0)])); \
 111 } while (0);
 112
 113                 BLOCK16();
 114
 115                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 116                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 117                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 118                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 119         }
 120
 121         kernel_fpu_end();
 122 }
 123
 124 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
 125         unsigned long *p2, unsigned long *p3, unsigned long *p4)
 126 {
 127         unsigned long lines = bytes >> 9;
 128
 129         kernel_fpu_begin();
 130
 131         while (lines--) {
 132 #undef BLOCK
 133 #define BLOCK(i, reg) \
 134 do { \
 135         asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
 136         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 137                 "m" (p3[i / sizeof(*p3)])); \
 138         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 139                 "m" (p2[i / sizeof(*p2)])); \
 140         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 141                 "m" (p1[i / sizeof(*p1)])); \
 142         asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
 143                 "m" (p0[i / sizeof(*p0)])); \
 144         asm volatile("vmovdqa %%ymm" #reg ", %0" : \
 145                 "=m" (p0[i / sizeof(*p0)])); \
 146 } while (0);
 147
 148                 BLOCK16()
 149
 150                 p0 = (unsigned long *)((uintptr_t)p0 + 512);
 151                 p1 = (unsigned long *)((uintptr_t)p1 + 512);
 152                 p2 = (unsigned long *)((uintptr_t)p2 + 512);
 153                 p3 = (unsigned long *)((uintptr_t)p3 + 512);
 154                 p4 = (unsigned long *)((uintptr_t)p4 + 512);
 155         }
 156
 157         kernel_fpu_end();
 158 }
 159
 160 static struct xor_block_template xor_block_avx = {
 161         .name = "avx",
 162         .do_2 = xor_avx_2,
 163         .do_3 = xor_avx_3,
 164         .do_4 = xor_avx_4,
 165         .do_5 = xor_avx_5,
 166 };
 167
 168 #define AVX_XOR_SPEED \
 169 do { \
 170         if (cpu_has_avx && cpu_has_osxsave) \
 171                 xor_speed(&xor_block_avx); \
 172 } while (0)
 173
 174 #define AVX_SELECT(FASTEST) \
 175         (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
 176
 177 #else
 178
 179 #define AVX_XOR_SPEED {}
 180
 181 #define AVX_SELECT(FASTEST) (FASTEST)
 182
 183 #endif
 184 #endif