lib/raid6/recov_neon_inner.c

   1 /*
   2  * Copyright (C) 2012 Intel Corporation
   3  * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; version 2
   8  * of the License.
   9  */
  10
  11 #include <arm_neon.h>
  12
  13 #ifdef CONFIG_ARM
  14 /*
  15  * AArch32 does not provide this intrinsic natively because it does not
  16  * implement the underlying instruction. AArch32 only provides a 64-bit
  17  * wide vtbl.8 instruction, so use that instead.
  18  */
  19 static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
  20 {
  21         union {
  22                 uint8x16_t      val;
  23                 uint8x8x2_t     pair;
  24         } __a = { a };
  25
  26         return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
  27                            vtbl2_u8(__a.pair, vget_high_u8(b)));
  28 }
  29 #endif
  30
  31 void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
  32                               uint8_t *dq, const uint8_t *pbmul,
  33                               const uint8_t *qmul)
  34 {
  35         uint8x16_t pm0 = vld1q_u8(pbmul);
  36         uint8x16_t pm1 = vld1q_u8(pbmul + 16);
  37         uint8x16_t qm0 = vld1q_u8(qmul);
  38         uint8x16_t qm1 = vld1q_u8(qmul + 16);
  39         uint8x16_t x0f = vdupq_n_u8(0x0f);
  40
  41         /*
  42          * while ( bytes-- ) {
  43          *      uint8_t px, qx, db;
  44          *
  45          *      px    = *p ^ *dp;
  46          *      qx    = qmul[*q ^ *dq];
  47          *      *dq++ = db = pbmul[px] ^ qx;
  48          *      *dp++ = db ^ px;
  49          *      p++; q++;
  50          * }
  51          */
  52
  53         while (bytes) {
  54                 uint8x16_t vx, vy, px, qx, db;
  55
  56                 px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
  57                 vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
  58
  59                 vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
  60                 vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
  61                 vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
  62                 qx = veorq_u8(vx, vy);
  63
  64                 vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
  65                 vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
  66                 vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
  67                 vx = veorq_u8(vx, vy);
  68                 db = veorq_u8(vx, qx);
  69
  70                 vst1q_u8(dq, db);
  71                 vst1q_u8(dp, veorq_u8(db, px));
  72
  73                 bytes -= 16;
  74                 p += 16;
  75                 q += 16;
  76                 dp += 16;
  77                 dq += 16;
  78         }
  79 }
  80
  81 void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
  82                               const uint8_t *qmul)
  83 {
  84         uint8x16_t qm0 = vld1q_u8(qmul);
  85         uint8x16_t qm1 = vld1q_u8(qmul + 16);
  86         uint8x16_t x0f = vdupq_n_u8(0x0f);
  87
  88         /*
  89          * while (bytes--) {
  90          *      *p++ ^= *dq = qmul[*q ^ *dq];
  91          *      q++; dq++;
  92          * }
  93          */
  94
  95         while (bytes) {
  96                 uint8x16_t vx, vy;
  97
  98                 vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
  99
 100                 vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
 101                 vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
 102                 vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
 103                 vx = veorq_u8(vx, vy);
 104                 vy = veorq_u8(vx, vld1q_u8(p));
 105
 106                 vst1q_u8(dq, vx);
 107                 vst1q_u8(p, vy);
 108
 109                 bytes -= 16;
 110                 p += 16;
 111                 q += 16;
 112                 dq += 16;
 113         }
 114 }