2 * Copyright (C) 2012 Intel Corporation
3 * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
13 static const uint8x16_t x0f
= {
14 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
15 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
20 * AArch32 does not provide this intrinsic natively because it does not
21 * implement the underlying instruction. AArch32 only provides a 64-bit
22 * wide vtbl.8 instruction, so use that instead.
24 static uint8x16_t
vqtbl1q_u8(uint8x16_t a
, uint8x16_t b
)
31 return vcombine_u8(vtbl2_u8(__a
.pair
, vget_low_u8(b
)),
32 vtbl2_u8(__a
.pair
, vget_high_u8(b
)));
36 void __raid6_2data_recov_neon(int bytes
, uint8_t *p
, uint8_t *q
, uint8_t *dp
,
37 uint8_t *dq
, const uint8_t *pbmul
,
40 uint8x16_t pm0
= vld1q_u8(pbmul
);
41 uint8x16_t pm1
= vld1q_u8(pbmul
+ 16);
42 uint8x16_t qm0
= vld1q_u8(qmul
);
43 uint8x16_t qm1
= vld1q_u8(qmul
+ 16);
50 * qx = qmul[*q ^ *dq];
51 * *dq++ = db = pbmul[px] ^ qx;
58 uint8x16_t vx
, vy
, px
, qx
, db
;
60 px
= veorq_u8(vld1q_u8(p
), vld1q_u8(dp
));
61 vx
= veorq_u8(vld1q_u8(q
), vld1q_u8(dq
));
63 vy
= (uint8x16_t
)vshrq_n_s16((int16x8_t
)vx
, 4);
64 vx
= vqtbl1q_u8(qm0
, vandq_u8(vx
, x0f
));
65 vy
= vqtbl1q_u8(qm1
, vandq_u8(vy
, x0f
));
66 qx
= veorq_u8(vx
, vy
);
68 vy
= (uint8x16_t
)vshrq_n_s16((int16x8_t
)px
, 4);
69 vx
= vqtbl1q_u8(pm0
, vandq_u8(px
, x0f
));
70 vy
= vqtbl1q_u8(pm1
, vandq_u8(vy
, x0f
));
71 vx
= veorq_u8(vx
, vy
);
72 db
= veorq_u8(vx
, qx
);
75 vst1q_u8(dp
, veorq_u8(db
, px
));
85 void __raid6_datap_recov_neon(int bytes
, uint8_t *p
, uint8_t *q
, uint8_t *dq
,
88 uint8x16_t qm0
= vld1q_u8(qmul
);
89 uint8x16_t qm1
= vld1q_u8(qmul
+ 16);
93 * *p++ ^= *dq = qmul[*q ^ *dq];
101 vx
= veorq_u8(vld1q_u8(q
), vld1q_u8(dq
));
103 vy
= (uint8x16_t
)vshrq_n_s16((int16x8_t
)vx
, 4);
104 vx
= vqtbl1q_u8(qm0
, vandq_u8(vx
, x0f
));
105 vy
= vqtbl1q_u8(qm1
, vandq_u8(vy
, x0f
));
106 vx
= veorq_u8(vx
, vy
);
107 vy
= veorq_u8(vx
, vld1q_u8(p
));