1 /* AesOpt.c -- Intel's AES
2 2017-06-08 : Igor Pavlov : Public domain */
8 #ifdef MY_CPU_X86_OR_AMD64
9 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
16 #include <wmmintrin.h>
18 void MY_FAST_CALL
AesCbc_Encode_Intel(__m128i
*p
, __m128i
*data
, size_t numBlocks
)
21 for (; numBlocks
!= 0; numBlocks
--, data
++)
23 UInt32 numRounds2
= *(const UInt32
*)(p
+ 1) - 1;
24 const __m128i
*w
= p
+ 3;
25 m
= _mm_xor_si128(m
, *data
);
26 m
= _mm_xor_si128(m
, p
[2]);
29 m
= _mm_aesenc_si128(m
, w
[0]);
30 m
= _mm_aesenc_si128(m
, w
[1]);
33 while (--numRounds2
!= 0);
34 m
= _mm_aesenc_si128(m
, w
[0]);
35 m
= _mm_aesenclast_si128(m
, w
[1]);
43 #define AES_OP_W(op, n) { \
44 const __m128i t = w[n]; \
50 #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
51 #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
52 #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
53 #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
55 void MY_FAST_CALL
AesCbc_Decode_Intel(__m128i
*p
, __m128i
*data
, size_t numBlocks
)
58 for (; numBlocks
>= NUM_WAYS
; numBlocks
-= NUM_WAYS
, data
+= NUM_WAYS
)
60 UInt32 numRounds2
= *(const UInt32
*)(p
+ 1);
61 const __m128i
*w
= p
+ numRounds2
* 2;
64 const __m128i t
= w
[2];
65 m0
= _mm_xor_si128(t
, data
[0]);
66 m1
= _mm_xor_si128(t
, data
[1]);
67 m2
= _mm_xor_si128(t
, data
[2]);
76 while (--numRounds2
!= 0);
82 t
= _mm_xor_si128(m0
, iv
); iv
= data
[0]; data
[0] = t
;
83 t
= _mm_xor_si128(m1
, iv
); iv
= data
[1]; data
[1] = t
;
84 t
= _mm_xor_si128(m2
, iv
); iv
= data
[2]; data
[2] = t
;
87 for (; numBlocks
!= 0; numBlocks
--, data
++)
89 UInt32 numRounds2
= *(const UInt32
*)(p
+ 1);
90 const __m128i
*w
= p
+ numRounds2
* 2;
91 __m128i m
= _mm_xor_si128(w
[2], *data
);
95 m
= _mm_aesdec_si128(m
, w
[1]);
96 m
= _mm_aesdec_si128(m
, w
[0]);
99 while (--numRounds2
!= 0);
100 m
= _mm_aesdec_si128(m
, w
[1]);
101 m
= _mm_aesdeclast_si128(m
, w
[0]);
103 m
= _mm_xor_si128(m
, iv
);
110 void MY_FAST_CALL
AesCtr_Code_Intel(__m128i
*p
, __m128i
*data
, size_t numBlocks
)
114 one
.m128i_u64
[0] = 1;
115 one
.m128i_u64
[1] = 0;
116 for (; numBlocks
>= NUM_WAYS
; numBlocks
-= NUM_WAYS
, data
+= NUM_WAYS
)
118 UInt32 numRounds2
= *(const UInt32
*)(p
+ 1) - 1;
119 const __m128i
*w
= p
;
122 const __m128i t
= w
[2];
123 ctr
= _mm_add_epi64(ctr
, one
); m0
= _mm_xor_si128(ctr
, t
);
124 ctr
= _mm_add_epi64(ctr
, one
); m1
= _mm_xor_si128(ctr
, t
);
125 ctr
= _mm_add_epi64(ctr
, one
); m2
= _mm_xor_si128(ctr
, t
);
134 while (--numRounds2
!= 0);
137 data
[0] = _mm_xor_si128(data
[0], m0
);
138 data
[1] = _mm_xor_si128(data
[1], m1
);
139 data
[2] = _mm_xor_si128(data
[2], m2
);
141 for (; numBlocks
!= 0; numBlocks
--, data
++)
143 UInt32 numRounds2
= *(const UInt32
*)(p
+ 1) - 1;
144 const __m128i
*w
= p
;
146 ctr
= _mm_add_epi64(ctr
, one
);
147 m
= _mm_xor_si128(ctr
, p
[2]);
151 m
= _mm_aesenc_si128(m
, w
[0]);
152 m
= _mm_aesenc_si128(m
, w
[1]);
155 while (--numRounds2
!= 0);
156 m
= _mm_aesenc_si128(m
, w
[0]);
157 m
= _mm_aesenclast_si128(m
, w
[1]);
158 *data
= _mm_xor_si128(*data
, m
);
165 void MY_FAST_CALL
AesCbc_Encode(UInt32
*ivAes
, Byte
*data
, size_t numBlocks
);
166 void MY_FAST_CALL
AesCbc_Decode(UInt32
*ivAes
, Byte
*data
, size_t numBlocks
);
167 void MY_FAST_CALL
AesCtr_Code(UInt32
*ivAes
, Byte
*data
, size_t numBlocks
);
169 void MY_FAST_CALL
AesCbc_Encode_Intel(UInt32
*p
, Byte
*data
, size_t numBlocks
)
171 AesCbc_Encode(p
, data
, numBlocks
);
174 void MY_FAST_CALL
AesCbc_Decode_Intel(UInt32
*p
, Byte
*data
, size_t numBlocks
)
176 AesCbc_Decode(p
, data
, numBlocks
);
179 void MY_FAST_CALL
AesCtr_Code_Intel(UInt32
*p
, Byte
*data
, size_t numBlocks
)
181 AesCtr_Code(p
, data
, numBlocks
);