4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33 * instructions. This file contains an accelerated
34 * Galois Field Multiplication implementation.
36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37 * carry-less multiplication. More information about PCLMULQDQ can be
39 * http://software.intel.com/en-us/articles/
40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
45 * ====================================================================
46 * OpenSolaris OS modifications
48 * This source originates as file galois_hash_asm.c from
49 * Intel Corporation dated September 21, 2009.
51 * This OpenSolaris version has these major changes from the original source:
53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55 * definition for lint.
57 * 2. Formatted code, added comments, and added #includes and #defines.
59 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
60 * calling kpreempt_disable() and kpreempt_enable().
61 * If the TS bit is not set, Save and restore %xmm registers at the beginning
62 * and end of function calls (%xmm* registers are not saved and restored by
63 * during kernel thread preemption).
65 * 4. Removed code to perform hashing. This is already done with C macro
66 * GHASH in gcm.c. For better performance, this removed code should be
67 * reintegrated in the future to replace the C GHASH macro.
69 * 5. Added code to byte swap 16-byte input and output.
71 * 6. Folded in comments from the original C source with embedded assembly
74 * 7. Renamed function and reordered parameters to match OpenSolaris:
76 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
77 * unsigned char *d, int length)
78 * OpenSolaris OS interface:
79 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80 * ====================================================================
85 #include <sys/asm_linkage.h>
86 #include <sys/controlregs.h>
88 #include <sys/machprivregs.h>
93 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
94 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
95 * uses it to pass P2 to syscall.
96 * This also occurs with the STTS macro, but we don't care if
97 * P2 (%rsi) is modified just before function exit.
98 * The CLTS and STTS macros push and pop P1 (%rdi) already.
101 #define PROTECTED_CLTS \
106 #define PROTECTED_CLTS \
111 * If CR0_TS is not set, align stack (with push %rbp) and push
112 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
114 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
118 testq $CR0_TS
, tmpreg; \
120 and $
-XMM_ALIGN
, %rsp; \
121 sub $
[XMM_SIZE
* 11], %rsp; \
122 movaps
%xmm0
, 160(%rsp
); \
123 movaps
%xmm1
, 144(%rsp
); \
124 movaps
%xmm2
, 128(%rsp
); \
125 movaps
%xmm3
, 112(%rsp
); \
126 movaps
%xmm4
, 96(%rsp
); \
127 movaps
%xmm5
, 80(%rsp
); \
128 movaps
%xmm6
, 64(%rsp
); \
129 movaps
%xmm7
, 48(%rsp
); \
130 movaps
%xmm8
, 32(%rsp
); \
131 movaps
%xmm9
, 16(%rsp
); \
132 movaps
%xmm10
, (%rsp
); \
140 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
141 * otherwise set CR0_TS.
143 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
144 testq $CR0_TS
, tmpreg; \
146 movaps
(%rsp
), %xmm10; \
147 movaps
16(%rsp
), %xmm9; \
148 movaps
32(%rsp
), %xmm8; \
149 movaps
48(%rsp
), %xmm7; \
150 movaps
64(%rsp
), %xmm6; \
151 movaps
80(%rsp
), %xmm5; \
152 movaps
96(%rsp
), %xmm4; \
153 movaps
112(%rsp
), %xmm3; \
154 movaps
128(%rsp
), %xmm2; \
155 movaps
144(%rsp
), %xmm1; \
156 movaps
160(%rsp
), %xmm0; \
166 #define PROTECTED_CLTS
167 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
168 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
172 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
175 // static uint8_t byte_swap16_mask
[] = {
176 // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
180 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
185 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
187 * Perform a carry-less multiplication (that is, use XOR instead of the
188 * multiply operator) on P1 and P2 and place the result in P3.
190 * Byte swap the input and the output.
192 * Note: x_in, y, and res all point to a block of 20-byte numbers
193 * (an array of two 64-bit integers).
195 * Note2: For kernel code, caller is responsible for ensuring
196 * kpreempt_disable() has been called. This is because %xmm registers are
197 * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
198 * respectively, if TS is set on entry. Otherwise, if TS is not set,
199 * save and restore %xmm registers on the stack.
201 * Note3: Original Intel definition:
202 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
203 * unsigned char *d, int length)
205 * Note4: Register/parameter mapping:
207 * Parameter 1: %rcx (copied to %xmm0) hk or x_in
208 * Parameter 2: %rdx (copied to %xmm1) s or y
209 * Parameter 3: %rdi (result) d or res
211 * Parameter 1: %rdi (copied to %xmm0) x_in
212 * Parameter 2: %rsi (copied to %xmm1) y
213 * Parameter 3: %rdx (result) res
216 ENTRY_NP
(gcm_mul_pclmulqdq
)
217 CLEAR_TS_OR_PUSH_XMM_REGISTERS
(%r10)
222 movdqu
(%rdi
), %xmm0
// P1
223 movdqu
(%rsi
), %xmm1
// P2
226 // Byte swap
16-byte input
228 lea
.Lbyte_swap16_mask(%rip), %rax
229 movaps
(%rax
), %xmm10
235 // Multiply with the hash key
238 pclmulqdq $
0, %xmm1
, %xmm3
// xmm3 holds a0
*b0
241 pclmulqdq $
16, %xmm1
, %xmm4
// xmm4 holds a0
*b1
244 pclmulqdq $
1, %xmm1
, %xmm5
// xmm5 holds a1
*b0
246 pclmulqdq $
17, %xmm1
, %xmm6
// xmm6 holds a1
*b1
248 pxor
%xmm5
, %xmm4
// xmm4 holds a0
*b1
+ a1
*b0
250 movdqu
%xmm4
, %xmm5
// move the contents of xmm4 to xmm5
251 psrldq $
8, %xmm4
// shift by xmm4
64 bits to the right
252 pslldq $
8, %xmm5
// shift by xmm5
64 bits to the left
254 pxor
%xmm4
, %xmm6
// Register pair
<xmm6
:xmm3
> holds the result
255 // of the carry-less multiplication of
258 // We shift the result of the multiplication by one bit position
259 // to the left to cope for the fact that the bits are reversed.
275 // First phase of the reduction
277 // Move xmm3 into xmm7
, xmm8
, xmm9 in order to perform the shifts
282 pslld $
31, %xmm7
// packed right shift shifting
<< 31
283 pslld $
30, %xmm8
// packed right shift shifting
<< 30
284 pslld $
25, %xmm9
// packed right shift shifting
<< 25
285 pxor
%xmm8
, %xmm7
// xor the shifted versions
290 pxor
%xmm7
, %xmm3
// first phase of the reduction complete
293 // Second phase of the reduction
295 // Make
3 copies of xmm3 in xmm2
, xmm4
, xmm5 for doing these
298 movdqu
%xmm3
, %xmm4
// packed left shifting
>> 1
301 psrld $
2, %xmm4
// packed left shifting
>> 2
302 psrld $
7, %xmm5
// packed left shifting
>> 7
303 pxor
%xmm4
, %xmm2
// xor the shifted versions
307 pxor
%xmm3
, %xmm6
// the result is in xmm6
310 // Byte swap
16-byte result
312 pshufb
%xmm10
, %xmm6
// %xmm10 has the swap mask
317 movdqu
%xmm6
, (%rdx
) // P3
321 // Cleanup
and Return
323 SET_TS_OR_POP_XMM_REGISTERS
(%r10)
325 SET_SIZE
(gcm_mul_pclmulqdq
)