8322 nl: misleading-indentation
[unleashed/tickless.git] / usr / src / common / crypto / modes / amd64 / gcm_intel.s
blob19776bf4b833e3a2241adcf10ee38fa6d11fa776
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33 * instructions. This file contains an accelerated
34 * Galois Field Multiplication implementation.
36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37 * carry-less multiplication. More information about PCLMULQDQ can be
38 * found at:
39 * http://software.intel.com/en-us/articles/
40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
45 * ====================================================================
46 * OpenSolaris OS modifications
48 * This source originates as file galois_hash_asm.c from
49 * Intel Corporation dated September 21, 2009.
51 * This OpenSolaris version has these major changes from the original source:
53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55 * definition for lint.
57 * 2. Formatted code, added comments, and added #includes and #defines.
59 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
60 * calling kpreempt_disable() and kpreempt_enable().
61 * If the TS bit is not set, Save and restore %xmm registers at the beginning
62 * and end of function calls (%xmm* registers are not saved and restored by
63 * during kernel thread preemption).
65 * 4. Removed code to perform hashing. This is already done with C macro
66 * GHASH in gcm.c. For better performance, this removed code should be
67 * reintegrated in the future to replace the C GHASH macro.
69 * 5. Added code to byte swap 16-byte input and output.
71 * 6. Folded in comments from the original C source with embedded assembly
72 * (SB_w_shift_xor.c)
74 * 7. Renamed function and reordered parameters to match OpenSolaris:
75 * Intel interface:
76 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
77 * unsigned char *d, int length)
78 * OpenSolaris OS interface:
79 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80 * ====================================================================
84 #if defined(lint) || defined(__lint)
86 #include <sys/types.h>
88 /* ARGSUSED */
89 void
90 gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
93 #else /* lint */
95 #include <sys/asm_linkage.h>
96 #include <sys/controlregs.h>
97 #ifdef _KERNEL
98 #include <sys/machprivregs.h>
99 #endif
101 #ifdef _KERNEL
103 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
104 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
105 * uses it to pass P2 to syscall.
106 * This also occurs with the STTS macro, but we don't care if
107 * P2 (%rsi) is modified just before function exit.
108 * The CLTS and STTS macros push and pop P1 (%rdi) already.
110 #ifdef __xpv
111 #define PROTECTED_CLTS \
112 push %rsi; \
113 CLTS; \
114 pop %rsi
115 #else
116 #define PROTECTED_CLTS \
117 CLTS
118 #endif /* __xpv */
121 * If CR0_TS is not set, align stack (with push %rbp) and push
122 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
124 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
125 push %rbp; \
126 mov %rsp, %rbp; \
127 movq %cr0, tmpreg; \
128 testq $CR0_TS, tmpreg; \
129 jnz 1f; \
130 and $-XMM_ALIGN, %rsp; \
131 sub $[XMM_SIZE * 11], %rsp; \
132 movaps %xmm0, 160(%rsp); \
133 movaps %xmm1, 144(%rsp); \
134 movaps %xmm2, 128(%rsp); \
135 movaps %xmm3, 112(%rsp); \
136 movaps %xmm4, 96(%rsp); \
137 movaps %xmm5, 80(%rsp); \
138 movaps %xmm6, 64(%rsp); \
139 movaps %xmm7, 48(%rsp); \
140 movaps %xmm8, 32(%rsp); \
141 movaps %xmm9, 16(%rsp); \
142 movaps %xmm10, (%rsp); \
143 jmp 2f; \
144 1: \
145 PROTECTED_CLTS; \
150 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
151 * otherwise set CR0_TS.
153 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
154 testq $CR0_TS, tmpreg; \
155 jnz 1f; \
156 movaps (%rsp), %xmm10; \
157 movaps 16(%rsp), %xmm9; \
158 movaps 32(%rsp), %xmm8; \
159 movaps 48(%rsp), %xmm7; \
160 movaps 64(%rsp), %xmm6; \
161 movaps 80(%rsp), %xmm5; \
162 movaps 96(%rsp), %xmm4; \
163 movaps 112(%rsp), %xmm3; \
164 movaps 128(%rsp), %xmm2; \
165 movaps 144(%rsp), %xmm1; \
166 movaps 160(%rsp), %xmm0; \
167 jmp 2f; \
168 1: \
169 STTS(tmpreg); \
170 2: \
171 mov %rbp, %rsp; \
172 pop %rbp
175 #else
176 #define PROTECTED_CLTS
177 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
178 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
179 #endif /* _KERNEL */
182 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
185 // static uint8_t byte_swap16_mask[] = {
186 // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
187 .text
188 .align XMM_ALIGN
189 .Lbyte_swap16_mask:
190 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
195 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
197 * Perform a carry-less multiplication (that is, use XOR instead of the
198 * multiply operator) on P1 and P2 and place the result in P3.
200 * Byte swap the input and the output.
202 * Note: x_in, y, and res all point to a block of 20-byte numbers
203 * (an array of two 64-bit integers).
205 * Note2: For kernel code, caller is responsible for ensuring
206 * kpreempt_disable() has been called. This is because %xmm registers are
207 * not saved/restored. Clear and set the CR0.TS bit on entry and exit,
208 * respectively, if TS is set on entry. Otherwise, if TS is not set,
209 * save and restore %xmm registers on the stack.
211 * Note3: Original Intel definition:
212 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
213 * unsigned char *d, int length)
215 * Note4: Register/parameter mapping:
216 * Intel:
217 * Parameter 1: %rcx (copied to %xmm0) hk or x_in
218 * Parameter 2: %rdx (copied to %xmm1) s or y
219 * Parameter 3: %rdi (result) d or res
220 * OpenSolaris:
221 * Parameter 1: %rdi (copied to %xmm0) x_in
222 * Parameter 2: %rsi (copied to %xmm1) y
223 * Parameter 3: %rdx (result) res
226 ENTRY_NP(gcm_mul_pclmulqdq)
227 CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
230 // Copy Parameters
232 movdqu (%rdi), %xmm0 // P1
233 movdqu (%rsi), %xmm1 // P2
236 // Byte swap 16-byte input
238 lea .Lbyte_swap16_mask(%rip), %rax
239 movaps (%rax), %xmm10
240 pshufb %xmm10, %xmm0
241 pshufb %xmm10, %xmm1
245 // Multiply with the hash key
247 movdqu %xmm0, %xmm3
248 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0
250 movdqu %xmm0, %xmm4
251 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1
253 movdqu %xmm0, %xmm5
254 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
255 movdqu %xmm0, %xmm6
256 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1
258 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0
260 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
261 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
262 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
263 pxor %xmm5, %xmm3
264 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
265 // of the carry-less multiplication of
266 // xmm0 by xmm1.
268 // We shift the result of the multiplication by one bit position
269 // to the left to cope for the fact that the bits are reversed.
270 movdqu %xmm3, %xmm7
271 movdqu %xmm6, %xmm8
272 pslld $1, %xmm3
273 pslld $1, %xmm6
274 psrld $31, %xmm7
275 psrld $31, %xmm8
276 movdqu %xmm7, %xmm9
277 pslldq $4, %xmm8
278 pslldq $4, %xmm7
279 psrldq $12, %xmm9
280 por %xmm7, %xmm3
281 por %xmm8, %xmm6
282 por %xmm9, %xmm6
285 // First phase of the reduction
287 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
288 // independently.
289 movdqu %xmm3, %xmm7
290 movdqu %xmm3, %xmm8
291 movdqu %xmm3, %xmm9
292 pslld $31, %xmm7 // packed right shift shifting << 31
293 pslld $30, %xmm8 // packed right shift shifting << 30
294 pslld $25, %xmm9 // packed right shift shifting << 25
295 pxor %xmm8, %xmm7 // xor the shifted versions
296 pxor %xmm9, %xmm7
297 movdqu %xmm7, %xmm8
298 pslldq $12, %xmm7
299 psrldq $4, %xmm8
300 pxor %xmm7, %xmm3 // first phase of the reduction complete
303 // Second phase of the reduction
305 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
306 // shift operations.
307 movdqu %xmm3, %xmm2
308 movdqu %xmm3, %xmm4 // packed left shifting >> 1
309 movdqu %xmm3, %xmm5
310 psrld $1, %xmm2
311 psrld $2, %xmm4 // packed left shifting >> 2
312 psrld $7, %xmm5 // packed left shifting >> 7
313 pxor %xmm4, %xmm2 // xor the shifted versions
314 pxor %xmm5, %xmm2
315 pxor %xmm8, %xmm2
316 pxor %xmm2, %xmm3
317 pxor %xmm3, %xmm6 // the result is in xmm6
320 // Byte swap 16-byte result
322 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask
325 // Store the result
327 movdqu %xmm6, (%rdx) // P3
331 // Cleanup and Return
333 SET_TS_OR_POP_XMM_REGISTERS(%r10)
335 SET_SIZE(gcm_mul_pclmulqdq)
337 #endif /* lint || __lint */