usr/src/common/crypto/modes/amd64/gcm_intel.s

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2009 Intel Corporation
  24  * All Rights Reserved.
  25  */
  26 /*
  27  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30
  31 /*
  32  * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
  33  * instructions.  This file contains an accelerated
  34  * Galois Field Multiplication implementation.
  35  *
  36  * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
  37  * carry-less multiplication. More information about PCLMULQDQ can be
  38  * found at:
  39  * http://software.intel.com/en-us/articles/
  40  * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
  41  *
  42  */
  43
  44 /*
  45  * ====================================================================
  46  * OpenSolaris OS modifications
  47  *
  48  * This source originates as file galois_hash_asm.c from
  49  * Intel Corporation dated September 21, 2009.
  50  *
  51  * This OpenSolaris version has these major changes from the original source:
  52  *
  53  * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
  54  * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
  55  * definition for lint.
  56  *
  57  * 2. Formatted code, added comments, and added #includes and #defines.
  58  *
  59  * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
  60  * calling kpreempt_disable() and kpreempt_enable().
  61  * If the TS bit is not set, Save and restore %xmm registers at the beginning
  62  * and end of function calls (%xmm* registers are not saved and restored by
  63  * during kernel thread preemption).
  64  *
  65  * 4. Removed code to perform hashing.  This is already done with C macro
  66  * GHASH in gcm.c.  For better performance, this removed code should be
  67  * reintegrated in the future to replace the C GHASH macro.
  68  *
  69  * 5. Added code to byte swap 16-byte input and output.
  70  *
  71  * 6. Folded in comments from the original C source with embedded assembly
  72  * (SB_w_shift_xor.c)
  73  *
  74  * 7. Renamed function and reordered parameters to match OpenSolaris:
  75  * Intel interface:
  76  *      void galois_hash_asm(unsigned char *hk, unsigned char *s,
  77  *              unsigned char *d, int length)
  78  * OpenSolaris OS interface:
  79  *      void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
  80  * ====================================================================
  81  */
  82
  83
  84
  85 #include <sys/asm_linkage.h>
  86 #include <sys/controlregs.h>
  87 #ifdef _KERNEL
  88 #include <sys/machprivregs.h>
  89 #endif
  90
  91 #ifdef _KERNEL
  92         /*
  93          * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
  94          * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
  95          * uses it to pass P2 to syscall.
  96          * This also occurs with the STTS macro, but we don't care if
  97          * P2 (%rsi) is modified just before function exit.
  98          * The CLTS and STTS macros push and pop P1 (%rdi) already.
  99          */
 100 #ifdef __xpv
 101 #define PROTECTED_CLTS \
 102         push    %rsi; \
 103         CLTS; \
 104         pop     %rsi
 105 #else
 106 #define PROTECTED_CLTS \
 107         CLTS
 108 #endif  /* __xpv */
 109
 110         /*
 111          * If CR0_TS is not set, align stack (with push %rbp) and push
 112          * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
 113          */
 114 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
 115         push    %rbp; \
 116         mov     %rsp, %rbp; \
 117         movq    %cr0, tmpreg; \
 118         testq   $CR0_TS, tmpreg; \
 119         jnz     1f; \
 120         and     $-XMM_ALIGN, %rsp; \
 121         sub     $[XMM_SIZE * 11], %rsp; \
 122         movaps  %xmm0, 160(%rsp); \
 123         movaps  %xmm1, 144(%rsp); \
 124         movaps  %xmm2, 128(%rsp); \
 125         movaps  %xmm3, 112(%rsp); \
 126         movaps  %xmm4, 96(%rsp); \
 127         movaps  %xmm5, 80(%rsp); \
 128         movaps  %xmm6, 64(%rsp); \
 129         movaps  %xmm7, 48(%rsp); \
 130         movaps  %xmm8, 32(%rsp); \
 131         movaps  %xmm9, 16(%rsp); \
 132         movaps  %xmm10, (%rsp); \
 133         jmp     2f; \
 134 1: \
 135         PROTECTED_CLTS; \
 136 2:
 137
 138
 139         /*
 140          * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
 141          * otherwise set CR0_TS.
 142          */
 143 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
 144         testq   $CR0_TS, tmpreg; \
 145         jnz     1f; \
 146         movaps  (%rsp), %xmm10; \
 147         movaps  16(%rsp), %xmm9; \
 148         movaps  32(%rsp), %xmm8; \
 149         movaps  48(%rsp), %xmm7; \
 150         movaps  64(%rsp), %xmm6; \
 151         movaps  80(%rsp), %xmm5; \
 152         movaps  96(%rsp), %xmm4; \
 153         movaps  112(%rsp), %xmm3; \
 154         movaps  128(%rsp), %xmm2; \
 155         movaps  144(%rsp), %xmm1; \
 156         movaps  160(%rsp), %xmm0; \
 157         jmp     2f; \
 158 1: \
 159         STTS(tmpreg); \
 160 2: \
 161         mov     %rbp, %rsp; \
 162         pop     %rbp
 163
 164
 165 #else
 166 #define PROTECTED_CLTS
 167 #define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
 168 #define SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
 169 #endif  /* _KERNEL */
 170
 171 /*
 172  * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
 173  */
 174
 175 // static uint8_t byte_swap16_mask[] = {
 176 //       15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
 177 .text
 178 .align XMM_ALIGN
 179 .Lbyte_swap16_mask:
 180         .byte   15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 181
 182
 183
 184 /*
 185  * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
 186  *
 187  * Perform a carry-less multiplication (that is, use XOR instead of the
 188  * multiply operator) on P1 and P2 and place the result in P3.
 189  *
 190  * Byte swap the input and the output.
 191  *
 192  * Note: x_in, y, and res all point to a block of 20-byte numbers
 193  * (an array of two 64-bit integers).
 194  *
 195  * Note2: For kernel code, caller is responsible for ensuring
 196  * kpreempt_disable() has been called.  This is because %xmm registers are
 197  * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
 198  * respectively, if TS is set on entry.  Otherwise, if TS is not set,
 199  * save and restore %xmm registers on the stack.
 200  *
 201  * Note3: Original Intel definition:
 202  * void galois_hash_asm(unsigned char *hk, unsigned char *s,
 203  *      unsigned char *d, int length)
 204  *
 205  * Note4: Register/parameter mapping:
 206  * Intel:
 207  *      Parameter 1: %rcx (copied to %xmm0)     hk or x_in
 208  *      Parameter 2: %rdx (copied to %xmm1)     s or y
 209  *      Parameter 3: %rdi (result)              d or res
 210  * OpenSolaris:
 211  *      Parameter 1: %rdi (copied to %xmm0)     x_in
 212  *      Parameter 2: %rsi (copied to %xmm1)     y
 213  *      Parameter 3: %rdx (result)              res
 214  */
 215
 216 ENTRY_NP(gcm_mul_pclmulqdq)
 217         CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
 218
 219         //
 220         // Copy Parameters
 221         //
 222         movdqu  (%rdi), %xmm0   // P1
 223         movdqu  (%rsi), %xmm1   // P2
 224
 225         //
 226         // Byte swap 16-byte input
 227         //
 228         lea     .Lbyte_swap16_mask(%rip), %rax
 229         movaps  (%rax), %xmm10
 230         pshufb  %xmm10, %xmm0
 231         pshufb  %xmm10, %xmm1
 232
 233
 234         //
 235         // Multiply with the hash key
 236         //
 237         movdqu  %xmm0, %xmm3
 238         pclmulqdq $0, %xmm1, %xmm3      // xmm3 holds a0*b0
 239
 240         movdqu  %xmm0, %xmm4
 241         pclmulqdq $16, %xmm1, %xmm4     // xmm4 holds a0*b1
 242
 243         movdqu  %xmm0, %xmm5
 244         pclmulqdq $1, %xmm1, %xmm5      // xmm5 holds a1*b0
 245         movdqu  %xmm0, %xmm6
 246         pclmulqdq $17, %xmm1, %xmm6     // xmm6 holds a1*b1
 247
 248         pxor    %xmm5, %xmm4    // xmm4 holds a0*b1 + a1*b0
 249
 250         movdqu  %xmm4, %xmm5    // move the contents of xmm4 to xmm5
 251         psrldq  $8, %xmm4       // shift by xmm4 64 bits to the right
 252         pslldq  $8, %xmm5       // shift by xmm5 64 bits to the left
 253         pxor    %xmm5, %xmm3
 254         pxor    %xmm4, %xmm6    // Register pair <xmm6:xmm3> holds the result
 255                                 // of the carry-less multiplication of
 256                                 // xmm0 by xmm1.
 257
 258         // We shift the result of the multiplication by one bit position
 259         // to the left to cope for the fact that the bits are reversed.
 260         movdqu  %xmm3, %xmm7
 261         movdqu  %xmm6, %xmm8
 262         pslld   $1, %xmm3
 263         pslld   $1, %xmm6
 264         psrld   $31, %xmm7
 265         psrld   $31, %xmm8
 266         movdqu  %xmm7, %xmm9
 267         pslldq  $4, %xmm8
 268         pslldq  $4, %xmm7
 269         psrldq  $12, %xmm9
 270         por     %xmm7, %xmm3
 271         por     %xmm8, %xmm6
 272         por     %xmm9, %xmm6
 273
 274         //
 275         // First phase of the reduction
 276         //
 277         // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
 278         // independently.
 279         movdqu  %xmm3, %xmm7
 280         movdqu  %xmm3, %xmm8
 281         movdqu  %xmm3, %xmm9
 282         pslld   $31, %xmm7      // packed right shift shifting << 31
 283         pslld   $30, %xmm8      // packed right shift shifting << 30
 284         pslld   $25, %xmm9      // packed right shift shifting << 25
 285         pxor    %xmm8, %xmm7    // xor the shifted versions
 286         pxor    %xmm9, %xmm7
 287         movdqu  %xmm7, %xmm8
 288         pslldq  $12, %xmm7
 289         psrldq  $4, %xmm8
 290         pxor    %xmm7, %xmm3    // first phase of the reduction complete
 291
 292         //
 293         // Second phase of the reduction
 294         //
 295         // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
 296         // shift operations.
 297         movdqu  %xmm3, %xmm2
 298         movdqu  %xmm3, %xmm4    // packed left shifting >> 1
 299         movdqu  %xmm3, %xmm5
 300         psrld   $1, %xmm2
 301         psrld   $2, %xmm4       // packed left shifting >> 2
 302         psrld   $7, %xmm5       // packed left shifting >> 7
 303         pxor    %xmm4, %xmm2    // xor the shifted versions
 304         pxor    %xmm5, %xmm2
 305         pxor    %xmm8, %xmm2
 306         pxor    %xmm2, %xmm3
 307         pxor    %xmm3, %xmm6    // the result is in xmm6
 308
 309         //
 310         // Byte swap 16-byte result
 311         //
 312         pshufb  %xmm10, %xmm6   // %xmm10 has the swap mask
 313
 314         //
 315         // Store the result
 316         //
 317         movdqu  %xmm6, (%rdx)   // P3
 318
 319
 320         //
 321         // Cleanup and Return
 322         //
 323         SET_TS_OR_POP_XMM_REGISTERS(%r10)
 324         ret
 325         SET_SIZE(gcm_mul_pclmulqdq)
 326