sys/arch/x86/include/cpu_extended_state.h

   1 /*      $NetBSD: cpu_extended_state.h,v 1.9 2014/02/25 22:16:52 dsl Exp $       */
   2
   3 #ifndef _X86_CPU_EXTENDED_STATE_H_
   4 #define _X86_CPU_EXTENDED_STATE_H_
   5
   6 /*
   7  * This file contains definitions of structures that match the memory
   8  * layouts used x86 processors to save floating point registers and other
   9  * extended cpu state.
  10  * This includes registers (etc) used by SSE/SSE2/SSE3/SSSE3/SSE4 and
  11  * the later AVX instructions.
  12  * The definitions are such that any future 'extended state' should
  13  * be handled (provided the kernel doesn't need to know the actual contents.
  14  *
  15  * The actual structures the cpu accesses must be aligned to 16 for
  16  * FXSAVE and 64 for XSAVE. The types aren't aligned because copies
  17  * do not need extra alignment.
  18  *
  19  * The slightly different layout saved by the i387 fsave in also defined.
  20  * This is only normally written by pre Pentium II type cpus that don't
  21  * support the fxsave instruction.
  22  *
  23  * Associated save instructions:
  24  * FNSAVE:  Saves x87 state in 108 bytes (original i387 layout).
  25  *          Then reinitialies the fpu.
  26  * FSAVE:   Encodes to FWAIT followed by FNSAVE.
  27  * FXSAVE:  Saves the x87 state and XMM (aka SSE) registers to the
  28  *          first 448 (max) bytes of a 512 byte area.
  29  *          This layout does not match that written by FNSAVE.
  30  * XSAVE:   Uses the same layout for the x87 and XMM registers,
  31  *          followed by a 64byte header and separate save areas
  32  *          for additional extended cpu state.
  33  *          The x87 state is always saved, the others conditionally.
  34  * XSAVEOPT: As XSAVE but (IIRC) only writes the registers blocks
  35  *          that have been modified.
  36  */
  37
  38 #ifdef __lint__
  39 /* Lint has different packing rules and doesn't understand __aligned() */
  40 #define __CTASSERT_NOLINT(x) __CTASSERT(1)
  41 #else
  42 #define __CTASSERT_NOLINT(x) __CTASSERT(x)
  43 #endif
  44
  45 /*
  46  * Layout for code/data pointers relating to FP exceptions.
  47  * Marked 'packed' because they aren't always 64bit aligned.
  48  * Since the x86 cpu supports misaligned accesses it isn't
  49  * worth avoiding the 'packed' attribute.
  50  */
  51 union fp_addr {
  52         uint64_t fa_64; /* Linear address for 64bit systems */
  53         struct {
  54                 uint32_t fa_off;        /* linear address for 32 bit */
  55                 uint16_t fa_seg;        /* code/data (etc) segment */
  56                 uint16_t fa_opcode;     /* last opcode (sometimes) */
  57         } fa_32;
  58 } __packed __aligned(4);
  59
  60 /* The x87 registers are 80 bits */
  61 struct fpacc87 {
  62         uint64_t        f87_mantissa;   /* mantissa */
  63         uint16_t        f87_exp_sign;   /* exponent and sign */
  64 } __packed __aligned(2);
  65
  66 /* The x87 registers padded out to 16 bytes for fxsave */
  67 struct fpaccfx {
  68         struct fpacc87 r __aligned(16);
  69 };
  70
  71 /* The SSE/SSE2 registers are 128 bits */
  72 struct xmmreg {
  73         uint8_t xmm_bytes[16];
  74 };
  75
  76 /* The AVX registers are 256 bits, but the low bits are the xmmregs */
  77 struct ymmreg {
  78         uint8_t ymm_bytes[16];
  79 };
  80
  81 /*
  82  * Floating point unit registers (fsave instruction).
  83  * The s87_ac[] and fx_87_ac[] are relative to the stack top.
  84  * The 'tag word' contains 2 bits per register and refers to
  85  * absolute register numbers.
  86  * The cpu sets the tag values 0b01 (zero) and 0b10 (special) when a value
  87  * is loaded. The software need only set 0b00 (used) and 0xb11 (unused).
  88  * The fxsave 'Abridged tag word' in inverted.
  89  */
  90 struct save87 {
  91         uint16_t        s87_cw __aligned(4);    /* control word (16bits) */
  92         uint16_t        s87_sw __aligned(4);    /* status word (16bits) */
  93         uint16_t        s87_tw __aligned(4);    /* tag word (16bits) */
  94         union fp_addr   s87_ip;         /* floating point instruction pointer */
  95 #define s87_opcode s87_ip.fa_32.fa_opcode       /* opcode last executed (11bits) */
  96         union fp_addr   s87_dp;         /* floating operand offset */
  97         struct fpacc87  s87_ac[8];      /* accumulator contents, 0-7 */
  98 };
  99 __CTASSERT_NOLINT(sizeof (struct save87) == 108);
 100
 101 /* FPU/MMX/SSE/SSE2 context */
 102 struct fxsave {
 103 /*0*/   uint16_t        fx_cw;          /* FPU Control Word */
 104         uint16_t        fx_sw;          /* FPU Status Word */
 105         uint8_t         fx_tw;          /* FPU Tag Word (abridged) */
 106         uint16_t        fx_opcode;      /* FPU Opcode */
 107         union fp_addr   fx_ip;          /* FPU Instruction Pointer */
 108 /*16*/  union fp_addr   fx_dp;          /* FPU Data pointer */
 109         uint32_t        fx_mxcsr;       /* MXCSR Register State */
 110         uint32_t        fx_mxcsr_mask;
 111         struct fpaccfx  fx_87_ac[8];    /* 8 x87 registers */
 112         struct xmmreg   fx_xmm[16];     /* XMM regs (8 in 32bit modes) */
 113         uint8_t         fx_rsvd[48];
 114         uint8_t         fx_kernel[48];  /* Not written by the hardware */
 115 } __aligned(16);
 116 __CTASSERT_NOLINT(sizeof (struct fxsave) == 512);
 117
 118 /* The end of the fsave buffer can be used by the operating system */
 119 struct fxsave_os {
 120         uint8_t         fxo_fxsave[512 - 48];
 121         /* 48 bytes available, NB copied to/from userspace */
 122         uint16_t        fxo_dflt_cw;    /* Control word for signal handlers */
 123 };
 124
 125 /*
 126  * For XSAVE a 64byte header follows the fxsave data.
 127  * Currently it only contains one field of which only 3 bits are defined.
 128  * Some other parts must be zero - zero it all.
 129  *
 130  * The xsh_xstate_bv bits match those of XCR0:
 131  *   XCR0_X87        0x00000001      x87 FPU/MMX state
 132  *   XCR0_SSE        0x00000002      SSE state
 133  *   XCR0_AVX        0x00000004      AVX state (ymmn registers)
 134  *
 135  * The offsets and sizes of any save areas can be found by reading
 136  * the correct control registers.
 137  */
 138
 139 struct xsave_header {
 140         uint64_t        xsh_fxsave[64]; /* to align in the union */
 141         uint64_t        xsh_xstate_bv;  /* bitmap of saved sub structures */
 142         uint64_t        xsh_rsrvd[2];   /* must be zero */
 143         uint64_t        xsh_reserved[5];/* best if zero */
 144 };
 145 __CTASSERT(sizeof (struct xsave_header) == 512 + 64);
 146
 147 /*
 148  * The ymm save area actually follows the xsave_header.
 149  */
 150 struct xsave_ymm {
 151         struct ymmreg   xs_ymm[16];     /* High bits of YMM registers */
 152 };
 153 __CTASSERT(sizeof (struct xsave_ymm) == 256);
 154
 155 /*
 156  * The following union is placed at the end of the pcb.
 157  * It is defined this way to separate the definitions and to
 158  * minimise the number of union/struct selectors.
 159  * NB: Some userspace stuff (eg firefox) uses it to parse ucontext.
 160  */
 161 union savefpu {
 162         struct save87           sv_87;
 163         struct fxsave           sv_xmm;
 164 #ifdef _KERNEL
 165         struct fxsave_os        sv_os;
 166         struct xsave_header     sv_xsave_hdr;
 167 #endif
 168 };
 169
 170 /*
 171  * 80387 control and status word bits
 172  *
 173  * The only reference I can find to bits 0x40 and 0x80 in the control word
 174  * is for the Weitek 1167/3167.
 175  * I (dsl) can't find why the default word has 0x40 set.
 176  *
 177  * A stack error is signalled as an INVOP that also sets STACK_FAULT
 178  * (other INVOP do not clear STACK_FAULT).
 179  */
 180 /* Interrupt masks (set masks interrupt) and status bits */
 181 #define EN_SW_INVOP             0x0001  /* Invalid operation */
 182 #define EN_SW_DENORM            0x0002  /* Denormalized operand */
 183 #define EN_SW_ZERODIV           0x0004  /* Divide by zero */
 184 #define EN_SW_OVERFLOW          0x0008  /* Overflow */
 185 #define EN_SW_UNDERFLOW         0x0010  /* Underflow */
 186 #define EN_SW_PRECLOSS          0x0020  /* Loss of precision */
 187 /* Status word bits (reserved in control word) */
 188 #define EN_SW_STACK_FAULT       0x0040  /* Stack under/overflow */
 189 #define EN_SW_ERROR_SUMMARY     0x0080  /* Unmasked error has ocurred */
 190 /* Control bits (badly named) */
 191 #define EN_SW_CTL_PREC          0x0300  /* Precision control */
 192 #define EN_SW_PREC_24           0x0000  /* Single precision */
 193 #define EN_SW_PREC_53           0x0200  /* Double precision */
 194 #define EN_SW_PREC_64           0x0300  /* Extended precision */
 195 #define EN_SW_CTL_ROUND         0x0c00  /* Rounding control */
 196 #define EN_SW_ROUND_EVEN        0x0000  /* Round to nearest even */
 197 #define EN_SW_ROUND_DOWN        0x0400  /* Round towards minus infinity */
 198 #define EN_SW_ROUND_UP          0x0800  /* Round towards plus infinity */
 199 #define EN_SW_ROUND_ZERO        0x0c00  /* Round towards zero (truncates) */
 200 #define EN_SW_CTL_INF           0x1000  /* Infinity control, not used  */
 201
 202 /*
 203  * The standard 0x87 control word from finit is 0x37F, giving:
 204  *      round to nearest
 205  *      64-bit precision
 206  *      all exceptions masked.
 207  *
 208  * NetBSD used to select:
 209  *      round to nearest
 210  *      53-bit precision
 211  *      all exceptions masked.
 212  * Stating: 64-bit precision often gives bad results with high level
 213  * languages because it makes the results of calculations depend on whether
 214  * intermediate values are stored in memory or in FPU registers.
 215  * Also some 'pathological divisions' give an error in the LSB because
 216  * the value is first rounded up when the 64bit mantissa is generated,
 217  * and then again when it is truncated to 53 bits.
 218  *
 219  * However the C language explicitly allows the extra precision.
 220  *
 221  * The iBCS control word has underflow, overflow, zero divide, and invalid
 222  * operation exceptions unmasked.  But that causes an unexpected exception
 223  * in the test program 'paranoia' and makes denormals useless (DBL_MIN / 2
 224  * underflows).  It doesn't make a lot of sense to trap underflow without
 225  * trapping denormals.
 226  */
 227 #define __INITIAL_NPXCW__       0x037f
 228 /* Modern NetBSD uses the default control word.. */
 229 #define __NetBSD_NPXCW__        __INITIAL_NPXCW__
 230 /* NetBSD before 6.99.26 forced IEEE double precision. */
 231 #define __NetBSD_COMPAT_NPXCW__ 0x127f
 232 /* FreeBSD leaves some exceptions unmasked as well. */
 233 #define __FreeBSD_NPXCW__       0x1272
 234 /* iBCS2 goes a bit further and leaves the underflow exception unmasked. */
 235 #define __iBCS2_NPXCW__         0x0262
 236 /* Linux just uses the default control word. */
 237 #define __Linux_NPXCW__         __INITIAL_NPXCW__
 238 /* SVR4 uses the same control word as iBCS2. */
 239 #define __SVR4_NPXCW__          0x0262
 240
 241 /*
 242  * The default MXCSR value at reset is 0x1f80, IA-32 Instruction
 243  * Set Reference, pg. 3-369.
 244  *
 245  * The low 6 bits of the mxcsr are the fp status bits (same order as x87).
 246  * Bit 6 is 'denormals are zero' (speeds up calculations).
 247  * Bits 7-16 are the interrupt mask bits (same order, 1 to mask).
 248  * Bits 13 and 14 are rounding control.
 249  * Bit 15 is 'flush to zero' - affects underflow.
 250  * Bits 16-31 must be zero.
 251  */
 252 #define __INITIAL_MXCSR__       0x1f80
 253 #define __INITIAL_MXCSR_MASK__  0xffbf
 254
 255 #endif /* _X86_CPU_EXTENDED_STATE_H_ */