libphobos/libdruntime/core/simd.d

   1 // Written in the D programming language.
   2
   3 /**
   4  * Builtin SIMD intrinsics
   5  *
   6  * Source: $(DRUNTIMESRC core/_simd.d)
   7  *
   8  * Copyright: Copyright Digital Mars 2012-2020
   9  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  10  * Authors:   $(HTTP digitalmars.com, Walter Bright),
  11  * Source:    $(DRUNTIMESRC core/_simd.d)
  12  */
  13
  14 module core.simd;
  15
  16 pure:
  17 nothrow:
  18 @safe:
  19 @nogc:
  20
  21 /*******************************
  22  * Create a vector type.
  23  *
  24  * Parameters:
  25  *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
  26  *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
  27  *      For 256 bit vectors,
  28  *      one of double[4], float[8], void[32], byte[32], ubyte[32],
  29  *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
  30  */
  31
  32 template Vector(T)
  33 {
  34     /* __vector is compiler magic, hide it behind a template.
  35      * The compiler will reject T's that don't work.
  36      */
  37     alias __vector(T) Vector;
  38 }
  39
  40 /* Handy aliases
  41  */
  42 static if (is(Vector!(void[8])))    alias Vector!(void[8])    void8;        ///
  43 static if (is(Vector!(double[1])))  alias Vector!(double[1])  double1;      ///
  44 static if (is(Vector!(float[2])))   alias Vector!(float[2])   float2;       ///
  45 static if (is(Vector!(byte[8])))    alias Vector!(byte[8])    byte8;        ///
  46 static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8])   ubyte8;       ///
  47 static if (is(Vector!(short[4])))   alias Vector!(short[4])   short4;       ///
  48 static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4])  ushort4;      ///
  49 static if (is(Vector!(int[2])))     alias Vector!(int[2])     int2;         ///
  50 static if (is(Vector!(uint[2])))    alias Vector!(uint[2])    uint2;        ///
  51 static if (is(Vector!(long[1])))    alias Vector!(long[1])    long1;        ///
  52 static if (is(Vector!(ulong[1])))   alias Vector!(ulong[1])   ulong1;       ///
  53
  54 static if (is(Vector!(void[16])))   alias Vector!(void[16])   void16;       ///
  55 static if (is(Vector!(double[2])))  alias Vector!(double[2])  double2;      ///
  56 static if (is(Vector!(float[4])))   alias Vector!(float[4])   float4;       ///
  57 static if (is(Vector!(byte[16])))   alias Vector!(byte[16])   byte16;       ///
  58 static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16])  ubyte16;      ///
  59 static if (is(Vector!(short[8])))   alias Vector!(short[8])   short8;       ///
  60 static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8])  ushort8;      ///
  61 static if (is(Vector!(int[4])))     alias Vector!(int[4])     int4;         ///
  62 static if (is(Vector!(uint[4])))    alias Vector!(uint[4])    uint4;        ///
  63 static if (is(Vector!(long[2])))    alias Vector!(long[2])    long2;        ///
  64 static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])   ulong2;       ///
  65
  66 static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;       ///
  67 static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;      ///
  68 static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;       ///
  69 static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;       ///
  70 static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;      ///
  71 static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;      ///
  72 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;     ///
  73 static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;         ///
  74 static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;        ///
  75 static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;        ///
  76 static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;       ///
  77
  78 static if (is(Vector!(void[64])))   alias Vector!(void[64])   void64;       ///
  79 static if (is(Vector!(double[8])))  alias Vector!(double[8])  double8;      ///
  80 static if (is(Vector!(float[16])))  alias Vector!(float[16])  float16;      ///
  81 static if (is(Vector!(byte[64])))   alias Vector!(byte[64])   byte64;       ///
  82 static if (is(Vector!(ubyte[64])))  alias Vector!(ubyte[64])  ubyte64;      ///
  83 static if (is(Vector!(short[32])))  alias Vector!(short[32])  short32;      ///
  84 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32;     ///
  85 static if (is(Vector!(int[16])))    alias Vector!(int[16])    int16;        ///
  86 static if (is(Vector!(uint[16])))   alias Vector!(uint[16])   uint16;       ///
  87 static if (is(Vector!(long[8])))    alias Vector!(long[8])    long8;        ///
  88 static if (is(Vector!(ulong[8])))   alias Vector!(ulong[8])   ulong8;       ///
  89
  90 version (D_SIMD)
  91 {
  92     /** XMM opcodes that conform to the following:
  93     *
  94     *  opcode xmm1,xmm2/mem
  95     *
  96     * and do not have side effects (i.e. do not write to memory).
  97     */
  98     enum XMM
  99     {
 100         ADDSS = 0xF30F58,
 101         ADDSD = 0xF20F58,
 102         ADDPS = 0x000F58,
 103         ADDPD = 0x660F58,
 104         PADDB = 0x660FFC,
 105         PADDW = 0x660FFD,
 106         PADDD = 0x660FFE,
 107         PADDQ = 0x660FD4,
 108
 109         SUBSS = 0xF30F5C,
 110         SUBSD = 0xF20F5C,
 111         SUBPS = 0x000F5C,
 112         SUBPD = 0x660F5C,
 113         PSUBB = 0x660FF8,
 114         PSUBW = 0x660FF9,
 115         PSUBD = 0x660FFA,
 116         PSUBQ = 0x660FFB,
 117
 118         MULSS = 0xF30F59,
 119         MULSD = 0xF20F59,
 120         MULPS = 0x000F59,
 121         MULPD = 0x660F59,
 122         PMULLW = 0x660FD5,
 123
 124         DIVSS = 0xF30F5E,
 125         DIVSD = 0xF20F5E,
 126         DIVPS = 0x000F5E,
 127         DIVPD = 0x660F5E,
 128
 129         PAND  = 0x660FDB,
 130         POR   = 0x660FEB,
 131
 132         UCOMISS = 0x000F2E,
 133         UCOMISD = 0x660F2E,
 134
 135         XORPS = 0x000F57,
 136         XORPD = 0x660F57,
 137
 138         // Use STO and LOD instead of MOV to distinguish the direction
 139         // (Destination is first operand, Source is second operand)
 140         STOSS  = 0xF30F11,        /// MOVSS xmm1/m32, xmm2
 141         STOSD  = 0xF20F11,        /// MOVSD xmm1/m64, xmm2
 142         STOAPS = 0x000F29,        /// MOVAPS xmm2/m128, xmm1
 143         STOAPD = 0x660F29,        /// MOVAPD xmm2/m128, xmm1
 144         STODQA = 0x660F7F,        /// MOVDQA xmm2/m128, xmm1
 145         STOD   = 0x660F7E,        /// MOVD reg/mem64, xmm   66 0F 7E /r
 146         STOQ   = 0x660FD6,        /// MOVQ xmm2/m64, xmm1
 147
 148         LODSS  = 0xF30F10,        /// MOVSS xmm1, xmm2/m32
 149         LODSD  = 0xF20F10,        /// MOVSD xmm1, xmm2/m64
 150         LODAPS = 0x000F28,        /// MOVAPS xmm1, xmm2/m128
 151         LODAPD = 0x660F28,        /// MOVAPD xmm1, xmm2/m128
 152         LODDQA = 0x660F6F,        /// MOVDQA xmm1, xmm2/m128
 153         LODD   = 0x660F6E,        /// MOVD xmm, reg/mem64   66 0F 6E /r
 154         LODQ   = 0xF30F7E,        /// MOVQ xmm1, xmm2/m64
 155
 156         LODDQU   = 0xF30F6F,      /// MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
 157         STODQU   = 0xF30F7F,      /// MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
 158         MOVDQ2Q  = 0xF20FD6,      /// MOVDQ2Q mmx, xmm          F2 0F D6 /r
 159         MOVHLPS  = 0x0F12,        /// MOVHLPS xmm1, xmm2        0F 12 /r
 160         LODHPD   = 0x660F16,      /// MOVHPD xmm1, m64
 161         STOHPD   = 0x660F17,      /// MOVHPD mem64, xmm1        66 0F 17 /r
 162         LODHPS   = 0x0F16,        /// MOVHPS xmm1, m64
 163         STOHPS   = 0x0F17,        /// MOVHPS m64, xmm1
 164         MOVLHPS  = 0x0F16,        /// MOVLHPS xmm1, xmm2
 165         LODLPD   = 0x660F12,      /// MOVLPD xmm1, m64
 166         STOLPD   = 0x660F13,      /// MOVLPD m64, xmm1
 167         LODLPS   = 0x0F12,        /// MOVLPS xmm1, m64
 168         STOLPS   = 0x0F13,        /// MOVLPS m64, xmm1
 169         MOVMSKPD = 0x660F50,      /// MOVMSKPD reg, xmm
 170         MOVMSKPS = 0x0F50,        /// MOVMSKPS reg, xmm
 171         MOVNTDQ  = 0x660FE7,      /// MOVNTDQ m128, xmm1
 172         MOVNTI   = 0x0FC3,        /// MOVNTI m32, r32
 173         MOVNTPD  = 0x660F2B,      /// MOVNTPD m128, xmm1
 174         MOVNTPS  = 0x0F2B,        /// MOVNTPS m128, xmm1
 175         MOVNTQ   = 0x0FE7,        /// MOVNTQ m64, mm
 176         MOVQ2DQ  = 0xF30FD6,      /// MOVQ2DQ
 177         LODUPD   = 0x660F10,      /// MOVUPD xmm1, xmm2/m128
 178         STOUPD   = 0x660F11,      /// MOVUPD xmm2/m128, xmm1
 179         LODUPS   = 0x0F10,        /// MOVUPS xmm1, xmm2/m128
 180         STOUPS   = 0x0F11,        /// MOVUPS xmm2/m128, xmm1
 181
 182         PACKSSDW = 0x660F6B,
 183         PACKSSWB = 0x660F63,
 184         PACKUSWB = 0x660F67,
 185         PADDSB = 0x660FEC,
 186         PADDSW = 0x660FED,
 187         PADDUSB = 0x660FDC,
 188         PADDUSW = 0x660FDD,
 189         PANDN = 0x660FDF,
 190         PCMPEQB = 0x660F74,
 191         PCMPEQD = 0x660F76,
 192         PCMPEQW = 0x660F75,
 193         PCMPGTB = 0x660F64,
 194         PCMPGTD = 0x660F66,
 195         PCMPGTW = 0x660F65,
 196         PMADDWD = 0x660FF5,
 197         PSLLW = 0x660FF1,
 198         PSLLD = 0x660FF2,
 199         PSLLQ = 0x660FF3,
 200         PSRAW = 0x660FE1,
 201         PSRAD = 0x660FE2,
 202         PSRLW = 0x660FD1,
 203         PSRLD = 0x660FD2,
 204         PSRLQ = 0x660FD3,
 205         PSUBSB = 0x660FE8,
 206         PSUBSW = 0x660FE9,
 207         PSUBUSB = 0x660FD8,
 208         PSUBUSW = 0x660FD9,
 209         PUNPCKHBW = 0x660F68,
 210         PUNPCKHDQ = 0x660F6A,
 211         PUNPCKHWD = 0x660F69,
 212         PUNPCKLBW = 0x660F60,
 213         PUNPCKLDQ = 0x660F62,
 214         PUNPCKLWD = 0x660F61,
 215         PXOR = 0x660FEF,
 216         ANDPD = 0x660F54,
 217         ANDPS = 0x0F54,
 218         ANDNPD = 0x660F55,
 219         ANDNPS = 0x0F55,
 220         CMPPS = 0x0FC2,
 221         CMPPD = 0x660FC2,
 222         CMPSD = 0xF20FC2,
 223         CMPSS = 0xF30FC2,
 224         COMISD = 0x660F2F,
 225         COMISS = 0x0F2F,
 226         CVTDQ2PD = 0xF30FE6,
 227         CVTDQ2PS = 0x0F5B,
 228         CVTPD2DQ = 0xF20FE6,
 229         CVTPD2PI = 0x660F2D,
 230         CVTPD2PS = 0x660F5A,
 231         CVTPI2PD = 0x660F2A,
 232         CVTPI2PS = 0x0F2A,
 233         CVTPS2DQ = 0x660F5B,
 234         CVTPS2PD = 0x0F5A,
 235         CVTPS2PI = 0x0F2D,
 236         CVTSD2SI = 0xF20F2D,
 237         CVTSD2SS = 0xF20F5A,
 238         CVTSI2SD = 0xF20F2A,
 239         CVTSI2SS = 0xF30F2A,
 240         CVTSS2SD = 0xF30F5A,
 241         CVTSS2SI = 0xF30F2D,
 242         CVTTPD2PI = 0x660F2C,
 243         CVTTPD2DQ = 0x660FE6,
 244         CVTTPS2DQ = 0xF30F5B,
 245         CVTTPS2PI = 0x0F2C,
 246         CVTTSD2SI = 0xF20F2C,
 247         CVTTSS2SI = 0xF30F2C,
 248         MASKMOVDQU = 0x660FF7,
 249         MASKMOVQ = 0x0FF7,
 250         MAXPD = 0x660F5F,
 251         MAXPS = 0x0F5F,
 252         MAXSD = 0xF20F5F,
 253         MAXSS = 0xF30F5F,
 254         MINPD = 0x660F5D,
 255         MINPS = 0x0F5D,
 256         MINSD = 0xF20F5D,
 257         MINSS = 0xF30F5D,
 258         ORPD = 0x660F56,
 259         ORPS = 0x0F56,
 260         PAVGB = 0x660FE0,
 261         PAVGW = 0x660FE3,
 262         PMAXSW = 0x660FEE,
 263         //PINSRW = 0x660FC4,
 264         PMAXUB = 0x660FDE,
 265         PMINSW = 0x660FEA,
 266         PMINUB = 0x660FDA,
 267         //PMOVMSKB = 0x660FD7,
 268         PMULHUW = 0x660FE4,
 269         PMULHW = 0x660FE5,
 270         PMULUDQ = 0x660FF4,
 271         PSADBW = 0x660FF6,
 272         PUNPCKHQDQ = 0x660F6D,
 273         PUNPCKLQDQ = 0x660F6C,
 274         RCPPS = 0x0F53,
 275         RCPSS = 0xF30F53,
 276         RSQRTPS = 0x0F52,
 277         RSQRTSS = 0xF30F52,
 278         SQRTPD = 0x660F51,
 279         SHUFPD = 0x660FC6,
 280         SHUFPS = 0x0FC6,
 281         SQRTPS = 0x0F51,
 282         SQRTSD = 0xF20F51,
 283         SQRTSS = 0xF30F51,
 284         UNPCKHPD = 0x660F15,
 285         UNPCKHPS = 0x0F15,
 286         UNPCKLPD = 0x660F14,
 287         UNPCKLPS = 0x0F14,
 288
 289         PSHUFD = 0x660F70,
 290         PSHUFHW = 0xF30F70,
 291         PSHUFLW = 0xF20F70,
 292         PSHUFW = 0x0F70,
 293         PSLLDQ = 0x07660F73,
 294         PSRLDQ = 0x03660F73,
 295
 296         //PREFETCH = 0x0F18,
 297
 298         // SSE3 Pentium 4 (Prescott)
 299
 300         ADDSUBPD = 0x660FD0,
 301         ADDSUBPS = 0xF20FD0,
 302         HADDPD   = 0x660F7C,
 303         HADDPS   = 0xF20F7C,
 304         HSUBPD   = 0x660F7D,
 305         HSUBPS   = 0xF20F7D,
 306         MOVDDUP  = 0xF20F12,
 307         MOVSHDUP = 0xF30F16,
 308         MOVSLDUP = 0xF30F12,
 309         LDDQU    = 0xF20FF0,
 310         MONITOR  = 0x0F01C8,
 311         MWAIT    = 0x0F01C9,
 312
 313         // SSSE3
 314         PALIGNR = 0x660F3A0F,
 315         PHADDD = 0x660F3802,
 316         PHADDW = 0x660F3801,
 317         PHADDSW = 0x660F3803,
 318         PABSB = 0x660F381C,
 319         PABSD = 0x660F381E,
 320         PABSW = 0x660F381D,
 321         PSIGNB = 0x660F3808,
 322         PSIGND = 0x660F380A,
 323         PSIGNW = 0x660F3809,
 324         PSHUFB = 0x660F3800,
 325         PMADDUBSW = 0x660F3804,
 326         PMULHRSW = 0x660F380B,
 327         PHSUBD = 0x660F3806,
 328         PHSUBW = 0x660F3805,
 329         PHSUBSW = 0x660F3807,
 330
 331         // SSE4.1
 332
 333         BLENDPD   = 0x660F3A0D,
 334         BLENDPS   = 0x660F3A0C,
 335         BLENDVPD  = 0x660F3815,
 336         BLENDVPS  = 0x660F3814,
 337         DPPD      = 0x660F3A41,
 338         DPPS      = 0x660F3A40,
 339         EXTRACTPS = 0x660F3A17,
 340         INSERTPS  = 0x660F3A21,
 341         MPSADBW   = 0x660F3A42,
 342         PBLENDVB  = 0x660F3810,
 343         PBLENDW   = 0x660F3A0E,
 344         PEXTRD    = 0x660F3A16,
 345         PEXTRQ    = 0x660F3A16,
 346         PINSRB    = 0x660F3A20,
 347         PINSRD    = 0x660F3A22,
 348         PINSRQ    = 0x660F3A22,
 349
 350         MOVNTDQA = 0x660F382A,
 351         PACKUSDW = 0x660F382B,
 352         PCMPEQQ = 0x660F3829,
 353         PEXTRB = 0x660F3A14,
 354         PHMINPOSUW = 0x660F3841,
 355         PMAXSB = 0x660F383C,
 356         PMAXSD = 0x660F383D,
 357         PMAXUD = 0x660F383F,
 358         PMAXUW = 0x660F383E,
 359         PMINSB = 0x660F3838,
 360         PMINSD = 0x660F3839,
 361         PMINUD = 0x660F383B,
 362         PMINUW = 0x660F383A,
 363         PMOVSXBW = 0x660F3820,
 364         PMOVSXBD = 0x660F3821,
 365         PMOVSXBQ = 0x660F3822,
 366         PMOVSXWD = 0x660F3823,
 367         PMOVSXWQ = 0x660F3824,
 368         PMOVSXDQ = 0x660F3825,
 369         PMOVZXBW = 0x660F3830,
 370         PMOVZXBD = 0x660F3831,
 371         PMOVZXBQ = 0x660F3832,
 372         PMOVZXWD = 0x660F3833,
 373         PMOVZXWQ = 0x660F3834,
 374         PMOVZXDQ = 0x660F3835,
 375         PMULDQ   = 0x660F3828,
 376         PMULLD   = 0x660F3840,
 377         PTEST    = 0x660F3817,
 378
 379         ROUNDPD = 0x660F3A09,
 380         ROUNDPS = 0x660F3A08,
 381         ROUNDSD = 0x660F3A0B,
 382         ROUNDSS = 0x660F3A0A,
 383
 384         // SSE4.2
 385         PCMPESTRI  = 0x660F3A61,
 386         PCMPESTRM  = 0x660F3A60,
 387         PCMPISTRI  = 0x660F3A63,
 388         PCMPISTRM  = 0x660F3A62,
 389         PCMPGTQ    = 0x660F3837,
 390         //CRC32
 391
 392         // SSE4a (AMD only)
 393         // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
 394
 395         // POPCNT and LZCNT (have their own CPUID bits)
 396         POPCNT     = 0xF30FB8,
 397         // LZCNT
 398     }
 399
 400     /**
 401     * Generate two operand instruction with XMM 128 bit operands.
 402     *
 403     * This is a compiler magic function - it doesn't behave like
 404     * regular D functions.
 405     *
 406     * Parameters:
 407     *      opcode = any of the XMM opcodes; it must be a compile time constant
 408     *      op1    = first operand
 409     *      op2    = second operand
 410     * Returns:
 411     *      result of opcode
 412     * Example:
 413     ---
 414     import core.simd;
 415     import core.stdc.stdio;
 416
 417     void main()
 418     {
 419         float4 A = [2.34f, -70000.0f, 0.00001f, 345.5f];
 420         float4 R = A;
 421         R = cast(float4) __simd(XMM.RCPSS, R, A);
 422         printf("%g %g %g %g\n", R.array[0], R.array[1], R.array[2], R.array[3]);
 423     }
 424     ---
 425     * Prints `0.427368 -70000 1e-05 345.5`.
 426     * The use of the two operand form for `XMM.RCPSS` is necessary because the result of the instruction
 427     * contains elements of both operands.
 428     * Example:
 429     ---
 430     double[2] A = [56.0, -75.0];
 431     double2 R = cast(double2) __simd(XMM.LODUPD, *cast(double2*)A.ptr);
 432     ---
 433     * The cast to `double2*` is necessary because the type of `*A.ptr` is `double`.
 434     */
 435     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
 436
 437     ///
 438     unittest
 439     {
 440         float4 a;
 441         a = cast(float4)__simd(XMM.PXOR, a, a);
 442     }
 443
 444     /**
 445     * Unary SIMD instructions.
 446     */
 447     pure @safe void16 __simd(XMM opcode, void16 op1);
 448     pure @safe void16 __simd(XMM opcode, double d);   ///
 449     pure @safe void16 __simd(XMM opcode, float f);    ///
 450
 451     ///
 452     unittest
 453     {
 454         float4 a;
 455         a = cast(float4)__simd(XMM.LODSS, a);
 456     }
 457
 458     /****
 459     * For instructions:
 460     * CMPPD, CMPSS, CMPSD, CMPPS,
 461     * PSHUFD, PSHUFHW, PSHUFLW,
 462     * BLENDPD, BLENDPS, DPPD, DPPS,
 463     * MPSADBW, PBLENDW,
 464     * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
 465     * Parameters:
 466     *      opcode = any of the above XMM opcodes; it must be a compile time constant
 467     *      op1    = first operand
 468     *      op2    = second operand
 469     *      imm8   = third operand; must be a compile time constant
 470     * Returns:
 471     *      result of opcode
 472     */
 473     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
 474
 475     ///
 476     unittest
 477     {
 478         float4 a;
 479         a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A);
 480     }
 481
 482     /***
 483     * For instructions with the imm8 version:
 484     * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
 485     * PSRLDQ, PSLLDQ
 486     * Parameters:
 487     *      opcode = any of the XMM opcodes; it must be a compile time constant
 488     *      op1    = first operand
 489     *      imm8   = second operand; must be a compile time constant
 490     * Returns:
 491     *      result of opcode
 492     */
 493     pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
 494
 495     ///
 496     unittest
 497     {
 498         float4 a;
 499         a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A);
 500     }
 501
 502     /*****
 503     * For "store" operations of the form:
 504     *    op1 op= op2
 505     * such as MOVLPS.
 506     * Returns:
 507     *    op2
 508     * These cannot be marked as pure, as semantic() doesn't check them.
 509     */
 510     @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
 511     @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
 512     @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///
 513     @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); ///
 514
 515     ///
 516     unittest
 517     {
 518         void16 a;
 519         float f = 1;
 520         double d = 1;
 521
 522         cast(void)__simd_sto(XMM.STOUPS, a, a);
 523         cast(void)__simd_sto(XMM.STOUPS, f, a);
 524         cast(void)__simd_sto(XMM.STOUPS, d, a);
 525     }
 526
 527     /* The following use overloading to ensure correct typing.
 528     * Compile with inlining on for best performance.
 529     */
 530
 531     pure @safe short8 pcmpeq()(short8 v1, short8 v2)
 532     {
 533         return cast(short8)__simd(XMM.PCMPEQW, v1, v2);
 534     }
 535
 536     pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
 537     {
 538         return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2);
 539     }
 540
 541     /*********************
 542     * Emit prefetch instruction.
 543     * Params:
 544     *    address = address to be prefetched
 545     *    writeFetch = true for write fetch, false for read fetch
 546     *    locality = 0..3 (0 meaning least local, 3 meaning most local)
 547     * Note:
 548     *    The Intel mappings are:
 549     *    $(TABLE
 550     *    $(THEAD writeFetch, locality, Instruction)
 551     *    $(TROW false, 0, prefetchnta)
 552     *    $(TROW false, 1, prefetch2)
 553     *    $(TROW false, 2, prefetch1)
 554     *    $(TROW false, 3, prefetch0)
 555     *    $(TROW true, 0, prefetchw)
 556     *    $(TROW true, 1, prefetchw)
 557     *    $(TROW true, 2, prefetchw)
 558     *    $(TROW true, 3, prefetchw)
 559     *    )
 560     */
 561     void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
 562     {
 563         static if (writeFetch)
 564             __prefetch(address, 4);
 565         else static if (locality < 4)
 566             __prefetch(address, 3 - locality);
 567         else
 568             static assert(0, "0..3 expected for locality");
 569     }
 570
 571     private void __prefetch(const(void*) address, ubyte encoding);
 572
 573     /*************************************
 574     * Load unaligned vector from address.
 575     * This is a compiler intrinsic.
 576     * Params:
 577     *    p = pointer to vector
 578     * Returns:
 579     *    vector
 580     */
 581
 582     V loadUnaligned(V)(const V* p)
 583         if (is(V == void16) ||
 584             is(V == byte16) ||
 585             is(V == ubyte16) ||
 586             is(V == short8) ||
 587             is(V == ushort8) ||
 588             is(V == int4) ||
 589             is(V == uint4) ||
 590             is(V == long2) ||
 591             is(V == ulong2) ||
 592             is(V == double2) ||
 593             is(V == float4))
 594     {
 595         pragma(inline, true);
 596         static if (is(V == double2))
 597             return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
 598         else static if (is(V == float4))
 599             return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
 600         else
 601             return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
 602     }
 603
 604     @system
 605     unittest
 606     {
 607         // Memory to load into the vector:
 608         // Should have enough data to test all 16-byte alignments, and still
 609         // have room for a 16-byte vector
 610         ubyte[32] data;
 611         foreach (i; 0..data.length)
 612         {
 613             data[i] = cast(ubyte)i;
 614         }
 615
 616         // to test all alignments from 1 ~ 16
 617         foreach (i; 0..16)
 618         {
 619             ubyte* d = &data[i];
 620
 621             void test(T)()
 622             {
 623                 // load the data
 624                 T v = loadUnaligned(cast(T*)d);
 625
 626                 // check that the data was loaded correctly
 627                 ubyte* ptrToV = cast(ubyte*)&v;
 628                 foreach (j; 0..T.sizeof)
 629                 {
 630                     assert(ptrToV[j] == d[j]);
 631                 }
 632             }
 633
 634             test!void16();
 635             test!byte16();
 636             test!ubyte16();
 637             test!short8();
 638             test!ushort8();
 639             test!int4();
 640             test!uint4();
 641             test!long2();
 642             test!ulong2();
 643             test!double2();
 644             test!float4();
 645         }
 646     }
 647
 648     /*************************************
 649     * Store vector to unaligned address.
 650     * This is a compiler intrinsic.
 651     * Params:
 652     *    p = pointer to vector
 653     *    value = value to store
 654     * Returns:
 655     *    value
 656     */
 657
 658     V storeUnaligned(V)(V* p, V value)
 659         if (is(V == void16) ||
 660             is(V == byte16) ||
 661             is(V == ubyte16) ||
 662             is(V == short8) ||
 663             is(V == ushort8) ||
 664             is(V == int4) ||
 665             is(V == uint4) ||
 666             is(V == long2) ||
 667             is(V == ulong2) ||
 668             is(V == double2) ||
 669             is(V == float4))
 670     {
 671         pragma(inline, true);
 672         static if (is(V == double2))
 673             return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
 674         else static if (is(V == float4))
 675             return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
 676         else
 677             return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
 678     }
 679
 680     @system
 681     unittest
 682     {
 683         // Memory to store the vector to:
 684         // Should have enough data to test all 16-byte alignments, and still
 685         // have room for a 16-byte vector
 686         ubyte[32] data;
 687
 688         // to test all alignments from 1 ~ 16
 689         foreach (i; 0..16)
 690         {
 691             ubyte* d = &data[i];
 692
 693             void test(T)()
 694             {
 695                 T v;
 696
 697                 // populate v` with data
 698                 ubyte* ptrToV = cast(ubyte*)&v;
 699                 foreach (j; 0..T.sizeof)
 700                 {
 701                     ptrToV[j] = cast(ubyte)j;
 702                 }
 703
 704                 // store `v` to location pointed to by `d`
 705                 storeUnaligned(cast(T*)d, v);
 706
 707                 // check that the data was stored correctly
 708                 foreach (j; 0..T.sizeof)
 709                 {
 710                     assert(ptrToV[j] == d[j]);
 711                 }
 712             }
 713
 714             test!void16();
 715             test!byte16();
 716             test!ubyte16();
 717             test!short8();
 718             test!ushort8();
 719             test!int4();
 720             test!uint4();
 721             test!long2();
 722             test!ulong2();
 723             test!double2();
 724             test!float4();
 725         }
 726     }
 727 }