1 // Written in the D programming language.
4 * Builtin SIMD intrinsics
6 * Source: $(DRUNTIMESRC core/_simd.d)
8 * Copyright: Copyright Digital Mars 2012-2020
9 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10 * Authors: $(HTTP digitalmars.com, Walter Bright),
11 * Source: $(DRUNTIMESRC core/_simd.d)
21 /*******************************
22 * Create a vector type.
25 * T = one of double[2], float[4], void[16], byte[16], ubyte[16],
26 * short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
27 * For 256 bit vectors,
28 * one of double[4], float[8], void[32], byte[32], ubyte[32],
29 * short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
34 /* __vector is compiler magic, hide it behind a template.
35 * The compiler will reject T's that don't work.
37 alias __vector(T
) Vector
;
42 static if (is(Vector
!(void[8]))) alias Vector
!(void[8]) void8
; ///
43 static if (is(Vector
!(double[1]))) alias Vector
!(double[1]) double1
; ///
44 static if (is(Vector
!(float[2]))) alias Vector
!(float[2]) float2
; ///
45 static if (is(Vector
!(byte[8]))) alias Vector
!(byte[8]) byte8
; ///
46 static if (is(Vector
!(ubyte[8]))) alias Vector
!(ubyte[8]) ubyte8
; ///
47 static if (is(Vector
!(short[4]))) alias Vector
!(short[4]) short4
; ///
48 static if (is(Vector
!(ushort[4]))) alias Vector
!(ushort[4]) ushort4
; ///
49 static if (is(Vector
!(int[2]))) alias Vector
!(int[2]) int2
; ///
50 static if (is(Vector
!(uint[2]))) alias Vector
!(uint[2]) uint2
; ///
51 static if (is(Vector
!(long[1]))) alias Vector
!(long[1]) long1
; ///
52 static if (is(Vector
!(ulong[1]))) alias Vector
!(ulong[1]) ulong1
; ///
54 static if (is(Vector
!(void[16]))) alias Vector
!(void[16]) void16
; ///
55 static if (is(Vector
!(double[2]))) alias Vector
!(double[2]) double2
; ///
56 static if (is(Vector
!(float[4]))) alias Vector
!(float[4]) float4
; ///
57 static if (is(Vector
!(byte[16]))) alias Vector
!(byte[16]) byte16
; ///
58 static if (is(Vector
!(ubyte[16]))) alias Vector
!(ubyte[16]) ubyte16
; ///
59 static if (is(Vector
!(short[8]))) alias Vector
!(short[8]) short8
; ///
60 static if (is(Vector
!(ushort[8]))) alias Vector
!(ushort[8]) ushort8
; ///
61 static if (is(Vector
!(int[4]))) alias Vector
!(int[4]) int4
; ///
62 static if (is(Vector
!(uint[4]))) alias Vector
!(uint[4]) uint4
; ///
63 static if (is(Vector
!(long[2]))) alias Vector
!(long[2]) long2
; ///
64 static if (is(Vector
!(ulong[2]))) alias Vector
!(ulong[2]) ulong2
; ///
66 static if (is(Vector
!(void[32]))) alias Vector
!(void[32]) void32
; ///
67 static if (is(Vector
!(double[4]))) alias Vector
!(double[4]) double4
; ///
68 static if (is(Vector
!(float[8]))) alias Vector
!(float[8]) float8
; ///
69 static if (is(Vector
!(byte[32]))) alias Vector
!(byte[32]) byte32
; ///
70 static if (is(Vector
!(ubyte[32]))) alias Vector
!(ubyte[32]) ubyte32
; ///
71 static if (is(Vector
!(short[16]))) alias Vector
!(short[16]) short16
; ///
72 static if (is(Vector
!(ushort[16]))) alias Vector
!(ushort[16]) ushort16
; ///
73 static if (is(Vector
!(int[8]))) alias Vector
!(int[8]) int8
; ///
74 static if (is(Vector
!(uint[8]))) alias Vector
!(uint[8]) uint8
; ///
75 static if (is(Vector
!(long[4]))) alias Vector
!(long[4]) long4
; ///
76 static if (is(Vector
!(ulong[4]))) alias Vector
!(ulong[4]) ulong4
; ///
78 static if (is(Vector
!(void[64]))) alias Vector
!(void[64]) void64
; ///
79 static if (is(Vector
!(double[8]))) alias Vector
!(double[8]) double8
; ///
80 static if (is(Vector
!(float[16]))) alias Vector
!(float[16]) float16
; ///
81 static if (is(Vector
!(byte[64]))) alias Vector
!(byte[64]) byte64
; ///
82 static if (is(Vector
!(ubyte[64]))) alias Vector
!(ubyte[64]) ubyte64
; ///
83 static if (is(Vector
!(short[32]))) alias Vector
!(short[32]) short32
; ///
84 static if (is(Vector
!(ushort[32]))) alias Vector
!(ushort[32]) ushort32
; ///
85 static if (is(Vector
!(int[16]))) alias Vector
!(int[16]) int16
; ///
86 static if (is(Vector
!(uint[16]))) alias Vector
!(uint[16]) uint16
; ///
87 static if (is(Vector
!(long[8]))) alias Vector
!(long[8]) long8
; ///
88 static if (is(Vector
!(ulong[8]))) alias Vector
!(ulong[8]) ulong8
; ///
92 /** XMM opcodes that conform to the following:
94 * opcode xmm1,xmm2/mem
96 * and do not have side effects (i.e. do not write to memory).
138 // Use STO and LOD instead of MOV to distinguish the direction
139 // (Destination is first operand, Source is second operand)
140 STOSS
= 0xF30F11, /// MOVSS xmm1/m32, xmm2
141 STOSD
= 0xF20F11, /// MOVSD xmm1/m64, xmm2
142 STOAPS
= 0x000F29, /// MOVAPS xmm2/m128, xmm1
143 STOAPD
= 0x660F29, /// MOVAPD xmm2/m128, xmm1
144 STODQA
= 0x660F7F, /// MOVDQA xmm2/m128, xmm1
145 STOD
= 0x660F7E, /// MOVD reg/mem64, xmm 66 0F 7E /r
146 STOQ
= 0x660FD6, /// MOVQ xmm2/m64, xmm1
148 LODSS
= 0xF30F10, /// MOVSS xmm1, xmm2/m32
149 LODSD
= 0xF20F10, /// MOVSD xmm1, xmm2/m64
150 LODAPS
= 0x000F28, /// MOVAPS xmm1, xmm2/m128
151 LODAPD
= 0x660F28, /// MOVAPD xmm1, xmm2/m128
152 LODDQA
= 0x660F6F, /// MOVDQA xmm1, xmm2/m128
153 LODD
= 0x660F6E, /// MOVD xmm, reg/mem64 66 0F 6E /r
154 LODQ
= 0xF30F7E, /// MOVQ xmm1, xmm2/m64
156 LODDQU
= 0xF30F6F, /// MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r
157 STODQU
= 0xF30F7F, /// MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r
158 MOVDQ2Q
= 0xF20FD6, /// MOVDQ2Q mmx, xmm F2 0F D6 /r
159 MOVHLPS
= 0x0F12, /// MOVHLPS xmm1, xmm2 0F 12 /r
160 LODHPD
= 0x660F16, /// MOVHPD xmm1, m64
161 STOHPD
= 0x660F17, /// MOVHPD mem64, xmm1 66 0F 17 /r
162 LODHPS
= 0x0F16, /// MOVHPS xmm1, m64
163 STOHPS
= 0x0F17, /// MOVHPS m64, xmm1
164 MOVLHPS
= 0x0F16, /// MOVLHPS xmm1, xmm2
165 LODLPD
= 0x660F12, /// MOVLPD xmm1, m64
166 STOLPD
= 0x660F13, /// MOVLPD m64, xmm1
167 LODLPS
= 0x0F12, /// MOVLPS xmm1, m64
168 STOLPS
= 0x0F13, /// MOVLPS m64, xmm1
169 MOVMSKPD
= 0x660F50, /// MOVMSKPD reg, xmm
170 MOVMSKPS
= 0x0F50, /// MOVMSKPS reg, xmm
171 MOVNTDQ
= 0x660FE7, /// MOVNTDQ m128, xmm1
172 MOVNTI
= 0x0FC3, /// MOVNTI m32, r32
173 MOVNTPD
= 0x660F2B, /// MOVNTPD m128, xmm1
174 MOVNTPS
= 0x0F2B, /// MOVNTPS m128, xmm1
175 MOVNTQ
= 0x0FE7, /// MOVNTQ m64, mm
176 MOVQ2DQ
= 0xF30FD6, /// MOVQ2DQ
177 LODUPD
= 0x660F10, /// MOVUPD xmm1, xmm2/m128
178 STOUPD
= 0x660F11, /// MOVUPD xmm2/m128, xmm1
179 LODUPS
= 0x0F10, /// MOVUPS xmm1, xmm2/m128
180 STOUPS
= 0x0F11, /// MOVUPS xmm2/m128, xmm1
209 PUNPCKHBW
= 0x660F68,
210 PUNPCKHDQ
= 0x660F6A,
211 PUNPCKHWD
= 0x660F69,
212 PUNPCKLBW
= 0x660F60,
213 PUNPCKLDQ
= 0x660F62,
214 PUNPCKLWD
= 0x660F61,
242 CVTTPD2PI
= 0x660F2C,
243 CVTTPD2DQ
= 0x660FE6,
244 CVTTPS2DQ
= 0xF30F5B,
246 CVTTSD2SI
= 0xF20F2C,
247 CVTTSS2SI
= 0xF30F2C,
248 MASKMOVDQU
= 0x660FF7,
267 //PMOVMSKB = 0x660FD7,
272 PUNPCKHQDQ
= 0x660F6D,
273 PUNPCKLQDQ
= 0x660F6C,
298 // SSE3 Pentium 4 (Prescott)
314 PALIGNR
= 0x660F3A0F,
317 PHADDSW
= 0x660F3803,
325 PMADDUBSW
= 0x660F3804,
326 PMULHRSW
= 0x660F380B,
329 PHSUBSW
= 0x660F3807,
333 BLENDPD
= 0x660F3A0D,
334 BLENDPS
= 0x660F3A0C,
335 BLENDVPD
= 0x660F3815,
336 BLENDVPS
= 0x660F3814,
339 EXTRACTPS
= 0x660F3A17,
340 INSERTPS
= 0x660F3A21,
341 MPSADBW
= 0x660F3A42,
342 PBLENDVB
= 0x660F3810,
343 PBLENDW
= 0x660F3A0E,
350 MOVNTDQA
= 0x660F382A,
351 PACKUSDW
= 0x660F382B,
352 PCMPEQQ
= 0x660F3829,
354 PHMINPOSUW
= 0x660F3841,
363 PMOVSXBW
= 0x660F3820,
364 PMOVSXBD
= 0x660F3821,
365 PMOVSXBQ
= 0x660F3822,
366 PMOVSXWD
= 0x660F3823,
367 PMOVSXWQ
= 0x660F3824,
368 PMOVSXDQ
= 0x660F3825,
369 PMOVZXBW
= 0x660F3830,
370 PMOVZXBD
= 0x660F3831,
371 PMOVZXBQ
= 0x660F3832,
372 PMOVZXWD
= 0x660F3833,
373 PMOVZXWQ
= 0x660F3834,
374 PMOVZXDQ
= 0x660F3835,
379 ROUNDPD
= 0x660F3A09,
380 ROUNDPS
= 0x660F3A08,
381 ROUNDSD
= 0x660F3A0B,
382 ROUNDSS
= 0x660F3A0A,
385 PCMPESTRI
= 0x660F3A61,
386 PCMPESTRM
= 0x660F3A60,
387 PCMPISTRI
= 0x660F3A63,
388 PCMPISTRM
= 0x660F3A62,
389 PCMPGTQ
= 0x660F3837,
393 // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
395 // POPCNT and LZCNT (have their own CPUID bits)
401 * Generate two operand instruction with XMM 128 bit operands.
403 * This is a compiler magic function - it doesn't behave like
404 * regular D functions.
407 * opcode = any of the XMM opcodes; it must be a compile time constant
408 * op1 = first operand
409 * op2 = second operand
415 import core.stdc.stdio;
419 float4 A = [2.34f, -70000.0f, 0.00001f, 345.5f];
421 R = cast(float4) __simd(XMM.RCPSS, R, A);
422 printf("%g %g %g %g\n", R.array[0], R.array[1], R.array[2], R.array[3]);
425 * Prints `0.427368 -70000 1e-05 345.5`.
426 * The use of the two operand form for `XMM.RCPSS` is necessary because the result of the instruction
427 * contains elements of both operands.
430 double[2] A = [56.0, -75.0];
431 double2 R = cast(double2) __simd(XMM.LODUPD, *cast(double2*)A.ptr);
433 * The cast to `double2*` is necessary because the type of `*A.ptr` is `double`.
435 pure @safe void16
__simd(XMM opcode
, void16 op1
, void16 op2
);
441 a
= cast(float4
)__simd(XMM
.PXOR
, a
, a
);
445 * Unary SIMD instructions.
447 pure @safe void16
__simd(XMM opcode
, void16 op1
);
448 pure @safe void16
__simd(XMM opcode
, double d
); ///
449 pure @safe void16
__simd(XMM opcode
, float f
); ///
455 a
= cast(float4
)__simd(XMM
.LODSS
, a
);
460 * CMPPD, CMPSS, CMPSD, CMPPS,
461 * PSHUFD, PSHUFHW, PSHUFLW,
462 * BLENDPD, BLENDPS, DPPD, DPPS,
464 * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
466 * opcode = any of the above XMM opcodes; it must be a compile time constant
467 * op1 = first operand
468 * op2 = second operand
469 * imm8 = third operand; must be a compile time constant
473 pure @safe void16
__simd(XMM opcode
, void16 op1
, void16 op2
, ubyte imm8
);
479 a
= cast(float4
)__simd(XMM
.CMPPD
, a
, a
, 0x7A);
483 * For instructions with the imm8 version:
484 * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
487 * opcode = any of the XMM opcodes; it must be a compile time constant
488 * op1 = first operand
489 * imm8 = second operand; must be a compile time constant
493 pure @safe void16
__simd_ib(XMM opcode
, void16 op1
, ubyte imm8
);
499 a
= cast(float4
) __simd_ib(XMM
.PSRLQ
, a
, 0x7A);
503 * For "store" operations of the form:
508 * These cannot be marked as pure, as semantic() doesn't check them.
510 @safe void16
__simd_sto(XMM opcode
, void16 op1
, void16 op2
);
511 @safe void16
__simd_sto(XMM opcode
, double op1
, void16 op2
); ///
512 @safe void16
__simd_sto(XMM opcode
, float op1
, void16 op2
); ///
513 @safe void16
__simd_sto(XMM opcode
, void16 op1
, long op2
); ///
522 cast(void)__simd_sto(XMM
.STOUPS
, a
, a
);
523 cast(void)__simd_sto(XMM
.STOUPS
, f
, a
);
524 cast(void)__simd_sto(XMM
.STOUPS
, d
, a
);
527 /* The following use overloading to ensure correct typing.
528 * Compile with inlining on for best performance.
531 pure @safe short8
pcmpeq()(short8 v1
, short8 v2
)
533 return cast(short8
)__simd(XMM
.PCMPEQW
, v1
, v2
);
536 pure @safe ushort8
pcmpeq()(ushort8 v1
, ushort8 v2
)
538 return cast(ushort8
)__simd(XMM
.PCMPEQW
, v1
, v2
);
541 /*********************
542 * Emit prefetch instruction.
544 * address = address to be prefetched
545 * writeFetch = true for write fetch, false for read fetch
546 * locality = 0..3 (0 meaning least local, 3 meaning most local)
548 * The Intel mappings are:
550 * $(THEAD writeFetch, locality, Instruction)
551 * $(TROW false, 0, prefetchnta)
552 * $(TROW false, 1, prefetch2)
553 * $(TROW false, 2, prefetch1)
554 * $(TROW false, 3, prefetch0)
555 * $(TROW true, 0, prefetchw)
556 * $(TROW true, 1, prefetchw)
557 * $(TROW true, 2, prefetchw)
558 * $(TROW true, 3, prefetchw)
561 void prefetch(bool writeFetch
, ubyte locality
)(const(void)* address
)
563 static if (writeFetch
)
564 __prefetch(address
, 4);
565 else static if (locality
< 4)
566 __prefetch(address
, 3 - locality
);
568 static assert(0, "0..3 expected for locality");
571 private void __prefetch(const(void*) address
, ubyte encoding
);
573 /*************************************
574 * Load unaligned vector from address.
575 * This is a compiler intrinsic.
577 * p = pointer to vector
582 V
loadUnaligned(V
)(const V
* p
)
583 if (is(V
== void16
) ||
595 pragma(inline
, true);
596 static if (is(V
== double2
))
597 return cast(V
)__simd(XMM
.LODUPD
, *cast(const void16
*)p
);
598 else static if (is(V
== float4
))
599 return cast(V
)__simd(XMM
.LODUPS
, *cast(const void16
*)p
);
601 return cast(V
)__simd(XMM
.LODDQU
, *cast(const void16
*)p
);
607 // Memory to load into the vector:
608 // Should have enough data to test all 16-byte alignments, and still
609 // have room for a 16-byte vector
611 foreach (i
; 0..data
.length
)
613 data
[i
] = cast(ubyte)i
;
616 // to test all alignments from 1 ~ 16
624 T v
= loadUnaligned(cast(T
*)d
);
626 // check that the data was loaded correctly
627 ubyte* ptrToV
= cast(ubyte*)&v
;
628 foreach (j
; 0..T
.sizeof
)
630 assert(ptrToV
[j
] == d
[j
]);
648 /*************************************
649 * Store vector to unaligned address.
650 * This is a compiler intrinsic.
652 * p = pointer to vector
653 * value = value to store
658 V
storeUnaligned(V
)(V
* p
, V value
)
659 if (is(V
== void16
) ||
671 pragma(inline
, true);
672 static if (is(V
== double2
))
673 return cast(V
)__simd_sto(XMM
.STOUPD
, *cast(void16
*)p
, value
);
674 else static if (is(V
== float4
))
675 return cast(V
)__simd_sto(XMM
.STOUPS
, *cast(void16
*)p
, value
);
677 return cast(V
)__simd_sto(XMM
.STODQU
, *cast(void16
*)p
, value
);
683 // Memory to store the vector to:
684 // Should have enough data to test all 16-byte alignments, and still
685 // have room for a 16-byte vector
688 // to test all alignments from 1 ~ 16
697 // populate v` with data
698 ubyte* ptrToV
= cast(ubyte*)&v
;
699 foreach (j
; 0..T
.sizeof
)
701 ptrToV
[j
] = cast(ubyte)j
;
704 // store `v` to location pointed to by `d`
705 storeUnaligned(cast(T
*)d
, v
);
707 // check that the data was stored correctly
708 foreach (j
; 0..T
.sizeof
)
710 assert(ptrToV
[j
] == d
[j
]);