1 // This file is part of the ustl library, an STL implementation.
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
7 /// \brief SIMD-type algorithms, with hardware acceleration, if available.
9 /// All algorithms are container-based because iterator syntax is just too
10 /// damn verbose and because the specializations need to be able to tell
11 /// how many elements are in the container in order to choose proper SIMD
12 /// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
13 /// Specializations are only for the tuple template because the container
14 /// must be of a fixed and compile-time-known size for the compiler to be
15 /// able to choose the specialization.
18 #ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
19 #define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
29 //----------------------------------------------------------------------
31 //----------------------------------------------------------------------
33 /// Applies \p op to each element in \p op1.
34 template <typename Ctr
, typename UnaryOperation
>
35 inline void packop (Ctr
& op1
, UnaryOperation op
)
37 foreach (typename
Ctr::iterator
, i
, op1
)
41 /// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
42 template <typename Ctr
, typename BinaryOperation
>
43 inline void packop (const Ctr
& op1
, Ctr
& op2
, BinaryOperation op
)
45 assert (op2
.size() <= op1
.size());
46 typename
Ctr::const_iterator
i1 (op1
.begin());
47 typename
Ctr::iterator
i2 (op2
.begin());
48 for (; i2
!= op2
.end(); ++i1
, ++i2
)
52 /// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
53 template <typename Ctr
, typename BinaryOperation
>
54 inline void packop (const Ctr
& op1
, const Ctr
& op2
, Ctr
& result
, BinaryOperation op
)
56 assert (op1
.size() <= op2
.size() && op1
.size() <= result
.size());
57 passign (op1
, result
);
61 /// Copies \p op1 into \p result.
62 template <typename Ctr
>
63 inline void passign (const Ctr
& op1
, Ctr
& result
)
65 assert (op1
.size() <= result
.size());
66 typename
Ctr::iterator
d (result
.begin());
67 foreach (typename
Ctr::const_iterator
, s
, op1
)
71 /// Copies \p result.size() elements from \p op1 to \p result.
72 template <typename Ctr
>
73 inline void ipassign (typename
Ctr::const_iterator op1
, Ctr
& result
)
75 foreach (typename
Ctr::iterator
, d
, result
)
79 template <typename Ctr1
, typename Ctr2
, typename ConvertFunction
>
80 inline void pconvert (const Ctr1
& op1
, Ctr2
& op2
, ConvertFunction f
)
82 assert (op1
.size() <= op2
.size());
83 typename
Ctr1::const_iterator
i1 (op1
.begin());
84 typename
Ctr2::iterator
i2 (op2
.begin());
85 for (; i1
!= op1
.end(); ++i1
, ++i2
)
89 // Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
90 STD_BINARY_FUNCTOR (fpadds
, T
, ((b
> numeric_limits
<T
>::max() - a
) ? numeric_limits
<T
>::max() : a
+ b
))
91 STD_BINARY_FUNCTOR (fpsubs
, T
, ((a
< numeric_limits
<T
>::min() + b
) ? numeric_limits
<T
>::min() : a
- b
))
92 STD_BINARY_FUNCTOR (fpshl
, T
, (a
<< b
))
93 STD_BINARY_FUNCTOR (fpshr
, T
, (a
>> b
))
94 STD_BINARY_FUNCTOR (fpmin
, T
, (min (a
, b
)))
95 STD_BINARY_FUNCTOR (fpmax
, T
, (max (a
, b
)))
96 STD_BINARY_FUNCTOR (fpavg
, T
, ((a
+ b
+ 1) / 2))
97 STD_CONVERSION_FUNCTOR (fcast
, (D(a
)))
99 STD_UNARY_FUNCTOR (fpreciprocal
,T
, (1 / a
))
100 STD_UNARY_FUNCTOR (fpsqrt
, T
, (reset_mmx(), T (sqrt (a
))))
101 STD_UNARY_FUNCTOR (fprecipsqrt
, T
, (reset_mmx(), 1 / T(sqrt (a
))))
102 STD_UNARY_FUNCTOR (fsin
, T
, (reset_mmx(), T (sin (a
))))
103 STD_UNARY_FUNCTOR (fcos
, T
, (reset_mmx(), T (cos (a
))))
104 STD_UNARY_FUNCTOR (ftan
, T
, (reset_mmx(), T (tan (a
))))
106 STD_CONVERSION_FUNCTOR (fround
, (reset_mmx(), D(rintf(a
))))
108 STD_CONVERSION_FUNCTOR (fround
, (reset_mmx(), D(rint(a
))))
110 template <> inline int32_t fround
<double,int32_t>::operator()(const double& a
) const { reset_mmx(); return (int32_t(rint(a
))); }
112 template <> inline float fpavg
<float>::operator()(const float& a
, const float& b
) const { return ((a
+ b
) / 2); }
113 template <> inline double fpavg
<double>::operator()(const double& a
, const double& b
) const { return ((a
+ b
) / 2); }
115 #define SIMD_PACKEDOP1(name, operation) \
116 template <typename Ctr> \
117 inline void name (Ctr& op1) \
119 typedef typename Ctr::value_type value_t; \
120 packop (op1, operation<value_t>()); \
122 #define SIMD_PACKEDOP2(name, operation) \
123 template <typename Ctr> \
124 inline void name (const Ctr& op1, Ctr& op2) \
126 typedef typename Ctr::value_type value_t; \
127 packop (op1, op2, operation<value_t>()); \
129 #define SIMD_PACKEDOP3(name, operation) \
130 template <typename Ctr> \
131 inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \
133 typedef typename Ctr::value_type value_t; \
134 packop (op1, op2, result, operation<value_t>()); \
136 #define SIMD_SINGLEOP1(name, operation) \
137 template <typename T> \
138 inline T name (T op) \
143 #define SIMD_CONVERTOP(name, operation) \
144 template <typename Ctr1, typename Ctr2> \
145 inline void name (const Ctr1& op1, Ctr2& op2) \
147 typedef typename Ctr1::value_type value1_t; \
148 typedef typename Ctr2::value_type value2_t; \
149 pconvert (op1, op2, operation<value1_t, value2_t>());\
152 SIMD_PACKEDOP2 (padd
, plus
)
153 SIMD_PACKEDOP2 (psub
, minus
)
154 SIMD_PACKEDOP2 (pmul
, multiplies
)
155 SIMD_PACKEDOP2 (pdiv
, divides
)
156 SIMD_PACKEDOP2 (pand
, bitwise_and
)
157 SIMD_PACKEDOP2 (por
, bitwise_or
)
158 SIMD_PACKEDOP2 (pxor
, bitwise_xor
)
159 SIMD_PACKEDOP2 (pshl
, fpshl
)
160 SIMD_PACKEDOP2 (pshr
, fpshr
)
161 SIMD_PACKEDOP2 (psubs
, fpsubs
)
162 SIMD_PACKEDOP2 (pmin
, fpmin
)
163 SIMD_PACKEDOP2 (pmax
, fpmax
)
164 SIMD_PACKEDOP2 (pavg
, fpavg
)
166 SIMD_PACKEDOP3 (padd
, plus
)
167 SIMD_PACKEDOP3 (psub
, minus
)
168 SIMD_PACKEDOP3 (pmul
, multiplies
)
169 SIMD_PACKEDOP3 (pdiv
, divides
)
170 SIMD_PACKEDOP3 (pand
, bitwise_and
)
171 SIMD_PACKEDOP3 (por
, bitwise_or
)
172 SIMD_PACKEDOP3 (pxor
, bitwise_xor
)
173 SIMD_PACKEDOP3 (pshl
, fpshl
)
174 SIMD_PACKEDOP3 (pshr
, fpshr
)
175 SIMD_PACKEDOP3 (padds
, fpadds
)
176 SIMD_PACKEDOP3 (psubs
, fpsubs
)
177 SIMD_PACKEDOP3 (pmin
, fpmin
)
178 SIMD_PACKEDOP3 (pmax
, fpmax
)
179 SIMD_PACKEDOP3 (pavg
, fpavg
)
182 SIMD_PACKEDOP1 (precip
, fpreciprocal
)
183 SIMD_PACKEDOP1 (psqrt
, fpsqrt
)
184 SIMD_PACKEDOP1 (precipsqrt
, fprecipsqrt
)
185 SIMD_PACKEDOP1 (psin
, fsin
)
186 SIMD_PACKEDOP1 (pcos
, fcos
)
187 SIMD_PACKEDOP1 (ptan
, ftan
)
189 SIMD_SINGLEOP1 (srecip
, fpreciprocal
)
190 SIMD_SINGLEOP1 (ssqrt
, fpsqrt
)
191 SIMD_SINGLEOP1 (srecipsqrt
, fprecipsqrt
)
192 SIMD_SINGLEOP1 (ssin
, fsin
)
193 SIMD_SINGLEOP1 (scos
, fcos
)
194 SIMD_SINGLEOP1 (stan
, ftan
)
196 SIMD_CONVERTOP (pround
, fround
)
198 template <typename T
> inline int32_t sround (T op
) { fround
<T
,int32_t> obj
; return (obj (op
)); }
201 #undef SIMD_SINGLEOP1
202 #undef SIMD_PACKEDOP3
203 #undef SIMD_PACKEDOP2
204 #undef SIMD_PACKEDOP1
206 //----------------------------------------------------------------------
207 // Vector types to cast tuple data to
208 //----------------------------------------------------------------------
210 #if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
211 #define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs)))
213 #define VECTOR_ATTRIBUTE(mode,vs)
215 typedef uint8_t v8qi_t
VECTOR_ATTRIBUTE (V8QI
,8);
216 typedef uint16_t v4hi_t
VECTOR_ATTRIBUTE (V4HI
,8);
217 typedef uint16_t v8hi_t
VECTOR_ATTRIBUTE (V8HI
,16);
218 typedef uint32_t v2si_t
VECTOR_ATTRIBUTE (V2SI
,8);
219 typedef uint32_t v4si_t
VECTOR_ATTRIBUTE (V4SI
,16);
221 typedef uint64_t v1di_t
VECTOR_ATTRIBUTE (V1DI
,8);
223 typedef float v2sf_t
VECTOR_ATTRIBUTE (V2SF
,8);
224 typedef float v4sf_t
VECTOR_ATTRIBUTE (V4SF
,16);
225 typedef double v2df_t
VECTOR_ATTRIBUTE (V2DF
,16);
226 #undef VECTOR_ATTRIBUTE
228 #define SIMDA_RI(n) "m"(oin[n])
229 #define SIMDA_RO(n) "m"(oout[n])
230 #define SIMDA_WI(n) "=m"(oin[n])
231 #define SIMDA_WO(n) "=m"(oout[n])
233 //----------------------------------------------------------------------
234 // Hardware accelerated specializations
235 //----------------------------------------------------------------------
237 #define SIMD_PKOP2_SPEC(n, type, optype) \
239 inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
240 #define SIMD_PASSIGN_SPEC(n, type) \
242 inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
243 #define SIMD_IPASSIGN_SPEC(n, type) \
245 inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
246 #define SIMD_CONVERT_SPEC(n, type1, type2, optype) \
248 inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
251 #define STD_MMX_ARGS : "m"(oout[0]), "m"(oin[0]) : "mm0", "st", "memory"
252 #define DBL_MMX_ARGS : "m"(oout[0]), "m"(oout[2]), "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
253 #define MMX_PKOP2_SPEC(n,type,optype,instruction) \
254 SIMD_PKOP2_SPEC(n,type,optype) \
255 { asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
256 #define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \
257 SIMD_PKOP2_SPEC(n,type,optype) \
258 { asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
259 #define MMX_PASSIGN_SPEC(n,type) \
260 SIMD_PASSIGN_SPEC(n,type) \
261 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
262 #define MMX_DBL_PASSIGN_SPEC(n,type) \
263 SIMD_PASSIGN_SPEC(n,type) \
264 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
265 #define MMX_IPASSIGN_SPEC(n,type) \
266 SIMD_IPASSIGN_SPEC(n,type) \
267 { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
268 #define MMX_DBL_IPASSIGN_SPEC(n,type) \
269 SIMD_IPASSIGN_SPEC(n,type) \
270 { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
272 MMX_PASSIGN_SPEC(8,uint8_t)
273 MMX_PKOP2_SPEC(8,uint8_t,plus
,paddb
)
274 MMX_PKOP2_SPEC(8,uint8_t,minus
,psubb
)
275 MMX_PKOP2_SPEC(8,uint8_t,bitwise_and
,pand
)
276 MMX_PKOP2_SPEC(8,uint8_t,bitwise_or
,por
)
277 MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor
,pxor
)
278 MMX_PKOP2_SPEC(8,uint8_t,fpadds
,paddusb
)
279 MMX_PKOP2_SPEC(8,uint8_t,fpsubs
,psubusb
)
281 MMX_PASSIGN_SPEC(8,int8_t)
282 MMX_PKOP2_SPEC(8,int8_t,plus
,paddb
)
283 MMX_PKOP2_SPEC(8,int8_t,minus
,psubb
)
284 MMX_PKOP2_SPEC(8,int8_t,bitwise_and
,pand
)
285 MMX_PKOP2_SPEC(8,int8_t,bitwise_or
,por
)
286 MMX_PKOP2_SPEC(8,int8_t,bitwise_xor
,pxor
)
287 MMX_PKOP2_SPEC(8,int8_t,fpadds
,paddsb
)
288 MMX_PKOP2_SPEC(8,int8_t,fpsubs
,psubsb
)
290 MMX_PASSIGN_SPEC(4,uint16_t)
291 MMX_PKOP2_SPEC(4,uint16_t,plus
,paddw
)
292 MMX_PKOP2_SPEC(4,uint16_t,minus
,psubw
)
293 MMX_PKOP2_SPEC(4,uint16_t,bitwise_and
,pand
)
294 MMX_PKOP2_SPEC(4,uint16_t,bitwise_or
,por
)
295 MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor
,pxor
)
296 /// \todo psllw does not work like other operations, it uses the first element for shift count.
297 //MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
298 //MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
299 MMX_PKOP2_SPEC(4,uint16_t,fpadds
,paddusw
)
300 MMX_PKOP2_SPEC(4,uint16_t,fpsubs
,psubusw
)
302 MMX_PASSIGN_SPEC(4,int16_t)
303 MMX_PKOP2_SPEC(4,int16_t,plus
,paddw
)
304 MMX_PKOP2_SPEC(4,int16_t,minus
,psubw
)
305 MMX_PKOP2_SPEC(4,int16_t,bitwise_and
,pand
)
306 MMX_PKOP2_SPEC(4,int16_t,bitwise_or
,por
)
307 MMX_PKOP2_SPEC(4,int16_t,bitwise_xor
,pxor
)
308 //MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
309 //MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
310 MMX_PKOP2_SPEC(4,int16_t,fpadds
,paddsw
)
311 MMX_PKOP2_SPEC(4,int16_t,fpsubs
,psubsw
)
313 MMX_PASSIGN_SPEC(2,uint32_t)
314 MMX_PKOP2_SPEC(2,uint32_t,plus
,paddd
)
315 MMX_PKOP2_SPEC(2,uint32_t,minus
,psubd
)
316 MMX_PKOP2_SPEC(2,uint32_t,bitwise_and
,pand
)
317 MMX_PKOP2_SPEC(2,uint32_t,bitwise_or
,por
)
318 MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor
,pxor
)
319 //MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
320 //MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
322 MMX_PASSIGN_SPEC(2,int32_t)
323 MMX_PKOP2_SPEC(2,int32_t,plus
,paddd
)
324 MMX_PKOP2_SPEC(2,int32_t,minus
,psubd
)
325 MMX_PKOP2_SPEC(2,int32_t,bitwise_and
,pand
)
326 MMX_PKOP2_SPEC(2,int32_t,bitwise_or
,por
)
327 MMX_PKOP2_SPEC(2,int32_t,bitwise_xor
,pxor
)
328 //MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
329 //MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
331 MMX_DBL_PKOP2_SPEC(4,uint32_t,plus
,paddd
)
332 MMX_DBL_PKOP2_SPEC(4,uint32_t,minus
,psubd
)
333 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and
,pand
)
334 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or
,por
)
335 MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor
,pxor
)
336 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
337 //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
339 MMX_DBL_PKOP2_SPEC(4,int32_t,plus
,paddd
)
340 MMX_DBL_PKOP2_SPEC(4,int32_t,minus
,psubd
)
341 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and
,pand
)
342 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or
,por
)
343 MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor
,pxor
)
344 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
345 //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
347 #if CPU_HAS_SSE || CPU_HAS_3DNOW
348 MMX_PKOP2_SPEC(8,uint8_t,fpavg
,pavgb
)
349 MMX_PKOP2_SPEC(8,int8_t,fpavg
,pavgb
)
350 MMX_PKOP2_SPEC(4,uint16_t,fpavg
,pavgw
)
351 MMX_PKOP2_SPEC(4,int16_t,fpavg
,pavgw
)
352 MMX_PKOP2_SPEC(8,uint8_t,fpmin
,pminub
)
353 MMX_PKOP2_SPEC(8,uint8_t,fpmax
,pmaxub
)
354 MMX_PKOP2_SPEC(4,int16_t,fpmax
,pmaxsw
)
355 MMX_PKOP2_SPEC(4,int16_t,fpmin
,pminsw
)
356 #endif // CPU_HAS_SSE || CPU_HAS_3DNOW
359 MMX_PASSIGN_SPEC(2,float)
360 MMX_PKOP2_SPEC(2,float,plus
,pfadd
)
361 MMX_PKOP2_SPEC(2,float,minus
,pfsub
)
362 MMX_PKOP2_SPEC(2,float,multiplies
,pfmul
)
363 MMX_PKOP2_SPEC(2,float,fpmin
,pfmin
)
364 MMX_PKOP2_SPEC(2,float,fpmax
,pfmax
)
366 MMX_DBL_PKOP2_SPEC(4,float,plus
,pfadd
)
367 MMX_DBL_PKOP2_SPEC(4,float,minus
,pfsub
)
368 MMX_DBL_PKOP2_SPEC(4,float,multiplies
,pfmul
)
369 MMX_DBL_PKOP2_SPEC(4,float,fpmin
,pfmin
)
370 MMX_DBL_PKOP2_SPEC(4,float,fpmax
,pfmax
)
372 #endif // CPU_HAS_3DNOW
374 MMX_IPASSIGN_SPEC(8,uint8_t)
375 MMX_IPASSIGN_SPEC(4,uint16_t)
376 MMX_IPASSIGN_SPEC(2,uint32_t)
377 MMX_IPASSIGN_SPEC(2,float)
380 MMX_DBL_PASSIGN_SPEC(4,float)
381 MMX_DBL_PASSIGN_SPEC(4,uint32_t)
382 MMX_DBL_PASSIGN_SPEC(4,int32_t)
383 MMX_DBL_IPASSIGN_SPEC(4,float)
384 MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
385 MMX_DBL_IPASSIGN_SPEC(4,int32_t)
388 #undef MMX_IPASSIGN_SPEC
389 #undef MMX_PASSIGN_SPEC
390 #undef MMX_PKOP2_SPEC
392 #endif // CPU_HAS_MMX
395 #define STD_SSE_ARGS : "m"(oout[0]), "m"(oin[0]) : "xmm0", "memory"
396 #define SSE_PKOP2_SPEC(n,type,optype,instruction) \
397 SIMD_PKOP2_SPEC(n,type,optype) \
398 { asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
399 #define SSE_PASSIGN_SPEC(n,type) \
400 SIMD_PASSIGN_SPEC(n,type) \
401 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
402 #define SSE_IPASSIGN_SPEC(n,type) \
403 SIMD_IPASSIGN_SPEC(n,type) \
404 { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
405 SSE_PASSIGN_SPEC(4,float)
406 SSE_PASSIGN_SPEC(4,int32_t)
407 SSE_PASSIGN_SPEC(4,uint32_t)
408 SSE_PKOP2_SPEC(4,float,plus
,addps
)
409 SSE_PKOP2_SPEC(4,float,minus
,subps
)
410 SSE_PKOP2_SPEC(4,float,multiplies
,mulps
)
411 SSE_PKOP2_SPEC(4,float,divides
,divps
)
412 SSE_PKOP2_SPEC(4,float,bitwise_and
,andps
)
413 SSE_PKOP2_SPEC(4,float,bitwise_or
,orps
)
414 SSE_PKOP2_SPEC(4,float,bitwise_xor
,xorps
)
415 SSE_PKOP2_SPEC(4,float,fpmax
,maxps
)
416 SSE_PKOP2_SPEC(4,float,fpmin
,minps
)
418 SIMD_CONVERT_SPEC(4,float,int32_t,fround
) {
419 asm ("cvtps2pi %2, %%mm0\n\t"
420 "cvtps2pi %3, %%mm1\n\t"
426 SIMD_CONVERT_SPEC(4,int32_t,float,fround
) {
427 asm ("cvtpi2ps %2, %%xmm0\n\t"
428 "shufps $0x4E,%%xmm0,%%xmm0\n\t"
429 "cvtpi2ps %1, %%xmm0\n\t"
431 :: "m"(oout
[0]), "m"(oin
[0]), "m"(oin
[2]) : "xmm0", "memory");
433 template <> inline int32_t fround
<float,int32_t>::operator()(const float& a
) const {
435 asm ("movss %1, %%xmm0\n\t"
436 "cvtss2si %%xmm0, %0"
437 : "=r"(rv
) : "m"(a
) : "xmm0" );
440 template <> inline uint32_t fround
<float,uint32_t>::operator()(const float& a
) const {
441 register uint32_t rv
;
442 asm ("movss %1, %%xmm0\n\t"
443 "cvtss2si %%xmm0, %0"
444 : "=r"(rv
) : "m"(a
) : "xmm0" );
448 SSE_IPASSIGN_SPEC(4,float)
449 SSE_IPASSIGN_SPEC(4,int32_t)
450 SSE_IPASSIGN_SPEC(4,uint32_t)
452 #undef SSE_IPASSIGN_SPEC
453 #undef SSE_PASSIGN_SPEC
454 #undef SSE_PKOP2_SPEC
456 #endif // CPU_HAS_SSE
462 #undef SIMD_PACKEDOP_SPEC