[clang-format] Fix a bug in aligning comments above PPDirective (#72791)
[llvm-project.git] / clang / lib / Headers / ppc_wrappers / tmmintrin.h
blob92f08676d2dfade7391127a0612c845fbffd32bf
1 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
8 */
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header is distributed to simplify porting x86_64 code that
15 makes explicit use of Intel intrinsics to powerpc64le.
17 It is the user's responsibility to determine if the results are
18 acceptable and make additional changes as necessary.
20 Note that much code that uses Intel intrinsics can be rewritten in
21 standard C or GNU C extensions, which are more portable and better
22 optimized across multiple targets. */
23 #endif
25 #ifndef TMMINTRIN_H_
26 #define TMMINTRIN_H_
28 #if defined(__powerpc64__) && \
29 (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
31 #include <altivec.h>
33 /* We need definitions from the SSE header files. */
34 #include <pmmintrin.h>
36 extern __inline __m128i
37 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
38 _mm_abs_epi16(__m128i __A) {
39 return (__m128i)vec_abs((__v8hi)__A);
42 extern __inline __m128i
43 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
44 _mm_abs_epi32(__m128i __A) {
45 return (__m128i)vec_abs((__v4si)__A);
48 extern __inline __m128i
49 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
50 _mm_abs_epi8(__m128i __A) {
51 return (__m128i)vec_abs((__v16qi)__A);
54 extern __inline __m64
55 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56 _mm_abs_pi16(__m64 __A) {
57 __v8hi __B = (__v8hi)(__v2du){__A, __A};
58 return (__m64)((__v2du)vec_abs(__B))[0];
61 extern __inline __m64
62 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63 _mm_abs_pi32(__m64 __A) {
64 __v4si __B = (__v4si)(__v2du){__A, __A};
65 return (__m64)((__v2du)vec_abs(__B))[0];
68 extern __inline __m64
69 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_abs_pi8(__m64 __A) {
71 __v16qi __B = (__v16qi)(__v2du){__A, __A};
72 return (__m64)((__v2du)vec_abs(__B))[0];
75 extern __inline __m128i
76 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77 _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
78 if (__builtin_constant_p(__count) && __count < 16) {
79 #ifdef __LITTLE_ENDIAN__
80 __A = (__m128i)vec_reve((__v16qu)__A);
81 __B = (__m128i)vec_reve((__v16qu)__B);
82 #endif
83 __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
84 #ifdef __LITTLE_ENDIAN__
85 __A = (__m128i)vec_reve((__v16qu)__A);
86 #endif
87 return __A;
90 if (__count == 0)
91 return __B;
93 if (__count >= 16) {
94 if (__count >= 32) {
95 const __v16qu __zero = {0};
96 return (__m128i)__zero;
97 } else {
98 const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
99 #ifdef __LITTLE_ENDIAN__
100 return (__m128i)vec_sro((__v16qu)__A, __shift);
101 #else
102 return (__m128i)vec_slo((__v16qu)__A, __shift);
103 #endif
105 } else {
106 const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
107 const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
108 #ifdef __LITTLE_ENDIAN__
109 __A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
110 __B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
111 #else
112 __A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
113 __B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
114 #endif
115 return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
119 extern __inline __m64
120 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
121 _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
122 if (__count < 16) {
123 __v2du __C = {__B, __A};
124 #ifdef __LITTLE_ENDIAN__
125 const __v4su __shift = {__count << 3, 0, 0, 0};
126 __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
127 #else
128 const __v4su __shift = {0, 0, 0, __count << 3};
129 __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
130 #endif
131 return (__m64)__C[0];
132 } else {
133 const __m64 __zero = {0};
134 return __zero;
138 extern __inline __m128i
139 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_hadd_epi16(__m128i __A, __m128i __B) {
141 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
142 16, 17, 20, 21, 24, 25, 28, 29};
143 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
144 18, 19, 22, 23, 26, 27, 30, 31};
145 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
146 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
147 return (__m128i)vec_add(__C, __D);
150 extern __inline __m128i
151 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152 _mm_hadd_epi32(__m128i __A, __m128i __B) {
153 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
154 16, 17, 18, 19, 24, 25, 26, 27};
155 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
156 20, 21, 22, 23, 28, 29, 30, 31};
157 __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
158 __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
159 return (__m128i)vec_add(__C, __D);
162 extern __inline __m64
163 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164 _mm_hadd_pi16(__m64 __A, __m64 __B) {
165 __v8hi __C = (__v8hi)(__v2du){__A, __B};
166 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
167 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
168 __v8hi __D = vec_perm(__C, __C, __Q);
169 __C = vec_perm(__C, __C, __P);
170 __C = vec_add(__C, __D);
171 return (__m64)((__v2du)__C)[1];
174 extern __inline __m64
175 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_hadd_pi32(__m64 __A, __m64 __B) {
177 __v4si __C = (__v4si)(__v2du){__A, __B};
178 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
179 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
180 __v4si __D = vec_perm(__C, __C, __Q);
181 __C = vec_perm(__C, __C, __P);
182 __C = vec_add(__C, __D);
183 return (__m64)((__v2du)__C)[1];
186 extern __inline __m128i
187 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 _mm_hadds_epi16(__m128i __A, __m128i __B) {
189 __v4si __C = {0}, __D = {0};
190 __C = vec_sum4s((__v8hi)__A, __C);
191 __D = vec_sum4s((__v8hi)__B, __D);
192 __C = (__v4si)vec_packs(__C, __D);
193 return (__m128i)__C;
196 extern __inline __m64
197 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198 _mm_hadds_pi16(__m64 __A, __m64 __B) {
199 const __v4si __zero = {0};
200 __v8hi __C = (__v8hi)(__v2du){__A, __B};
201 __v4si __D = vec_sum4s(__C, __zero);
202 __C = vec_packs(__D, __D);
203 return (__m64)((__v2du)__C)[1];
206 extern __inline __m128i
207 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208 _mm_hsub_epi16(__m128i __A, __m128i __B) {
209 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
210 16, 17, 20, 21, 24, 25, 28, 29};
211 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
212 18, 19, 22, 23, 26, 27, 30, 31};
213 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
214 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
215 return (__m128i)vec_sub(__C, __D);
218 extern __inline __m128i
219 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220 _mm_hsub_epi32(__m128i __A, __m128i __B) {
221 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
222 16, 17, 18, 19, 24, 25, 26, 27};
223 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
224 20, 21, 22, 23, 28, 29, 30, 31};
225 __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
226 __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
227 return (__m128i)vec_sub(__C, __D);
230 extern __inline __m64
231 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232 _mm_hsub_pi16(__m64 __A, __m64 __B) {
233 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
234 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
235 __v8hi __C = (__v8hi)(__v2du){__A, __B};
236 __v8hi __D = vec_perm(__C, __C, __Q);
237 __C = vec_perm(__C, __C, __P);
238 __C = vec_sub(__C, __D);
239 return (__m64)((__v2du)__C)[1];
242 extern __inline __m64
243 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
244 _mm_hsub_pi32(__m64 __A, __m64 __B) {
245 const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
246 const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
247 __v4si __C = (__v4si)(__v2du){__A, __B};
248 __v4si __D = vec_perm(__C, __C, __Q);
249 __C = vec_perm(__C, __C, __P);
250 __C = vec_sub(__C, __D);
251 return (__m64)((__v2du)__C)[1];
254 extern __inline __m128i
255 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
256 _mm_hsubs_epi16(__m128i __A, __m128i __B) {
257 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
258 16, 17, 20, 21, 24, 25, 28, 29};
259 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
260 18, 19, 22, 23, 26, 27, 30, 31};
261 __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
262 __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
263 return (__m128i)vec_subs(__C, __D);
266 extern __inline __m64
267 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 _mm_hsubs_pi16(__m64 __A, __m64 __B) {
269 const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
270 const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
271 __v8hi __C = (__v8hi)(__v2du){__A, __B};
272 __v8hi __D = vec_perm(__C, __C, __P);
273 __v8hi __E = vec_perm(__C, __C, __Q);
274 __C = vec_subs(__D, __E);
275 return (__m64)((__v2du)__C)[1];
278 extern __inline __m128i
279 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
280 _mm_shuffle_epi8(__m128i __A, __m128i __B) {
281 const __v16qi __zero = {0};
282 __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
283 __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
284 return (__m128i)vec_sel(__C, __zero, __select);
287 extern __inline __m64
288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_shuffle_pi8(__m64 __A, __m64 __B) {
290 const __v16qi __zero = {0};
291 __v16qi __C = (__v16qi)(__v2du){__A, __A};
292 __v16qi __D = (__v16qi)(__v2du){__B, __B};
293 __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
294 __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
295 __C = vec_sel(__C, __zero, __select);
296 return (__m64)((__v2du)(__C))[0];
299 #ifdef _ARCH_PWR8
300 extern __inline __m128i
301 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _mm_sign_epi8(__m128i __A, __m128i __B) {
303 const __v16qi __zero = {0};
304 __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
305 __v16qi __selectpos =
306 (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
307 __v16qi __conv = vec_add(__selectneg, __selectpos);
308 return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
310 #endif
312 #ifdef _ARCH_PWR8
313 extern __inline __m128i
314 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315 _mm_sign_epi16(__m128i __A, __m128i __B) {
316 const __v8hi __zero = {0};
317 __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
318 __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
319 __v8hi __conv = vec_add(__selectneg, __selectpos);
320 return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
322 #endif
324 #ifdef _ARCH_PWR8
325 extern __inline __m128i
326 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_sign_epi32(__m128i __A, __m128i __B) {
328 const __v4si __zero = {0};
329 __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
330 __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
331 __v4si __conv = vec_add(__selectneg, __selectpos);
332 return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
334 #endif
336 #ifdef _ARCH_PWR8
337 extern __inline __m64
338 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339 _mm_sign_pi8(__m64 __A, __m64 __B) {
340 const __v16qi __zero = {0};
341 __v16qi __C = (__v16qi)(__v2du){__A, __A};
342 __v16qi __D = (__v16qi)(__v2du){__B, __B};
343 __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
344 return (__m64)((__v2du)(__C))[0];
346 #endif
348 #ifdef _ARCH_PWR8
349 extern __inline __m64
350 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_sign_pi16(__m64 __A, __m64 __B) {
352 const __v8hi __zero = {0};
353 __v8hi __C = (__v8hi)(__v2du){__A, __A};
354 __v8hi __D = (__v8hi)(__v2du){__B, __B};
355 __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
356 return (__m64)((__v2du)(__C))[0];
358 #endif
360 #ifdef _ARCH_PWR8
361 extern __inline __m64
362 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _mm_sign_pi32(__m64 __A, __m64 __B) {
364 const __v4si __zero = {0};
365 __v4si __C = (__v4si)(__v2du){__A, __A};
366 __v4si __D = (__v4si)(__v2du){__B, __B};
367 __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
368 return (__m64)((__v2du)(__C))[0];
370 #endif
372 extern __inline __m128i
373 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374 _mm_maddubs_epi16(__m128i __A, __m128i __B) {
375 __v8hi __unsigned = vec_splats((signed short)0x00ff);
376 __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
377 __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
378 __v8hi __E = vec_unpackh((__v16qi)__B);
379 __v8hi __F = vec_unpackl((__v16qi)__B);
380 __C = vec_mul(__C, __E);
381 __D = vec_mul(__D, __F);
382 const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
383 16, 17, 20, 21, 24, 25, 28, 29};
384 const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
385 18, 19, 22, 23, 26, 27, 30, 31};
386 __E = vec_perm(__C, __D, __odds);
387 __F = vec_perm(__C, __D, __evens);
388 return (__m128i)vec_adds(__E, __F);
391 extern __inline __m64
392 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393 _mm_maddubs_pi16(__m64 __A, __m64 __B) {
394 __v8hi __C = (__v8hi)(__v2du){__A, __A};
395 __C = vec_unpackl((__v16qi)__C);
396 const __v8hi __unsigned = vec_splats((signed short)0x00ff);
397 __C = vec_and(__C, __unsigned);
398 __v8hi __D = (__v8hi)(__v2du){__B, __B};
399 __D = vec_unpackl((__v16qi)__D);
400 __D = vec_mul(__C, __D);
401 const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
402 16, 17, 20, 21, 24, 25, 28, 29};
403 const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
404 18, 19, 22, 23, 26, 27, 30, 31};
405 __C = vec_perm(__D, __D, __odds);
406 __D = vec_perm(__D, __D, __evens);
407 __C = vec_adds(__C, __D);
408 return (__m64)((__v2du)(__C))[0];
411 extern __inline __m128i
412 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413 _mm_mulhrs_epi16(__m128i __A, __m128i __B) {
414 __v4si __C = vec_unpackh((__v8hi)__A);
415 __v4si __D = vec_unpackh((__v8hi)__B);
416 __C = vec_mul(__C, __D);
417 __D = vec_unpackl((__v8hi)__A);
418 __v4si __E = vec_unpackl((__v8hi)__B);
419 __D = vec_mul(__D, __E);
420 const __v4su __shift = vec_splats((unsigned int)14);
421 __C = vec_sr(__C, __shift);
422 __D = vec_sr(__D, __shift);
423 const __v4si __ones = vec_splats((signed int)1);
424 __C = vec_add(__C, __ones);
425 __C = vec_sr(__C, (__v4su)__ones);
426 __D = vec_add(__D, __ones);
427 __D = vec_sr(__D, (__v4su)__ones);
428 return (__m128i)vec_pack(__C, __D);
431 extern __inline __m64
432 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433 _mm_mulhrs_pi16(__m64 __A, __m64 __B) {
434 __v4si __C = (__v4si)(__v2du){__A, __A};
435 __C = vec_unpackh((__v8hi)__C);
436 __v4si __D = (__v4si)(__v2du){__B, __B};
437 __D = vec_unpackh((__v8hi)__D);
438 __C = vec_mul(__C, __D);
439 const __v4su __shift = vec_splats((unsigned int)14);
440 __C = vec_sr(__C, __shift);
441 const __v4si __ones = vec_splats((signed int)1);
442 __C = vec_add(__C, __ones);
443 __C = vec_sr(__C, (__v4su)__ones);
444 __v8hi __E = vec_pack(__C, __D);
445 return (__m64)((__v2du)(__E))[0];
448 #else
449 #include_next <tmmintrin.h>
450 #endif /* defined(__powerpc64__) && \
451 * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
453 #endif /* TMMINTRIN_H_ */