1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1OR2,X86-AVX1
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1OR2,X86-AVX2
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512,X86-AVX512VL
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512,X86-AVX512FP16
7 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512,X86-AVX512VLDQ
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X64,X64-SSE
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1OR2,X64-AVX1
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1OR2,X64-AVX2
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512VL
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512FP16
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512VLDQ
19 define <2 x double> @fabs_v2f64(<2 x double> %p) nounwind {
20 ; X86-SSE-LABEL: fabs_v2f64:
22 ; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
25 ; X86-AVX1OR2-LABEL: fabs_v2f64:
26 ; X86-AVX1OR2: # %bb.0:
27 ; X86-AVX1OR2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
28 ; X86-AVX1OR2-NEXT: retl
30 ; X86-AVX512VL-LABEL: fabs_v2f64:
31 ; X86-AVX512VL: # %bb.0:
32 ; X86-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
33 ; X86-AVX512VL-NEXT: retl
35 ; X86-AVX512FP16-LABEL: fabs_v2f64:
36 ; X86-AVX512FP16: # %bb.0:
37 ; X86-AVX512FP16-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
38 ; X86-AVX512FP16-NEXT: retl
40 ; X86-AVX512VLDQ-LABEL: fabs_v2f64:
41 ; X86-AVX512VLDQ: # %bb.0:
42 ; X86-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
43 ; X86-AVX512VLDQ-NEXT: retl
45 ; X64-SSE-LABEL: fabs_v2f64:
47 ; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
50 ; X64-AVX1OR2-LABEL: fabs_v2f64:
51 ; X64-AVX1OR2: # %bb.0:
52 ; X64-AVX1OR2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
53 ; X64-AVX1OR2-NEXT: retq
55 ; X64-AVX512VL-LABEL: fabs_v2f64:
56 ; X64-AVX512VL: # %bb.0:
57 ; X64-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
58 ; X64-AVX512VL-NEXT: retq
60 ; X64-AVX512FP16-LABEL: fabs_v2f64:
61 ; X64-AVX512FP16: # %bb.0:
62 ; X64-AVX512FP16-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
63 ; X64-AVX512FP16-NEXT: retq
65 ; X64-AVX512VLDQ-LABEL: fabs_v2f64:
66 ; X64-AVX512VLDQ: # %bb.0:
67 ; X64-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
68 ; X64-AVX512VLDQ-NEXT: retq
69 %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
72 declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
74 define <4 x float> @fabs_v4f32(<4 x float> %p) nounwind {
75 ; X86-SSE-LABEL: fabs_v4f32:
77 ; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
80 ; X86-AVX1-LABEL: fabs_v4f32:
82 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
85 ; X86-AVX2-LABEL: fabs_v4f32:
87 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
88 ; X86-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
91 ; X86-AVX512VL-LABEL: fabs_v4f32:
92 ; X86-AVX512VL: # %bb.0:
93 ; X86-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
94 ; X86-AVX512VL-NEXT: retl
96 ; X86-AVX512FP16-LABEL: fabs_v4f32:
97 ; X86-AVX512FP16: # %bb.0:
98 ; X86-AVX512FP16-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
99 ; X86-AVX512FP16-NEXT: retl
101 ; X86-AVX512VLDQ-LABEL: fabs_v4f32:
102 ; X86-AVX512VLDQ: # %bb.0:
103 ; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
104 ; X86-AVX512VLDQ-NEXT: retl
106 ; X64-SSE-LABEL: fabs_v4f32:
108 ; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
111 ; X64-AVX1-LABEL: fabs_v4f32:
113 ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
114 ; X64-AVX1-NEXT: retq
116 ; X64-AVX2-LABEL: fabs_v4f32:
118 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
119 ; X64-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
120 ; X64-AVX2-NEXT: retq
122 ; X64-AVX512VL-LABEL: fabs_v4f32:
123 ; X64-AVX512VL: # %bb.0:
124 ; X64-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
125 ; X64-AVX512VL-NEXT: retq
127 ; X64-AVX512FP16-LABEL: fabs_v4f32:
128 ; X64-AVX512FP16: # %bb.0:
129 ; X64-AVX512FP16-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
130 ; X64-AVX512FP16-NEXT: retq
132 ; X64-AVX512VLDQ-LABEL: fabs_v4f32:
133 ; X64-AVX512VLDQ: # %bb.0:
134 ; X64-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
135 ; X64-AVX512VLDQ-NEXT: retq
136 %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
139 declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
141 define <8 x half> @fabs_v8f16(ptr %p) nounwind {
142 ; X86-SSE-LABEL: fabs_v8f16:
144 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
145 ; X86-SSE-NEXT: movaps (%eax), %xmm0
146 ; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
149 ; X86-AVX1-LABEL: fabs_v8f16:
151 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
152 ; X86-AVX1-NEXT: vmovaps (%eax), %xmm0
153 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
154 ; X86-AVX1-NEXT: retl
156 ; X86-AVX2-LABEL: fabs_v8f16:
158 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
159 ; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
160 ; X86-AVX2-NEXT: vpand (%eax), %xmm0, %xmm0
161 ; X86-AVX2-NEXT: retl
163 ; X86-AVX512-LABEL: fabs_v8f16:
164 ; X86-AVX512: # %bb.0:
165 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
166 ; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
167 ; X86-AVX512-NEXT: vpand (%eax), %xmm0, %xmm0
168 ; X86-AVX512-NEXT: retl
170 ; X64-SSE-LABEL: fabs_v8f16:
172 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
173 ; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
176 ; X64-AVX1-LABEL: fabs_v8f16:
178 ; X64-AVX1-NEXT: vmovaps (%rdi), %xmm0
179 ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
180 ; X64-AVX1-NEXT: retq
182 ; X64-AVX2-LABEL: fabs_v8f16:
184 ; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
185 ; X64-AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0
186 ; X64-AVX2-NEXT: retq
188 ; X64-AVX512-LABEL: fabs_v8f16:
189 ; X64-AVX512: # %bb.0:
190 ; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
191 ; X64-AVX512-NEXT: vpand (%rdi), %xmm0, %xmm0
192 ; X64-AVX512-NEXT: retq
193 %v = load <8 x half>, ptr %p, align 16
194 %nnv = call <8 x half> @llvm.fabs.v8f16(<8 x half> %v)
197 declare <8 x half> @llvm.fabs.v8f16(<8 x half> %p)
203 define <4 x double> @fabs_v4f64(<4 x double> %p) nounwind {
204 ; X86-SSE-LABEL: fabs_v4f64:
206 ; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN]
207 ; X86-SSE-NEXT: andps %xmm2, %xmm0
208 ; X86-SSE-NEXT: andps %xmm2, %xmm1
211 ; X86-AVX1-LABEL: fabs_v4f64:
213 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
214 ; X86-AVX1-NEXT: retl
216 ; X86-AVX2-LABEL: fabs_v4f64:
218 ; X86-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN]
219 ; X86-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
220 ; X86-AVX2-NEXT: retl
222 ; X86-AVX512VL-LABEL: fabs_v4f64:
223 ; X86-AVX512VL: # %bb.0:
224 ; X86-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm0, %ymm0
225 ; X86-AVX512VL-NEXT: retl
227 ; X86-AVX512FP16-LABEL: fabs_v4f64:
228 ; X86-AVX512FP16: # %bb.0:
229 ; X86-AVX512FP16-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm0, %ymm0
230 ; X86-AVX512FP16-NEXT: retl
232 ; X86-AVX512VLDQ-LABEL: fabs_v4f64:
233 ; X86-AVX512VLDQ: # %bb.0:
234 ; X86-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm0, %ymm0
235 ; X86-AVX512VLDQ-NEXT: retl
237 ; X64-SSE-LABEL: fabs_v4f64:
239 ; X64-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN]
240 ; X64-SSE-NEXT: andps %xmm2, %xmm0
241 ; X64-SSE-NEXT: andps %xmm2, %xmm1
244 ; X64-AVX1-LABEL: fabs_v4f64:
246 ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
247 ; X64-AVX1-NEXT: retq
249 ; X64-AVX2-LABEL: fabs_v4f64:
251 ; X64-AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN]
252 ; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
253 ; X64-AVX2-NEXT: retq
255 ; X64-AVX512VL-LABEL: fabs_v4f64:
256 ; X64-AVX512VL: # %bb.0:
257 ; X64-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
258 ; X64-AVX512VL-NEXT: retq
260 ; X64-AVX512FP16-LABEL: fabs_v4f64:
261 ; X64-AVX512FP16: # %bb.0:
262 ; X64-AVX512FP16-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
263 ; X64-AVX512FP16-NEXT: retq
265 ; X64-AVX512VLDQ-LABEL: fabs_v4f64:
266 ; X64-AVX512VLDQ: # %bb.0:
267 ; X64-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
268 ; X64-AVX512VLDQ-NEXT: retq
269 %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
272 declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
274 define <8 x float> @fabs_v8f32(<8 x float> %p) nounwind {
275 ; X86-SSE-LABEL: fabs_v8f32:
277 ; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
278 ; X86-SSE-NEXT: andps %xmm2, %xmm0
279 ; X86-SSE-NEXT: andps %xmm2, %xmm1
282 ; X86-AVX1-LABEL: fabs_v8f32:
284 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
285 ; X86-AVX1-NEXT: retl
287 ; X86-AVX2-LABEL: fabs_v8f32:
289 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
290 ; X86-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
291 ; X86-AVX2-NEXT: retl
293 ; X86-AVX512VL-LABEL: fabs_v8f32:
294 ; X86-AVX512VL: # %bb.0:
295 ; X86-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
296 ; X86-AVX512VL-NEXT: retl
298 ; X86-AVX512FP16-LABEL: fabs_v8f32:
299 ; X86-AVX512FP16: # %bb.0:
300 ; X86-AVX512FP16-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
301 ; X86-AVX512FP16-NEXT: retl
303 ; X86-AVX512VLDQ-LABEL: fabs_v8f32:
304 ; X86-AVX512VLDQ: # %bb.0:
305 ; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
306 ; X86-AVX512VLDQ-NEXT: retl
308 ; X64-SSE-LABEL: fabs_v8f32:
310 ; X64-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
311 ; X64-SSE-NEXT: andps %xmm2, %xmm0
312 ; X64-SSE-NEXT: andps %xmm2, %xmm1
315 ; X64-AVX1-LABEL: fabs_v8f32:
317 ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
318 ; X64-AVX1-NEXT: retq
320 ; X64-AVX2-LABEL: fabs_v8f32:
322 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
323 ; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
324 ; X64-AVX2-NEXT: retq
326 ; X64-AVX512VL-LABEL: fabs_v8f32:
327 ; X64-AVX512VL: # %bb.0:
328 ; X64-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
329 ; X64-AVX512VL-NEXT: retq
331 ; X64-AVX512FP16-LABEL: fabs_v8f32:
332 ; X64-AVX512FP16: # %bb.0:
333 ; X64-AVX512FP16-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
334 ; X64-AVX512FP16-NEXT: retq
336 ; X64-AVX512VLDQ-LABEL: fabs_v8f32:
337 ; X64-AVX512VLDQ: # %bb.0:
338 ; X64-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
339 ; X64-AVX512VLDQ-NEXT: retq
340 %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
343 declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
345 define <16 x half> @fabs_v16f16(ptr %p) nounwind {
346 ; X86-SSE-LABEL: fabs_v16f16:
348 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
349 ; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
350 ; X86-SSE-NEXT: movaps (%eax), %xmm0
351 ; X86-SSE-NEXT: andps %xmm1, %xmm0
352 ; X86-SSE-NEXT: andps 16(%eax), %xmm1
355 ; X86-AVX1-LABEL: fabs_v16f16:
357 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
358 ; X86-AVX1-NEXT: vmovaps (%eax), %ymm0
359 ; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
360 ; X86-AVX1-NEXT: retl
362 ; X86-AVX2-LABEL: fabs_v16f16:
364 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
365 ; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
366 ; X86-AVX2-NEXT: vpand (%eax), %ymm0, %ymm0
367 ; X86-AVX2-NEXT: retl
369 ; X86-AVX512-LABEL: fabs_v16f16:
370 ; X86-AVX512: # %bb.0:
371 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
372 ; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
373 ; X86-AVX512-NEXT: vpand (%eax), %ymm0, %ymm0
374 ; X86-AVX512-NEXT: retl
376 ; X64-SSE-LABEL: fabs_v16f16:
378 ; X64-SSE-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
379 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
380 ; X64-SSE-NEXT: andps %xmm1, %xmm0
381 ; X64-SSE-NEXT: andps 16(%rdi), %xmm1
384 ; X64-AVX1-LABEL: fabs_v16f16:
386 ; X64-AVX1-NEXT: vmovaps (%rdi), %ymm0
387 ; X64-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
388 ; X64-AVX1-NEXT: retq
390 ; X64-AVX2-LABEL: fabs_v16f16:
392 ; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
393 ; X64-AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0
394 ; X64-AVX2-NEXT: retq
396 ; X64-AVX512-LABEL: fabs_v16f16:
397 ; X64-AVX512: # %bb.0:
398 ; X64-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
399 ; X64-AVX512-NEXT: vpand (%rdi), %ymm0, %ymm0
400 ; X64-AVX512-NEXT: retq
401 %v = load <16 x half>, ptr %p, align 32
402 %nnv = call <16 x half> @llvm.fabs.v16f16(<16 x half> %v)
405 declare <16 x half> @llvm.fabs.v16f16(<16 x half> %p)
411 define <8 x double> @fabs_v8f64(<8 x double> %p) nounwind {
412 ; X86-SSE-LABEL: fabs_v8f64:
414 ; X86-SSE-NEXT: pushl %ebp
415 ; X86-SSE-NEXT: movl %esp, %ebp
416 ; X86-SSE-NEXT: andl $-16, %esp
417 ; X86-SSE-NEXT: subl $16, %esp
418 ; X86-SSE-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN]
419 ; X86-SSE-NEXT: andps %xmm3, %xmm0
420 ; X86-SSE-NEXT: andps %xmm3, %xmm1
421 ; X86-SSE-NEXT: andps %xmm3, %xmm2
422 ; X86-SSE-NEXT: andps 8(%ebp), %xmm3
423 ; X86-SSE-NEXT: movl %ebp, %esp
424 ; X86-SSE-NEXT: popl %ebp
427 ; X86-AVX1OR2-LABEL: fabs_v8f64:
428 ; X86-AVX1OR2: # %bb.0:
429 ; X86-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
430 ; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0
431 ; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1
432 ; X86-AVX1OR2-NEXT: retl
434 ; X86-AVX512VL-LABEL: fabs_v8f64:
435 ; X86-AVX512VL: # %bb.0:
436 ; X86-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
437 ; X86-AVX512VL-NEXT: retl
439 ; X86-AVX512FP16-LABEL: fabs_v8f64:
440 ; X86-AVX512FP16: # %bb.0:
441 ; X86-AVX512FP16-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
442 ; X86-AVX512FP16-NEXT: retl
444 ; X86-AVX512VLDQ-LABEL: fabs_v8f64:
445 ; X86-AVX512VLDQ: # %bb.0:
446 ; X86-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
447 ; X86-AVX512VLDQ-NEXT: retl
449 ; X64-SSE-LABEL: fabs_v8f64:
451 ; X64-SSE-NEXT: movaps {{.*#+}} xmm4 = [NaN,NaN]
452 ; X64-SSE-NEXT: andps %xmm4, %xmm0
453 ; X64-SSE-NEXT: andps %xmm4, %xmm1
454 ; X64-SSE-NEXT: andps %xmm4, %xmm2
455 ; X64-SSE-NEXT: andps %xmm4, %xmm3
458 ; X64-AVX1OR2-LABEL: fabs_v8f64:
459 ; X64-AVX1OR2: # %bb.0:
460 ; X64-AVX1OR2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
461 ; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0
462 ; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1
463 ; X64-AVX1OR2-NEXT: retq
465 ; X64-AVX512VL-LABEL: fabs_v8f64:
466 ; X64-AVX512VL: # %bb.0:
467 ; X64-AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
468 ; X64-AVX512VL-NEXT: retq
470 ; X64-AVX512FP16-LABEL: fabs_v8f64:
471 ; X64-AVX512FP16: # %bb.0:
472 ; X64-AVX512FP16-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
473 ; X64-AVX512FP16-NEXT: retq
475 ; X64-AVX512VLDQ-LABEL: fabs_v8f64:
476 ; X64-AVX512VLDQ: # %bb.0:
477 ; X64-AVX512VLDQ-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
478 ; X64-AVX512VLDQ-NEXT: retq
479 %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
482 declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
484 define <16 x float> @fabs_v16f32(<16 x float> %p) nounwind {
485 ; X86-SSE-LABEL: fabs_v16f32:
487 ; X86-SSE-NEXT: pushl %ebp
488 ; X86-SSE-NEXT: movl %esp, %ebp
489 ; X86-SSE-NEXT: andl $-16, %esp
490 ; X86-SSE-NEXT: subl $16, %esp
491 ; X86-SSE-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
492 ; X86-SSE-NEXT: andps %xmm3, %xmm0
493 ; X86-SSE-NEXT: andps %xmm3, %xmm1
494 ; X86-SSE-NEXT: andps %xmm3, %xmm2
495 ; X86-SSE-NEXT: andps 8(%ebp), %xmm3
496 ; X86-SSE-NEXT: movl %ebp, %esp
497 ; X86-SSE-NEXT: popl %ebp
500 ; X86-AVX1OR2-LABEL: fabs_v16f32:
501 ; X86-AVX1OR2: # %bb.0:
502 ; X86-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
503 ; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0
504 ; X86-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1
505 ; X86-AVX1OR2-NEXT: retl
507 ; X86-AVX512VL-LABEL: fabs_v16f32:
508 ; X86-AVX512VL: # %bb.0:
509 ; X86-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
510 ; X86-AVX512VL-NEXT: retl
512 ; X86-AVX512FP16-LABEL: fabs_v16f32:
513 ; X86-AVX512FP16: # %bb.0:
514 ; X86-AVX512FP16-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
515 ; X86-AVX512FP16-NEXT: retl
517 ; X86-AVX512VLDQ-LABEL: fabs_v16f32:
518 ; X86-AVX512VLDQ: # %bb.0:
519 ; X86-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
520 ; X86-AVX512VLDQ-NEXT: retl
522 ; X64-SSE-LABEL: fabs_v16f32:
524 ; X64-SSE-NEXT: movaps {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
525 ; X64-SSE-NEXT: andps %xmm4, %xmm0
526 ; X64-SSE-NEXT: andps %xmm4, %xmm1
527 ; X64-SSE-NEXT: andps %xmm4, %xmm2
528 ; X64-SSE-NEXT: andps %xmm4, %xmm3
531 ; X64-AVX1OR2-LABEL: fabs_v16f32:
532 ; X64-AVX1OR2: # %bb.0:
533 ; X64-AVX1OR2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
534 ; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm0, %ymm0
535 ; X64-AVX1OR2-NEXT: vandps %ymm2, %ymm1, %ymm1
536 ; X64-AVX1OR2-NEXT: retq
538 ; X64-AVX512VL-LABEL: fabs_v16f32:
539 ; X64-AVX512VL: # %bb.0:
540 ; X64-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
541 ; X64-AVX512VL-NEXT: retq
543 ; X64-AVX512FP16-LABEL: fabs_v16f32:
544 ; X64-AVX512FP16: # %bb.0:
545 ; X64-AVX512FP16-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
546 ; X64-AVX512FP16-NEXT: retq
548 ; X64-AVX512VLDQ-LABEL: fabs_v16f32:
549 ; X64-AVX512VLDQ: # %bb.0:
550 ; X64-AVX512VLDQ-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
551 ; X64-AVX512VLDQ-NEXT: retq
552 %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
555 declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
557 define <32 x half> @fabs_v32f16(ptr %p) nounwind {
558 ; X86-SSE-LABEL: fabs_v32f16:
560 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
561 ; X86-SSE-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
562 ; X86-SSE-NEXT: movaps (%eax), %xmm0
563 ; X86-SSE-NEXT: andps %xmm3, %xmm0
564 ; X86-SSE-NEXT: movaps 16(%eax), %xmm1
565 ; X86-SSE-NEXT: andps %xmm3, %xmm1
566 ; X86-SSE-NEXT: movaps 32(%eax), %xmm2
567 ; X86-SSE-NEXT: andps %xmm3, %xmm2
568 ; X86-SSE-NEXT: andps 48(%eax), %xmm3
571 ; X86-AVX1-LABEL: fabs_v32f16:
573 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
574 ; X86-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
575 ; X86-AVX1-NEXT: vandps (%eax), %ymm1, %ymm0
576 ; X86-AVX1-NEXT: vandps 32(%eax), %ymm1, %ymm1
577 ; X86-AVX1-NEXT: retl
579 ; X86-AVX2-LABEL: fabs_v32f16:
581 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
582 ; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
583 ; X86-AVX2-NEXT: vpand (%eax), %ymm1, %ymm0
584 ; X86-AVX2-NEXT: vpand 32(%eax), %ymm1, %ymm1
585 ; X86-AVX2-NEXT: retl
587 ; X86-AVX512VL-LABEL: fabs_v32f16:
588 ; X86-AVX512VL: # %bb.0:
589 ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
590 ; X86-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
591 ; X86-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
592 ; X86-AVX512VL-NEXT: vpandq (%eax), %zmm0, %zmm0
593 ; X86-AVX512VL-NEXT: retl
595 ; X86-AVX512FP16-LABEL: fabs_v32f16:
596 ; X86-AVX512FP16: # %bb.0:
597 ; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax
598 ; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
599 ; X86-AVX512FP16-NEXT: vpandq (%eax), %zmm0, %zmm0
600 ; X86-AVX512FP16-NEXT: retl
602 ; X86-AVX512VLDQ-LABEL: fabs_v32f16:
603 ; X86-AVX512VLDQ: # %bb.0:
604 ; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax
605 ; X86-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
606 ; X86-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
607 ; X86-AVX512VLDQ-NEXT: vpandq (%eax), %zmm0, %zmm0
608 ; X86-AVX512VLDQ-NEXT: retl
610 ; X64-SSE-LABEL: fabs_v32f16:
612 ; X64-SSE-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
613 ; X64-SSE-NEXT: movaps (%rdi), %xmm0
614 ; X64-SSE-NEXT: andps %xmm3, %xmm0
615 ; X64-SSE-NEXT: movaps 16(%rdi), %xmm1
616 ; X64-SSE-NEXT: andps %xmm3, %xmm1
617 ; X64-SSE-NEXT: movaps 32(%rdi), %xmm2
618 ; X64-SSE-NEXT: andps %xmm3, %xmm2
619 ; X64-SSE-NEXT: andps 48(%rdi), %xmm3
622 ; X64-AVX1-LABEL: fabs_v32f16:
624 ; X64-AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
625 ; X64-AVX1-NEXT: vandps (%rdi), %ymm1, %ymm0
626 ; X64-AVX1-NEXT: vandps 32(%rdi), %ymm1, %ymm1
627 ; X64-AVX1-NEXT: retq
629 ; X64-AVX2-LABEL: fabs_v32f16:
631 ; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
632 ; X64-AVX2-NEXT: vpand (%rdi), %ymm1, %ymm0
633 ; X64-AVX2-NEXT: vpand 32(%rdi), %ymm1, %ymm1
634 ; X64-AVX2-NEXT: retq
636 ; X64-AVX512VL-LABEL: fabs_v32f16:
637 ; X64-AVX512VL: # %bb.0:
638 ; X64-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
639 ; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
640 ; X64-AVX512VL-NEXT: vpandq (%rdi), %zmm0, %zmm0
641 ; X64-AVX512VL-NEXT: retq
643 ; X64-AVX512FP16-LABEL: fabs_v32f16:
644 ; X64-AVX512FP16: # %bb.0:
645 ; X64-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
646 ; X64-AVX512FP16-NEXT: vpandq (%rdi), %zmm0, %zmm0
647 ; X64-AVX512FP16-NEXT: retq
649 ; X64-AVX512VLDQ-LABEL: fabs_v32f16:
650 ; X64-AVX512VLDQ: # %bb.0:
651 ; X64-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
652 ; X64-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
653 ; X64-AVX512VLDQ-NEXT: vpandq (%rdi), %zmm0, %zmm0
654 ; X64-AVX512VLDQ-NEXT: retq
655 %v = load <32 x half>, ptr %p, align 64
656 %nnv = call <32 x half> @llvm.fabs.v32f16(<32 x half> %v)
659 declare <32 x half> @llvm.fabs.v32f16(<32 x half> %p)
661 ; PR20354: when generating code for a vector fabs op,
662 ; make sure that we're only turning off the sign bit of each float value.
663 ; No constant pool loads or vector ops are needed for the fabs of a
664 ; bitcasted integer constant; we should just return an integer constant
665 ; that has the sign bits turned off.
667 ; So instead of something like this:
668 ; movabsq (constant pool load of mask for sign bits)
669 ; vmovq (move from integer register to vector/fp register)
670 ; vandps (mask off sign bits)
671 ; vmovq (move vector/fp register back to integer return register)
673 ; We should generate:
674 ; mov (put constant value in return register)
676 define i64 @fabs_v2f32_1() nounwind {
677 ; X86-LABEL: fabs_v2f32_1:
679 ; X86-NEXT: xorl %eax, %eax
680 ; X86-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
683 ; X64-LABEL: fabs_v2f32_1:
685 ; X64-NEXT: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
687 %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
688 %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
689 %ret = bitcast <2 x float> %fabs to i64
693 define i64 @fabs_v2f32_2() nounwind {
694 ; X86-LABEL: fabs_v2f32_2:
696 ; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
697 ; X86-NEXT: xorl %edx, %edx
700 ; X64-LABEL: fabs_v2f32_2:
702 ; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
704 %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
705 %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
706 %ret = bitcast <2 x float> %fabs to i64
710 declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p)
712 ; PR70947 - remove duplicate xmm/ymm constant loads
713 define void @PR70947(ptr %src, ptr %dst) nounwind {
714 ; X86-SSE-LABEL: PR70947:
716 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
717 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
718 ; X86-SSE-NEXT: movups (%ecx), %xmm0
719 ; X86-SSE-NEXT: movups 32(%ecx), %xmm1
720 ; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN]
721 ; X86-SSE-NEXT: andps %xmm2, %xmm0
722 ; X86-SSE-NEXT: andps %xmm2, %xmm1
723 ; X86-SSE-NEXT: movups %xmm0, (%eax)
724 ; X86-SSE-NEXT: movups %xmm1, 16(%eax)
727 ; X86-AVX-LABEL: PR70947:
729 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
730 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
731 ; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
732 ; X86-AVX-NEXT: vandps (%ecx), %ymm0, %ymm1
733 ; X86-AVX-NEXT: vandps 32(%ecx), %xmm0, %xmm0
734 ; X86-AVX-NEXT: vmovups %ymm1, (%eax)
735 ; X86-AVX-NEXT: vmovups %xmm0, 16(%eax)
736 ; X86-AVX-NEXT: vzeroupper
739 ; X64-SSE-LABEL: PR70947:
741 ; X64-SSE-NEXT: movups (%rdi), %xmm0
742 ; X64-SSE-NEXT: movups 32(%rdi), %xmm1
743 ; X64-SSE-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN]
744 ; X64-SSE-NEXT: andps %xmm2, %xmm0
745 ; X64-SSE-NEXT: andps %xmm2, %xmm1
746 ; X64-SSE-NEXT: movups %xmm0, (%rsi)
747 ; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
750 ; X64-AVX-LABEL: PR70947:
752 ; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
753 ; X64-AVX-NEXT: vandps (%rdi), %ymm0, %ymm1
754 ; X64-AVX-NEXT: vandps 32(%rdi), %xmm0, %xmm0
755 ; X64-AVX-NEXT: vmovups %ymm1, (%rsi)
756 ; X64-AVX-NEXT: vmovups %xmm0, 16(%rsi)
757 ; X64-AVX-NEXT: vzeroupper
759 %src4 = getelementptr inbounds double, ptr %src, i64 4
760 %dst4 = getelementptr inbounds i32, ptr %dst, i64 4
761 %ld0 = load <4 x double>, ptr %src, align 8
762 %ld4 = load <2 x double>, ptr %src4, align 8
763 %fabs0 = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %ld0)
764 %fabs4 = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %ld4)
765 store <4 x double> %fabs0, ptr %dst, align 4
766 store <2 x double> %fabs4, ptr %dst4, align 4