1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-freebsd -mattr=+avx | FileCheck %s --check-prefixes=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512F
6 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX512FP16
8 define half @round_f16(half %h) {
9 ; SSE2-LABEL: round_f16:
10 ; SSE2: # %bb.0: # %entry
11 ; SSE2-NEXT: pushq %rax
12 ; SSE2-NEXT: .cfi_def_cfa_offset 16
13 ; SSE2-NEXT: callq __extendhfsf2@PLT
14 ; SSE2-NEXT: callq roundf@PLT
15 ; SSE2-NEXT: callq __truncsfhf2@PLT
16 ; SSE2-NEXT: popq %rax
17 ; SSE2-NEXT: .cfi_def_cfa_offset 8
20 ; SSE41-LABEL: round_f16:
21 ; SSE41: # %bb.0: # %entry
22 ; SSE41-NEXT: pushq %rax
23 ; SSE41-NEXT: .cfi_def_cfa_offset 16
24 ; SSE41-NEXT: callq __extendhfsf2@PLT
25 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
26 ; SSE41-NEXT: andps %xmm0, %xmm1
27 ; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
28 ; SSE41-NEXT: addss %xmm0, %xmm1
29 ; SSE41-NEXT: xorps %xmm0, %xmm0
30 ; SSE41-NEXT: roundss $11, %xmm1, %xmm0
31 ; SSE41-NEXT: callq __truncsfhf2@PLT
32 ; SSE41-NEXT: popq %rax
33 ; SSE41-NEXT: .cfi_def_cfa_offset 8
36 ; AVX1-LABEL: round_f16:
37 ; AVX1: # %bb.0: # %entry
38 ; AVX1-NEXT: pushq %rax
39 ; AVX1-NEXT: .cfi_def_cfa_offset 16
40 ; AVX1-NEXT: callq __extendhfsf2@PLT
41 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
42 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
43 ; AVX1-NEXT: vorps %xmm2, %xmm1, %xmm1
44 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
45 ; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
46 ; AVX1-NEXT: callq __truncsfhf2@PLT
47 ; AVX1-NEXT: popq %rax
48 ; AVX1-NEXT: .cfi_def_cfa_offset 8
51 ; AVX512F-LABEL: round_f16:
52 ; AVX512F: # %bb.0: # %entry
53 ; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
54 ; AVX512F-NEXT: movzwl %ax, %eax
55 ; AVX512F-NEXT: vmovd %eax, %xmm0
56 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
57 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
58 ; AVX512F-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
59 ; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm0
60 ; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
61 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
62 ; AVX512F-NEXT: vmovd %xmm0, %eax
63 ; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
66 ; AVX512FP16-LABEL: round_f16:
67 ; AVX512FP16: ## %bb.0: ## %entry
68 ; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
69 ; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1]
70 ; AVX512FP16-NEXT: vpternlogq $248, %xmm1, %xmm0, %xmm2
71 ; AVX512FP16-NEXT: vaddsh %xmm2, %xmm0, %xmm0
72 ; AVX512FP16-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0
73 ; AVX512FP16-NEXT: retq
75 %a = call half @llvm.round.f16(half %h)
79 define float @round_f32(float %x) {
80 ; SSE2-LABEL: round_f32:
82 ; SSE2-NEXT: jmp roundf@PLT # TAILCALL
84 ; SSE41-LABEL: round_f32:
86 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
87 ; SSE41-NEXT: andps %xmm0, %xmm1
88 ; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
89 ; SSE41-NEXT: addss %xmm0, %xmm1
90 ; SSE41-NEXT: xorps %xmm0, %xmm0
91 ; SSE41-NEXT: roundss $11, %xmm1, %xmm0
94 ; AVX1-LABEL: round_f32:
96 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
97 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
98 ; AVX1-NEXT: vorps %xmm2, %xmm1, %xmm1
99 ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
100 ; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
103 ; AVX512F-LABEL: round_f32:
105 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
106 ; AVX512F-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
107 ; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm0
108 ; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
111 ; AVX512FP16-LABEL: round_f32:
112 ; AVX512FP16: ## %bb.0:
113 ; AVX512FP16-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
114 ; AVX512FP16-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
115 ; AVX512FP16-NEXT: vaddss %xmm1, %xmm0, %xmm0
116 ; AVX512FP16-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
117 ; AVX512FP16-NEXT: retq
118 %a = call float @llvm.round.f32(float %x)
122 define double @round_f64(double %x) {
123 ; SSE2-LABEL: round_f64:
125 ; SSE2-NEXT: jmp round@PLT # TAILCALL
127 ; SSE41-LABEL: round_f64:
129 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
130 ; SSE41-NEXT: andpd %xmm0, %xmm1
131 ; SSE41-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
132 ; SSE41-NEXT: addsd %xmm0, %xmm1
133 ; SSE41-NEXT: xorps %xmm0, %xmm0
134 ; SSE41-NEXT: roundsd $11, %xmm1, %xmm0
137 ; AVX1-LABEL: round_f64:
139 ; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
140 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
141 ; AVX1-NEXT: # xmm2 = mem[0,0]
142 ; AVX1-NEXT: vorpd %xmm2, %xmm1, %xmm1
143 ; AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm0
144 ; AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
147 ; AVX512F-LABEL: round_f64:
149 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
150 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
151 ; AVX512F-NEXT: vaddsd %xmm1, %xmm0, %xmm0
152 ; AVX512F-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
155 ; AVX512FP16-LABEL: round_f64:
156 ; AVX512FP16: ## %bb.0:
157 ; AVX512FP16-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
158 ; AVX512FP16-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
159 ; AVX512FP16-NEXT: vaddsd %xmm1, %xmm0, %xmm0
160 ; AVX512FP16-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
161 ; AVX512FP16-NEXT: retq
162 %a = call double @llvm.round.f64(double %x)
166 define <4 x float> @round_v4f32(<4 x float> %x) {
167 ; SSE2-LABEL: round_v4f32:
169 ; SSE2-NEXT: subq $56, %rsp
170 ; SSE2-NEXT: .cfi_def_cfa_offset 64
171 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
172 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
173 ; SSE2-NEXT: callq roundf@PLT
174 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
175 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
176 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
177 ; SSE2-NEXT: callq roundf@PLT
178 ; SSE2-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
179 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
180 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
181 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
182 ; SSE2-NEXT: callq roundf@PLT
183 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
184 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
185 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
186 ; SSE2-NEXT: callq roundf@PLT
187 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
188 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
189 ; SSE2-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
190 ; SSE2-NEXT: # xmm1 = xmm1[0],mem[0]
191 ; SSE2-NEXT: movaps %xmm1, %xmm0
192 ; SSE2-NEXT: addq $56, %rsp
193 ; SSE2-NEXT: .cfi_def_cfa_offset 8
196 ; SSE41-LABEL: round_v4f32:
198 ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
199 ; SSE41-NEXT: andps %xmm0, %xmm1
200 ; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
201 ; SSE41-NEXT: addps %xmm0, %xmm1
202 ; SSE41-NEXT: roundps $11, %xmm1, %xmm0
205 ; AVX1-LABEL: round_v4f32:
207 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
208 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
209 ; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0
210 ; AVX1-NEXT: vroundps $11, %xmm0, %xmm0
213 ; AVX512F-LABEL: round_v4f32:
215 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
216 ; AVX512F-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
217 ; AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0
218 ; AVX512F-NEXT: vroundps $11, %xmm0, %xmm0
221 ; AVX512FP16-LABEL: round_v4f32:
222 ; AVX512FP16: ## %bb.0:
223 ; AVX512FP16-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
224 ; AVX512FP16-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
225 ; AVX512FP16-NEXT: vaddps %xmm1, %xmm0, %xmm0
226 ; AVX512FP16-NEXT: vroundps $11, %xmm0, %xmm0
227 ; AVX512FP16-NEXT: retq
228 %a = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
232 define <2 x double> @round_v2f64(<2 x double> %x) {
233 ; SSE2-LABEL: round_v2f64:
235 ; SSE2-NEXT: subq $40, %rsp
236 ; SSE2-NEXT: .cfi_def_cfa_offset 48
237 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
238 ; SSE2-NEXT: callq round@PLT
239 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
240 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
241 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
242 ; SSE2-NEXT: callq round@PLT
243 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
244 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
245 ; SSE2-NEXT: movaps %xmm1, %xmm0
246 ; SSE2-NEXT: addq $40, %rsp
247 ; SSE2-NEXT: .cfi_def_cfa_offset 8
250 ; SSE41-LABEL: round_v2f64:
252 ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
253 ; SSE41-NEXT: andpd %xmm0, %xmm1
254 ; SSE41-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
255 ; SSE41-NEXT: addpd %xmm0, %xmm1
256 ; SSE41-NEXT: roundpd $11, %xmm1, %xmm0
259 ; AVX1-LABEL: round_v2f64:
261 ; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
262 ; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
263 ; AVX1-NEXT: vaddpd %xmm1, %xmm0, %xmm0
264 ; AVX1-NEXT: vroundpd $11, %xmm0, %xmm0
267 ; AVX512F-LABEL: round_v2f64:
269 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
270 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
271 ; AVX512F-NEXT: vaddpd %xmm1, %xmm0, %xmm0
272 ; AVX512F-NEXT: vroundpd $11, %xmm0, %xmm0
275 ; AVX512FP16-LABEL: round_v2f64:
276 ; AVX512FP16: ## %bb.0:
277 ; AVX512FP16-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
278 ; AVX512FP16-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
279 ; AVX512FP16-NEXT: vaddpd %xmm1, %xmm0, %xmm0
280 ; AVX512FP16-NEXT: vroundpd $11, %xmm0, %xmm0
281 ; AVX512FP16-NEXT: retq
282 %a = call <2 x double> @llvm.round.v2f64(<2 x double> %x)
286 define <8 x float> @round_v8f32(<8 x float> %x) {
287 ; SSE2-LABEL: round_v8f32:
289 ; SSE2-NEXT: subq $72, %rsp
290 ; SSE2-NEXT: .cfi_def_cfa_offset 80
291 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
292 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
293 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
294 ; SSE2-NEXT: callq roundf@PLT
295 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
296 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
297 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
298 ; SSE2-NEXT: callq roundf@PLT
299 ; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
300 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
301 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
302 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
303 ; SSE2-NEXT: callq roundf@PLT
304 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
305 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
306 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
307 ; SSE2-NEXT: callq roundf@PLT
308 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
309 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
310 ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
311 ; SSE2-NEXT: # xmm1 = xmm1[0],mem[0]
312 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
313 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
314 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
315 ; SSE2-NEXT: callq roundf@PLT
316 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
317 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
318 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
319 ; SSE2-NEXT: callq roundf@PLT
320 ; SSE2-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
321 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
322 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
323 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
324 ; SSE2-NEXT: callq roundf@PLT
325 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
326 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
327 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
328 ; SSE2-NEXT: callq roundf@PLT
329 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
330 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
331 ; SSE2-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
332 ; SSE2-NEXT: # xmm1 = xmm1[0],mem[0]
333 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
334 ; SSE2-NEXT: addq $72, %rsp
335 ; SSE2-NEXT: .cfi_def_cfa_offset 8
338 ; SSE41-LABEL: round_v8f32:
340 ; SSE41-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
341 ; SSE41-NEXT: movaps %xmm0, %xmm3
342 ; SSE41-NEXT: andps %xmm2, %xmm3
343 ; SSE41-NEXT: movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
344 ; SSE41-NEXT: orps %xmm4, %xmm3
345 ; SSE41-NEXT: addps %xmm0, %xmm3
346 ; SSE41-NEXT: roundps $11, %xmm3, %xmm0
347 ; SSE41-NEXT: andps %xmm1, %xmm2
348 ; SSE41-NEXT: orps %xmm4, %xmm2
349 ; SSE41-NEXT: addps %xmm1, %xmm2
350 ; SSE41-NEXT: roundps $11, %xmm2, %xmm1
353 ; AVX1-LABEL: round_v8f32:
355 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
356 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
357 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
358 ; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
361 ; AVX512F-LABEL: round_v8f32:
363 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
364 ; AVX512F-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
365 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0
366 ; AVX512F-NEXT: vroundps $11, %ymm0, %ymm0
369 ; AVX512FP16-LABEL: round_v8f32:
370 ; AVX512FP16: ## %bb.0:
371 ; AVX512FP16-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
372 ; AVX512FP16-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
373 ; AVX512FP16-NEXT: vaddps %ymm1, %ymm0, %ymm0
374 ; AVX512FP16-NEXT: vroundps $11, %ymm0, %ymm0
375 ; AVX512FP16-NEXT: retq
376 %a = call <8 x float> @llvm.round.v8f32(<8 x float> %x)
380 define <4 x double> @round_v4f64(<4 x double> %x) {
381 ; SSE2-LABEL: round_v4f64:
383 ; SSE2-NEXT: subq $56, %rsp
384 ; SSE2-NEXT: .cfi_def_cfa_offset 64
385 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
386 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
387 ; SSE2-NEXT: callq round@PLT
388 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
389 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
390 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
391 ; SSE2-NEXT: callq round@PLT
392 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
393 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
394 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
395 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
396 ; SSE2-NEXT: callq round@PLT
397 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
398 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
399 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
400 ; SSE2-NEXT: callq round@PLT
401 ; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
402 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
403 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
404 ; SSE2-NEXT: addq $56, %rsp
405 ; SSE2-NEXT: .cfi_def_cfa_offset 8
408 ; SSE41-LABEL: round_v4f64:
410 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
411 ; SSE41-NEXT: movapd %xmm0, %xmm3
412 ; SSE41-NEXT: andpd %xmm2, %xmm3
413 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1]
414 ; SSE41-NEXT: orpd %xmm4, %xmm3
415 ; SSE41-NEXT: addpd %xmm0, %xmm3
416 ; SSE41-NEXT: roundpd $11, %xmm3, %xmm0
417 ; SSE41-NEXT: andpd %xmm1, %xmm2
418 ; SSE41-NEXT: orpd %xmm4, %xmm2
419 ; SSE41-NEXT: addpd %xmm1, %xmm2
420 ; SSE41-NEXT: roundpd $11, %xmm2, %xmm1
423 ; AVX1-LABEL: round_v4f64:
425 ; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
426 ; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
427 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
428 ; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
431 ; AVX512F-LABEL: round_v4f64:
433 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
434 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
435 ; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0
436 ; AVX512F-NEXT: vroundpd $11, %ymm0, %ymm0
439 ; AVX512FP16-LABEL: round_v4f64:
440 ; AVX512FP16: ## %bb.0:
441 ; AVX512FP16-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
442 ; AVX512FP16-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1
443 ; AVX512FP16-NEXT: vaddpd %ymm1, %ymm0, %ymm0
444 ; AVX512FP16-NEXT: vroundpd $11, %ymm0, %ymm0
445 ; AVX512FP16-NEXT: retq
446 %a = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
450 define <16 x float> @round_v16f32(<16 x float> %x) {
451 ; SSE2-LABEL: round_v16f32:
453 ; SSE2-NEXT: subq $104, %rsp
454 ; SSE2-NEXT: .cfi_def_cfa_offset 112
455 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
456 ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
457 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
458 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
459 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
460 ; SSE2-NEXT: callq roundf@PLT
461 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
462 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
463 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
464 ; SSE2-NEXT: callq roundf@PLT
465 ; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
466 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
467 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
468 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
469 ; SSE2-NEXT: callq roundf@PLT
470 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
471 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
472 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
473 ; SSE2-NEXT: callq roundf@PLT
474 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
475 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
476 ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
477 ; SSE2-NEXT: # xmm1 = xmm1[0],mem[0]
478 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
479 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
480 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
481 ; SSE2-NEXT: callq roundf@PLT
482 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
483 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
484 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
485 ; SSE2-NEXT: callq roundf@PLT
486 ; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
487 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
488 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
489 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
490 ; SSE2-NEXT: callq roundf@PLT
491 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
492 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
493 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
494 ; SSE2-NEXT: callq roundf@PLT
495 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
496 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
497 ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
498 ; SSE2-NEXT: # xmm1 = xmm1[0],mem[0]
499 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
500 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
501 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
502 ; SSE2-NEXT: callq roundf@PLT
503 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
504 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
505 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
506 ; SSE2-NEXT: callq roundf@PLT
507 ; SSE2-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
508 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
509 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
510 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
511 ; SSE2-NEXT: callq roundf@PLT
512 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
513 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
514 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
515 ; SSE2-NEXT: callq roundf@PLT
516 ; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
517 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
518 ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
519 ; SSE2-NEXT: # xmm1 = xmm1[0],mem[0]
520 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
521 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
522 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
523 ; SSE2-NEXT: callq roundf@PLT
524 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
525 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
526 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
527 ; SSE2-NEXT: callq roundf@PLT
528 ; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
529 ; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
530 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
531 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
532 ; SSE2-NEXT: callq roundf@PLT
533 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
534 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
535 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
536 ; SSE2-NEXT: callq roundf@PLT
537 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
538 ; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
539 ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
540 ; SSE2-NEXT: # xmm3 = xmm3[0],mem[0]
541 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
542 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
543 ; SSE2-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
544 ; SSE2-NEXT: addq $104, %rsp
545 ; SSE2-NEXT: .cfi_def_cfa_offset 8
548 ; SSE41-LABEL: round_v16f32:
550 ; SSE41-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
551 ; SSE41-NEXT: movaps %xmm0, %xmm5
552 ; SSE41-NEXT: andps %xmm4, %xmm5
553 ; SSE41-NEXT: movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
554 ; SSE41-NEXT: orps %xmm6, %xmm5
555 ; SSE41-NEXT: addps %xmm0, %xmm5
556 ; SSE41-NEXT: roundps $11, %xmm5, %xmm0
557 ; SSE41-NEXT: movaps %xmm1, %xmm5
558 ; SSE41-NEXT: andps %xmm4, %xmm5
559 ; SSE41-NEXT: orps %xmm6, %xmm5
560 ; SSE41-NEXT: addps %xmm1, %xmm5
561 ; SSE41-NEXT: roundps $11, %xmm5, %xmm1
562 ; SSE41-NEXT: movaps %xmm2, %xmm5
563 ; SSE41-NEXT: andps %xmm4, %xmm5
564 ; SSE41-NEXT: orps %xmm6, %xmm5
565 ; SSE41-NEXT: addps %xmm2, %xmm5
566 ; SSE41-NEXT: roundps $11, %xmm5, %xmm2
567 ; SSE41-NEXT: andps %xmm3, %xmm4
568 ; SSE41-NEXT: orps %xmm6, %xmm4
569 ; SSE41-NEXT: addps %xmm3, %xmm4
570 ; SSE41-NEXT: roundps $11, %xmm4, %xmm3
573 ; AVX1-LABEL: round_v16f32:
575 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
576 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3
577 ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
578 ; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3
579 ; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0
580 ; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
581 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm2
582 ; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm2
583 ; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1
584 ; AVX1-NEXT: vroundps $11, %ymm1, %ymm1
587 ; AVX512F-LABEL: round_v16f32:
589 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
590 ; AVX512F-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
591 ; AVX512F-NEXT: vaddps %zmm1, %zmm0, %zmm0
592 ; AVX512F-NEXT: vrndscaleps $11, %zmm0, %zmm0
595 ; AVX512FP16-LABEL: round_v16f32:
596 ; AVX512FP16: ## %bb.0:
597 ; AVX512FP16-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
598 ; AVX512FP16-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
599 ; AVX512FP16-NEXT: vaddps %zmm1, %zmm0, %zmm0
600 ; AVX512FP16-NEXT: vrndscaleps $11, %zmm0, %zmm0
601 ; AVX512FP16-NEXT: retq
602 %a = call <16 x float> @llvm.round.v16f32(<16 x float> %x)
606 define <8 x double> @round_v8f64(<8 x double> %x) {
607 ; SSE2-LABEL: round_v8f64:
609 ; SSE2-NEXT: subq $88, %rsp
610 ; SSE2-NEXT: .cfi_def_cfa_offset 96
611 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
612 ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
613 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
614 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
615 ; SSE2-NEXT: callq round@PLT
616 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
617 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
618 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
619 ; SSE2-NEXT: callq round@PLT
620 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
621 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
622 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
623 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
624 ; SSE2-NEXT: callq round@PLT
625 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
626 ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
627 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
628 ; SSE2-NEXT: callq round@PLT
629 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
630 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
631 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
632 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
633 ; SSE2-NEXT: callq round@PLT
634 ; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
635 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
636 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
637 ; SSE2-NEXT: callq round@PLT
638 ; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
639 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
640 ; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
641 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
642 ; SSE2-NEXT: callq round@PLT
643 ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
644 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
645 ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
646 ; SSE2-NEXT: callq round@PLT
647 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
648 ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
649 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
650 ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
651 ; SSE2-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
652 ; SSE2-NEXT: addq $88, %rsp
653 ; SSE2-NEXT: .cfi_def_cfa_offset 8
656 ; SSE41-LABEL: round_v8f64:
658 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0]
659 ; SSE41-NEXT: movapd %xmm0, %xmm5
660 ; SSE41-NEXT: andpd %xmm4, %xmm5
661 ; SSE41-NEXT: movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1]
662 ; SSE41-NEXT: orpd %xmm6, %xmm5
663 ; SSE41-NEXT: addpd %xmm0, %xmm5
664 ; SSE41-NEXT: roundpd $11, %xmm5, %xmm0
665 ; SSE41-NEXT: movapd %xmm1, %xmm5
666 ; SSE41-NEXT: andpd %xmm4, %xmm5
667 ; SSE41-NEXT: orpd %xmm6, %xmm5
668 ; SSE41-NEXT: addpd %xmm1, %xmm5
669 ; SSE41-NEXT: roundpd $11, %xmm5, %xmm1
670 ; SSE41-NEXT: movapd %xmm2, %xmm5
671 ; SSE41-NEXT: andpd %xmm4, %xmm5
672 ; SSE41-NEXT: orpd %xmm6, %xmm5
673 ; SSE41-NEXT: addpd %xmm2, %xmm5
674 ; SSE41-NEXT: roundpd $11, %xmm5, %xmm2
675 ; SSE41-NEXT: andpd %xmm3, %xmm4
676 ; SSE41-NEXT: orpd %xmm6, %xmm4
677 ; SSE41-NEXT: addpd %xmm3, %xmm4
678 ; SSE41-NEXT: roundpd $11, %xmm4, %xmm3
681 ; AVX1-LABEL: round_v8f64:
683 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
684 ; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3
685 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
686 ; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3
687 ; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0
688 ; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
689 ; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm2
690 ; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2
691 ; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1
692 ; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1
695 ; AVX512F-LABEL: round_v8f64:
697 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
698 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
699 ; AVX512F-NEXT: vaddpd %zmm1, %zmm0, %zmm0
700 ; AVX512F-NEXT: vrndscalepd $11, %zmm0, %zmm0
703 ; AVX512FP16-LABEL: round_v8f64:
704 ; AVX512FP16: ## %bb.0:
705 ; AVX512FP16-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
706 ; AVX512FP16-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
707 ; AVX512FP16-NEXT: vaddpd %zmm1, %zmm0, %zmm0
708 ; AVX512FP16-NEXT: vrndscalepd $11, %zmm0, %zmm0
709 ; AVX512FP16-NEXT: retq
710 %a = call <8 x double> @llvm.round.v8f64(<8 x double> %x)
714 declare half @llvm.round.f16(half)
715 declare float @llvm.round.f32(float)
716 declare double @llvm.round.f64(double)
717 declare <4 x float> @llvm.round.v4f32(<4 x float>)
718 declare <2 x double> @llvm.round.v2f64(<2 x double>)
719 declare <8 x float> @llvm.round.v8f32(<8 x float>)
720 declare <4 x double> @llvm.round.v4f64(<4 x double>)
721 declare <16 x float> @llvm.round.v16f32(<16 x float>)
722 declare <8 x double> @llvm.round.v8f64(<8 x double>)