1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefixes=SSE,SSE-LINUX
3 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefixes=SSE,SSE-WIN
4 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefixes=AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512VL
7 define dso_local double @t1(float* nocapture %x) nounwind readonly ssp {
9 ; SSE: # %bb.0: # %entry
10 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
11 ; SSE-NEXT: cvtss2sd %xmm0, %xmm0
15 ; AVX: # %bb.0: # %entry
16 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
17 ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
21 %0 = load float, float* %x, align 4
22 %1 = fpext float %0 to double
26 define dso_local float @t2(double* nocapture %x) nounwind readonly ssp optsize {
27 ; SSE-LINUX-LABEL: t2:
28 ; SSE-LINUX: # %bb.0: # %entry
29 ; SSE-LINUX-NEXT: cvtsd2ss (%rdi), %xmm0
30 ; SSE-LINUX-NEXT: retq
33 ; SSE-WIN: # %bb.0: # %entry
34 ; SSE-WIN-NEXT: cvtsd2ss (%rcx), %xmm0
38 ; AVX: # %bb.0: # %entry
39 ; AVX-NEXT: vcvtsd2ss (%rcx), %xmm0, %xmm0
42 %0 = load double, double* %x, align 8
43 %1 = fptrunc double %0 to float
47 define dso_local float @squirtf(float* %x) nounwind {
49 ; SSE: # %bb.0: # %entry
50 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
51 ; SSE-NEXT: sqrtss %xmm0, %xmm0
55 ; AVX: # %bb.0: # %entry
56 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
57 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
60 %z = load float, float* %x
61 %t = call float @llvm.sqrt.f32(float %z)
65 define dso_local double @squirt(double* %x) nounwind {
67 ; SSE: # %bb.0: # %entry
68 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
69 ; SSE-NEXT: sqrtsd %xmm0, %xmm0
73 ; AVX: # %bb.0: # %entry
74 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
75 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
78 %z = load double, double* %x
79 %t = call double @llvm.sqrt.f64(double %z)
83 define dso_local float @squirtf_size(float* %x) nounwind optsize {
84 ; SSE-LINUX-LABEL: squirtf_size:
85 ; SSE-LINUX: # %bb.0: # %entry
86 ; SSE-LINUX-NEXT: sqrtss (%rdi), %xmm0
87 ; SSE-LINUX-NEXT: retq
89 ; SSE-WIN-LABEL: squirtf_size:
90 ; SSE-WIN: # %bb.0: # %entry
91 ; SSE-WIN-NEXT: sqrtss (%rcx), %xmm0
94 ; AVX-LABEL: squirtf_size:
95 ; AVX: # %bb.0: # %entry
96 ; AVX-NEXT: vsqrtss (%rcx), %xmm0, %xmm0
99 %z = load float, float* %x
100 %t = call float @llvm.sqrt.f32(float %z)
104 define dso_local double @squirt_size(double* %x) nounwind optsize {
105 ; SSE-LINUX-LABEL: squirt_size:
106 ; SSE-LINUX: # %bb.0: # %entry
107 ; SSE-LINUX-NEXT: sqrtsd (%rdi), %xmm0
108 ; SSE-LINUX-NEXT: retq
110 ; SSE-WIN-LABEL: squirt_size:
111 ; SSE-WIN: # %bb.0: # %entry
112 ; SSE-WIN-NEXT: sqrtsd (%rcx), %xmm0
115 ; AVX-LABEL: squirt_size:
116 ; AVX: # %bb.0: # %entry
117 ; AVX-NEXT: vsqrtsd (%rcx), %xmm0, %xmm0
120 %z = load double, double* %x
121 %t = call double @llvm.sqrt.f64(double %z)
125 declare float @llvm.sqrt.f32(float)
126 declare double @llvm.sqrt.f64(double)
128 ; This loop contains two cvtsi2ss instructions that update the same xmm
129 ; register. Verify that the break false dependency fix pass breaks those
130 ; dependencies by inserting xorps instructions.
132 define dso_local float @loopdep1(i32 %m) nounwind uwtable readnone ssp {
133 ; SSE-LINUX-LABEL: loopdep1:
134 ; SSE-LINUX: # %bb.0: # %entry
135 ; SSE-LINUX-NEXT: testl %edi, %edi
136 ; SSE-LINUX-NEXT: je .LBB6_1
137 ; SSE-LINUX-NEXT: # %bb.2: # %for.body.preheader
138 ; SSE-LINUX-NEXT: movl $1, %eax
139 ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
140 ; SSE-LINUX-NEXT: xorps %xmm1, %xmm1
141 ; SSE-LINUX-NEXT: .p2align 4, 0x90
142 ; SSE-LINUX-NEXT: .LBB6_3: # %for.body
143 ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1
144 ; SSE-LINUX-NEXT: xorps %xmm2, %xmm2
145 ; SSE-LINUX-NEXT: cvtsi2ss %eax, %xmm2
146 ; SSE-LINUX-NEXT: xorps %xmm3, %xmm3
147 ; SSE-LINUX-NEXT: cvtsi2ss %edi, %xmm3
148 ; SSE-LINUX-NEXT: addss %xmm2, %xmm0
149 ; SSE-LINUX-NEXT: addss %xmm3, %xmm1
150 ; SSE-LINUX-NEXT: incl %eax
151 ; SSE-LINUX-NEXT: decl %edi
152 ; SSE-LINUX-NEXT: jne .LBB6_3
153 ; SSE-LINUX-NEXT: # %bb.4: # %for.end
154 ; SSE-LINUX-NEXT: subss %xmm1, %xmm0
155 ; SSE-LINUX-NEXT: retq
156 ; SSE-LINUX-NEXT: .LBB6_1:
157 ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
158 ; SSE-LINUX-NEXT: xorps %xmm1, %xmm1
159 ; SSE-LINUX-NEXT: subss %xmm1, %xmm0
160 ; SSE-LINUX-NEXT: retq
162 ; SSE-WIN-LABEL: loopdep1:
163 ; SSE-WIN: # %bb.0: # %entry
164 ; SSE-WIN-NEXT: testl %ecx, %ecx
165 ; SSE-WIN-NEXT: je .LBB6_1
166 ; SSE-WIN-NEXT: # %bb.2: # %for.body.preheader
167 ; SSE-WIN-NEXT: movl $1, %eax
168 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0
169 ; SSE-WIN-NEXT: xorps %xmm1, %xmm1
170 ; SSE-WIN-NEXT: .p2align 4, 0x90
171 ; SSE-WIN-NEXT: .LBB6_3: # %for.body
172 ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1
173 ; SSE-WIN-NEXT: xorps %xmm2, %xmm2
174 ; SSE-WIN-NEXT: cvtsi2ss %eax, %xmm2
175 ; SSE-WIN-NEXT: xorps %xmm3, %xmm3
176 ; SSE-WIN-NEXT: cvtsi2ss %ecx, %xmm3
177 ; SSE-WIN-NEXT: addss %xmm2, %xmm0
178 ; SSE-WIN-NEXT: addss %xmm3, %xmm1
179 ; SSE-WIN-NEXT: incl %eax
180 ; SSE-WIN-NEXT: decl %ecx
181 ; SSE-WIN-NEXT: jne .LBB6_3
182 ; SSE-WIN-NEXT: # %bb.4: # %for.end
183 ; SSE-WIN-NEXT: subss %xmm1, %xmm0
185 ; SSE-WIN-NEXT: .LBB6_1:
186 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0
187 ; SSE-WIN-NEXT: xorps %xmm1, %xmm1
188 ; SSE-WIN-NEXT: subss %xmm1, %xmm0
191 ; AVX1-LABEL: loopdep1:
192 ; AVX1: # %bb.0: # %entry
193 ; AVX1-NEXT: testl %ecx, %ecx
194 ; AVX1-NEXT: je .LBB6_1
195 ; AVX1-NEXT: # %bb.2: # %for.body.preheader
196 ; AVX1-NEXT: movl $1, %eax
197 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
198 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
199 ; AVX1-NEXT: .p2align 4, 0x90
200 ; AVX1-NEXT: .LBB6_3: # %for.body
201 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
202 ; AVX1-NEXT: vcvtsi2ss %eax, %xmm4, %xmm2
203 ; AVX1-NEXT: vcvtsi2ss %ecx, %xmm4, %xmm3
204 ; AVX1-NEXT: vaddss %xmm2, %xmm0, %xmm0
205 ; AVX1-NEXT: vaddss %xmm3, %xmm1, %xmm1
206 ; AVX1-NEXT: incl %eax
207 ; AVX1-NEXT: decl %ecx
208 ; AVX1-NEXT: jne .LBB6_3
209 ; AVX1-NEXT: # %bb.4: # %for.end
210 ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
212 ; AVX1-NEXT: .LBB6_1:
213 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
214 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
215 ; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
218 ; AVX512VL-LABEL: loopdep1:
219 ; AVX512VL: # %bb.0: # %entry
220 ; AVX512VL-NEXT: testl %ecx, %ecx
221 ; AVX512VL-NEXT: je .LBB6_1
222 ; AVX512VL-NEXT: # %bb.2: # %for.body.preheader
223 ; AVX512VL-NEXT: movl $1, %eax
224 ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
225 ; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
226 ; AVX512VL-NEXT: .p2align 4, 0x90
227 ; AVX512VL-NEXT: .LBB6_3: # %for.body
228 ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
229 ; AVX512VL-NEXT: vcvtsi2ss %eax, %xmm3, %xmm2
230 ; AVX512VL-NEXT: vaddss %xmm2, %xmm0, %xmm0
231 ; AVX512VL-NEXT: vcvtsi2ss %ecx, %xmm3, %xmm2
232 ; AVX512VL-NEXT: vaddss %xmm2, %xmm1, %xmm1
233 ; AVX512VL-NEXT: incl %eax
234 ; AVX512VL-NEXT: decl %ecx
235 ; AVX512VL-NEXT: jne .LBB6_3
236 ; AVX512VL-NEXT: # %bb.4: # %for.end
237 ; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0
238 ; AVX512VL-NEXT: retq
239 ; AVX512VL-NEXT: .LBB6_1:
240 ; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
241 ; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
242 ; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0
243 ; AVX512VL-NEXT: retq
245 %tobool3 = icmp eq i32 %m, 0
246 br i1 %tobool3, label %for.end, label %for.body
248 for.body: ; preds = %entry, %for.body
249 %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
250 %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
251 %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
252 %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
253 %conv = sitofp i32 %n.04 to float
254 %add = fadd float %s1.06, %conv
255 %conv1 = sitofp i32 %m.addr.07 to float
256 %add2 = fadd float %s2.05, %conv1
257 %inc = add nsw i32 %n.04, 1
258 %dec = add nsw i32 %m.addr.07, -1
259 %tobool = icmp eq i32 %dec, 0
260 br i1 %tobool, label %for.end, label %for.body
262 for.end: ; preds = %for.body, %entry
263 %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
264 %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
265 %sub = fsub float %s1.0.lcssa, %s2.0.lcssa
269 ; rdar:15221834 False AVX register dependencies cause 5x slowdown on
270 ; flops-6. Make sure the unused register read by vcvtsi2sd is zeroed
271 ; to avoid cyclic dependence on a write to the same register in a
272 ; previous iteration.
274 define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind {
275 ; SSE-LINUX-LABEL: loopdep2:
276 ; SSE-LINUX: # %bb.0: # %entry
277 ; SSE-LINUX-NEXT: movq (%rdi), %rax
278 ; SSE-LINUX-NEXT: movl $1, %ecx
279 ; SSE-LINUX-NEXT: .p2align 4, 0x90
280 ; SSE-LINUX-NEXT: .LBB7_1: # %loop
281 ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1
282 ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
283 ; SSE-LINUX-NEXT: cvtsi2sd %rcx, %xmm0
284 ; SSE-LINUX-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
285 ; SSE-LINUX-NEXT: #APP
286 ; SSE-LINUX-NEXT: #NO_APP
287 ; SSE-LINUX-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
288 ; SSE-LINUX-NEXT: # xmm0 = mem[0],zero
289 ; SSE-LINUX-NEXT: addsd (%rsi), %xmm0
290 ; SSE-LINUX-NEXT: cvttsd2si %xmm0, %rdx
291 ; SSE-LINUX-NEXT: addq %rdx, %rax
292 ; SSE-LINUX-NEXT: incq %rcx
293 ; SSE-LINUX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90
294 ; SSE-LINUX-NEXT: jne .LBB7_1
295 ; SSE-LINUX-NEXT: # %bb.2: # %ret
296 ; SSE-LINUX-NEXT: retq
298 ; SSE-WIN-LABEL: loopdep2:
299 ; SSE-WIN: # %bb.0: # %entry
300 ; SSE-WIN-NEXT: subq $184, %rsp
301 ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
302 ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
303 ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
304 ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
305 ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
306 ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
307 ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
308 ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
309 ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
310 ; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
311 ; SSE-WIN-NEXT: movq (%rcx), %rax
312 ; SSE-WIN-NEXT: movl $1, %r8d
313 ; SSE-WIN-NEXT: .p2align 4, 0x90
314 ; SSE-WIN-NEXT: .LBB7_1: # %loop
315 ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1
316 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0
317 ; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm0
318 ; SSE-WIN-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
320 ; SSE-WIN-NEXT: #NO_APP
321 ; SSE-WIN-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
322 ; SSE-WIN-NEXT: # xmm0 = mem[0],zero
323 ; SSE-WIN-NEXT: addsd (%rdx), %xmm0
324 ; SSE-WIN-NEXT: cvttsd2si %xmm0, %rcx
325 ; SSE-WIN-NEXT: addq %rcx, %rax
326 ; SSE-WIN-NEXT: incq %r8
327 ; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
328 ; SSE-WIN-NEXT: jne .LBB7_1
329 ; SSE-WIN-NEXT: # %bb.2: # %ret
330 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
331 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
332 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
333 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
334 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
335 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
336 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
337 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
338 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
339 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
340 ; SSE-WIN-NEXT: addq $184, %rsp
343 ; AVX-LABEL: loopdep2:
344 ; AVX: # %bb.0: # %entry
345 ; AVX-NEXT: subq $184, %rsp
346 ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
347 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
348 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
349 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
350 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
351 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
352 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
353 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
354 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
355 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
356 ; AVX-NEXT: movq (%rcx), %rax
357 ; AVX-NEXT: movl $1, %r8d
358 ; AVX-NEXT: .p2align 4, 0x90
359 ; AVX-NEXT: .LBB7_1: # %loop
360 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1
361 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
362 ; AVX-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0
363 ; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
366 ; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
367 ; AVX-NEXT: # xmm0 = mem[0],zero
368 ; AVX-NEXT: vaddsd (%rdx), %xmm0, %xmm0
369 ; AVX-NEXT: vcvttsd2si %xmm0, %rcx
370 ; AVX-NEXT: addq %rcx, %rax
372 ; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
373 ; AVX-NEXT: jne .LBB7_1
374 ; AVX-NEXT: # %bb.2: # %ret
375 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
376 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
377 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
378 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
379 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
380 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
381 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
382 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
383 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
384 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
385 ; AVX-NEXT: addq $184, %rsp
388 %vx = load i64, i64* %x
391 %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
392 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
393 %fi = sitofp i64 %i to double
394 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"()
395 %vy = load double, double* %y
396 %fipy = fadd double %fi, %vy
397 %iipy = fptosi double %fipy to i64
398 %s2 = add i64 %s1, %iipy
399 %inc = add nsw i64 %i, 1
400 %exitcond = icmp eq i64 %inc, 156250000
401 br i1 %exitcond, label %ret, label %loop
406 ; This loop contains a cvtsi2sd instruction that has a loop-carried
407 ; false dependency on an xmm that is modified by other scalar instructions
408 ; that follow it in the loop. Additionally, the source of convert is a
409 ; memory operand. Verify the break false dependency fix pass breaks this
410 ; dependency by inserting a xor before the convert.
411 @x = common dso_local global [1024 x double] zeroinitializer, align 16
412 @y = common dso_local global [1024 x double] zeroinitializer, align 16
413 @z = common dso_local global [1024 x double] zeroinitializer, align 16
414 @w = common dso_local global [1024 x double] zeroinitializer, align 16
415 @v = common dso_local global [1024 x i32] zeroinitializer, align 16
417 define dso_local void @loopdep3() {
418 ; SSE-LINUX-LABEL: loopdep3:
419 ; SSE-LINUX: # %bb.0: # %entry
420 ; SSE-LINUX-NEXT: xorl %eax, %eax
421 ; SSE-LINUX-NEXT: .p2align 4, 0x90
422 ; SSE-LINUX-NEXT: .LBB8_1: # %for.cond1.preheader
423 ; SSE-LINUX-NEXT: # =>This Loop Header: Depth=1
424 ; SSE-LINUX-NEXT: # Child Loop BB8_2 Depth 2
425 ; SSE-LINUX-NEXT: movq $-4096, %rcx # imm = 0xF000
426 ; SSE-LINUX-NEXT: .p2align 4, 0x90
427 ; SSE-LINUX-NEXT: .LBB8_2: # %for.body3
428 ; SSE-LINUX-NEXT: # Parent Loop BB8_1 Depth=1
429 ; SSE-LINUX-NEXT: # => This Inner Loop Header: Depth=2
430 ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
431 ; SSE-LINUX-NEXT: cvtsi2sdl v+4096(%rcx), %xmm0
432 ; SSE-LINUX-NEXT: mulsd x+8192(%rcx,%rcx), %xmm0
433 ; SSE-LINUX-NEXT: mulsd y+8192(%rcx,%rcx), %xmm0
434 ; SSE-LINUX-NEXT: mulsd z+8192(%rcx,%rcx), %xmm0
435 ; SSE-LINUX-NEXT: movsd %xmm0, w+8192(%rcx,%rcx)
436 ; SSE-LINUX-NEXT: #APP
437 ; SSE-LINUX-NEXT: #NO_APP
438 ; SSE-LINUX-NEXT: addq $4, %rcx
439 ; SSE-LINUX-NEXT: jne .LBB8_2
440 ; SSE-LINUX-NEXT: # %bb.3: # %for.inc14
441 ; SSE-LINUX-NEXT: # in Loop: Header=BB8_1 Depth=1
442 ; SSE-LINUX-NEXT: incl %eax
443 ; SSE-LINUX-NEXT: cmpl $100000, %eax # imm = 0x186A0
444 ; SSE-LINUX-NEXT: jne .LBB8_1
445 ; SSE-LINUX-NEXT: # %bb.4: # %for.end16
446 ; SSE-LINUX-NEXT: retq
448 ; SSE-WIN-LABEL: loopdep3:
449 ; SSE-WIN: # %bb.0: # %entry
450 ; SSE-WIN-NEXT: pushq %rsi
451 ; SSE-WIN-NEXT: .seh_pushreg %rsi
452 ; SSE-WIN-NEXT: subq $160, %rsp
453 ; SSE-WIN-NEXT: .seh_stackalloc 160
454 ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
455 ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144
456 ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
457 ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128
458 ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
459 ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112
460 ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
461 ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96
462 ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
463 ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80
464 ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
465 ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64
466 ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
467 ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48
468 ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
469 ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32
470 ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
471 ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16
472 ; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
473 ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0
474 ; SSE-WIN-NEXT: .seh_endprologue
475 ; SSE-WIN-NEXT: xorl %r9d, %r9d
476 ; SSE-WIN-NEXT: leaq v(%rip), %r8
477 ; SSE-WIN-NEXT: leaq x(%rip), %r10
478 ; SSE-WIN-NEXT: leaq y(%rip), %r11
479 ; SSE-WIN-NEXT: leaq z(%rip), %rax
480 ; SSE-WIN-NEXT: leaq w(%rip), %rdx
481 ; SSE-WIN-NEXT: .p2align 4, 0x90
482 ; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader
483 ; SSE-WIN-NEXT: # =>This Loop Header: Depth=1
484 ; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2
485 ; SSE-WIN-NEXT: movq %r8, %rcx
486 ; SSE-WIN-NEXT: xorl %esi, %esi
487 ; SSE-WIN-NEXT: .p2align 4, 0x90
488 ; SSE-WIN-NEXT: .LBB8_2: # %for.body3
489 ; SSE-WIN-NEXT: # Parent Loop BB8_1 Depth=1
490 ; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2
491 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0
492 ; SSE-WIN-NEXT: cvtsi2sdl (%rcx), %xmm0
493 ; SSE-WIN-NEXT: mulsd (%rsi,%r10), %xmm0
494 ; SSE-WIN-NEXT: mulsd (%rsi,%r11), %xmm0
495 ; SSE-WIN-NEXT: mulsd (%rsi,%rax), %xmm0
496 ; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%rdx)
498 ; SSE-WIN-NEXT: #NO_APP
499 ; SSE-WIN-NEXT: addq $8, %rsi
500 ; SSE-WIN-NEXT: addq $4, %rcx
501 ; SSE-WIN-NEXT: cmpq $8192, %rsi # imm = 0x2000
502 ; SSE-WIN-NEXT: jne .LBB8_2
503 ; SSE-WIN-NEXT: # %bb.3: # %for.inc14
504 ; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1
505 ; SSE-WIN-NEXT: incl %r9d
506 ; SSE-WIN-NEXT: cmpl $100000, %r9d # imm = 0x186A0
507 ; SSE-WIN-NEXT: jne .LBB8_1
508 ; SSE-WIN-NEXT: # %bb.4: # %for.end16
509 ; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload
510 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
511 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
512 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
513 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
514 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
515 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
516 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
517 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
518 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
519 ; SSE-WIN-NEXT: addq $160, %rsp
520 ; SSE-WIN-NEXT: popq %rsi
522 ; SSE-WIN-NEXT: .seh_endproc
524 ; AVX-LABEL: loopdep3:
525 ; AVX: # %bb.0: # %entry
526 ; AVX-NEXT: pushq %rsi
527 ; AVX-NEXT: .seh_pushreg %rsi
528 ; AVX-NEXT: subq $160, %rsp
529 ; AVX-NEXT: .seh_stackalloc 160
530 ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
531 ; AVX-NEXT: .seh_savexmm %xmm15, 144
532 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
533 ; AVX-NEXT: .seh_savexmm %xmm14, 128
534 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
535 ; AVX-NEXT: .seh_savexmm %xmm13, 112
536 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
537 ; AVX-NEXT: .seh_savexmm %xmm12, 96
538 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
539 ; AVX-NEXT: .seh_savexmm %xmm11, 80
540 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
541 ; AVX-NEXT: .seh_savexmm %xmm10, 64
542 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
543 ; AVX-NEXT: .seh_savexmm %xmm9, 48
544 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
545 ; AVX-NEXT: .seh_savexmm %xmm8, 32
546 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
547 ; AVX-NEXT: .seh_savexmm %xmm7, 16
548 ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
549 ; AVX-NEXT: .seh_savexmm %xmm6, 0
550 ; AVX-NEXT: .seh_endprologue
551 ; AVX-NEXT: xorl %r9d, %r9d
552 ; AVX-NEXT: leaq v(%rip), %r8
553 ; AVX-NEXT: leaq x(%rip), %r10
554 ; AVX-NEXT: leaq y(%rip), %r11
555 ; AVX-NEXT: leaq z(%rip), %rax
556 ; AVX-NEXT: leaq w(%rip), %rdx
557 ; AVX-NEXT: .p2align 4, 0x90
558 ; AVX-NEXT: .LBB8_1: # %for.cond1.preheader
559 ; AVX-NEXT: # =>This Loop Header: Depth=1
560 ; AVX-NEXT: # Child Loop BB8_2 Depth 2
561 ; AVX-NEXT: movq %r8, %rcx
562 ; AVX-NEXT: xorl %esi, %esi
563 ; AVX-NEXT: .p2align 4, 0x90
564 ; AVX-NEXT: .LBB8_2: # %for.body3
565 ; AVX-NEXT: # Parent Loop BB8_1 Depth=1
566 ; AVX-NEXT: # => This Inner Loop Header: Depth=2
567 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
568 ; AVX-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0
569 ; AVX-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0
570 ; AVX-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0
571 ; AVX-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0
572 ; AVX-NEXT: vmovsd %xmm0, (%rsi,%rdx)
575 ; AVX-NEXT: addq $8, %rsi
576 ; AVX-NEXT: addq $4, %rcx
577 ; AVX-NEXT: cmpq $8192, %rsi # imm = 0x2000
578 ; AVX-NEXT: jne .LBB8_2
579 ; AVX-NEXT: # %bb.3: # %for.inc14
580 ; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1
581 ; AVX-NEXT: incl %r9d
582 ; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0
583 ; AVX-NEXT: jne .LBB8_1
584 ; AVX-NEXT: # %bb.4: # %for.end16
585 ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
586 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
587 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
588 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
589 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
590 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
591 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
592 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
593 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
594 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
595 ; AVX-NEXT: addq $160, %rsp
596 ; AVX-NEXT: popq %rsi
598 ; AVX-NEXT: .seh_endproc
600 br label %for.cond1.preheader
602 for.cond1.preheader: ; preds = %for.inc14, %entry
603 %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ]
607 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
608 %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @v, i64 0, i64 %indvars.iv
609 %0 = load i32, i32* %arrayidx, align 4
610 %conv = sitofp i32 %0 to double
611 %arrayidx5 = getelementptr inbounds [1024 x double], [1024 x double]* @x, i64 0, i64 %indvars.iv
612 %1 = load double, double* %arrayidx5, align 8
613 %mul = fmul double %conv, %1
614 %arrayidx7 = getelementptr inbounds [1024 x double], [1024 x double]* @y, i64 0, i64 %indvars.iv
615 %2 = load double, double* %arrayidx7, align 8
616 %mul8 = fmul double %mul, %2
617 %arrayidx10 = getelementptr inbounds [1024 x double], [1024 x double]* @z, i64 0, i64 %indvars.iv
618 %3 = load double, double* %arrayidx10, align 8
619 %mul11 = fmul double %mul8, %3
620 %arrayidx13 = getelementptr inbounds [1024 x double], [1024 x double]* @w, i64 0, i64 %indvars.iv
621 store double %mul11, double* %arrayidx13, align 8
622 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
623 %exitcond = icmp eq i64 %indvars.iv.next, 1024
624 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"()
625 br i1 %exitcond, label %for.inc14, label %for.body3
627 for.inc14: ; preds = %for.body3
628 %inc15 = add nsw i32 %i.025, 1
629 %exitcond26 = icmp eq i32 %inc15, 100000
630 br i1 %exitcond26, label %for.end16, label %for.cond1.preheader
632 for.end16: ; preds = %for.inc14
637 define dso_local double @inlineasmdep(i64 %arg) {
638 ; SSE-LINUX-LABEL: inlineasmdep:
639 ; SSE-LINUX: # %bb.0: # %top
640 ; SSE-LINUX-NEXT: #APP
641 ; SSE-LINUX-NEXT: #NO_APP
642 ; SSE-LINUX-NEXT: #APP
643 ; SSE-LINUX-NEXT: #NO_APP
644 ; SSE-LINUX-NEXT: #APP
645 ; SSE-LINUX-NEXT: #NO_APP
646 ; SSE-LINUX-NEXT: #APP
647 ; SSE-LINUX-NEXT: #NO_APP
648 ; SSE-LINUX-NEXT: #APP
649 ; SSE-LINUX-NEXT: #NO_APP
650 ; SSE-LINUX-NEXT: #APP
651 ; SSE-LINUX-NEXT: #NO_APP
652 ; SSE-LINUX-NEXT: #APP
653 ; SSE-LINUX-NEXT: #NO_APP
654 ; SSE-LINUX-NEXT: #APP
655 ; SSE-LINUX-NEXT: #NO_APP
656 ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
657 ; SSE-LINUX-NEXT: cvtsi2sd %rdi, %xmm0
658 ; SSE-LINUX-NEXT: retq
660 ; SSE-WIN-LABEL: inlineasmdep:
661 ; SSE-WIN: # %bb.0: # %top
662 ; SSE-WIN-NEXT: subq $168, %rsp
663 ; SSE-WIN-NEXT: .seh_stackalloc 168
664 ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
665 ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144
666 ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
667 ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128
668 ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
669 ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112
670 ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
671 ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96
672 ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
673 ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80
674 ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
675 ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64
676 ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
677 ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48
678 ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
679 ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32
680 ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
681 ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16
682 ; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
683 ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0
684 ; SSE-WIN-NEXT: .seh_endprologue
686 ; SSE-WIN-NEXT: #NO_APP
688 ; SSE-WIN-NEXT: #NO_APP
690 ; SSE-WIN-NEXT: #NO_APP
692 ; SSE-WIN-NEXT: #NO_APP
694 ; SSE-WIN-NEXT: #NO_APP
696 ; SSE-WIN-NEXT: #NO_APP
698 ; SSE-WIN-NEXT: #NO_APP
700 ; SSE-WIN-NEXT: #NO_APP
701 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0
702 ; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0
703 ; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload
704 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
705 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
706 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
707 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
708 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
709 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
710 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
711 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
712 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
713 ; SSE-WIN-NEXT: addq $168, %rsp
715 ; SSE-WIN-NEXT: .seh_endproc
717 ; AVX-LABEL: inlineasmdep:
718 ; AVX: # %bb.0: # %top
719 ; AVX-NEXT: subq $168, %rsp
720 ; AVX-NEXT: .seh_stackalloc 168
721 ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
722 ; AVX-NEXT: .seh_savexmm %xmm15, 144
723 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
724 ; AVX-NEXT: .seh_savexmm %xmm14, 128
725 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
726 ; AVX-NEXT: .seh_savexmm %xmm13, 112
727 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
728 ; AVX-NEXT: .seh_savexmm %xmm12, 96
729 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
730 ; AVX-NEXT: .seh_savexmm %xmm11, 80
731 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
732 ; AVX-NEXT: .seh_savexmm %xmm10, 64
733 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
734 ; AVX-NEXT: .seh_savexmm %xmm9, 48
735 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
736 ; AVX-NEXT: .seh_savexmm %xmm8, 32
737 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
738 ; AVX-NEXT: .seh_savexmm %xmm7, 16
739 ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
740 ; AVX-NEXT: .seh_savexmm %xmm6, 0
741 ; AVX-NEXT: .seh_endprologue
758 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
759 ; AVX-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0
760 ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
761 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
762 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
763 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
764 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
765 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
766 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
767 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
768 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
769 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
770 ; AVX-NEXT: addq $168, %rsp
772 ; AVX-NEXT: .seh_endproc
774 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
775 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
776 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
777 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
778 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"()
779 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"()
780 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"()
781 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"()
782 %tmp1 = sitofp i64 %arg to double
786 ; Make sure we are making a smart choice regarding undef registers and
787 ; hiding the false dependency behind a true dependency
788 define dso_local double @truedeps(float %arg) {
789 ; SSE-LINUX-LABEL: truedeps:
790 ; SSE-LINUX: # %bb.0: # %top
791 ; SSE-LINUX-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
792 ; SSE-LINUX-NEXT: #APP
793 ; SSE-LINUX-NEXT: #NO_APP
794 ; SSE-LINUX-NEXT: #APP
795 ; SSE-LINUX-NEXT: #NO_APP
796 ; SSE-LINUX-NEXT: #APP
797 ; SSE-LINUX-NEXT: #NO_APP
798 ; SSE-LINUX-NEXT: #APP
799 ; SSE-LINUX-NEXT: #NO_APP
800 ; SSE-LINUX-NEXT: #APP
801 ; SSE-LINUX-NEXT: #NO_APP
802 ; SSE-LINUX-NEXT: #APP
803 ; SSE-LINUX-NEXT: #NO_APP
804 ; SSE-LINUX-NEXT: #APP
805 ; SSE-LINUX-NEXT: #NO_APP
806 ; SSE-LINUX-NEXT: #APP
807 ; SSE-LINUX-NEXT: #NO_APP
808 ; SSE-LINUX-NEXT: #APP
809 ; SSE-LINUX-NEXT: #NO_APP
810 ; SSE-LINUX-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
811 ; SSE-LINUX-NEXT: # xmm0 = mem[0],zero,zero,zero
812 ; SSE-LINUX-NEXT: cvtss2sd %xmm0, %xmm0
813 ; SSE-LINUX-NEXT: retq
815 ; SSE-WIN-LABEL: truedeps:
816 ; SSE-WIN: # %bb.0: # %top
817 ; SSE-WIN-NEXT: subq $184, %rsp
818 ; SSE-WIN-NEXT: .seh_stackalloc 184
819 ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
820 ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 160
821 ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
822 ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 144
823 ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
824 ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 128
825 ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
826 ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 112
827 ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
828 ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 96
829 ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
830 ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 80
831 ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
832 ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 64
833 ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
834 ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 48
835 ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
836 ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 32
837 ; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
838 ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 16
839 ; SSE-WIN-NEXT: .seh_endprologue
840 ; SSE-WIN-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
842 ; SSE-WIN-NEXT: #NO_APP
844 ; SSE-WIN-NEXT: #NO_APP
846 ; SSE-WIN-NEXT: #NO_APP
848 ; SSE-WIN-NEXT: #NO_APP
850 ; SSE-WIN-NEXT: #NO_APP
852 ; SSE-WIN-NEXT: #NO_APP
854 ; SSE-WIN-NEXT: #NO_APP
856 ; SSE-WIN-NEXT: #NO_APP
858 ; SSE-WIN-NEXT: #NO_APP
859 ; SSE-WIN-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
860 ; SSE-WIN-NEXT: # xmm0 = mem[0],zero,zero,zero
861 ; SSE-WIN-NEXT: cvtss2sd %xmm0, %xmm0
862 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
863 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
864 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
865 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
866 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
867 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
868 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
869 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
870 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
871 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
872 ; SSE-WIN-NEXT: addq $184, %rsp
874 ; SSE-WIN-NEXT: .seh_endproc
876 ; AVX-LABEL: truedeps:
877 ; AVX: # %bb.0: # %top
878 ; AVX-NEXT: subq $184, %rsp
879 ; AVX-NEXT: .seh_stackalloc 184
880 ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
881 ; AVX-NEXT: .seh_savexmm %xmm15, 160
882 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
883 ; AVX-NEXT: .seh_savexmm %xmm14, 144
884 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
885 ; AVX-NEXT: .seh_savexmm %xmm13, 128
886 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
887 ; AVX-NEXT: .seh_savexmm %xmm12, 112
888 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
889 ; AVX-NEXT: .seh_savexmm %xmm11, 96
890 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
891 ; AVX-NEXT: .seh_savexmm %xmm10, 80
892 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
893 ; AVX-NEXT: .seh_savexmm %xmm9, 64
894 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
895 ; AVX-NEXT: .seh_savexmm %xmm8, 48
896 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
897 ; AVX-NEXT: .seh_savexmm %xmm7, 32
898 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
899 ; AVX-NEXT: .seh_savexmm %xmm6, 16
900 ; AVX-NEXT: .seh_endprologue
901 ; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
920 ; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
921 ; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero
922 ; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
923 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
924 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
925 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
926 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
927 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
928 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
929 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
930 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
931 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
932 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
933 ; AVX-NEXT: addq $184, %rsp
935 ; AVX-NEXT: .seh_endproc
937 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"()
938 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
939 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
940 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
941 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
942 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"()
943 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"()
944 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"()
945 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"()
946 %tmp1 = fpext float %arg to double
950 ; Make sure we are making a smart choice regarding undef registers and
951 ; choosing the register with the highest clearence
952 define dso_local double @clearence(i64 %arg) {
953 ; SSE-LINUX-LABEL: clearence:
954 ; SSE-LINUX: # %bb.0: # %top
955 ; SSE-LINUX-NEXT: #APP
956 ; SSE-LINUX-NEXT: #NO_APP
957 ; SSE-LINUX-NEXT: #APP
958 ; SSE-LINUX-NEXT: #NO_APP
959 ; SSE-LINUX-NEXT: #APP
960 ; SSE-LINUX-NEXT: #NO_APP
961 ; SSE-LINUX-NEXT: #APP
962 ; SSE-LINUX-NEXT: #NO_APP
963 ; SSE-LINUX-NEXT: #APP
964 ; SSE-LINUX-NEXT: #NO_APP
965 ; SSE-LINUX-NEXT: #APP
966 ; SSE-LINUX-NEXT: #NO_APP
967 ; SSE-LINUX-NEXT: #APP
968 ; SSE-LINUX-NEXT: #NO_APP
969 ; SSE-LINUX-NEXT: #APP
970 ; SSE-LINUX-NEXT: #NO_APP
971 ; SSE-LINUX-NEXT: #APP
972 ; SSE-LINUX-NEXT: #NO_APP
973 ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
974 ; SSE-LINUX-NEXT: cvtsi2sd %rdi, %xmm0
975 ; SSE-LINUX-NEXT: retq
977 ; SSE-WIN-LABEL: clearence:
978 ; SSE-WIN: # %bb.0: # %top
979 ; SSE-WIN-NEXT: subq $168, %rsp
980 ; SSE-WIN-NEXT: .seh_stackalloc 168
981 ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
982 ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144
983 ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
984 ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128
985 ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
986 ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112
987 ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
988 ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96
989 ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
990 ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80
991 ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
992 ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64
993 ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
994 ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48
995 ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
996 ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32
997 ; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
998 ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16
999 ; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
1000 ; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0
1001 ; SSE-WIN-NEXT: .seh_endprologue
1002 ; SSE-WIN-NEXT: #APP
1003 ; SSE-WIN-NEXT: #NO_APP
1004 ; SSE-WIN-NEXT: #APP
1005 ; SSE-WIN-NEXT: #NO_APP
1006 ; SSE-WIN-NEXT: #APP
1007 ; SSE-WIN-NEXT: #NO_APP
1008 ; SSE-WIN-NEXT: #APP
1009 ; SSE-WIN-NEXT: #NO_APP
1010 ; SSE-WIN-NEXT: #APP
1011 ; SSE-WIN-NEXT: #NO_APP
1012 ; SSE-WIN-NEXT: #APP
1013 ; SSE-WIN-NEXT: #NO_APP
1014 ; SSE-WIN-NEXT: #APP
1015 ; SSE-WIN-NEXT: #NO_APP
1016 ; SSE-WIN-NEXT: #APP
1017 ; SSE-WIN-NEXT: #NO_APP
1018 ; SSE-WIN-NEXT: #APP
1019 ; SSE-WIN-NEXT: #NO_APP
1020 ; SSE-WIN-NEXT: xorps %xmm0, %xmm0
1021 ; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0
1022 ; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload
1023 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1024 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1025 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1026 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1027 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1028 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1029 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1030 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1031 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1032 ; SSE-WIN-NEXT: addq $168, %rsp
1033 ; SSE-WIN-NEXT: retq
1034 ; SSE-WIN-NEXT: .seh_endproc
1036 ; AVX-LABEL: clearence:
1037 ; AVX: # %bb.0: # %top
1038 ; AVX-NEXT: subq $168, %rsp
1039 ; AVX-NEXT: .seh_stackalloc 168
1040 ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1041 ; AVX-NEXT: .seh_savexmm %xmm15, 144
1042 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1043 ; AVX-NEXT: .seh_savexmm %xmm14, 128
1044 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1045 ; AVX-NEXT: .seh_savexmm %xmm13, 112
1046 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1047 ; AVX-NEXT: .seh_savexmm %xmm12, 96
1048 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1049 ; AVX-NEXT: .seh_savexmm %xmm11, 80
1050 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1051 ; AVX-NEXT: .seh_savexmm %xmm10, 64
1052 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1053 ; AVX-NEXT: .seh_savexmm %xmm9, 48
1054 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1055 ; AVX-NEXT: .seh_savexmm %xmm8, 32
1056 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1057 ; AVX-NEXT: .seh_savexmm %xmm7, 16
1058 ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
1059 ; AVX-NEXT: .seh_savexmm %xmm6, 0
1060 ; AVX-NEXT: .seh_endprologue
1079 ; AVX-NEXT: vxorps %xmm6, %xmm6, %xmm6
1080 ; AVX-NEXT: vcvtsi2sd %rcx, %xmm6, %xmm0
1081 ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
1082 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1083 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1084 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1085 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1086 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1087 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1088 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1089 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1090 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1091 ; AVX-NEXT: addq $168, %rsp
1093 ; AVX-NEXT: .seh_endproc
1095 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"()
1096 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
1097 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
1098 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
1099 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
1100 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"()
1101 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"()
1102 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"()
1103 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"()
1104 %tmp1 = sitofp i64 %arg to double
1108 ; Make sure we are making a smart choice regarding undef registers in order to
1109 ; avoid a cyclic dependence on a write to the same register in a previous
1110 ; iteration, especially when we cannot zero out the undef register because it
1112 define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind {
1113 ; SSE-LINUX-LABEL: loopclearence:
1114 ; SSE-LINUX: # %bb.0: # %entry
1115 ; SSE-LINUX-NEXT: movq (%rdi), %rax
1116 ; SSE-LINUX-NEXT: movl $1, %ecx
1117 ; SSE-LINUX-NEXT: .p2align 4, 0x90
1118 ; SSE-LINUX-NEXT: .LBB12_1: # %loop
1119 ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1
1120 ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4
1121 ; SSE-LINUX-NEXT: cvtsi2sd %rcx, %xmm4
1122 ; SSE-LINUX-NEXT: #APP
1123 ; SSE-LINUX-NEXT: #NO_APP
1124 ; SSE-LINUX-NEXT: #APP
1125 ; SSE-LINUX-NEXT: #NO_APP
1126 ; SSE-LINUX-NEXT: #APP
1127 ; SSE-LINUX-NEXT: #NO_APP
1128 ; SSE-LINUX-NEXT: #APP
1129 ; SSE-LINUX-NEXT: #NO_APP
1130 ; SSE-LINUX-NEXT: #APP
1131 ; SSE-LINUX-NEXT: #NO_APP
1132 ; SSE-LINUX-NEXT: #APP
1133 ; SSE-LINUX-NEXT: #NO_APP
1134 ; SSE-LINUX-NEXT: #APP
1135 ; SSE-LINUX-NEXT: #NO_APP
1136 ; SSE-LINUX-NEXT: addsd (%rsi), %xmm4
1137 ; SSE-LINUX-NEXT: cvttsd2si %xmm4, %rdx
1138 ; SSE-LINUX-NEXT: addq %rdx, %rax
1139 ; SSE-LINUX-NEXT: incq %rcx
1140 ; SSE-LINUX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90
1141 ; SSE-LINUX-NEXT: jne .LBB12_1
1142 ; SSE-LINUX-NEXT: # %bb.2: # %ret
1143 ; SSE-LINUX-NEXT: retq
1145 ; SSE-WIN-LABEL: loopclearence:
1146 ; SSE-WIN: # %bb.0: # %entry
1147 ; SSE-WIN-NEXT: subq $136, %rsp
1148 ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1149 ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1150 ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1151 ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1152 ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1153 ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1154 ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1155 ; SSE-WIN-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill
1156 ; SSE-WIN-NEXT: movq (%rcx), %rax
1157 ; SSE-WIN-NEXT: movl $1, %r8d
1158 ; SSE-WIN-NEXT: .p2align 4, 0x90
1159 ; SSE-WIN-NEXT: .LBB12_1: # %loop
1160 ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1
1161 ; SSE-WIN-NEXT: xorps %xmm4, %xmm4
1162 ; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm4
1163 ; SSE-WIN-NEXT: #APP
1164 ; SSE-WIN-NEXT: #NO_APP
1165 ; SSE-WIN-NEXT: #APP
1166 ; SSE-WIN-NEXT: #NO_APP
1167 ; SSE-WIN-NEXT: #APP
1168 ; SSE-WIN-NEXT: #NO_APP
1169 ; SSE-WIN-NEXT: #APP
1170 ; SSE-WIN-NEXT: #NO_APP
1171 ; SSE-WIN-NEXT: #APP
1172 ; SSE-WIN-NEXT: #NO_APP
1173 ; SSE-WIN-NEXT: #APP
1174 ; SSE-WIN-NEXT: #NO_APP
1175 ; SSE-WIN-NEXT: #APP
1176 ; SSE-WIN-NEXT: #NO_APP
1177 ; SSE-WIN-NEXT: addsd (%rdx), %xmm4
1178 ; SSE-WIN-NEXT: cvttsd2si %xmm4, %rcx
1179 ; SSE-WIN-NEXT: addq %rcx, %rax
1180 ; SSE-WIN-NEXT: incq %r8
1181 ; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
1182 ; SSE-WIN-NEXT: jne .LBB12_1
1183 ; SSE-WIN-NEXT: # %bb.2: # %ret
1184 ; SSE-WIN-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload
1185 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1186 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1187 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1188 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1189 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1190 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1191 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1192 ; SSE-WIN-NEXT: addq $136, %rsp
1193 ; SSE-WIN-NEXT: retq
1195 ; AVX-LABEL: loopclearence:
1196 ; AVX: # %bb.0: # %entry
1197 ; AVX-NEXT: subq $136, %rsp
1198 ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1199 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1200 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1201 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1202 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1203 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1205 ; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
1206 ; AVX-NEXT: movq (%rcx), %rax
1207 ; AVX-NEXT: movl $1, %r8d
1208 ; AVX-NEXT: .p2align 4, 0x90
1209 ; AVX-NEXT: .LBB12_1: # %loop
1210 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1
1211 ; AVX-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4
1226 ; AVX-NEXT: vaddsd (%rdx), %xmm4, %xmm0
1227 ; AVX-NEXT: vcvttsd2si %xmm0, %rcx
1228 ; AVX-NEXT: addq %rcx, %rax
1229 ; AVX-NEXT: incq %r8
1230 ; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90
1231 ; AVX-NEXT: jne .LBB12_1
1232 ; AVX-NEXT: # %bb.2: # %ret
1233 ; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
1234 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1235 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1236 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1237 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1238 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1239 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1240 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1241 ; AVX-NEXT: addq $136, %rsp
1244 %vx = load i64, i64* %x
1247 %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
1248 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
1249 %fi = sitofp i64 %i to double
1250 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
1251 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
1252 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
1253 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"()
1254 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"()
1255 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"()
1256 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"()
1257 %vy = load double, double* %y
1258 %fipy = fadd double %fi, %vy
1259 %iipy = fptosi double %fipy to i64
1260 %s2 = add i64 %s1, %iipy
1261 %inc = add nsw i64 %i, 1
1262 %exitcond = icmp eq i64 %inc, 156250000
1263 br i1 %exitcond, label %ret, label %loop
1268 ; Make sure we are making a smart choice regarding undef registers even for more
1269 ; complicated loop structures. This example is the inner loop from
1270 ; julia> a = falses(10000); a[1:4:end] = true
1271 ; julia> linspace(1.0,2.0,10000)[a]
1272 define dso_local void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) {
1273 ; SSE-LINUX-LABEL: loopclearance2:
1274 ; SSE-LINUX: # %bb.0: # %entry
1275 ; SSE-LINUX-NEXT: #APP
1276 ; SSE-LINUX-NEXT: #NO_APP
1277 ; SSE-LINUX-NEXT: #APP
1278 ; SSE-LINUX-NEXT: #NO_APP
1279 ; SSE-LINUX-NEXT: #APP
1280 ; SSE-LINUX-NEXT: #NO_APP
1281 ; SSE-LINUX-NEXT: #APP
1282 ; SSE-LINUX-NEXT: #NO_APP
1283 ; SSE-LINUX-NEXT: #APP
1284 ; SSE-LINUX-NEXT: #NO_APP
1285 ; SSE-LINUX-NEXT: #APP
1286 ; SSE-LINUX-NEXT: #NO_APP
1287 ; SSE-LINUX-NEXT: #APP
1288 ; SSE-LINUX-NEXT: #NO_APP
1289 ; SSE-LINUX-NEXT: movl $1, %r8d
1290 ; SSE-LINUX-NEXT: xorl %ecx, %ecx
1291 ; SSE-LINUX-NEXT: .p2align 4, 0x90
1292 ; SSE-LINUX-NEXT: .LBB13_1: # %inner_loop
1293 ; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1
1294 ; SSE-LINUX-NEXT: movq %rcx, %rax
1295 ; SSE-LINUX-NEXT: shrq $6, %rcx
1296 ; SSE-LINUX-NEXT: movq (%rsi,%rcx,8), %rcx
1297 ; SSE-LINUX-NEXT: btq %rax, %rcx
1298 ; SSE-LINUX-NEXT: leaq 1(%rax), %rcx
1299 ; SSE-LINUX-NEXT: jae .LBB13_1
1300 ; SSE-LINUX-NEXT: # %bb.2: # %loop_end
1301 ; SSE-LINUX-NEXT: # in Loop: Header=BB13_1 Depth=1
1302 ; SSE-LINUX-NEXT: leaq 1(%r8), %r9
1303 ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4
1304 ; SSE-LINUX-NEXT: cvtsi2sd %r9, %xmm4
1305 ; SSE-LINUX-NEXT: movapd %xmm0, %xmm5
1306 ; SSE-LINUX-NEXT: subsd %xmm4, %xmm5
1307 ; SSE-LINUX-NEXT: mulsd %xmm1, %xmm5
1308 ; SSE-LINUX-NEXT: leaq -1(%rcx), %rax
1309 ; SSE-LINUX-NEXT: xorps %xmm4, %xmm4
1310 ; SSE-LINUX-NEXT: cvtsi2sd %rax, %xmm4
1311 ; SSE-LINUX-NEXT: mulsd %xmm2, %xmm4
1312 ; SSE-LINUX-NEXT: addsd %xmm5, %xmm4
1313 ; SSE-LINUX-NEXT: divsd %xmm3, %xmm4
1314 ; SSE-LINUX-NEXT: movsd %xmm4, -8(%rdi,%r8,8)
1315 ; SSE-LINUX-NEXT: movq %r9, %r8
1316 ; SSE-LINUX-NEXT: cmpq %r9, %rdx
1317 ; SSE-LINUX-NEXT: jge .LBB13_1
1318 ; SSE-LINUX-NEXT: # %bb.3: # %loopdone
1319 ; SSE-LINUX-NEXT: retq
1321 ; SSE-WIN-LABEL: loopclearance2:
1322 ; SSE-WIN: # %bb.0: # %entry
1323 ; SSE-WIN-NEXT: subq $152, %rsp
1324 ; SSE-WIN-NEXT: .seh_stackalloc 152
1325 ; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1326 ; SSE-WIN-NEXT: .seh_savexmm %xmm15, 128
1327 ; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1328 ; SSE-WIN-NEXT: .seh_savexmm %xmm14, 112
1329 ; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1330 ; SSE-WIN-NEXT: .seh_savexmm %xmm13, 96
1331 ; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1332 ; SSE-WIN-NEXT: .seh_savexmm %xmm12, 80
1333 ; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1334 ; SSE-WIN-NEXT: .seh_savexmm %xmm11, 64
1335 ; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1336 ; SSE-WIN-NEXT: .seh_savexmm %xmm10, 48
1337 ; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1338 ; SSE-WIN-NEXT: .seh_savexmm %xmm9, 32
1339 ; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1340 ; SSE-WIN-NEXT: .seh_savexmm %xmm8, 16
1341 ; SSE-WIN-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill
1342 ; SSE-WIN-NEXT: .seh_savexmm %xmm7, 0
1343 ; SSE-WIN-NEXT: .seh_endprologue
1344 ; SSE-WIN-NEXT: movq {{[0-9]+}}(%rsp), %r8
1345 ; SSE-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1346 ; SSE-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
1347 ; SSE-WIN-NEXT: #APP
1348 ; SSE-WIN-NEXT: #NO_APP
1349 ; SSE-WIN-NEXT: #APP
1350 ; SSE-WIN-NEXT: #NO_APP
1351 ; SSE-WIN-NEXT: #APP
1352 ; SSE-WIN-NEXT: #NO_APP
1353 ; SSE-WIN-NEXT: #APP
1354 ; SSE-WIN-NEXT: #NO_APP
1355 ; SSE-WIN-NEXT: #APP
1356 ; SSE-WIN-NEXT: #NO_APP
1357 ; SSE-WIN-NEXT: #APP
1358 ; SSE-WIN-NEXT: #NO_APP
1359 ; SSE-WIN-NEXT: #APP
1360 ; SSE-WIN-NEXT: #NO_APP
1361 ; SSE-WIN-NEXT: movl $1, %r9d
1362 ; SSE-WIN-NEXT: xorl %r11d, %r11d
1363 ; SSE-WIN-NEXT: .p2align 4, 0x90
1364 ; SSE-WIN-NEXT: .LBB13_1: # %inner_loop
1365 ; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1
1366 ; SSE-WIN-NEXT: movq %r11, %r10
1367 ; SSE-WIN-NEXT: movq %r11, %rax
1368 ; SSE-WIN-NEXT: shrq $6, %rax
1369 ; SSE-WIN-NEXT: movq (%rdx,%rax,8), %rax
1370 ; SSE-WIN-NEXT: btq %r11, %rax
1371 ; SSE-WIN-NEXT: leaq 1(%r11), %r11
1372 ; SSE-WIN-NEXT: jae .LBB13_1
1373 ; SSE-WIN-NEXT: # %bb.2: # %loop_end
1374 ; SSE-WIN-NEXT: # in Loop: Header=BB13_1 Depth=1
1375 ; SSE-WIN-NEXT: leaq 1(%r9), %r10
1376 ; SSE-WIN-NEXT: xorps %xmm4, %xmm4
1377 ; SSE-WIN-NEXT: cvtsi2sd %r10, %xmm4
1378 ; SSE-WIN-NEXT: movapd %xmm2, %xmm5
1379 ; SSE-WIN-NEXT: subsd %xmm4, %xmm5
1380 ; SSE-WIN-NEXT: mulsd %xmm3, %xmm5
1381 ; SSE-WIN-NEXT: leaq -1(%r11), %rax
1382 ; SSE-WIN-NEXT: xorps %xmm4, %xmm4
1383 ; SSE-WIN-NEXT: cvtsi2sd %rax, %xmm4
1384 ; SSE-WIN-NEXT: mulsd %xmm1, %xmm4
1385 ; SSE-WIN-NEXT: addsd %xmm5, %xmm4
1386 ; SSE-WIN-NEXT: divsd %xmm0, %xmm4
1387 ; SSE-WIN-NEXT: movsd %xmm4, -8(%rcx,%r9,8)
1388 ; SSE-WIN-NEXT: movq %r10, %r9
1389 ; SSE-WIN-NEXT: cmpq %r10, %r8
1390 ; SSE-WIN-NEXT: jge .LBB13_1
1391 ; SSE-WIN-NEXT: # %bb.3: # %loopdone
1392 ; SSE-WIN-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload
1393 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1394 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1395 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1396 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1397 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1398 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1399 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1400 ; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1401 ; SSE-WIN-NEXT: addq $152, %rsp
1402 ; SSE-WIN-NEXT: retq
1403 ; SSE-WIN-NEXT: .seh_endproc
1405 ; AVX1-LABEL: loopclearance2:
1406 ; AVX1: # %bb.0: # %entry
1407 ; AVX1-NEXT: subq $152, %rsp
1408 ; AVX1-NEXT: .seh_stackalloc 152
1409 ; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1410 ; AVX1-NEXT: .seh_savexmm %xmm15, 128
1411 ; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1412 ; AVX1-NEXT: .seh_savexmm %xmm14, 112
1413 ; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1414 ; AVX1-NEXT: .seh_savexmm %xmm13, 96
1415 ; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1416 ; AVX1-NEXT: .seh_savexmm %xmm12, 80
1417 ; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1418 ; AVX1-NEXT: .seh_savexmm %xmm11, 64
1419 ; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1420 ; AVX1-NEXT: .seh_savexmm %xmm10, 48
1421 ; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1422 ; AVX1-NEXT: .seh_savexmm %xmm9, 32
1423 ; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1424 ; AVX1-NEXT: .seh_savexmm %xmm8, 16
1425 ; AVX1-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill
1426 ; AVX1-NEXT: .seh_savexmm %xmm7, 0
1427 ; AVX1-NEXT: .seh_endprologue
1428 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r8
1429 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1430 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1432 ; AVX1-NEXT: #NO_APP
1434 ; AVX1-NEXT: #NO_APP
1436 ; AVX1-NEXT: #NO_APP
1438 ; AVX1-NEXT: #NO_APP
1440 ; AVX1-NEXT: #NO_APP
1442 ; AVX1-NEXT: #NO_APP
1444 ; AVX1-NEXT: #NO_APP
1445 ; AVX1-NEXT: movl $1, %r9d
1446 ; AVX1-NEXT: xorl %r11d, %r11d
1447 ; AVX1-NEXT: .p2align 4, 0x90
1448 ; AVX1-NEXT: .LBB13_1: # %inner_loop
1449 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
1450 ; AVX1-NEXT: movq %r11, %r10
1451 ; AVX1-NEXT: movq %r11, %rax
1452 ; AVX1-NEXT: shrq $6, %rax
1453 ; AVX1-NEXT: movq (%rdx,%rax,8), %rax
1454 ; AVX1-NEXT: btq %r11, %rax
1455 ; AVX1-NEXT: leaq 1(%r11), %r11
1456 ; AVX1-NEXT: jae .LBB13_1
1457 ; AVX1-NEXT: # %bb.2: # %loop_end
1458 ; AVX1-NEXT: # in Loop: Header=BB13_1 Depth=1
1459 ; AVX1-NEXT: leaq 1(%r9), %r10
1460 ; AVX1-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4
1461 ; AVX1-NEXT: vsubsd %xmm4, %xmm2, %xmm4
1462 ; AVX1-NEXT: vmulsd %xmm3, %xmm4, %xmm4
1463 ; AVX1-NEXT: leaq -1(%r11), %rax
1464 ; AVX1-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5
1465 ; AVX1-NEXT: vmulsd %xmm1, %xmm5, %xmm5
1466 ; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4
1467 ; AVX1-NEXT: vdivsd %xmm0, %xmm4, %xmm4
1468 ; AVX1-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8)
1469 ; AVX1-NEXT: movq %r10, %r9
1470 ; AVX1-NEXT: cmpq %r10, %r8
1471 ; AVX1-NEXT: jge .LBB13_1
1472 ; AVX1-NEXT: # %bb.3: # %loopdone
1473 ; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
1474 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1475 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1476 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1477 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1478 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1479 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1480 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1481 ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1482 ; AVX1-NEXT: addq $152, %rsp
1484 ; AVX1-NEXT: .seh_endproc
1486 ; AVX512VL-LABEL: loopclearance2:
1487 ; AVX512VL: # %bb.0: # %entry
1488 ; AVX512VL-NEXT: subq $152, %rsp
1489 ; AVX512VL-NEXT: .seh_stackalloc 152
1490 ; AVX512VL-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1491 ; AVX512VL-NEXT: .seh_savexmm %xmm15, 128
1492 ; AVX512VL-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1493 ; AVX512VL-NEXT: .seh_savexmm %xmm14, 112
1494 ; AVX512VL-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1495 ; AVX512VL-NEXT: .seh_savexmm %xmm13, 96
1496 ; AVX512VL-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1497 ; AVX512VL-NEXT: .seh_savexmm %xmm12, 80
1498 ; AVX512VL-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1499 ; AVX512VL-NEXT: .seh_savexmm %xmm11, 64
1500 ; AVX512VL-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1501 ; AVX512VL-NEXT: .seh_savexmm %xmm10, 48
1502 ; AVX512VL-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1503 ; AVX512VL-NEXT: .seh_savexmm %xmm9, 32
1504 ; AVX512VL-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1505 ; AVX512VL-NEXT: .seh_savexmm %xmm8, 16
1506 ; AVX512VL-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill
1507 ; AVX512VL-NEXT: .seh_savexmm %xmm7, 0
1508 ; AVX512VL-NEXT: .seh_endprologue
1509 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1510 ; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r8
1511 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1512 ; AVX512VL-NEXT: #APP
1513 ; AVX512VL-NEXT: #NO_APP
1514 ; AVX512VL-NEXT: #APP
1515 ; AVX512VL-NEXT: #NO_APP
1516 ; AVX512VL-NEXT: #APP
1517 ; AVX512VL-NEXT: #NO_APP
1518 ; AVX512VL-NEXT: #APP
1519 ; AVX512VL-NEXT: #NO_APP
1520 ; AVX512VL-NEXT: #APP
1521 ; AVX512VL-NEXT: #NO_APP
1522 ; AVX512VL-NEXT: #APP
1523 ; AVX512VL-NEXT: #NO_APP
1524 ; AVX512VL-NEXT: #APP
1525 ; AVX512VL-NEXT: #NO_APP
1526 ; AVX512VL-NEXT: movl $1, %r9d
1527 ; AVX512VL-NEXT: xorl %r11d, %r11d
1528 ; AVX512VL-NEXT: .p2align 4, 0x90
1529 ; AVX512VL-NEXT: .LBB13_1: # %inner_loop
1530 ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
1531 ; AVX512VL-NEXT: movq %r11, %r10
1532 ; AVX512VL-NEXT: movq %r11, %rax
1533 ; AVX512VL-NEXT: shrq $6, %rax
1534 ; AVX512VL-NEXT: movq (%rdx,%rax,8), %rax
1535 ; AVX512VL-NEXT: btq %r11, %rax
1536 ; AVX512VL-NEXT: leaq 1(%r11), %r11
1537 ; AVX512VL-NEXT: jae .LBB13_1
1538 ; AVX512VL-NEXT: # %bb.2: # %loop_end
1539 ; AVX512VL-NEXT: # in Loop: Header=BB13_1 Depth=1
1540 ; AVX512VL-NEXT: leaq 1(%r9), %r10
1541 ; AVX512VL-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4
1542 ; AVX512VL-NEXT: vsubsd %xmm4, %xmm2, %xmm4
1543 ; AVX512VL-NEXT: vmulsd %xmm3, %xmm4, %xmm4
1544 ; AVX512VL-NEXT: leaq -1(%r11), %rax
1545 ; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5
1546 ; AVX512VL-NEXT: vmulsd %xmm1, %xmm5, %xmm5
1547 ; AVX512VL-NEXT: vaddsd %xmm5, %xmm4, %xmm4
1548 ; AVX512VL-NEXT: vdivsd %xmm0, %xmm4, %xmm4
1549 ; AVX512VL-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8)
1550 ; AVX512VL-NEXT: movq %r10, %r9
1551 ; AVX512VL-NEXT: cmpq %r10, %r8
1552 ; AVX512VL-NEXT: jge .LBB13_1
1553 ; AVX512VL-NEXT: # %bb.3: # %loopdone
1554 ; AVX512VL-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
1555 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1556 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1557 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1558 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1559 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1560 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1561 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1562 ; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1563 ; AVX512VL-NEXT: addq $152, %rsp
1564 ; AVX512VL-NEXT: retq
1565 ; AVX512VL-NEXT: .seh_endproc
1567 tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"()
1568 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
1569 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
1570 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"()
1571 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"()
1572 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"()
1573 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"()
1577 %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ]
1578 %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ]
1579 %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ]
1580 br label %inner_loop
1583 %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ]
1584 %idx = lshr i64 %phi, 6
1585 %inputptr = getelementptr i64, i64* %x, i64 %idx
1586 %input = load i64, i64* %inputptr, align 8
1587 %masked = and i64 %phi, 63
1588 %shiftedmasked = shl i64 1, %masked
1589 %maskedinput = and i64 %input, %shiftedmasked
1590 %cmp = icmp eq i64 %maskedinput, 0
1591 %nextk = add i64 %phi, 1
1592 br i1 %cmp, label %inner_loop, label %loop_end
1595 %nexti = add i64 %phi_i, 1
1596 %nextj = add i64 %phi_j, 1
1597 ; Register use, plus us clobbering 7-15 above, basically forces xmm6 here as
1598 ; the only reasonable choice. The primary thing we care about is that it's
1599 ; not one of the registers used in the loop (e.g. not the output reg here)
1600 %nexti_f = sitofp i64 %nexti to double
1601 %sub = fsub double %c1, %nexti_f
1602 %mul = fmul double %sub, %c2
1603 %phi_f = sitofp i64 %phi to double
1604 %mul2 = fmul double %phi_f, %c3
1605 %add2 = fadd double %mul, %mul2
1606 %div = fdiv double %add2, %c4
1607 %prev_j = add i64 %phi_j, -1
1608 %outptr = getelementptr double, double* %y, i64 %prev_j
1609 store double %div, double* %outptr, align 8
1610 %done = icmp slt i64 %size, %nexti
1611 br i1 %done, label %loopdone, label %loop