1 ; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs -disable-block-placement < %s | FileCheck -allow-deprecated-dag-overlap %s
3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4 ;; This test checks that x86-cmov-converter optimization transform CMOV
5 ;; instruction into branches when it is profitable.
6 ;; There are 5 cases below:
7 ;; 1. CmovInCriticalPath:
8 ;; CMOV depends on the condition and it is in the hot path.
9 ;; Thus, it worths transforming.
11 ;; 2. CmovNotInCriticalPath:
12 ;; Similar test like in (1), just that CMOV is not in the hot path.
13 ;; Thus, it does not worth transforming.
16 ;; Maximum calculation algorithm that is looking for the max index,
17 ;; calculating CMOV value is cheaper than calculating CMOV condition.
18 ;; Thus, it worths transforming.
21 ;; Maximum calculation algorithm that is looking for the max value,
22 ;; calculating CMOV value is not cheaper than calculating CMOV condition.
23 ;; Thus, it does not worth transforming.
26 ;; Usually, binary search CMOV is not predicted.
27 ;; Thus, it does not worth transforming.
29 ;; Test was created using the following command line:
30 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33 ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
34 ;; for (int i = 0; i < n; i++) {
38 ;; c[i] = (c[i] + 1) * t;
43 ;;void CmovNotInHotPath(int n, int a, int b, int *c, int *d) {
44 ;; for (int i = 0; i < n; i++) {
54 ;;int MaxIndex(int n, int *a) {
56 ;; for (int i = 1; i < n; i++) {
64 ;;int MaxValue(int n, int *a) {
66 ;; for (int i = 1; i < n; i++) {
73 ;;typedef struct Node Node;
80 ;;unsigned BinarySearch(unsigned Mask, Node *Curr, Node *Next) {
81 ;; while (Curr->Val > Next->Val) {
83 ;; if (Mask & (0x1 << Curr->Val))
84 ;; Next = Curr->Right;
92 ;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
93 ;; for (int i = 0; i < n; i++) {
100 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
102 %struct.Node = type { i32, %struct.Node*, %struct.Node* }
104 ; CHECK-LABEL: CmovInHotPath
108 define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
110 %cmp14 = icmp sgt i32 %n, 0
111 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
113 for.body.preheader: ; preds = %entry
114 %wide.trip.count = zext i32 %n to i64
117 for.cond.cleanup: ; preds = %for.body, %entry
120 for.body: ; preds = %for.body.preheader, %for.body
121 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
122 %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
123 %0 = load i32, i32* %arrayidx, align 4
124 %add = add nsw i32 %0, 1
125 %mul = mul nsw i32 %0, %a
126 %cmp3 = icmp sgt i32 %mul, %b
127 %. = select i1 %cmp3, i32 10, i32 %add
128 %mul7 = mul nsw i32 %., %add
129 store i32 %mul7, i32* %arrayidx, align 4
130 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
131 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
132 br i1 %exitcond, label %for.cond.cleanup, label %for.body
135 ; CHECK-LABEL: CmovNotInHotPath
138 define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture %d) #0 {
140 %cmp18 = icmp sgt i32 %n, 0
141 br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup
143 for.body.preheader: ; preds = %entry
144 %wide.trip.count = zext i32 %n to i64
147 for.cond.cleanup: ; preds = %for.body, %entry
150 for.body: ; preds = %for.body.preheader, %for.body
151 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
152 %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
153 %0 = load i32, i32* %arrayidx, align 4
154 %mul = mul nsw i32 %0, %a
155 %cmp3 = icmp sgt i32 %mul, %b
156 %. = select i1 %cmp3, i32 10, i32 %0
157 store i32 %., i32* %arrayidx, align 4
158 %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
159 %1 = load i32, i32* %arrayidx7, align 4
160 %div = sdiv i32 %1, %b
161 store i32 %div, i32* %arrayidx7, align 4
162 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
163 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
164 br i1 %exitcond, label %for.cond.cleanup, label %for.body
167 ; CHECK-LABEL: MaxIndex
171 define i32 @MaxIndex(i32 %n, i32* nocapture readonly %a) #0 {
173 %cmp14 = icmp sgt i32 %n, 1
174 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
176 for.body.preheader: ; preds = %entry
177 %wide.trip.count = zext i32 %n to i64
180 for.cond.cleanup: ; preds = %for.body, %entry
181 %t.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.t.0, %for.body ]
184 for.body: ; preds = %for.body.preheader, %for.body
185 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
186 %t.015 = phi i32 [ %i.0.t.0, %for.body ], [ 0, %for.body.preheader ]
187 %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
188 %0 = load i32, i32* %arrayidx, align 4
189 %idxprom1 = sext i32 %t.015 to i64
190 %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1
191 %1 = load i32, i32* %arrayidx2, align 4
192 %cmp3 = icmp sgt i32 %0, %1
193 %2 = trunc i64 %indvars.iv to i32
194 %i.0.t.0 = select i1 %cmp3, i32 %2, i32 %t.015
195 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
196 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
197 br i1 %exitcond, label %for.cond.cleanup, label %for.body
200 ; CHECK-LABEL: MaxValue
204 define i32 @MaxValue(i32 %n, i32* nocapture readonly %a) #0 {
206 %0 = load i32, i32* %a, align 4
207 %cmp13 = icmp sgt i32 %n, 1
208 br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
210 for.body.preheader: ; preds = %entry
211 %wide.trip.count = zext i32 %n to i64
214 for.cond.cleanup: ; preds = %for.body, %entry
215 %t.0.lcssa = phi i32 [ %0, %entry ], [ %.t.0, %for.body ]
218 for.body: ; preds = %for.body.preheader, %for.body
219 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
220 %t.014 = phi i32 [ %.t.0, %for.body ], [ %0, %for.body.preheader ]
221 %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
222 %1 = load i32, i32* %arrayidx1, align 4
223 %cmp2 = icmp sgt i32 %1, %t.014
224 %.t.0 = select i1 %cmp2, i32 %1, i32 %t.014
225 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
226 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
227 br i1 %exitcond, label %for.cond.cleanup, label %for.body
230 ; CHECK-LABEL: BinarySearch
233 define i32 @BinarySearch(i32 %Mask, %struct.Node* nocapture readonly %Curr, %struct.Node* nocapture readonly %Next) #0 {
235 %Val8 = getelementptr inbounds %struct.Node, %struct.Node* %Curr, i64 0, i32 0
236 %0 = load i32, i32* %Val8, align 8
237 %Val19 = getelementptr inbounds %struct.Node, %struct.Node* %Next, i64 0, i32 0
238 %1 = load i32, i32* %Val19, align 8
239 %cmp10 = icmp ugt i32 %0, %1
240 br i1 %cmp10, label %while.body, label %while.end
242 while.body: ; preds = %entry, %while.body
243 %2 = phi i32 [ %4, %while.body ], [ %1, %entry ]
244 %Next.addr.011 = phi %struct.Node* [ %3, %while.body ], [ %Next, %entry ]
246 %and = and i32 %shl, %Mask
247 %tobool = icmp eq i32 %and, 0
248 %Left = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 2
249 %Right = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 1
250 %Left.sink = select i1 %tobool, %struct.Node** %Left, %struct.Node** %Right
251 %3 = load %struct.Node*, %struct.Node** %Left.sink, align 8
252 %Val1 = getelementptr inbounds %struct.Node, %struct.Node* %3, i64 0, i32 0
253 %4 = load i32, i32* %Val1, align 8
254 %cmp = icmp ugt i32 %2, %4
255 br i1 %cmp, label %while.body, label %while.end
257 while.end: ; preds = %while.body, %entry
258 %.lcssa = phi i32 [ %0, %entry ], [ %2, %while.body ]
262 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
263 ;; The following test checks that x86-cmov-converter optimization transforms
264 ;; CMOV instructions into branch correctly.
268 ;; v1 = CMOVgt t1, f1, cond
269 ;; v2 = CMOVle s1, f2, cond
271 ;; Where: t1 = 11, f1 = 22, f2 = a
273 ;; After CMOV transformation
274 ;; -------------------------
283 ;; %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
284 ;; %v2 = phi[%f1, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
285 ;; ; true-value with false-value
286 ;; ; Phi instruction cannot use
287 ;; ; previous Phi instruction result
288 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
290 ; CHECK-LABEL: Transform
292 ; CHECK: divl [[a:%[0-9a-z]*]]
293 ; CHECK: movl $11, [[s1:%[0-9a-z]*]]
294 ; CHECK: movl [[a]], [[s2:%[0-9a-z]*]]
295 ; CHECK: cmpl [[a]], %edx
296 ; CHECK: ja [[SinkBB:.*]]
297 ; CHECK: [[FalseBB:.*]]:
298 ; CHECK: movl $22, [[s1]]
299 ; CHECK: movl $22, [[s2]]
303 define void @Transform(i32 *%arr, i32 *%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 {
305 %cmp10 = icmp ugt i32 0, %n
306 br i1 %cmp10, label %while.body, label %while.end
308 while.body: ; preds = %entry, %while.body
309 %i = phi i32 [ %i_inc, %while.body ], [ 0, %entry ]
310 %arr_i = getelementptr inbounds i32, i32* %arr, i32 %i
311 %x = load i32, i32* %arr_i, align 4
312 %div = udiv i32 %x, %a
313 %cond = icmp ugt i32 %div, %a
314 %condOpp = icmp ule i32 %div, %a
315 %s1 = select i1 %cond, i32 11, i32 22
316 %s2 = select i1 %condOpp, i32 %s1, i32 %a
317 %sum = urem i32 %s1, %s2
318 store i32 %sum, i32* %arr_i, align 4
319 %i_inc = add i32 %i, 1
320 %cmp = icmp ugt i32 %i_inc, %n
321 br i1 %cmp, label %while.body, label %while.end
323 while.end: ; preds = %while.body, %entry
327 ; Test that we always will convert a cmov with a memory operand into a branch,
328 ; even outside of a loop.
329 define i32 @test_cmov_memoperand(i32 %a, i32 %b, i32 %x, i32* %y) #0 {
330 ; CHECK-LABEL: test_cmov_memoperand:
332 %cond = icmp ugt i32 %a, %b
333 ; CHECK: movl %edx, %eax
335 %load = load i32, i32* %y
336 %z = select i1 %cond, i32 %x, i32 %load
338 ; CHECK: ja [[FALSE_BB:.*]]
339 ; CHECK: movl (%rcx), %eax
340 ; CHECK: [[FALSE_BB]]:
344 ; Test that we can convert a group of cmovs where only one has a memory
346 define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
347 ; CHECK-LABEL: test_cmov_memoperand_in_group:
349 %cond = icmp ugt i32 %a, %b
350 ; CHECK: movl %edx, %eax
352 %y = load i32, i32* %y.ptr
353 %z1 = select i1 %cond, i32 %x, i32 %a
354 %z2 = select i1 %cond, i32 %x, i32 %y
355 %z3 = select i1 %cond, i32 %x, i32 %b
357 ; CHECK: ja [[FALSE_BB:.*]]
358 ; CHECK-DAG: movl %{{.*}}, %[[R1:.*]]
359 ; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]]
360 ; CHECK-DAG: movl %{{.*}} %eax
361 ; CHECK: [[FALSE_BB]]:
371 %s1 = add i32 %z1, %z2
372 %s2 = add i32 %s1, %z3
376 ; Same as before but with operands reversed in the select with a load.
377 define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
378 ; CHECK-LABEL: test_cmov_memoperand_in_group2:
380 %cond = icmp ugt i32 %a, %b
381 ; CHECK: movl %edx, %eax
383 %y = load i32, i32* %y.ptr
384 %z2 = select i1 %cond, i32 %a, i32 %x
385 %z1 = select i1 %cond, i32 %y, i32 %x
386 %z3 = select i1 %cond, i32 %b, i32 %x
388 ; CHECK: jbe [[FALSE_BB:.*]]
389 ; CHECK-DAG: movl %{{.*}}, %[[R1:.*]]
390 ; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]]
391 ; CHECK-DAG: movl %{{.*}} %eax
392 ; CHECK: [[FALSE_BB]]:
402 %s1 = add i32 %z1, %z2
403 %s2 = add i32 %s1, %z3
407 ; Test that we don't convert a group of cmovs with conflicting directions of
409 define i32 @test_cmov_memoperand_conflicting_dir(i32 %a, i32 %b, i32 %x, i32* %y1.ptr, i32* %y2.ptr) #0 {
410 ; CHECK-LABEL: test_cmov_memoperand_conflicting_dir:
412 %cond = icmp ugt i32 %a, %b
414 %y1 = load i32, i32* %y1.ptr
415 %y2 = load i32, i32* %y2.ptr
416 %z1 = select i1 %cond, i32 %x, i32 %y1
417 %z2 = select i1 %cond, i32 %y2, i32 %x
420 %s1 = add i32 %z1, %z2
424 ; Test that we can convert a group of cmovs where only one has a memory
425 ; operand and where that memory operand's registers come from a prior cmov in
427 define i32 @test_cmov_memoperand_in_group_reuse_for_addr(i32 %a, i32 %b, i32* %x, i32* %y) #0 {
428 ; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr:
430 %cond = icmp ugt i32 %a, %b
431 ; CHECK: movl %edi, %eax
433 %p = select i1 %cond, i32* %x, i32* %y
434 %load = load i32, i32* %p
435 %z = select i1 %cond, i32 %a, i32 %load
437 ; CHECK: ja [[FALSE_BB:.*]]
438 ; CHECK: movl (%r{{..}}), %eax
439 ; CHECK: [[FALSE_BB]]:
444 ; Test that we can convert a group of two cmovs with memory operands where one
445 ; uses the result of the other as part of the address.
446 define i32 @test_cmov_memoperand_in_group_reuse_for_addr2(i32 %a, i32 %b, i32* %x, i32** %y) #0 {
447 ; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2:
449 %cond = icmp ugt i32 %a, %b
450 ; CHECK: movl %edi, %eax
452 %load1 = load i32*, i32** %y
453 %p = select i1 %cond, i32* %x, i32* %load1
454 %load2 = load i32, i32* %p
455 %z = select i1 %cond, i32 %a, i32 %load2
457 ; CHECK: ja [[FALSE_BB:.*]]
458 ; CHECK: movq (%r{{..}}), %[[R1:.*]]
459 ; CHECK: movl (%[[R1]]), %eax
460 ; CHECK: [[FALSE_BB]]:
465 ; Test that we can convert a group of cmovs where only one has a memory
466 ; operand and where that memory operand's registers come from a prior cmov and
467 ; where that cmov gets *its* input from a prior cmov in the group.
468 define i32 @test_cmov_memoperand_in_group_reuse_for_addr3(i32 %a, i32 %b, i32* %x, i32* %y, i32* %z) #0 {
469 ; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr3:
471 %cond = icmp ugt i32 %a, %b
472 ; CHECK: movl %edi, %eax
474 %p = select i1 %cond, i32* %x, i32* %y
475 %p2 = select i1 %cond, i32* %z, i32* %p
476 %load = load i32, i32* %p2
477 %r = select i1 %cond, i32 %a, i32 %load
479 ; CHECK: ja [[FALSE_BB:.*]]
480 ; CHECK: movl (%r{{..}}), %eax
481 ; CHECK: [[FALSE_BB]]:
486 @begin = external global i32*
487 @end = external global i32*
489 define void @test_memoperand_loop(i32 %data) #0 {
490 ; CHECK-LABEL: test_memoperand_loop:
491 ; CHECK: # %bb.0: # %entry
492 ; CHECK-NEXT: movq begin@GOTPCREL(%rip), %r8
493 ; CHECK-NEXT: movq (%r8), %rax
494 ; CHECK-NEXT: movq end@GOTPCREL(%rip), %rcx
495 ; CHECK-NEXT: movq (%rcx), %rdx
496 ; CHECK-NEXT: xorl %esi, %esi
497 ; CHECK-NEXT: movq %rax, %rcx
499 %begin = load i32*, i32** @begin, align 8
500 %end = load i32*, i32** @end, align 8
503 ; CHECK-NEXT: .LBB13_1: # %loop.body
504 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
505 ; CHECK-NEXT: addq $8, %rcx
506 ; CHECK-NEXT: cmpq %rdx, %rcx
507 ; CHECK-NEXT: ja .LBB13_3
508 ; CHECK-NEXT: # %bb.2: # %loop.body
509 ; CHECK-NEXT: # in Loop: Header=BB13_1 Depth=1
510 ; CHECK-NEXT: movq (%r8), %rcx
511 ; CHECK-NEXT: .LBB13_3: # %loop.body
512 ; CHECK-NEXT: # in Loop: Header=BB13_1 Depth=1
513 ; CHECK-NEXT: movl %edi, (%rcx)
514 ; CHECK-NEXT: addq $8, %rcx
515 ; CHECK-NEXT: cmpq %rdx, %rcx
516 ; CHECK-NEXT: ja .LBB13_5
517 ; CHECK-NEXT: # %bb.4: # %loop.body
518 ; CHECK-NEXT: # in Loop: Header=BB13_1 Depth=1
519 ; CHECK-NEXT: movq %rax, %rcx
520 ; CHECK-NEXT: .LBB13_5: # %loop.body
521 ; CHECK-NEXT: # in Loop: Header=BB13_1 Depth=1
522 ; CHECK-NEXT: movl %edi, (%rcx)
523 ; CHECK-NEXT: addl $1, %esi
524 ; CHECK-NEXT: cmpl $1024, %esi # imm = 0x400
525 ; CHECK-NEXT: jl .LBB13_1
527 %phi.iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
528 %phi.ptr = phi i32* [ %begin, %entry ], [ %dst2, %loop.body ]
529 %gep1 = getelementptr inbounds i32, i32 *%phi.ptr, i64 2
530 %cmp1 = icmp ugt i32* %gep1, %end
531 %begin_dup = load i32*, i32** @begin, align 8
532 %dst1 = select i1 %cmp1, i32* %gep1, i32* %begin_dup
533 store i32 %data, i32 *%dst1, align 4
534 %gep2 = getelementptr inbounds i32, i32 *%dst1, i64 2
535 %cmp2 = icmp ugt i32* %gep2, %end
536 %dst2 = select i1 %cmp2, i32* %gep2, i32* %begin
537 store i32 %data, i32 *%dst2, align 4
538 %iv.next = add i32 %phi.iv, 1
539 %cond = icmp slt i32 %iv.next, 1024
540 br i1 %cond, label %loop.body, label %exit
542 ; CHECK-NEXT: # %bb.6: # %exit
548 attributes #0 = {"target-cpu"="x86-64"}