1 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
3 ; Test that loops with different maximum offsets for different address
4 ; spaces are correctly handled.
6 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
8 ; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32(
10 ; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
11 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095
12 ; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1
13 define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
15 %tmp = icmp sgt i32 %n, 0
16 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
18 .lr.ph.preheader: ; preds = %bb
21 ._crit_edge.loopexit: ; preds = %.lr.ph
24 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
27 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
28 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
29 %tmp1 = add nuw nsw i64 %indvars.iv, 4095
30 %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %tmp1
31 %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
32 %tmp4 = sext i8 %tmp3 to i32
33 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
34 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
35 %tmp7 = add nsw i32 %tmp6, %tmp4
36 store i32 %tmp7, i32 addrspace(1)* %tmp5, align 4
37 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
38 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
39 %exitcond = icmp eq i32 %lftr.wideiv, %n
40 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
43 ; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_p1_i32(
44 ; OPT: {{^}}.lr.ph.preheader:
45 ; OPT: %scevgep2 = getelementptr i8, i8 addrspace(1)* %arg1, i64 4096
46 ; OPT: br label %.lr.ph
49 ; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
50 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1
51 define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
53 %tmp = icmp sgt i32 %n, 0
54 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
56 .lr.ph.preheader: ; preds = %bb
59 ._crit_edge.loopexit: ; preds = %.lr.ph
62 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
65 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
66 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
67 %tmp1 = add nuw nsw i64 %indvars.iv, 4096
68 %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %tmp1
69 %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
70 %tmp4 = sext i8 %tmp3 to i32
71 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
72 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
73 %tmp7 = add nsw i32 %tmp6, %tmp4
74 store i32 %tmp7, i32 addrspace(1)* %tmp5, align 4
75 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
76 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
77 %exitcond = icmp eq i32 %lftr.wideiv, %n
78 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
81 ; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32(
83 ; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
84 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535
85 ; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1
86 define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
88 %tmp = icmp sgt i32 %n, 0
89 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
91 .lr.ph.preheader: ; preds = %bb
94 ._crit_edge.loopexit: ; preds = %.lr.ph
97 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
100 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
101 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
102 %tmp1 = add nuw nsw i64 %indvars.iv, 65535
103 %tmp2 = trunc i64 %tmp1 to i32
104 %tmp3 = getelementptr inbounds i8, i8 addrspace(3)* %arg1, i32 %tmp2
105 %tmp4 = load i8, i8 addrspace(3)* %tmp3, align 1
106 %tmp5 = sext i8 %tmp4 to i32
107 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
108 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
109 %tmp8 = add nsw i32 %tmp7, %tmp5
110 store i32 %tmp8, i32 addrspace(1)* %tmp6, align 4
111 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
112 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
113 %exitcond = icmp eq i32 %lftr.wideiv, %n
114 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
117 ; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_p1_i32(
118 ; OPT: {{^}}.lr.ph.preheader:
119 ; OPT: %scevgep2 = getelementptr i8, i8 addrspace(3)* %arg1, i32 65536
120 ; OPT: br label %.lr.ph
123 ; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
124 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1
125 define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
127 %tmp = icmp sgt i32 %n, 0
128 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
130 .lr.ph.preheader: ; preds = %bb
133 ._crit_edge.loopexit: ; preds = %.lr.ph
134 br label %._crit_edge
136 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
139 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
140 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
141 %tmp1 = add nuw nsw i64 %indvars.iv, 65536
142 %tmp2 = trunc i64 %tmp1 to i32
143 %tmp3 = getelementptr inbounds i8, i8 addrspace(3)* %arg1, i32 %tmp2
144 %tmp4 = load i8, i8 addrspace(3)* %tmp3, align 1
145 %tmp5 = sext i8 %tmp4 to i32
146 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv
147 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
148 %tmp8 = add nsw i32 %tmp7, %tmp5
149 store i32 %tmp8, i32 addrspace(1)* %tmp6, align 4
150 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
151 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
152 %exitcond = icmp eq i32 %lftr.wideiv, %n
153 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
156 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" }