1 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
3 ; Test that loops with different maximum offsets for different address
4 ; spaces are correctly handled.
6 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
8 ; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32(
9 ; OPT: .lr.ph.preheader:
10 ; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4095
11 ; OPT: br label %.lr.ph
13 ; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
14 ; OPT: load i8, ptr addrspace(1) %lsr.iv3, align 1
15 ; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
16 define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 {
18 %tmp = icmp sgt i32 %n, 0
19 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
21 .lr.ph.preheader: ; preds = %bb
24 ._crit_edge.loopexit: ; preds = %.lr.ph
27 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
30 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
31 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
32 %tmp1 = add nuw nsw i64 %indvars.iv, 4095
33 %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %tmp1
34 %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1
35 %tmp4 = sext i8 %tmp3 to i32
36 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
37 %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
38 %tmp7 = add nsw i32 %tmp6, %tmp4
39 store i32 %tmp7, ptr addrspace(1) %tmp5, align 4
40 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
41 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
42 %exitcond = icmp eq i32 %lftr.wideiv, %n
43 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
46 ; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_p1_i32(
47 ; OPT: {{^}}.lr.ph.preheader:
48 ; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4096
49 ; OPT: br label %.lr.ph
52 ; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
53 ; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
54 define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 {
56 %tmp = icmp sgt i32 %n, 0
57 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
59 .lr.ph.preheader: ; preds = %bb
62 ._crit_edge.loopexit: ; preds = %.lr.ph
65 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
68 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
69 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
70 %tmp1 = add nuw nsw i64 %indvars.iv, 4096
71 %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %tmp1
72 %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1
73 %tmp4 = sext i8 %tmp3 to i32
74 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
75 %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
76 %tmp7 = add nsw i32 %tmp6, %tmp4
77 store i32 %tmp7, ptr addrspace(1) %tmp5, align 4
78 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
79 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
80 %exitcond = icmp eq i32 %lftr.wideiv, %n
81 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
84 ; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32(
85 ; OPT: .lr.ph.preheader:
86 ; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65535
87 ; OPT: br label %.lr.ph
89 ; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
90 ; OPT: %tmp4 = load i8, ptr addrspace(3) %lsr.iv3, align 1
91 ; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1
92 define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
94 %tmp = icmp sgt i32 %n, 0
95 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
97 .lr.ph.preheader: ; preds = %bb
100 ._crit_edge.loopexit: ; preds = %.lr.ph
101 br label %._crit_edge
103 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
106 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
107 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
108 %tmp1 = add nuw nsw i64 %indvars.iv, 65535
109 %tmp2 = trunc i64 %tmp1 to i32
110 %tmp3 = getelementptr inbounds i8, ptr addrspace(3) %arg1, i32 %tmp2
111 %tmp4 = load i8, ptr addrspace(3) %tmp3, align 1
112 %tmp5 = sext i8 %tmp4 to i32
113 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
114 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
115 %tmp8 = add nsw i32 %tmp7, %tmp5
116 store i32 %tmp8, ptr addrspace(1) %tmp6, align 4
117 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
118 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
119 %exitcond = icmp eq i32 %lftr.wideiv, %n
120 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
123 ; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_p1_i32(
124 ; OPT: {{^}}.lr.ph.preheader:
125 ; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65536
126 ; OPT: br label %.lr.ph
129 ; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
130 ; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1
131 define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
133 %tmp = icmp sgt i32 %n, 0
134 br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
136 .lr.ph.preheader: ; preds = %bb
139 ._crit_edge.loopexit: ; preds = %.lr.ph
140 br label %._crit_edge
142 ._crit_edge: ; preds = %._crit_edge.loopexit, %bb
145 .lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
146 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
147 %tmp1 = add nuw nsw i64 %indvars.iv, 65536
148 %tmp2 = trunc i64 %tmp1 to i32
149 %tmp3 = getelementptr inbounds i8, ptr addrspace(3) %arg1, i32 %tmp2
150 %tmp4 = load i8, ptr addrspace(3) %tmp3, align 1
151 %tmp5 = sext i8 %tmp4 to i32
152 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg0, i64 %indvars.iv
153 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
154 %tmp8 = add nsw i32 %tmp7, %tmp5
155 store i32 %tmp8, ptr addrspace(1) %tmp6, align 4
156 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
157 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
158 %exitcond = icmp eq i32 %lftr.wideiv, %n
159 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
162 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" }