1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s
4 define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
5 ; GCN-LABEL: copy_flat:
6 ; GCN: ; %bb.0: ; %entry
7 ; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34
8 ; GCN-NEXT: s_wait_kmcnt 0x0
9 ; GCN-NEXT: s_cmp_eq_u32 s4, 0
10 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3
11 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader
12 ; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
13 ; GCN-NEXT: s_wait_kmcnt 0x0
14 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
15 ; GCN-NEXT: .LBB0_2: ; %for.body
16 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
17 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18 ; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
19 ; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
20 ; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
21 ; GCN-NEXT: s_add_co_i32 s4, s4, -1
22 ; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176
23 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
24 ; GCN-NEXT: s_cmp_lg_u32 s4, 0
25 ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
26 ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
27 ; GCN-NEXT: flat_store_b128 v[4:5], v[0:3]
28 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2
29 ; GCN-NEXT: .LBB0_3: ; %for.end
32 %cmp6.not = icmp eq i32 %n, 0
33 br i1 %cmp6.not, label %for.end, label %for.body
35 for.body: ; preds = %entry, %for.body
36 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
37 %idxprom = zext i32 %i.07 to i64
38 %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom
39 %ld = load <4 x i32>, ptr %arrayidx, align 4
40 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom
41 store <4 x i32> %ld, ptr %arrayidx2, align 4
42 %inc = add nuw i32 %i.07, 1
43 %exitcond.not = icmp eq i32 %inc, %n
44 br i1 %exitcond.not, label %for.end, label %for.body
46 for.end: ; preds = %for.body, %entry
50 define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
51 ; GCN-LABEL: copy_global:
52 ; GCN: ; %bb.0: ; %entry
53 ; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34
54 ; GCN-NEXT: s_wait_kmcnt 0x0
55 ; GCN-NEXT: s_cmp_eq_u32 s4, 0
56 ; GCN-NEXT: s_cbranch_scc1 .LBB1_3
57 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader
58 ; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
59 ; GCN-NEXT: v_mov_b32_e32 v0, 0
60 ; GCN-NEXT: s_wait_kmcnt 0x0
61 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
62 ; GCN-NEXT: .LBB1_2: ; %for.body
63 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
64 ; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
65 ; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
66 ; GCN-NEXT: s_add_co_i32 s4, s4, -1
67 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
68 ; GCN-NEXT: s_cmp_lg_u32 s4, 0
69 ; GCN-NEXT: s_wait_loadcnt 0x0
70 ; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
71 ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
72 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2
73 ; GCN-NEXT: .LBB1_3: ; %for.end
75 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
78 %cmp6.not = icmp eq i32 %n, 0
79 br i1 %cmp6.not, label %for.end, label %for.body
81 for.body: ; preds = %entry, %for.body
82 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
83 %idxprom = zext i32 %i.07 to i64
84 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom
85 %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
86 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
87 store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
88 %inc = add nuw i32 %i.07, 1
89 %exitcond.not = icmp eq i32 %inc, %n
90 br i1 %exitcond.not, label %for.end, label %for.body
92 for.end: ; preds = %for.body, %entry
96 define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
97 ; GCN-LABEL: copy_constant:
98 ; GCN: ; %bb.0: ; %entry
99 ; GCN-NEXT: s_load_b32 s4, s[2:3], 0x34
100 ; GCN-NEXT: s_wait_kmcnt 0x0
101 ; GCN-NEXT: s_cmp_eq_u32 s4, 0
102 ; GCN-NEXT: s_cbranch_scc1 .LBB2_3
103 ; GCN-NEXT: ; %bb.1: ; %for.body.preheader
104 ; GCN-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
105 ; GCN-NEXT: v_mov_b32_e32 v0, 0
106 ; GCN-NEXT: .LBB2_2: ; %for.body
107 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
108 ; GCN-NEXT: s_wait_kmcnt 0x0
109 ; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
110 ; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
111 ; GCN-NEXT: s_add_co_i32 s4, s4, -1
112 ; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
113 ; GCN-NEXT: s_cmp_lg_u32 s4, 0
114 ; GCN-NEXT: s_wait_kmcnt 0x0
115 ; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
116 ; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
117 ; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
118 ; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
119 ; GCN-NEXT: s_cbranch_scc1 .LBB2_2
120 ; GCN-NEXT: .LBB2_3: ; %for.end
122 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
125 %cmp6.not = icmp eq i32 %n, 0
126 br i1 %cmp6.not, label %for.end, label %for.body
128 for.body: ; preds = %entry, %for.body
129 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
130 %idxprom = zext i32 %i.07 to i64
131 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom
132 %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4
133 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
134 store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
135 %inc = add nuw i32 %i.07, 1
136 %exitcond.not = icmp eq i32 %inc, %n
137 br i1 %exitcond.not, label %for.end, label %for.body
139 for.end: ; preds = %for.body, %entry
143 define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
144 ; GCN-LABEL: copy_local:
145 ; GCN: ; %bb.0: ; %entry
146 ; GCN-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
147 ; GCN-NEXT: s_wait_kmcnt 0x0
148 ; GCN-NEXT: s_cmp_eq_u32 s2, 0
149 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2
150 ; GCN-NEXT: .LBB3_1: ; %for.body
151 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
152 ; GCN-NEXT: v_mov_b32_e32 v2, s1
153 ; GCN-NEXT: v_mov_b32_e32 v4, s0
154 ; GCN-NEXT: s_add_co_i32 s2, s2, -1
155 ; GCN-NEXT: s_add_co_i32 s0, s0, 16
156 ; GCN-NEXT: s_add_co_i32 s1, s1, 16
157 ; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
158 ; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
159 ; GCN-NEXT: s_cmp_lg_u32 s2, 0
160 ; GCN-NEXT: s_wait_dscnt 0x1
161 ; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
162 ; GCN-NEXT: s_wait_dscnt 0x1
163 ; GCN-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1
164 ; GCN-NEXT: s_cbranch_scc1 .LBB3_1
165 ; GCN-NEXT: .LBB3_2: ; %for.end
168 %cmp6.not = icmp eq i32 %n, 0
169 br i1 %cmp6.not, label %for.end, label %for.body
171 for.body: ; preds = %entry, %for.body
172 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
173 %idxprom = zext i32 %i.07 to i64
174 %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom
175 %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4
176 %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom
177 store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4
178 %inc = add nuw i32 %i.07, 1
179 %exitcond.not = icmp eq i32 %inc, %n
180 br i1 %exitcond.not, label %for.end, label %for.body
182 for.end: ; preds = %for.body, %entry