1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s
2 ; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=CHECK-O0
4 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
6 ; CHECK-LABEL: mubuf_vgpr
7 ; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
8 ; CHECK: [[LOOPBB:BB[0-9]+_[0-9]+]]:
9 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
10 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
11 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
12 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
13 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
14 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
15 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
16 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
17 ; CHECK: s_waitcnt vmcnt(0)
18 ; CHECK: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
19 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
20 ; CHECK: s_cbranch_execnz [[LOOPBB]]
21 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
22 ; CHECK: v_mov_b32_e32 v0, [[RES]]
23 define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
24 %call = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1
28 ; CHECK-LABEL: mubuf_vgpr_adjacent_in_block
30 ; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
31 ; CHECK: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
32 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
33 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
34 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
35 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
36 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
37 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
38 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
39 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
40 ; CHECK: s_waitcnt vmcnt(0)
41 ; CHECK: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
42 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
43 ; CHECK: s_cbranch_execnz [[LOOPBB0]]
45 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
46 ; FIXME: redundant s_mov
47 ; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
49 ; CHECK: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
50 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
51 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
52 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
53 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
54 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
55 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
56 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
57 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
58 ; CHECK: s_waitcnt vmcnt(0)
59 ; CHECK: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
60 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
61 ; CHECK: s_cbranch_execnz [[LOOPBB1]]
63 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
64 ; CHECK-DAG: global_store_dword v[9:10], [[RES0]], off
65 ; CHECK-DAG: global_store_dword v[11:12], [[RES1]], off
67 define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %out0, float addrspace(1)* %out1) #0 {
69 %val0 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1
70 %val1 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %j, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1
71 store volatile float %val0, float addrspace(1)* %out0
72 store volatile float %val1, float addrspace(1)* %out1
76 ; CHECK-LABEL: mubuf_vgpr_outside_entry
78 ; CHECK-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
79 ; CHECK-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
81 ; CHECK: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
82 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
83 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
84 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
85 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
86 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
87 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
88 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
89 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
90 ; CHECK: s_waitcnt vmcnt(0)
91 ; CHECK: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
92 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
93 ; CHECK: s_cbranch_execnz [[LOOPBB0]]
95 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
96 ; CHECK: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
98 ; CHECK: BB{{[0-9]+_[0-9]+}}:
99 ; CHECK-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
100 ; CHECK-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
102 ; CHECK: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
103 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
104 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
105 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
106 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
107 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
108 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
109 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
110 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
111 ; CHECK: s_waitcnt vmcnt(0)
112 ; CHECK: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
113 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
114 ; CHECK: s_cbranch_execnz [[LOOPBB1]]
116 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
119 ; CHECK: global_store_dword v[11:12], [[RES]], off
121 ; Confirm spills do not occur between the XOR and branch that terminate the
122 ; waterfall loop BBs.
124 ; CHECK-O0-LABEL: mubuf_vgpr_outside_entry
126 ; CHECK-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4
127 ; CHECK-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]]
128 ; CHECK-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
129 ; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s5 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
131 ; CHECK-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
132 ; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
133 ; CHECK-O0: s_waitcnt vmcnt(0)
134 ; CHECK-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
135 ; CHECK-O0: s_waitcnt vmcnt(0)
136 ; CHECK-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
137 ; CHECK-O0: s_waitcnt vmcnt(0)
138 ; CHECK-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
139 ; CHECK-O0: s_waitcnt vmcnt(0)
140 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
141 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
142 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
143 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
144 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
145 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
146 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
147 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
148 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
149 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
150 ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
151 ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
152 ; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s5 offset:[[IDX_OFF]] ; 4-byte Folded Reload
153 ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
154 ; CHECK-O0: s_waitcnt vmcnt(0)
155 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
156 ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]]
157 ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB0]]
159 ; CHECK-O0: s_mov_b64 exec, [[SAVEEXEC]]
160 ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
161 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill
162 ; CHECK-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
164 ; CHECK-O0: BB{{[0-9]+_[0-9]+}}:
165 ; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
166 ; CHECK-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s5 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
167 ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
168 ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
170 ; CHECK-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
171 ; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
172 ; CHECK-O0: s_waitcnt vmcnt(0)
173 ; CHECK-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
174 ; CHECK-O0: s_waitcnt vmcnt(0)
175 ; CHECK-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
176 ; CHECK-O0: s_waitcnt vmcnt(0)
177 ; CHECK-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
178 ; CHECK-O0: s_waitcnt vmcnt(0)
179 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
180 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
181 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
182 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
183 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
184 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
185 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
186 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
187 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
188 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
189 ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
190 ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
191 ; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s5 offset:[[IDX_OFF]] ; 4-byte Folded Reload
192 ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
193 ; CHECK-O0: s_waitcnt vmcnt(0)
194 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
195 ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]]
196 ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
198 ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
199 ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
200 ; CHECK-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
201 ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
202 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF]] ; 4-byte Folded Spill
204 ; CHECK-O0: [[TERMBB]]:
205 ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF]] ; 4-byte Folded Reload
206 ; CHECK-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off
208 define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 {
210 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
211 %val0 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %live.out.reg, i32 0, i1 zeroext false, i1 zeroext false) #1
212 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
213 %cmp = icmp eq i32 %idx, 0
214 br i1 %cmp, label %bb1, label %bb2
217 %val1 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %j, i32 %live.out.reg, i32 0, i1 zeroext false, i1 zeroext false) #1
221 %val = phi float [ %val0, %entry ], [ %val1, %bb1 ]
222 store volatile float %val, float addrspace(1)* %out
226 declare i32 @llvm.amdgcn.workitem.id.x() #1
227 declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
229 attributes #0 = { nounwind }
230 attributes #1 = { nounwind readnone }