1 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-fast -o - %s | FileCheck %s -check-prefix=GFX7
3 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s -check-prefix=GFX7
4 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -simplify-mir -stop-after=amdgpu-regbankselect -o - %s | FileCheck %s -check-prefix=GFX12
7 define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
8 ; GFX7-LABEL: name: s_buffer_load_i32
9 ; GFX7: bb.1 (%ir-block.0):
10 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
12 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
13 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
14 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
15 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
16 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
17 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
18 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32))
19 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
20 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
21 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
22 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
24 ; GFX12-LABEL: name: s_buffer_load_i32
25 ; GFX12: bb.1 (%ir-block.0):
26 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
28 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
29 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
30 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
31 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
32 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
33 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
34 ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32))
35 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
36 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
37 ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
38 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
39 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
43 define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
44 ; GFX7-LABEL: name: s_buffer_load_v2i32
45 ; GFX7: bb.1 (%ir-block.0):
46 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
48 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
49 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
50 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
51 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
52 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
53 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
54 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64))
55 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
56 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
57 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
58 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
59 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
60 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
61 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
62 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
64 ; GFX12-LABEL: name: s_buffer_load_v2i32
65 ; GFX12: bb.1 (%ir-block.0):
66 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
68 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
69 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
70 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
71 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
72 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
73 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
74 ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64))
75 ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
76 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
77 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
78 ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
79 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
80 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
81 ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
82 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
83 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
87 define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
88 ; GFX7-LABEL: name: s_buffer_load_v3i32
89 ; GFX7: bb.1 (%ir-block.0):
90 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
92 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
93 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
94 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
95 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
96 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
97 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
98 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 16)
99 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
100 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
101 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
102 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
103 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
104 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
105 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
106 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
107 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
108 ; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
109 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
111 ; GFX12-LABEL: name: s_buffer_load_v3i32
112 ; GFX12: bb.1 (%ir-block.0):
113 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
115 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
116 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
117 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
118 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
119 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
120 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
121 ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 16)
122 ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<3 x s32>)
123 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
124 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
125 ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
126 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
127 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
128 ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
129 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
130 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
131 ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
132 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
133 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
137 define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
138 ; GFX7-LABEL: name: s_buffer_load_v8i32
139 ; GFX7: bb.1 (%ir-block.0):
140 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
142 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
143 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
144 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
145 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
146 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
147 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
148 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256))
149 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
150 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
151 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
152 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
153 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
154 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
155 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
156 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
157 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
158 ; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
159 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
160 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
161 ; GFX7-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
162 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
163 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
164 ; GFX7-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
165 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
166 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
167 ; GFX7-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
168 ; GFX7-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
169 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
170 ; GFX7-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
171 ; GFX7-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
172 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
173 ; GFX7-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
174 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
176 ; GFX12-LABEL: name: s_buffer_load_v8i32
177 ; GFX12: bb.1 (%ir-block.0):
178 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
180 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
181 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
182 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
183 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
184 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
185 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
186 ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256))
187 ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
188 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
189 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
190 ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
191 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
192 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
193 ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
194 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
195 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
196 ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
197 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
198 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
199 ; GFX12-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
200 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
201 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
202 ; GFX12-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
203 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
204 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
205 ; GFX12-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
206 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
207 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
208 ; GFX12-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
209 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
210 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
211 ; GFX12-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
212 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
213 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
217 define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
218 ; GFX7-LABEL: name: s_buffer_load_v16i32
219 ; GFX7: bb.1 (%ir-block.0):
220 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
222 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
223 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
224 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
225 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
226 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
227 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
228 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512))
229 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
230 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
231 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
232 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
233 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
234 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
235 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
236 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
237 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
238 ; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
239 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
240 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
241 ; GFX7-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
242 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
243 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
244 ; GFX7-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
245 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
246 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
247 ; GFX7-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
248 ; GFX7-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
249 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
250 ; GFX7-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
251 ; GFX7-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
252 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
253 ; GFX7-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
254 ; GFX7-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
255 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
256 ; GFX7-NEXT: $sgpr8 = COPY [[INTRINSIC_CONVERGENT8]](s32)
257 ; GFX7-NEXT: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
258 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
259 ; GFX7-NEXT: $sgpr9 = COPY [[INTRINSIC_CONVERGENT9]](s32)
260 ; GFX7-NEXT: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
261 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
262 ; GFX7-NEXT: $sgpr10 = COPY [[INTRINSIC_CONVERGENT10]](s32)
263 ; GFX7-NEXT: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
264 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
265 ; GFX7-NEXT: $sgpr11 = COPY [[INTRINSIC_CONVERGENT11]](s32)
266 ; GFX7-NEXT: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
267 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
268 ; GFX7-NEXT: $sgpr12 = COPY [[INTRINSIC_CONVERGENT12]](s32)
269 ; GFX7-NEXT: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
270 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
271 ; GFX7-NEXT: $sgpr13 = COPY [[INTRINSIC_CONVERGENT13]](s32)
272 ; GFX7-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
273 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
274 ; GFX7-NEXT: $sgpr14 = COPY [[INTRINSIC_CONVERGENT14]](s32)
275 ; GFX7-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
276 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
277 ; GFX7-NEXT: $sgpr15 = COPY [[INTRINSIC_CONVERGENT15]](s32)
278 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
280 ; GFX12-LABEL: name: s_buffer_load_v16i32
281 ; GFX12: bb.1 (%ir-block.0):
282 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
284 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
285 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
286 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
287 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
288 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
289 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
290 ; GFX12-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512))
291 ; GFX12-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
292 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
293 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
294 ; GFX12-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
295 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
296 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
297 ; GFX12-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
298 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
299 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
300 ; GFX12-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
301 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
302 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
303 ; GFX12-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
304 ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
305 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
306 ; GFX12-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
307 ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
308 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
309 ; GFX12-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
310 ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
311 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
312 ; GFX12-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
313 ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
314 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
315 ; GFX12-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
316 ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
317 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
318 ; GFX12-NEXT: $sgpr8 = COPY [[INTRINSIC_CONVERGENT8]](s32)
319 ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
320 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
321 ; GFX12-NEXT: $sgpr9 = COPY [[INTRINSIC_CONVERGENT9]](s32)
322 ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
323 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
324 ; GFX12-NEXT: $sgpr10 = COPY [[INTRINSIC_CONVERGENT10]](s32)
325 ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
326 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
327 ; GFX12-NEXT: $sgpr11 = COPY [[INTRINSIC_CONVERGENT11]](s32)
328 ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
329 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
330 ; GFX12-NEXT: $sgpr12 = COPY [[INTRINSIC_CONVERGENT12]](s32)
331 ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
332 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
333 ; GFX12-NEXT: $sgpr13 = COPY [[INTRINSIC_CONVERGENT13]](s32)
334 ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
335 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
336 ; GFX12-NEXT: $sgpr14 = COPY [[INTRINSIC_CONVERGENT14]](s32)
337 ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
338 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
339 ; GFX12-NEXT: $sgpr15 = COPY [[INTRINSIC_CONVERGENT15]](s32)
340 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
341 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
345 ; Check cases that need to be converted to MUBUF due to the offset being a VGPR.
346 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
347 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset
348 ; GFX7: bb.1 (%ir-block.0):
349 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
351 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
352 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
353 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
354 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
355 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
356 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
357 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
358 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
359 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
360 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
361 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
363 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset
364 ; GFX12: bb.1 (%ir-block.0):
365 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
367 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
368 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
369 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
370 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
371 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
372 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
373 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
374 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
375 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
376 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
377 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
378 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
382 define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
383 ; GFX7-LABEL: name: s_buffer_load_v2f32_vgpr_offset
384 ; GFX7: bb.1 (%ir-block.0):
385 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
387 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
388 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
389 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
390 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
391 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
392 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
393 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
394 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
395 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
396 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>)
397 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
398 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
399 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
401 ; GFX12-LABEL: name: s_buffer_load_v2f32_vgpr_offset
402 ; GFX12: bb.1 (%ir-block.0):
403 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
405 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
406 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
407 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
408 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
409 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
410 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
411 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
412 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
413 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
414 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>)
415 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
416 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
417 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
418 %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
422 define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
423 ; GFX7-LABEL: name: s_buffer_load_v3f32_vgpr_offset
424 ; GFX7: bb.1 (%ir-block.0):
425 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
427 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
428 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
429 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
430 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
431 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
432 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
433 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
434 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
435 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
436 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
437 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
438 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
439 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
440 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
442 ; GFX12-LABEL: name: s_buffer_load_v3f32_vgpr_offset
443 ; GFX12: bb.1 (%ir-block.0):
444 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
446 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
447 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
448 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
449 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
450 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
451 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
452 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
453 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
454 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<3 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s96), align 4)
455 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<3 x s32>)
456 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
457 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
458 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
459 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
460 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
464 define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
465 ; GFX7-LABEL: name: s_buffer_load_v4f32_vgpr_offset
466 ; GFX7: bb.1 (%ir-block.0):
467 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
469 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
470 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
471 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
472 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
473 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
474 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
475 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
476 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
477 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
478 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
479 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
480 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
481 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
482 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
483 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
485 ; GFX12-LABEL: name: s_buffer_load_v4f32_vgpr_offset
486 ; GFX12: bb.1 (%ir-block.0):
487 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
489 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
490 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
491 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
492 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
493 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
494 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
495 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
496 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
497 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
498 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
499 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
500 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
501 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
502 ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
503 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
504 %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
508 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
509 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset
510 ; GFX7: bb.1 (%ir-block.0):
511 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
513 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
514 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
515 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
516 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
517 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
518 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
519 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
520 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
521 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
522 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
523 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
524 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
525 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
526 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
527 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
528 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
529 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
530 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
531 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
532 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
533 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
535 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset
536 ; GFX12: bb.1 (%ir-block.0):
537 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
539 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
540 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
541 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
542 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
543 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
544 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
545 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
546 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
547 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
548 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
549 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
550 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
551 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
552 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
553 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
554 ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
555 ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
556 ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
557 ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
558 ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
559 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
560 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
564 define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
565 ; GFX7-LABEL: name: s_buffer_load_v16f32_vgpr_offset
566 ; GFX7: bb.1 (%ir-block.0):
567 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
569 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
570 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
571 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
572 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
573 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
574 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
575 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
576 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
577 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
578 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
579 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
580 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
581 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
582 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
583 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
584 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
585 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
586 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
587 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
588 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
589 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
590 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
591 ; GFX7-NEXT: $vgpr8 = COPY [[UV8]](s32)
592 ; GFX7-NEXT: $vgpr9 = COPY [[UV9]](s32)
593 ; GFX7-NEXT: $vgpr10 = COPY [[UV10]](s32)
594 ; GFX7-NEXT: $vgpr11 = COPY [[UV11]](s32)
595 ; GFX7-NEXT: $vgpr12 = COPY [[UV12]](s32)
596 ; GFX7-NEXT: $vgpr13 = COPY [[UV13]](s32)
597 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
598 ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
599 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
601 ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset
602 ; GFX12: bb.1 (%ir-block.0):
603 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
605 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
606 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
607 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
608 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
609 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
610 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
611 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
612 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
613 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
614 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
615 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
616 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
617 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
618 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
619 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
620 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
621 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
622 ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
623 ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
624 ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
625 ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
626 ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
627 ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32)
628 ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32)
629 ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32)
630 ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32)
631 ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32)
632 ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32)
633 ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32)
634 ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32)
635 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
636 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
637 ret <16 x float> %val
640 define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
641 ; GFX7-LABEL: name: s_buffer_load_i96_vgpr_offset
642 ; GFX7: bb.1 (%ir-block.0):
643 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
645 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
646 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
647 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
648 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
649 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
650 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
651 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
652 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
653 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
654 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
655 ; GFX7-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128)
656 ; GFX7-NEXT: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1)
657 ; GFX7-NEXT: S_ENDPGM 0
659 ; GFX12-LABEL: name: s_buffer_load_i96_vgpr_offset
660 ; GFX12: bb.1 (%ir-block.0):
661 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
663 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
664 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
665 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
666 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
667 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
668 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
669 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
670 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
671 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
672 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s96) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s96), align 4)
673 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
674 ; GFX12-NEXT: G_STORE [[AMDGPU_BUFFER_LOAD]](s96), [[COPY5]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1)
675 ; GFX12-NEXT: S_ENDPGM 0
676 %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0)
677 store i96 %val, ptr addrspace(1) undef
681 ; Test split of a wide scalar
682 define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
683 ; GFX7-LABEL: name: s_buffer_load_i256_vgpr_offset
684 ; GFX7: bb.1 (%ir-block.0):
685 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
687 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
688 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
689 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
690 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
691 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
692 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
693 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
694 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
695 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
696 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
697 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
698 ; GFX7-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128)
699 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256)
700 ; GFX7-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
701 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
702 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
703 ; GFX7-NEXT: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
704 ; GFX7-NEXT: S_ENDPGM 0
706 ; GFX12-LABEL: name: s_buffer_load_i256_vgpr_offset
707 ; GFX12: bb.1 (%ir-block.0):
708 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
710 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
711 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
712 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
713 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
714 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
715 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
716 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
717 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
718 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
719 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
720 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
721 ; GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128)
722 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256)
723 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
724 ; GFX12-NEXT: G_STORE [[UV]](s128), [[COPY5]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
725 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
726 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
727 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
728 ; GFX12-NEXT: G_STORE [[UV1]](s128), [[COPY6]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
729 ; GFX12-NEXT: S_ENDPGM 0
730 %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0)
731 store i256 %val, ptr addrspace(1) undef
735 ; Test split of a wide scalar
736 define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
737 ; GFX7-LABEL: name: s_buffer_load_i512_vgpr_offset
738 ; GFX7: bb.1 (%ir-block.0):
739 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
741 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
742 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
743 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
744 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
745 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
746 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
747 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
748 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
749 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
750 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
751 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
752 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
753 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
754 ; GFX7-NEXT: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128)
755 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512)
756 ; GFX7-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
757 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
758 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
759 ; GFX7-NEXT: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
760 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
761 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
762 ; GFX7-NEXT: G_STORE [[UV2]](s128), [[PTR_ADD1]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 32, align 8, addrspace 1)
763 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
764 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
765 ; GFX7-NEXT: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1)
766 ; GFX7-NEXT: S_ENDPGM 0
768 ; GFX12-LABEL: name: s_buffer_load_i512_vgpr_offset
769 ; GFX12: bb.1 (%ir-block.0):
770 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
772 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
773 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
774 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
775 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
776 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
777 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
778 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
779 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
780 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
781 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
782 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
783 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
784 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
785 ; GFX12-NEXT: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128)
786 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512)
787 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
788 ; GFX12-NEXT: G_STORE [[UV]](s128), [[COPY5]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
789 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
790 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
791 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
792 ; GFX12-NEXT: G_STORE [[UV1]](s128), [[COPY6]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
793 ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
794 ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
795 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
796 ; GFX12-NEXT: G_STORE [[UV2]](s128), [[COPY7]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 32, align 8, addrspace 1)
797 ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
798 ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
799 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
800 ; GFX12-NEXT: G_STORE [[UV3]](s128), [[COPY8]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1)
801 ; GFX12-NEXT: S_ENDPGM 0
802 %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0)
803 store i512 %val, ptr addrspace(1) undef
807 ; Test split of a vector with 16-bit elements
808 define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
809 ; GFX7-LABEL: name: s_buffer_load_v16i16_vgpr_offset
810 ; GFX7: bb.1 (%ir-block.0):
811 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
813 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
814 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
815 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
816 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
817 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
818 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
819 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
820 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
821 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
822 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
823 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
824 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>)
825 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
826 ; GFX7-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
827 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
828 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
829 ; GFX7-NEXT: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
830 ; GFX7-NEXT: S_ENDPGM 0
832 ; GFX12-LABEL: name: s_buffer_load_v16i16_vgpr_offset
833 ; GFX12: bb.1 (%ir-block.0):
834 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
836 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
837 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
838 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
839 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
840 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
841 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
842 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
843 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
844 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
845 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
846 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
847 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>)
848 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
849 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
850 ; GFX12-NEXT: G_STORE [[UV]](<8 x s16>), [[COPY5]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
851 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
852 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
853 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
854 ; GFX12-NEXT: G_STORE [[UV1]](<8 x s16>), [[COPY6]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
855 ; GFX12-NEXT: S_ENDPGM 0
856 %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
857 store <16 x i16> %val, ptr addrspace(1) undef
861 ; Test split of a vector with 16-bit elements
862 define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
863 ; GFX7-LABEL: name: s_buffer_load_v32i16_vgpr_offset
864 ; GFX7: bb.1 (%ir-block.0):
865 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
867 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
868 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
869 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
870 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
871 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
872 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
873 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
874 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
875 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
876 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
877 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
878 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
879 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
880 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>)
881 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>)
882 ; GFX7-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
883 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
884 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
885 ; GFX7-NEXT: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
886 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
887 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
888 ; GFX7-NEXT: G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
889 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
890 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
891 ; GFX7-NEXT: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
892 ; GFX7-NEXT: S_ENDPGM 0
894 ; GFX12-LABEL: name: s_buffer_load_v32i16_vgpr_offset
895 ; GFX12: bb.1 (%ir-block.0):
896 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
898 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
899 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
900 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
901 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
902 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
903 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
904 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
905 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
906 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
907 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
908 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
909 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
910 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
911 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>)
912 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>)
913 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
914 ; GFX12-NEXT: G_STORE [[UV]](<8 x s16>), [[COPY5]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
915 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
916 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
917 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
918 ; GFX12-NEXT: G_STORE [[UV1]](<8 x s16>), [[COPY6]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
919 ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
920 ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
921 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
922 ; GFX12-NEXT: G_STORE [[UV2]](<8 x s16>), [[COPY7]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
923 ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
924 ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
925 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
926 ; GFX12-NEXT: G_STORE [[UV3]](<8 x s16>), [[COPY8]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
927 ; GFX12-NEXT: S_ENDPGM 0
928 %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
929 store <32 x i16> %val, ptr addrspace(1) undef
933 ; Test split of a vector with 64-bit elements
934 define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
935 ; GFX7-LABEL: name: s_buffer_load_v4i64_vgpr_offset
936 ; GFX7: bb.1 (%ir-block.0):
937 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
939 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
940 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
941 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
942 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
943 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
944 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
945 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
946 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
947 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
948 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
949 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
950 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>)
951 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
952 ; GFX7-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
953 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
954 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
955 ; GFX7-NEXT: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
956 ; GFX7-NEXT: S_ENDPGM 0
958 ; GFX12-LABEL: name: s_buffer_load_v4i64_vgpr_offset
959 ; GFX12: bb.1 (%ir-block.0):
960 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
962 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
963 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
964 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
965 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
966 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
967 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
968 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
969 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
970 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
971 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
972 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
973 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>)
974 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
975 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
976 ; GFX12-NEXT: G_STORE [[UV]](<2 x s64>), [[COPY5]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
977 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
978 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
979 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
980 ; GFX12-NEXT: G_STORE [[UV1]](<2 x s64>), [[COPY6]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
981 ; GFX12-NEXT: S_ENDPGM 0
982 %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
983 store <4 x i64> %val, ptr addrspace(1) undef
987 ; Test split of a vector with 64-bit elements
988 define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
989 ; GFX7-LABEL: name: s_buffer_load_v8i64_vgpr_offset
990 ; GFX7: bb.1 (%ir-block.0):
991 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
993 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
994 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
995 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
996 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
997 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
998 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
999 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
1000 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1001 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1002 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1003 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1004 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1005 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1006 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>)
1007 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
1008 ; GFX7-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
1009 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
1010 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
1011 ; GFX7-NEXT: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
1012 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
1013 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
1014 ; GFX7-NEXT: G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
1015 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
1016 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
1017 ; GFX7-NEXT: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
1018 ; GFX7-NEXT: S_ENDPGM 0
1020 ; GFX12-LABEL: name: s_buffer_load_v8i64_vgpr_offset
1021 ; GFX12: bb.1 (%ir-block.0):
1022 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1023 ; GFX12-NEXT: {{ $}}
1024 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1025 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1026 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1027 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1028 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1029 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1030 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
1031 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1032 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1033 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1034 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1035 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1036 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1037 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>)
1038 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
1039 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
1040 ; GFX12-NEXT: G_STORE [[UV]](<2 x s64>), [[COPY5]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
1041 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
1042 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
1043 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
1044 ; GFX12-NEXT: G_STORE [[UV1]](<2 x s64>), [[COPY6]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
1045 ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
1046 ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
1047 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
1048 ; GFX12-NEXT: G_STORE [[UV2]](<2 x s64>), [[COPY7]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
1049 ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
1050 ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
1051 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
1052 ; GFX12-NEXT: G_STORE [[UV3]](<2 x s64>), [[COPY8]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
1053 ; GFX12-NEXT: S_ENDPGM 0
1054 %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
1055 store <8 x i64> %val, ptr addrspace(1) undef
1059 ; Test split of a vector with 64-bit pointer elements
1060 define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
1061 ; GFX7-LABEL: name: s_buffer_load_v4p1_vgpr_offset
1062 ; GFX7: bb.1 (%ir-block.0):
1063 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1065 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1066 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1067 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1068 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1069 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1070 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1071 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
1072 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1073 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1074 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1075 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1076 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>)
1077 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>)
1078 ; GFX7-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
1079 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
1080 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
1081 ; GFX7-NEXT: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
1082 ; GFX7-NEXT: S_ENDPGM 0
1084 ; GFX12-LABEL: name: s_buffer_load_v4p1_vgpr_offset
1085 ; GFX12: bb.1 (%ir-block.0):
1086 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1087 ; GFX12-NEXT: {{ $}}
1088 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1089 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1090 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1091 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1092 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1093 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1094 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
1095 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1096 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1097 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1098 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1099 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>)
1100 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>)
1101 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
1102 ; GFX12-NEXT: G_STORE [[UV]](<2 x p1>), [[COPY5]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
1103 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
1104 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
1105 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
1106 ; GFX12-NEXT: G_STORE [[UV1]](<2 x p1>), [[COPY6]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
1107 ; GFX12-NEXT: S_ENDPGM 0
1108 %val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32> %rsrc, i32 %soffset, i32 0)
1109 store <4 x ptr addrspace(1)> %val, ptr addrspace(1) undef
1113 ; Test split of a vector with 64-bit pointer elements
1114 define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
1115 ; GFX7-LABEL: name: s_buffer_load_v8p1_vgpr_offset
1116 ; GFX7: bb.1 (%ir-block.0):
1117 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1119 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1120 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1121 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1122 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1123 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1124 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1125 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
1126 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1127 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1128 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1129 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1130 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1131 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1132 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>)
1133 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>)
1134 ; GFX7-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
1135 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
1136 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
1137 ; GFX7-NEXT: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
1138 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
1139 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
1140 ; GFX7-NEXT: G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
1141 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
1142 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
1143 ; GFX7-NEXT: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
1144 ; GFX7-NEXT: S_ENDPGM 0
1146 ; GFX12-LABEL: name: s_buffer_load_v8p1_vgpr_offset
1147 ; GFX12: bb.1 (%ir-block.0):
1148 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1149 ; GFX12-NEXT: {{ $}}
1150 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1151 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1152 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1153 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1154 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1155 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1156 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
1157 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1158 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1159 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1160 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1161 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1162 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1163 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>)
1164 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>)
1165 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
1166 ; GFX12-NEXT: G_STORE [[UV]](<2 x p1>), [[COPY5]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
1167 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
1168 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
1169 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD]](p1)
1170 ; GFX12-NEXT: G_STORE [[UV1]](<2 x p1>), [[COPY6]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
1171 ; GFX12-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
1172 ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
1173 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD1]](p1)
1174 ; GFX12-NEXT: G_STORE [[UV2]](<2 x p1>), [[COPY7]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
1175 ; GFX12-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
1176 ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
1177 ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr(p1) = COPY [[PTR_ADD2]](p1)
1178 ; GFX12-NEXT: G_STORE [[UV3]](<2 x p1>), [[COPY8]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
1179 ; GFX12-NEXT: S_ENDPGM 0
1180 %val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32> %rsrc, i32 %soffset, i32 0)
1181 store <8 x ptr addrspace(1)> %val, ptr addrspace(1) undef
1185 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %rsrc, i32 %soffset.base) {
1186 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
1187 ; GFX7: bb.1 (%ir-block.0):
1188 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1190 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1191 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1192 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1193 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1194 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1195 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1196 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
1197 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1198 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1199 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1200 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1201 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
1202 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1203 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1205 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
1206 ; GFX12: bb.1 (%ir-block.0):
1207 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1208 ; GFX12-NEXT: {{ $}}
1209 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1210 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1211 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1212 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1213 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1214 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1215 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
1216 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1217 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1218 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1219 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1220 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
1221 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1222 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1223 %soffset = add i32 %soffset.base, 4092
1224 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1228 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %rsrc, i32 %soffset.base) {
1229 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
1230 ; GFX7: bb.1 (%ir-block.0):
1231 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1233 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1234 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1235 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1236 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1237 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1238 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1239 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
1240 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1241 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1242 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1243 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1244 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32))
1245 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1246 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1248 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
1249 ; GFX12: bb.1 (%ir-block.0):
1250 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1251 ; GFX12-NEXT: {{ $}}
1252 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1253 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1254 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1255 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1256 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1257 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1258 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
1259 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1260 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1261 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1262 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1263 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32))
1264 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1265 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1266 %soffset = add i32 %soffset.base, 4095
1267 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1271 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %rsrc, i32 %soffset.base) {
1272 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
1273 ; GFX7: bb.1 (%ir-block.0):
1274 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1276 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1277 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1278 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1279 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1280 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1281 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1282 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1283 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1284 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1285 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1286 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1287 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1288 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1290 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
1291 ; GFX12: bb.1 (%ir-block.0):
1292 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1293 ; GFX12-NEXT: {{ $}}
1294 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1295 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1296 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1297 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1298 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1299 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1300 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1301 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1302 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1303 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1304 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1305 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4096, 0, 0 :: (dereferenceable invariant load (s32))
1306 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1307 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1308 %soffset = add i32 %soffset.base, 4096
1309 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1313 ; Make sure the base offset is added to each split load.
1314 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> inreg %rsrc, i32 %soffset.base) {
1315 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064
1316 ; GFX7: bb.1 (%ir-block.0):
1317 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1319 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1320 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1321 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1322 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1323 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1324 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1325 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
1326 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1327 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1328 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1329 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1330 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1331 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1332 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1333 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1334 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
1335 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
1336 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
1337 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
1338 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
1339 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
1340 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
1341 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
1342 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1344 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064
1345 ; GFX12: bb.1 (%ir-block.0):
1346 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1347 ; GFX12-NEXT: {{ $}}
1348 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1349 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1350 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1351 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1352 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1353 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1354 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
1355 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1356 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1357 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1358 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1359 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1360 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1361 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1362 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1363 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
1364 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
1365 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
1366 ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
1367 ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
1368 ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
1369 ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
1370 ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
1371 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1372 %soffset = add i32 %soffset.base, 4064
1373 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1374 ret <8 x float> %val
1377 ; Make sure the maximum offset isn't exeeded when splitting this
1378 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> inreg %rsrc, i32 %soffset.base) {
1379 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068
1380 ; GFX7: bb.1 (%ir-block.0):
1381 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1383 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1384 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1385 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1386 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1387 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1388 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1389 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
1390 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1391 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1392 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1393 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1394 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1395 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1396 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1397 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
1398 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
1399 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
1400 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
1401 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
1402 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
1403 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
1404 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
1405 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1407 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068
1408 ; GFX12: bb.1 (%ir-block.0):
1409 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1410 ; GFX12-NEXT: {{ $}}
1411 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1412 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1413 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1414 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1415 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1416 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1417 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
1418 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1419 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1420 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1421 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1422 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4068, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1423 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4084, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1424 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1425 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1426 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
1427 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
1428 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
1429 ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
1430 ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
1431 ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
1432 ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
1433 ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
1434 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1435 %soffset = add i32 %soffset.base, 4068
1436 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1437 ret <8 x float> %val
1440 define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i32> inreg %rsrc, i32 %soffset.base) {
1441 ; GFX7-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032
1442 ; GFX7: bb.1 (%ir-block.0):
1443 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1445 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1446 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1447 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1448 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1449 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1450 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1451 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032
1452 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1453 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1454 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1455 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1456 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1457 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1458 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1459 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1460 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
1461 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
1462 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
1463 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
1464 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
1465 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
1466 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
1467 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
1468 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
1469 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
1470 ; GFX7-NEXT: $vgpr8 = COPY [[UV8]](s32)
1471 ; GFX7-NEXT: $vgpr9 = COPY [[UV9]](s32)
1472 ; GFX7-NEXT: $vgpr10 = COPY [[UV10]](s32)
1473 ; GFX7-NEXT: $vgpr11 = COPY [[UV11]](s32)
1474 ; GFX7-NEXT: $vgpr12 = COPY [[UV12]](s32)
1475 ; GFX7-NEXT: $vgpr13 = COPY [[UV13]](s32)
1476 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
1477 ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
1478 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
1480 ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032
1481 ; GFX12: bb.1 (%ir-block.0):
1482 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1483 ; GFX12-NEXT: {{ $}}
1484 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1485 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1486 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1487 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1488 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1489 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1490 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032
1491 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1492 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1493 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1494 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1495 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1496 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1497 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1498 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1499 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
1500 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
1501 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
1502 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
1503 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
1504 ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
1505 ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
1506 ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
1507 ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
1508 ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
1509 ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32)
1510 ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32)
1511 ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32)
1512 ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32)
1513 ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32)
1514 ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32)
1515 ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32)
1516 ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32)
1517 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
1518 %soffset = add i32 %soffset.base, 4032
1519 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1520 ret <16 x float> %val
1523 define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i32> inreg %rsrc, i32 %soffset.base) {
1524 ; GFX7-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036
1525 ; GFX7: bb.1 (%ir-block.0):
1526 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1528 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1529 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1530 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1531 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1532 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1533 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1534 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036
1535 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1536 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1537 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1538 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1539 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1540 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1541 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1542 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
1543 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
1544 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
1545 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
1546 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
1547 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
1548 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
1549 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
1550 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
1551 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
1552 ; GFX7-NEXT: $vgpr8 = COPY [[UV8]](s32)
1553 ; GFX7-NEXT: $vgpr9 = COPY [[UV9]](s32)
1554 ; GFX7-NEXT: $vgpr10 = COPY [[UV10]](s32)
1555 ; GFX7-NEXT: $vgpr11 = COPY [[UV11]](s32)
1556 ; GFX7-NEXT: $vgpr12 = COPY [[UV12]](s32)
1557 ; GFX7-NEXT: $vgpr13 = COPY [[UV13]](s32)
1558 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
1559 ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
1560 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
1562 ; GFX12-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036
1563 ; GFX12: bb.1 (%ir-block.0):
1564 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
1565 ; GFX12-NEXT: {{ $}}
1566 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1567 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1568 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1569 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1570 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1571 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1572 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036
1573 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1574 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1575 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1576 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1577 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4036, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1578 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4052, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1579 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4068, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
1580 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4084, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
1581 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
1582 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
1583 ; GFX12-NEXT: $vgpr0 = COPY [[UV]](s32)
1584 ; GFX12-NEXT: $vgpr1 = COPY [[UV1]](s32)
1585 ; GFX12-NEXT: $vgpr2 = COPY [[UV2]](s32)
1586 ; GFX12-NEXT: $vgpr3 = COPY [[UV3]](s32)
1587 ; GFX12-NEXT: $vgpr4 = COPY [[UV4]](s32)
1588 ; GFX12-NEXT: $vgpr5 = COPY [[UV5]](s32)
1589 ; GFX12-NEXT: $vgpr6 = COPY [[UV6]](s32)
1590 ; GFX12-NEXT: $vgpr7 = COPY [[UV7]](s32)
1591 ; GFX12-NEXT: $vgpr8 = COPY [[UV8]](s32)
1592 ; GFX12-NEXT: $vgpr9 = COPY [[UV9]](s32)
1593 ; GFX12-NEXT: $vgpr10 = COPY [[UV10]](s32)
1594 ; GFX12-NEXT: $vgpr11 = COPY [[UV11]](s32)
1595 ; GFX12-NEXT: $vgpr12 = COPY [[UV12]](s32)
1596 ; GFX12-NEXT: $vgpr13 = COPY [[UV13]](s32)
1597 ; GFX12-NEXT: $vgpr14 = COPY [[UV14]](s32)
1598 ; GFX12-NEXT: $vgpr15 = COPY [[UV15]](s32)
1599 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
1600 %soffset = add i32 %soffset.base, 4036
1601 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1602 ret <16 x float> %val
1605 ; Waterfall loop due to resource being VGPR
1606 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %soffset) {
1607 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc
1608 ; GFX7: bb.1 (%ir-block.0):
1609 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1611 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1612 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1613 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1614 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1615 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1616 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1617 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
1618 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1619 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1620 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1621 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1624 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
1625 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1626 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1627 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1628 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1629 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1630 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1631 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1632 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1633 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1634 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1635 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1636 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1637 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1640 ; GFX7-NEXT: successors: %bb.4, %bb.2
1642 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1643 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1644 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1647 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1650 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1651 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1653 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc
1654 ; GFX12: bb.1 (%ir-block.0):
1655 ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1656 ; GFX12-NEXT: {{ $}}
1657 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1658 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1659 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1660 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1661 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1662 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1663 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
1664 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1665 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1666 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
1667 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
1668 ; GFX12-NEXT: {{ $}}
1670 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
1671 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1672 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1673 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1674 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1675 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1676 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1677 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1678 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1679 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1680 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1681 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1682 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1683 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
1684 ; GFX12-NEXT: {{ $}}
1686 ; GFX12-NEXT: successors: %bb.4, %bb.2
1687 ; GFX12-NEXT: {{ $}}
1688 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1689 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
1690 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1691 ; GFX12-NEXT: {{ $}}
1693 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
1694 ; GFX12-NEXT: {{ $}}
1696 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1697 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1698 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1702 ; Use the offset inside the waterfall loop
1703 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1704 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092
1705 ; GFX7: bb.1 (%ir-block.0):
1706 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1708 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1709 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1710 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1711 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1712 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1713 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1714 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
1715 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1716 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1717 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1718 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1719 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1722 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.3
1723 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1724 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1725 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1726 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1727 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1728 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1729 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1730 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1731 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1732 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1733 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1734 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1735 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1738 ; GFX7-NEXT: successors: %bb.4, %bb.2
1740 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
1741 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1742 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1745 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1748 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1749 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1751 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092
1752 ; GFX12: bb.1 (%ir-block.0):
1753 ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1754 ; GFX12-NEXT: {{ $}}
1755 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1756 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1757 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1758 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1759 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1760 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1761 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
1762 ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1763 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1764 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1765 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
1766 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
1767 ; GFX12-NEXT: {{ $}}
1769 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
1770 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1771 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1772 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1773 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1774 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1775 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1776 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1777 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1778 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1779 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1780 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1781 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1782 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
1783 ; GFX12-NEXT: {{ $}}
1785 ; GFX12-NEXT: successors: %bb.4, %bb.2
1786 ; GFX12-NEXT: {{ $}}
1787 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
1788 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
1789 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1790 ; GFX12-NEXT: {{ $}}
1792 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
1793 ; GFX12-NEXT: {{ $}}
1795 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1796 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1797 %soffset = add i32 %soffset.base, 4092
1798 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1802 ; Scalar offset exceeds MUBUF limit, keep add out of the loop
1803 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1804 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096
1805 ; GFX7: bb.1 (%ir-block.0):
1806 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1808 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1809 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1810 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1811 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1812 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1813 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1814 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1815 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1816 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
1817 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1818 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1819 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1820 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1823 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.3
1824 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1825 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1826 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1827 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1828 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1829 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1830 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1831 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1832 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1833 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1834 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1835 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1836 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1839 ; GFX7-NEXT: successors: %bb.4, %bb.2
1841 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1842 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1843 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1846 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1849 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1850 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1852 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096
1853 ; GFX12: bb.1 (%ir-block.0):
1854 ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1855 ; GFX12-NEXT: {{ $}}
1856 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1857 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1858 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1859 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1860 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1861 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1862 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1863 ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1864 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1865 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1866 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
1867 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
1868 ; GFX12-NEXT: {{ $}}
1870 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
1871 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1872 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1873 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1874 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1875 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1876 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1877 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1878 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1879 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1880 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1881 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1882 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1883 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
1884 ; GFX12-NEXT: {{ $}}
1886 ; GFX12-NEXT: successors: %bb.4, %bb.2
1887 ; GFX12-NEXT: {{ $}}
1888 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4096, 0, 0 :: (dereferenceable invariant load (s32))
1889 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
1890 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1891 ; GFX12-NEXT: {{ $}}
1893 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
1894 ; GFX12-NEXT: {{ $}}
1896 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1897 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1898 %soffset = add i32 %soffset.base, 4096
1899 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1903 ; Waterfall loop, but constant offset
1904 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) {
1905 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095
1906 ; GFX7: bb.1 (%ir-block.0):
1907 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1909 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1910 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1911 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1912 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1913 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1914 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
1915 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1916 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1917 ; GFX7-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1918 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1919 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1922 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
1923 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1924 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1925 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1926 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1927 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1928 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1929 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1930 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1931 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1932 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1933 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1934 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1935 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1938 ; GFX7-NEXT: successors: %bb.4, %bb.2
1940 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1)
1941 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1942 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1945 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1948 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1949 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1951 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095
1952 ; GFX12: bb.1 (%ir-block.0):
1953 ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1954 ; GFX12-NEXT: {{ $}}
1955 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1956 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1957 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1958 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1959 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1960 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
1961 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1962 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1963 ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1964 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
1965 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
1966 ; GFX12-NEXT: {{ $}}
1968 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
1969 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1970 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1971 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1972 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1973 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1974 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1975 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1976 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1977 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1978 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1979 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1980 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1981 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
1982 ; GFX12-NEXT: {{ $}}
1984 ; GFX12-NEXT: successors: %bb.4, %bb.2
1985 ; GFX12-NEXT: {{ $}}
1986 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1)
1987 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
1988 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1989 ; GFX12-NEXT: {{ $}}
1991 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
1992 ; GFX12-NEXT: {{ $}}
1994 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1995 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1996 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0)
2000 ; Waterfall loop, but constant offset
2001 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) {
2002 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096
2003 ; GFX7: bb.1 (%ir-block.0):
2004 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
2006 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2007 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2008 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2009 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2010 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2011 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
2012 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
2013 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2014 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2015 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2016 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2019 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
2020 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2021 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2022 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2023 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2024 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2025 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2026 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2027 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2028 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2029 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2030 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2031 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2032 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2035 ; GFX7-NEXT: successors: %bb.4, %bb.2
2037 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32))
2038 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2039 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2042 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2045 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
2046 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
2048 ; GFX12-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096
2049 ; GFX12: bb.1 (%ir-block.0):
2050 ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
2051 ; GFX12-NEXT: {{ $}}
2052 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2053 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2054 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2055 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2056 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2057 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
2058 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2059 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2060 ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2061 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2062 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2063 ; GFX12-NEXT: {{ $}}
2065 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %14, %bb.3
2066 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2067 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2068 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2069 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2070 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2071 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2072 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2073 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2074 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2075 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2076 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2077 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2078 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2079 ; GFX12-NEXT: {{ $}}
2081 ; GFX12-NEXT: successors: %bb.4, %bb.2
2082 ; GFX12-NEXT: {{ $}}
2083 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4096, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4096)
2084 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2085 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2086 ; GFX12-NEXT: {{ $}}
2088 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2089 ; GFX12-NEXT: {{ $}}
2091 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
2092 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
2093 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0)
2097 ; Need a waterfall loop, but the offset is scalar.
2098 ; Make sure the base offset is added to each split load.
2099 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %rsrc, i32 inreg %soffset.base) {
2100 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064
2101 ; GFX7: bb.1 (%ir-block.0):
2102 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
2104 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2105 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2106 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2107 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2108 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2109 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2110 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
2111 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
2112 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2113 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2114 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2115 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2118 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2119 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2120 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2121 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2122 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2123 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2124 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2125 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2126 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2127 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2128 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2129 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2130 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2131 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2134 ; GFX7-NEXT: successors: %bb.4, %bb.2
2136 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2137 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2138 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2139 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2142 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2145 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2146 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2147 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
2148 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
2149 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
2150 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
2151 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
2152 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
2153 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
2154 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
2155 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2157 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064
2158 ; GFX12: bb.1 (%ir-block.0):
2159 ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
2160 ; GFX12-NEXT: {{ $}}
2161 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2162 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2163 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2164 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2165 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2166 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2167 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
2168 ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
2169 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2170 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2171 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2172 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2173 ; GFX12-NEXT: {{ $}}
2175 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
2176 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2177 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2178 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2179 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2180 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2181 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2182 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2183 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2184 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2185 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2186 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2187 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2188 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2189 ; GFX12-NEXT: {{ $}}
2191 ; GFX12-NEXT: successors: %bb.4, %bb.2
2192 ; GFX12-NEXT: {{ $}}
2193 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2194 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2195 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2196 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2197 ; GFX12-NEXT: {{ $}}
2199 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2200 ; GFX12-NEXT: {{ $}}
2202 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2203 ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2204 ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
2205 ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
2206 ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
2207 ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
2208 ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
2209 ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
2210 ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
2211 ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
2212 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2213 %soffset = add i32 %soffset.base, 4064
2214 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
2215 ret <8 x float> %val
2218 ; Need a waterfall loop, but the offset is scalar.
2219 ; Make sure the maximum offset isn't exeeded when splitting this
2220 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %rsrc, i32 inreg %soffset.base) {
2221 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068
2222 ; GFX7: bb.1 (%ir-block.0):
2223 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
2225 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2226 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2227 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2228 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2229 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2230 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2231 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
2232 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
2233 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
2234 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2235 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2236 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2237 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2240 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
2241 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2242 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2243 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2244 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2245 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2246 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2247 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2248 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2249 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2250 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2251 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2252 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2253 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2256 ; GFX7-NEXT: successors: %bb.4, %bb.2
2258 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2259 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2260 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2261 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2264 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2267 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2268 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2269 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
2270 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
2271 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
2272 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
2273 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
2274 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
2275 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
2276 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
2277 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2279 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068
2280 ; GFX12: bb.1 (%ir-block.0):
2281 ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
2282 ; GFX12-NEXT: {{ $}}
2283 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2284 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2285 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2286 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2287 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2288 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2289 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
2290 ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
2291 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2292 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2293 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2294 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2295 ; GFX12-NEXT: {{ $}}
2297 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
2298 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2299 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2300 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2301 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2302 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2303 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2304 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2305 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2306 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2307 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2308 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2309 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2310 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2311 ; GFX12-NEXT: {{ $}}
2313 ; GFX12-NEXT: successors: %bb.4, %bb.2
2314 ; GFX12-NEXT: {{ $}}
2315 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4068, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2316 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4084, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2317 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2318 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2319 ; GFX12-NEXT: {{ $}}
2321 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2322 ; GFX12-NEXT: {{ $}}
2324 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2325 ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2326 ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
2327 ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
2328 ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
2329 ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
2330 ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
2331 ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
2332 ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
2333 ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
2334 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2335 %soffset = add i32 %soffset.base, 4068
2336 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
2337 ret <8 x float> %val
2340 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) {
2341 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096
2342 ; GFX7: bb.1 (%ir-block.0):
2343 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
2345 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2346 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2347 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2348 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2349 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2350 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2351 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
2352 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
2353 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
2354 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2355 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2356 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2357 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2360 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
2361 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2362 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2363 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2364 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2365 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2366 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2367 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2368 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2369 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2370 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2371 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2372 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2373 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2376 ; GFX7-NEXT: successors: %bb.4, %bb.2
2378 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2379 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2380 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2381 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2384 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2387 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2388 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2389 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
2390 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
2391 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
2392 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
2393 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
2394 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
2395 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
2396 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
2397 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2399 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096
2400 ; GFX12: bb.1 (%ir-block.0):
2401 ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
2402 ; GFX12-NEXT: {{ $}}
2403 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2404 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2405 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2406 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2407 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2408 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2409 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
2410 ; GFX12-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
2411 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2412 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2413 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2414 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2415 ; GFX12-NEXT: {{ $}}
2417 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
2418 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2419 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2420 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2421 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2422 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2423 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2424 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2425 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2426 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2427 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2428 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2429 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2430 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2431 ; GFX12-NEXT: {{ $}}
2433 ; GFX12-NEXT: successors: %bb.4, %bb.2
2434 ; GFX12-NEXT: {{ $}}
2435 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4096, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2436 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4112, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2437 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2438 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2439 ; GFX12-NEXT: {{ $}}
2441 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2442 ; GFX12-NEXT: {{ $}}
2444 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2445 ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2446 ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
2447 ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
2448 ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
2449 ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
2450 ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
2451 ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
2452 ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
2453 ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
2454 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2455 %soffset = add i32 %soffset.base, 4096
2456 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
2457 ret <8 x float> %val
2460 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(<4 x i32> %rsrc, i32 %offset.base) {
2461 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
2462 ; GFX7: bb.1 (%ir-block.0):
2463 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
2465 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2466 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2467 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2468 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2469 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2470 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
2471 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000
2472 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
2473 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
2474 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2475 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2476 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2479 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2480 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2481 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2482 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2483 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2484 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2485 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2486 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2487 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2488 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2489 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2490 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2491 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2492 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2495 ; GFX7-NEXT: successors: %bb.4, %bb.2
2497 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2498 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2499 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2500 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2503 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2506 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2507 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2508 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
2509 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
2510 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
2511 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
2512 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
2513 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
2514 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
2515 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
2516 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2518 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
2519 ; GFX12: bb.1 (%ir-block.0):
2520 ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
2521 ; GFX12-NEXT: {{ $}}
2522 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2523 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2524 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2525 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2526 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2527 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
2528 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000
2529 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
2530 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
2531 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2532 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2533 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2534 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2535 ; GFX12-NEXT: {{ $}}
2537 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2538 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2539 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2540 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2541 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2542 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2543 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2544 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2545 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2546 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2547 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2548 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2549 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2550 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2551 ; GFX12-NEXT: {{ $}}
2553 ; GFX12-NEXT: successors: %bb.4, %bb.2
2554 ; GFX12-NEXT: {{ $}}
2555 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 5000, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2556 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 5016, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2557 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2558 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2559 ; GFX12-NEXT: {{ $}}
2561 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2562 ; GFX12-NEXT: {{ $}}
2564 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2565 ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2566 ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
2567 ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
2568 ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
2569 ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
2570 ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
2571 ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
2572 ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
2573 ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
2574 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2575 %soffset = add i32 %offset.base, 5000
2576 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
2577 ret <8 x float> %val
2580 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(<4 x i32> %rsrc, i32 %offset.base) {
2581 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
2582 ; GFX7: bb.1 (%ir-block.0):
2583 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
2585 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2586 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2587 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2588 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2589 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2590 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
2591 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076
2592 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
2593 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
2594 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2595 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2596 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2599 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2600 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2601 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2602 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2603 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2604 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2605 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2606 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2607 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2608 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2609 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2610 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2611 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2612 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2615 ; GFX7-NEXT: successors: %bb.4, %bb.2
2617 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2618 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2619 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2620 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2623 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2626 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2627 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2628 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
2629 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
2630 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
2631 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
2632 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
2633 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
2634 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
2635 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
2636 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2638 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
2639 ; GFX12: bb.1 (%ir-block.0):
2640 ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
2641 ; GFX12-NEXT: {{ $}}
2642 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2643 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2644 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2645 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2646 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2647 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
2648 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076
2649 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
2650 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
2651 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2652 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2653 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2654 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2655 ; GFX12-NEXT: {{ $}}
2657 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2658 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2659 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2660 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2661 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2662 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2663 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2664 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2665 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2666 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2667 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2668 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2669 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2670 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2671 ; GFX12-NEXT: {{ $}}
2673 ; GFX12-NEXT: successors: %bb.4, %bb.2
2674 ; GFX12-NEXT: {{ $}}
2675 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4076, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2676 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2677 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2678 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2679 ; GFX12-NEXT: {{ $}}
2681 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2682 ; GFX12-NEXT: {{ $}}
2684 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2685 ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2686 ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
2687 ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
2688 ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
2689 ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
2690 ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
2691 ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
2692 ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
2693 ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
2694 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2695 %soffset = add i32 %offset.base, 4076
2696 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
2697 ret <8 x float> %val
2700 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(<4 x i32> %rsrc, i32 %offset.base) {
2701 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
2702 ; GFX7: bb.1 (%ir-block.0):
2703 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
2705 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2706 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2707 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2708 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2709 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2710 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
2711 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080
2712 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
2713 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
2714 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2715 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2716 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2719 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2720 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2721 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2722 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2723 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2724 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2725 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2726 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2727 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2728 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2729 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2730 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2731 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2732 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2735 ; GFX7-NEXT: successors: %bb.4, %bb.2
2737 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2738 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2739 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2740 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2743 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2746 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2747 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2748 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
2749 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
2750 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
2751 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
2752 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
2753 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
2754 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
2755 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
2756 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2758 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
2759 ; GFX12: bb.1 (%ir-block.0):
2760 ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
2761 ; GFX12-NEXT: {{ $}}
2762 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2763 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2764 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2765 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2766 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2767 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
2768 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080
2769 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
2770 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
2771 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2772 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2773 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2774 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2775 ; GFX12-NEXT: {{ $}}
2777 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2778 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2779 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2780 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2781 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2782 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2783 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2784 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2785 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2786 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2787 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2788 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2789 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2790 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2791 ; GFX12-NEXT: {{ $}}
2793 ; GFX12-NEXT: successors: %bb.4, %bb.2
2794 ; GFX12-NEXT: {{ $}}
2795 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2796 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4096, 0, 0 :: (dereferenceable invariant load (s128), align 4)
2797 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2798 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2799 ; GFX12-NEXT: {{ $}}
2801 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2802 ; GFX12-NEXT: {{ $}}
2804 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2805 ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2806 ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
2807 ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
2808 ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
2809 ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
2810 ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
2811 ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
2812 ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
2813 ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
2814 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2815 %soffset = add i32 %offset.base, 4080
2816 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
2817 ret <8 x float> %val
2820 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(<4 x i32> %rsrc, i32 %offset.base) {
2821 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064
2822 ; GFX7: bb.1 (%ir-block.0):
2823 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
2825 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2826 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2827 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2828 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2829 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2830 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
2831 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2832 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2833 ; GFX7-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2834 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
2835 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
2838 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
2839 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2840 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2841 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2842 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2843 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2844 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2845 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2846 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2847 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2848 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2849 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2850 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2851 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
2854 ; GFX7-NEXT: successors: %bb.4, %bb.2
2856 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
2857 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
2858 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
2859 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2862 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
2865 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2866 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2867 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
2868 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
2869 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
2870 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
2871 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
2872 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
2873 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
2874 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
2875 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2877 ; GFX12-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064
2878 ; GFX12: bb.1 (%ir-block.0):
2879 ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
2880 ; GFX12-NEXT: {{ $}}
2881 ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2882 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
2883 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
2884 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
2885 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2886 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
2887 ; GFX12-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2888 ; GFX12-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
2889 ; GFX12-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2890 ; GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
2891 ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
2892 ; GFX12-NEXT: {{ $}}
2894 ; GFX12-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF]], %bb.1, %25, %bb.3
2895 ; GFX12-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2896 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
2897 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
2898 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
2899 ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
2900 ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
2901 ; GFX12-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
2902 ; GFX12-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
2903 ; GFX12-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
2904 ; GFX12-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
2905 ; GFX12-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
2906 ; GFX12-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
2907 ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[INTRINSIC_CONVERGENT]](s32), implicit-def $exec, implicit-def $scc, implicit $exec
2908 ; GFX12-NEXT: {{ $}}
2910 ; GFX12-NEXT: successors: %bb.4, %bb.2
2911 ; GFX12-NEXT: {{ $}}
2912 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
2913 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
2914 ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
2915 ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
2916 ; GFX12-NEXT: {{ $}}
2918 ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
2919 ; GFX12-NEXT: {{ $}}
2921 ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
2922 ; GFX12-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
2923 ; GFX12-NEXT: $vgpr0 = COPY [[UV8]](s32)
2924 ; GFX12-NEXT: $vgpr1 = COPY [[UV9]](s32)
2925 ; GFX12-NEXT: $vgpr2 = COPY [[UV10]](s32)
2926 ; GFX12-NEXT: $vgpr3 = COPY [[UV11]](s32)
2927 ; GFX12-NEXT: $vgpr4 = COPY [[UV12]](s32)
2928 ; GFX12-NEXT: $vgpr5 = COPY [[UV13]](s32)
2929 ; GFX12-NEXT: $vgpr6 = COPY [[UV14]](s32)
2930 ; GFX12-NEXT: $vgpr7 = COPY [[UV15]](s32)
2931 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
2932 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0)
2933 ret <8 x float> %val
2936 define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
2937 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
2938 ; GFX7: bb.1 (%ir-block.0):
2939 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
2941 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2942 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
2943 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
2944 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
2945 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2946 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2947 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
2948 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
2949 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
2950 ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2951 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
2952 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
2953 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
2955 ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
2956 ; GFX12: bb.1 (%ir-block.0):
2957 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
2958 ; GFX12-NEXT: {{ $}}
2959 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2960 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
2961 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
2962 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
2963 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2964 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2965 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
2966 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
2967 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
2968 ; GFX12-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2969 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
2970 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
2971 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
2972 %offset = add i32 %offset.v, %offset.s
2973 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
2977 define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
2978 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
2979 ; GFX7: bb.1 (%ir-block.0):
2980 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
2982 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
2983 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
2984 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
2985 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
2986 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
2987 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
2988 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
2989 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
2990 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
2991 ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
2992 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
2993 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
2994 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
2996 ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
2997 ; GFX12: bb.1 (%ir-block.0):
2998 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
2999 ; GFX12-NEXT: {{ $}}
3000 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3001 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3002 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3003 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3004 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3005 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3006 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3007 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3008 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
3009 ; GFX12-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3010 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
3011 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3012 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3013 %offset = add i32 %offset.s, %offset.v
3014 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
3018 define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
3019 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
3020 ; GFX7: bb.1 (%ir-block.0):
3021 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3023 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3024 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3025 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3026 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3027 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3028 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3029 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3030 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3031 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
3032 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3033 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3034 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3035 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3036 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3037 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3038 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3039 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3041 ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
3042 ; GFX12: bb.1 (%ir-block.0):
3043 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3044 ; GFX12-NEXT: {{ $}}
3045 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3046 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3047 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3048 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3049 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3050 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3051 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3052 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3053 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
3054 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3055 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3056 ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3057 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3058 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3059 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3060 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3061 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3062 %offset.base = add i32 %offset.v, %offset.s
3063 %offset = add i32 %offset.base, 1024
3064 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
3068 define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
3069 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
3070 ; GFX7: bb.1 (%ir-block.0):
3071 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3073 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3074 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3075 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3076 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3077 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3078 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3079 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3080 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3081 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
3082 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3083 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3084 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3085 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3086 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3087 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3088 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3089 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3091 ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
3092 ; GFX12: bb.1 (%ir-block.0):
3093 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3094 ; GFX12-NEXT: {{ $}}
3095 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3096 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3097 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3098 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3099 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3100 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3101 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3102 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3103 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
3104 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3105 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3106 ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3107 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3108 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3109 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3110 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3111 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3112 %offset.base = add i32 %offset.s, %offset.v
3113 %offset = add i32 %offset.base, 1024
3114 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
3118 ; TODO: Ideally this would be reassociated to fold.
3119 define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
3120 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
3121 ; GFX7: bb.1 (%ir-block.0):
3122 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3124 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3125 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3126 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3127 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3128 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3129 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3130 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3131 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3132 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
3133 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3134 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3135 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3136 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3137 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3138 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3139 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3140 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3142 ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
3143 ; GFX12: bb.1 (%ir-block.0):
3144 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3145 ; GFX12-NEXT: {{ $}}
3146 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3147 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3148 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3149 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3150 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3151 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3152 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3153 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3154 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
3155 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3156 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3157 ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3158 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3159 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3160 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3161 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3162 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3163 %offset.base = add i32 %offset.s, 1024
3164 %offset = add i32 %offset.base, %offset.v
3165 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
3169 define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
3170 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
3171 ; GFX7: bb.1 (%ir-block.0):
3172 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3174 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3175 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3176 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3177 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3178 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3179 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3180 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3181 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3182 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
3183 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3184 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3185 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3186 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3187 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3188 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3189 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3190 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3192 ; GFX12-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
3193 ; GFX12: bb.1 (%ir-block.0):
3194 ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
3195 ; GFX12-NEXT: {{ $}}
3196 ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
3197 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
3198 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
3199 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
3200 ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
3201 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
3202 ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
3203 ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
3204 ; GFX12-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
3205 ; GFX12-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
3206 ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
3207 ; GFX12-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
3208 ; GFX12-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
3209 ; GFX12-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
3210 ; GFX12-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
3211 ; GFX12-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
3212 ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
3213 %offset.base = add i32 %offset.v, 1024
3214 %offset = add i32 %offset.base, %offset.s
3215 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
3219 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
3220 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg)
3221 declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg)
3222 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg)
3223 declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32 immarg)
3224 declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32 immarg)
3226 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg)
3227 declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32 immarg)
3228 declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32 immarg)
3229 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg)
3230 declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32 immarg)
3231 declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32 immarg)
3233 declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32 immarg)
3234 declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32 immarg)
3235 declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32 immarg)
3237 declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32 immarg)
3238 declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32 immarg)
3240 declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32 immarg)
3241 declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32 immarg)
3243 declare <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32>, i32, i32 immarg)
3244 declare <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32>, i32, i32 immarg)