1 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-fast -o - %s | FileCheck %s -check-prefix=GFX7
3 ; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=amdgpu-regbankselect -regbankselect-greedy -o - %s | FileCheck %s -check-prefix=GFX7
6 define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
7 ; GFX7-LABEL: name: s_buffer_load_i32
8 ; GFX7: bb.1 (%ir-block.0):
9 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
11 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
12 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
13 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
14 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
15 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
16 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
17 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s32))
18 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
19 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
20 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
21 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
22 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
26 define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
27 ; GFX7-LABEL: name: s_buffer_load_v2i32
28 ; GFX7: bb.1 (%ir-block.0):
29 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
31 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
32 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
33 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
34 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
35 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
36 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
37 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s64), align 4)
38 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
39 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
40 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
41 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
42 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
43 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
44 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
45 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
46 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
50 define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
51 ; GFX7-LABEL: name: s_buffer_load_v3i32
52 ; GFX7: bb.1 (%ir-block.0):
53 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
55 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
56 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
57 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
58 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
59 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
60 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
61 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s96), align 4)
62 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>)
63 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
64 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
65 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
66 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
67 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
68 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
69 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
70 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
71 ; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
72 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2
73 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
77 define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
78 ; GFX7-LABEL: name: s_buffer_load_v8i32
79 ; GFX7: bb.1 (%ir-block.0):
80 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
82 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
83 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
84 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
85 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
86 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
87 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
88 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s256), align 4)
89 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
90 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
91 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
92 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
93 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
94 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
95 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
96 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
97 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
98 ; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
99 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
100 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
101 ; GFX7-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
102 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
103 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
104 ; GFX7-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
105 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
106 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
107 ; GFX7-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
108 ; GFX7-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
109 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
110 ; GFX7-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
111 ; GFX7-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
112 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
113 ; GFX7-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
114 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7
115 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
119 define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
120 ; GFX7-LABEL: name: s_buffer_load_v16i32
121 ; GFX7: bb.1 (%ir-block.0):
122 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
124 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
125 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
126 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
127 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
128 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
129 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
130 ; GFX7-NEXT: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load (s512), align 4)
131 ; GFX7-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
132 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
133 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
134 ; GFX7-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
135 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32)
136 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32)
137 ; GFX7-NEXT: $sgpr1 = COPY [[INTRINSIC_CONVERGENT1]](s32)
138 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32)
139 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32)
140 ; GFX7-NEXT: $sgpr2 = COPY [[INTRINSIC_CONVERGENT2]](s32)
141 ; GFX7-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
142 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32)
143 ; GFX7-NEXT: $sgpr3 = COPY [[INTRINSIC_CONVERGENT3]](s32)
144 ; GFX7-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32)
145 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32)
146 ; GFX7-NEXT: $sgpr4 = COPY [[INTRINSIC_CONVERGENT4]](s32)
147 ; GFX7-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32)
148 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32)
149 ; GFX7-NEXT: $sgpr5 = COPY [[INTRINSIC_CONVERGENT5]](s32)
150 ; GFX7-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32)
151 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32)
152 ; GFX7-NEXT: $sgpr6 = COPY [[INTRINSIC_CONVERGENT6]](s32)
153 ; GFX7-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32)
154 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32)
155 ; GFX7-NEXT: $sgpr7 = COPY [[INTRINSIC_CONVERGENT7]](s32)
156 ; GFX7-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32)
157 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32)
158 ; GFX7-NEXT: $sgpr8 = COPY [[INTRINSIC_CONVERGENT8]](s32)
159 ; GFX7-NEXT: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32)
160 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32)
161 ; GFX7-NEXT: $sgpr9 = COPY [[INTRINSIC_CONVERGENT9]](s32)
162 ; GFX7-NEXT: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32)
163 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32)
164 ; GFX7-NEXT: $sgpr10 = COPY [[INTRINSIC_CONVERGENT10]](s32)
165 ; GFX7-NEXT: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32)
166 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32)
167 ; GFX7-NEXT: $sgpr11 = COPY [[INTRINSIC_CONVERGENT11]](s32)
168 ; GFX7-NEXT: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32)
169 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32)
170 ; GFX7-NEXT: $sgpr12 = COPY [[INTRINSIC_CONVERGENT12]](s32)
171 ; GFX7-NEXT: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32)
172 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32)
173 ; GFX7-NEXT: $sgpr13 = COPY [[INTRINSIC_CONVERGENT13]](s32)
174 ; GFX7-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32)
175 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32)
176 ; GFX7-NEXT: $sgpr14 = COPY [[INTRINSIC_CONVERGENT14]](s32)
177 ; GFX7-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32)
178 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32)
179 ; GFX7-NEXT: $sgpr15 = COPY [[INTRINSIC_CONVERGENT15]](s32)
180 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15
181 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0)
185 ; Check cases that need to be converted to MUBUF due to the offset being a VGPR.
186 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
187 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset
188 ; GFX7: bb.1 (%ir-block.0):
189 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
191 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
192 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
193 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
194 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
195 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
196 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
197 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
198 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
199 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
200 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
201 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
202 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
206 define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
207 ; GFX7-LABEL: name: s_buffer_load_v2f32_vgpr_offset
208 ; GFX7: bb.1 (%ir-block.0):
209 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
211 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
212 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
213 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
214 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
215 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
216 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
217 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
218 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
219 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
220 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<2 x s32>)
221 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
222 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
223 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
224 %val = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
228 define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
229 ; GFX7-LABEL: name: s_buffer_load_v3f32_vgpr_offset
230 ; GFX7: bb.1 (%ir-block.0):
231 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
233 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
234 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
235 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
236 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
237 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
238 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
239 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
240 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
241 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
242 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
243 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
244 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
245 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
246 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
247 %val = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
251 define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
252 ; GFX7-LABEL: name: s_buffer_load_v4f32_vgpr_offset
253 ; GFX7: bb.1 (%ir-block.0):
254 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
256 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
257 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
258 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
259 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
260 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
261 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
262 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
263 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
264 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
265 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>)
266 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
267 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
268 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
269 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
270 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
271 %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
275 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
276 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset
277 ; GFX7: bb.1 (%ir-block.0):
278 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
280 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
281 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
282 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
283 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
284 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
285 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
286 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
287 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
288 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
289 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
290 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
291 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
292 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
293 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
294 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
295 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
296 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
297 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
298 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
299 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
300 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
301 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
305 define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
306 ; GFX7-LABEL: name: s_buffer_load_v16f32_vgpr_offset
307 ; GFX7: bb.1 (%ir-block.0):
308 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
310 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
311 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
312 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
313 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
314 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
315 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
316 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
317 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
318 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
319 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
320 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
321 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
322 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
323 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
324 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
325 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
326 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
327 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
328 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
329 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
330 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
331 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
332 ; GFX7-NEXT: $vgpr8 = COPY [[UV8]](s32)
333 ; GFX7-NEXT: $vgpr9 = COPY [[UV9]](s32)
334 ; GFX7-NEXT: $vgpr10 = COPY [[UV10]](s32)
335 ; GFX7-NEXT: $vgpr11 = COPY [[UV11]](s32)
336 ; GFX7-NEXT: $vgpr12 = COPY [[UV12]](s32)
337 ; GFX7-NEXT: $vgpr13 = COPY [[UV13]](s32)
338 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
339 ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
340 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
341 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
342 ret <16 x float> %val
345 define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
346 ; GFX7-LABEL: name: s_buffer_load_i96_vgpr_offset
347 ; GFX7: bb.1 (%ir-block.0):
348 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
350 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
351 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
352 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
353 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
354 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
355 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
356 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
357 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
358 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
359 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
360 ; GFX7-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s96) = G_TRUNC [[AMDGPU_BUFFER_LOAD]](s128)
361 ; GFX7-NEXT: G_STORE [[TRUNC]](s96), [[DEF]](p1) :: (store (s96) into `ptr addrspace(1) undef`, align 8, addrspace 1)
362 ; GFX7-NEXT: S_ENDPGM 0
363 %val = call i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32> %rsrc, i32 %soffset, i32 0)
364 store i96 %val, ptr addrspace(1) undef
368 ; Test split of a wide scalar
369 define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
370 ; GFX7-LABEL: name: s_buffer_load_i256_vgpr_offset
371 ; GFX7: bb.1 (%ir-block.0):
372 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
374 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
375 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
376 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
377 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
378 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
379 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
380 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
381 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
382 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
383 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
384 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
385 ; GFX7-NEXT: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128)
386 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s256)
387 ; GFX7-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
388 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
389 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
390 ; GFX7-NEXT: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
391 ; GFX7-NEXT: S_ENDPGM 0
392 %val = call i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32> %rsrc, i32 %soffset, i32 0)
393 store i256 %val, ptr addrspace(1) undef
397 ; Test split of a wide scalar
398 define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
399 ; GFX7-LABEL: name: s_buffer_load_i512_vgpr_offset
400 ; GFX7: bb.1 (%ir-block.0):
401 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
403 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
404 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
405 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
406 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
407 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
408 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
409 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
410 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
411 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
412 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
413 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
414 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
415 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(s128) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
416 ; GFX7-NEXT: [[MV:%[0-9]+]]:vgpr(s512) = G_MERGE_VALUES [[AMDGPU_BUFFER_LOAD]](s128), [[AMDGPU_BUFFER_LOAD1]](s128), [[AMDGPU_BUFFER_LOAD2]](s128), [[AMDGPU_BUFFER_LOAD3]](s128)
417 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s128), [[UV1:%[0-9]+]]:vgpr(s128), [[UV2:%[0-9]+]]:vgpr(s128), [[UV3:%[0-9]+]]:vgpr(s128) = G_UNMERGE_VALUES [[MV]](s512)
418 ; GFX7-NEXT: G_STORE [[UV]](s128), [[DEF]](p1) :: (store (s128) into `ptr addrspace(1) undef`, align 8, addrspace 1)
419 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
420 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
421 ; GFX7-NEXT: G_STORE [[UV1]](s128), [[PTR_ADD]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 16, align 8, addrspace 1)
422 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
423 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
424 ; GFX7-NEXT: G_STORE [[UV2]](s128), [[PTR_ADD1]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 32, align 8, addrspace 1)
425 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
426 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
427 ; GFX7-NEXT: G_STORE [[UV3]](s128), [[PTR_ADD2]](p1) :: (store (s128) into `ptr addrspace(1) undef` + 48, align 8, addrspace 1)
428 ; GFX7-NEXT: S_ENDPGM 0
429 %val = call i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32> %rsrc, i32 %soffset, i32 0)
430 store i512 %val, ptr addrspace(1) undef
434 ; Test split of a vector with 16-bit elements
435 define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
436 ; GFX7-LABEL: name: s_buffer_load_v16i16_vgpr_offset
437 ; GFX7: bb.1 (%ir-block.0):
438 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
440 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
441 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
442 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
443 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
444 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
445 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
446 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
447 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
448 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
449 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
450 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
451 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>)
452 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s16>)
453 ; GFX7-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
454 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
455 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
456 ; GFX7-NEXT: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
457 ; GFX7-NEXT: S_ENDPGM 0
458 %val = call <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
459 store <16 x i16> %val, ptr addrspace(1) undef
463 ; Test split of a vector with 16-bit elements
464 define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
465 ; GFX7-LABEL: name: s_buffer_load_v32i16_vgpr_offset
466 ; GFX7: bb.1 (%ir-block.0):
467 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
469 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
470 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
471 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
472 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
473 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
474 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
475 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
476 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
477 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
478 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
479 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
480 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
481 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<8 x s16>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
482 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<32 x s16>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<8 x s16>), [[AMDGPU_BUFFER_LOAD1]](<8 x s16>), [[AMDGPU_BUFFER_LOAD2]](<8 x s16>), [[AMDGPU_BUFFER_LOAD3]](<8 x s16>)
483 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<8 x s16>), [[UV1:%[0-9]+]]:vgpr(<8 x s16>), [[UV2:%[0-9]+]]:vgpr(<8 x s16>), [[UV3:%[0-9]+]]:vgpr(<8 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s16>)
484 ; GFX7-NEXT: G_STORE [[UV]](<8 x s16>), [[DEF]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
485 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
486 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
487 ; GFX7-NEXT: G_STORE [[UV1]](<8 x s16>), [[PTR_ADD]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
488 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
489 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
490 ; GFX7-NEXT: G_STORE [[UV2]](<8 x s16>), [[PTR_ADD1]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
491 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
492 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
493 ; GFX7-NEXT: G_STORE [[UV3]](<8 x s16>), [[PTR_ADD2]](p1) :: (store (<8 x s16>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
494 ; GFX7-NEXT: S_ENDPGM 0
495 %val = call <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32> %rsrc, i32 %soffset, i32 0)
496 store <32 x i16> %val, ptr addrspace(1) undef
500 ; Test split of a vector with 64-bit elements
501 define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
502 ; GFX7-LABEL: name: s_buffer_load_v4i64_vgpr_offset
503 ; GFX7: bb.1 (%ir-block.0):
504 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
506 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
507 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
508 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
509 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
510 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
511 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
512 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
513 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
514 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
515 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
516 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
517 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>)
518 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s64>)
519 ; GFX7-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
520 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
521 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
522 ; GFX7-NEXT: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
523 ; GFX7-NEXT: S_ENDPGM 0
524 %val = call <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
525 store <4 x i64> %val, ptr addrspace(1) undef
529 ; Test split of a vector with 64-bit elements
530 define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
531 ; GFX7-LABEL: name: s_buffer_load_v8i64_vgpr_offset
532 ; GFX7: bb.1 (%ir-block.0):
533 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
535 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
536 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
537 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
538 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
539 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
540 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
541 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
542 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
543 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
544 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
545 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
546 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
547 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
548 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x s64>), [[AMDGPU_BUFFER_LOAD1]](<2 x s64>), [[AMDGPU_BUFFER_LOAD2]](<2 x s64>), [[AMDGPU_BUFFER_LOAD3]](<2 x s64>)
549 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s64>), [[UV1:%[0-9]+]]:vgpr(<2 x s64>), [[UV2:%[0-9]+]]:vgpr(<2 x s64>), [[UV3:%[0-9]+]]:vgpr(<2 x s64>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s64>)
550 ; GFX7-NEXT: G_STORE [[UV]](<2 x s64>), [[DEF]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
551 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
552 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
553 ; GFX7-NEXT: G_STORE [[UV1]](<2 x s64>), [[PTR_ADD]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
554 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
555 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
556 ; GFX7-NEXT: G_STORE [[UV2]](<2 x s64>), [[PTR_ADD1]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
557 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
558 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
559 ; GFX7-NEXT: G_STORE [[UV3]](<2 x s64>), [[PTR_ADD2]](p1) :: (store (<2 x s64>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
560 ; GFX7-NEXT: S_ENDPGM 0
561 %val = call <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32> %rsrc, i32 %soffset, i32 0)
562 store <8 x i64> %val, ptr addrspace(1) undef
566 ; Test split of a vector with 64-bit pointer elements
567 define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
568 ; GFX7-LABEL: name: s_buffer_load_v4p1_vgpr_offset
569 ; GFX7: bb.1 (%ir-block.0):
570 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
572 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
573 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
574 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
575 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
576 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
577 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
578 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
579 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
580 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
581 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
582 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
583 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>)
584 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x p1>)
585 ; GFX7-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 32, addrspace 1)
586 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
587 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
588 ; GFX7-NEXT: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 32, addrspace 1)
589 ; GFX7-NEXT: S_ENDPGM 0
590 %val = call <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32> %rsrc, i32 %soffset, i32 0)
591 store <4 x ptr addrspace(1)> %val, ptr addrspace(1) undef
595 ; Test split of a vector with 64-bit pointer elements
596 define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32 %soffset) {
597 ; GFX7-LABEL: name: s_buffer_load_v8p1_vgpr_offset
598 ; GFX7: bb.1 (%ir-block.0):
599 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
601 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
602 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
603 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
604 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
605 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
606 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
607 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
608 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
609 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
610 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
611 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
612 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
613 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<2 x p1>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
614 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x p1>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<2 x p1>), [[AMDGPU_BUFFER_LOAD1]](<2 x p1>), [[AMDGPU_BUFFER_LOAD2]](<2 x p1>), [[AMDGPU_BUFFER_LOAD3]](<2 x p1>)
615 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x p1>), [[UV1:%[0-9]+]]:vgpr(<2 x p1>), [[UV2:%[0-9]+]]:vgpr(<2 x p1>), [[UV3:%[0-9]+]]:vgpr(<2 x p1>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x p1>)
616 ; GFX7-NEXT: G_STORE [[UV]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, align 64, addrspace 1)
617 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 16
618 ; GFX7-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
619 ; GFX7-NEXT: G_STORE [[UV1]](<2 x p1>), [[PTR_ADD]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 16, basealign 64, addrspace 1)
620 ; GFX7-NEXT: [[C3:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 32
621 ; GFX7-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C3]](s64)
622 ; GFX7-NEXT: G_STORE [[UV2]](<2 x p1>), [[PTR_ADD1]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 32, align 32, basealign 64, addrspace 1)
623 ; GFX7-NEXT: [[C4:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 48
624 ; GFX7-NEXT: [[PTR_ADD2:%[0-9]+]]:sgpr(p1) = G_PTR_ADD [[DEF]], [[C4]](s64)
625 ; GFX7-NEXT: G_STORE [[UV3]](<2 x p1>), [[PTR_ADD2]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef` + 48, basealign 64, addrspace 1)
626 ; GFX7-NEXT: S_ENDPGM 0
627 %val = call <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32> %rsrc, i32 %soffset, i32 0)
628 store <8 x ptr addrspace(1)> %val, ptr addrspace(1) undef
632 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %rsrc, i32 %soffset.base) {
633 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092
634 ; GFX7: bb.1 (%ir-block.0):
635 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
637 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
638 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
639 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
640 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
641 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
642 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
643 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
644 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
645 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
646 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
647 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
648 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
649 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
650 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
651 %soffset = add i32 %soffset.base, 4092
652 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
656 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %rsrc, i32 %soffset.base) {
657 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095
658 ; GFX7: bb.1 (%ir-block.0):
659 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
661 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
662 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
663 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
664 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
665 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
666 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
667 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
668 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
669 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
670 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
671 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
672 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4095, 0, 0 :: (dereferenceable invariant load (s32))
673 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
674 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
675 %soffset = add i32 %soffset.base, 4095
676 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
680 define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %rsrc, i32 %soffset.base) {
681 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096
682 ; GFX7: bb.1 (%ir-block.0):
683 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
685 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
686 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
687 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
688 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
689 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
690 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
691 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
692 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
693 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
694 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
695 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
696 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
697 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
698 %soffset = add i32 %soffset.base, 4096
699 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
703 ; Make sure the base offset is added to each split load.
704 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32> inreg %rsrc, i32 %soffset.base) {
705 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4064
706 ; GFX7: bb.1 (%ir-block.0):
707 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
709 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
710 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
711 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
712 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
713 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
714 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
715 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
716 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
717 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
718 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
719 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
720 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
721 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
722 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
723 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
724 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
725 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
726 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
727 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
728 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
729 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
730 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
731 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
732 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
733 %soffset = add i32 %soffset.base, 4064
734 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
738 ; Make sure the maximum offset isn't exeeded when splitting this
739 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32> inreg %rsrc, i32 %soffset.base) {
740 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_add_4068
741 ; GFX7: bb.1 (%ir-block.0):
742 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
744 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
745 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
746 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
747 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
748 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
749 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
750 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
751 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
752 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
753 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
754 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
755 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
756 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
757 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
758 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
759 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
760 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
761 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
762 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
763 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
764 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
765 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
766 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
767 %soffset = add i32 %soffset.base, 4068
768 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
772 define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i32> inreg %rsrc, i32 %soffset.base) {
773 ; GFX7-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4032
774 ; GFX7: bb.1 (%ir-block.0):
775 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
777 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
778 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
779 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
780 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
781 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
782 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
783 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032
784 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
785 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
786 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
787 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
788 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4032, 0, 0 :: (dereferenceable invariant load (s128), align 4)
789 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4048, 0, 0 :: (dereferenceable invariant load (s128), align 4)
790 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
791 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
792 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
793 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
794 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
795 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
796 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
797 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
798 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
799 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
800 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
801 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
802 ; GFX7-NEXT: $vgpr8 = COPY [[UV8]](s32)
803 ; GFX7-NEXT: $vgpr9 = COPY [[UV9]](s32)
804 ; GFX7-NEXT: $vgpr10 = COPY [[UV10]](s32)
805 ; GFX7-NEXT: $vgpr11 = COPY [[UV11]](s32)
806 ; GFX7-NEXT: $vgpr12 = COPY [[UV12]](s32)
807 ; GFX7-NEXT: $vgpr13 = COPY [[UV13]](s32)
808 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
809 ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
810 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
811 %soffset = add i32 %soffset.base, 4032
812 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
813 ret <16 x float> %val
816 define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i32> inreg %rsrc, i32 %soffset.base) {
817 ; GFX7-LABEL: name: s_buffer_load_v16f32_vgpr_offset_add_4036
818 ; GFX7: bb.1 (%ir-block.0):
819 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0
821 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
822 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
823 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
824 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
825 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
826 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
827 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036
828 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
829 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
830 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
831 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
832 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
833 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 32, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 16, align 4)
834 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 48, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 48, align 4)
835 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>), [[AMDGPU_BUFFER_LOAD2]](<4 x s32>), [[AMDGPU_BUFFER_LOAD3]](<4 x s32>)
836 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s32>)
837 ; GFX7-NEXT: $vgpr0 = COPY [[UV]](s32)
838 ; GFX7-NEXT: $vgpr1 = COPY [[UV1]](s32)
839 ; GFX7-NEXT: $vgpr2 = COPY [[UV2]](s32)
840 ; GFX7-NEXT: $vgpr3 = COPY [[UV3]](s32)
841 ; GFX7-NEXT: $vgpr4 = COPY [[UV4]](s32)
842 ; GFX7-NEXT: $vgpr5 = COPY [[UV5]](s32)
843 ; GFX7-NEXT: $vgpr6 = COPY [[UV6]](s32)
844 ; GFX7-NEXT: $vgpr7 = COPY [[UV7]](s32)
845 ; GFX7-NEXT: $vgpr8 = COPY [[UV8]](s32)
846 ; GFX7-NEXT: $vgpr9 = COPY [[UV9]](s32)
847 ; GFX7-NEXT: $vgpr10 = COPY [[UV10]](s32)
848 ; GFX7-NEXT: $vgpr11 = COPY [[UV11]](s32)
849 ; GFX7-NEXT: $vgpr12 = COPY [[UV12]](s32)
850 ; GFX7-NEXT: $vgpr13 = COPY [[UV13]](s32)
851 ; GFX7-NEXT: $vgpr14 = COPY [[UV14]](s32)
852 ; GFX7-NEXT: $vgpr15 = COPY [[UV15]](s32)
853 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
854 %soffset = add i32 %soffset.base, 4036
855 %val = call <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
856 ret <16 x float> %val
859 ; Waterfall loop due to resource being VGPR
860 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %soffset) {
861 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc
862 ; GFX7: bb.1 (%ir-block.0):
863 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
865 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
866 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
867 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
868 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
869 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
870 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
871 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
872 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
873 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
874 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
875 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
878 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
879 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
880 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
881 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
882 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
883 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
884 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
885 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
886 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
887 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
888 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
889 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
890 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
891 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
894 ; GFX7-NEXT: successors: %bb.4, %bb.2
896 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY5]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s32))
897 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
898 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
901 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
904 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
905 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
906 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
910 ; Use the offset inside the waterfall loop
911 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %rsrc, i32 inreg %soffset.base) {
912 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4092
913 ; GFX7: bb.1 (%ir-block.0):
914 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
916 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
917 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
918 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
919 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
920 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
921 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
922 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
923 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
924 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
925 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
926 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
927 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
930 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %16, %bb.3
931 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
932 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
933 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
934 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
935 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
936 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
937 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
938 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
939 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
940 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
941 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
942 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
943 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
946 ; GFX7-NEXT: successors: %bb.4, %bb.2
948 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4092, 0, 0 :: (dereferenceable invariant load (s32))
949 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
950 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
953 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
956 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
957 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
958 %soffset = add i32 %soffset.base, 4092
959 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
963 ; Scalar offset exceeds MUBUF limit, keep add out of the loop
964 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) {
965 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_soffset_add_4096
966 ; GFX7: bb.1 (%ir-block.0):
967 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
969 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
970 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
971 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
972 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
973 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
974 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
975 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
976 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
977 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
978 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
979 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
980 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
981 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
984 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %17, %bb.3
985 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
986 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
987 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
988 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
989 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
990 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
991 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
992 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
993 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
994 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
995 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
996 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
997 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1000 ; GFX7-NEXT: successors: %bb.4, %bb.2
1002 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1003 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1004 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1007 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1010 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1011 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1012 %soffset = add i32 %soffset.base, 4096
1013 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1017 ; Waterfall loop, but constant offset
1018 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc) {
1019 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4095
1020 ; GFX7: bb.1 (%ir-block.0):
1021 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1023 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1024 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1025 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1026 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1027 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1028 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
1029 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1030 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1031 ; GFX7-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1032 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1033 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1036 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
1037 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1038 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1039 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1040 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1041 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1042 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1043 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1044 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1045 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1046 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1047 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1048 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1049 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1052 ; GFX7-NEXT: successors: %bb.4, %bb.2
1054 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4095, 0, 0 :: (dereferenceable invariant load (s32) from unknown-address + 4095, align 1)
1055 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1056 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1059 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1062 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1063 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1064 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 0)
1068 ; Waterfall loop, but constant offset
1069 define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc) {
1070 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_rsrc_offset_4096
1071 ; GFX7: bb.1 (%ir-block.0):
1072 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1074 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1075 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1076 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1077 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1078 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1079 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1080 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1081 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1082 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1083 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1084 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1087 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %15, %bb.3
1088 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1089 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1090 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1091 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1092 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1093 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1094 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1095 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1096 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1097 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1098 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1099 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1100 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1103 ; GFX7-NEXT: successors: %bb.4, %bb.2
1105 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY4]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1106 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1107 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1110 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1113 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1114 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1115 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 0)
1119 ; Need a waterfall loop, but the offset is scalar.
1120 ; Make sure the base offset is added to each split load.
1121 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1122 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4064
1123 ; GFX7: bb.1 (%ir-block.0):
1124 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1126 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1127 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1128 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1129 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1130 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1131 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1132 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
1133 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1134 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1135 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1136 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1137 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1140 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
1141 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1142 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1143 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1144 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1145 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1146 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1147 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1148 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1149 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1150 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1151 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1152 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1153 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1156 ; GFX7-NEXT: successors: %bb.4, %bb.2
1158 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4064, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1159 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[C1]], [[COPY4]], 4080, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1160 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1161 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1164 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1167 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1168 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1169 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
1170 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
1171 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
1172 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
1173 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
1174 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
1175 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
1176 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
1177 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1178 %soffset = add i32 %soffset.base, 4064
1179 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1180 ret <8 x float> %val
1183 ; Need a waterfall loop, but the offset is scalar.
1184 ; Make sure the maximum offset isn't exeeded when splitting this
1185 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1186 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4068
1187 ; GFX7: bb.1 (%ir-block.0):
1188 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1190 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1191 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1192 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1193 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1194 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1195 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1196 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
1197 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1198 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
1199 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1200 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1201 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1202 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1205 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
1206 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1207 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1208 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1209 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1210 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1211 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1212 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1213 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1214 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1215 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1216 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1217 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1218 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1221 ; GFX7-NEXT: successors: %bb.4, %bb.2
1223 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1224 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1225 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1226 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1229 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1232 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1233 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1234 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
1235 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
1236 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
1237 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
1238 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
1239 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
1240 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
1241 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
1242 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1243 %soffset = add i32 %soffset.base, 4068
1244 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1245 ret <8 x float> %val
1248 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %rsrc, i32 inreg %soffset.base) {
1249 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_rsrc_add_4096
1250 ; GFX7: bb.1 (%ir-block.0):
1251 ; GFX7-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3
1253 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1254 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1255 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1256 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1257 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1258 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1259 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
1260 ; GFX7-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
1261 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
1262 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1263 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1264 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1265 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1268 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %27, %bb.3
1269 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1270 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1271 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1272 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1273 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1274 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1275 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1276 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1277 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1278 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1279 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1280 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1281 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1284 ; GFX7-NEXT: successors: %bb.4, %bb.2
1286 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1287 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C2]](s32), [[COPY5]], [[C1]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1288 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1289 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1292 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1295 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1296 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1297 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
1298 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
1299 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
1300 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
1301 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
1302 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
1303 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
1304 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
1305 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1306 %soffset = add i32 %soffset.base, 4096
1307 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1308 ret <8 x float> %val
1311 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000(<4 x i32> %rsrc, i32 %offset.base) {
1312 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
1313 ; GFX7: bb.1 (%ir-block.0):
1314 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
1316 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1317 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1318 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1319 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1320 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1321 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
1322 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000
1323 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1324 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1325 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1326 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1327 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1330 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
1331 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1332 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1333 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1334 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1335 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1336 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1337 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1338 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1339 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1340 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1341 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1342 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1343 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1346 ; GFX7-NEXT: successors: %bb.4, %bb.2
1348 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1349 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1350 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1351 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1354 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1357 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1358 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1359 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
1360 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
1361 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
1362 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
1363 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
1364 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
1365 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
1366 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
1367 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1368 %soffset = add i32 %offset.base, 5000
1369 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1370 ret <8 x float> %val
1373 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076(<4 x i32> %rsrc, i32 %offset.base) {
1374 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
1375 ; GFX7: bb.1 (%ir-block.0):
1376 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
1378 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1379 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1380 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1381 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1382 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1383 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
1384 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076
1385 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1386 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1387 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1388 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1389 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1392 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
1393 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1394 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1395 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1396 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1397 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1398 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1399 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1400 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1401 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1402 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1403 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1404 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1405 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1408 ; GFX7-NEXT: successors: %bb.4, %bb.2
1410 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1411 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1412 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1413 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1416 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1419 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1420 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1421 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
1422 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
1423 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
1424 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
1425 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
1426 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
1427 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
1428 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
1429 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1430 %soffset = add i32 %offset.base, 4076
1431 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1432 ret <8 x float> %val
1435 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080(<4 x i32> %rsrc, i32 %offset.base) {
1436 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
1437 ; GFX7: bb.1 (%ir-block.0):
1438 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
1440 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1441 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1442 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1443 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1444 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1445 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
1446 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080
1447 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1448 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
1449 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1450 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1451 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1454 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
1455 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1456 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1457 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1458 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1459 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1460 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1461 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1462 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1463 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1464 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1465 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1466 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1467 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1470 ; GFX7-NEXT: successors: %bb.4, %bb.2
1472 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1473 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 16, 0, 0 :: (dereferenceable invariant load (s128), align 4)
1474 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1475 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1478 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1481 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1482 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1483 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
1484 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
1485 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
1486 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
1487 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
1488 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
1489 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
1490 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
1491 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1492 %soffset = add i32 %offset.base, 4080
1493 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 %soffset, i32 0)
1494 ret <8 x float> %val
1497 define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064(<4 x i32> %rsrc, i32 %offset.base) {
1498 ; GFX7-LABEL: name: s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4064
1499 ; GFX7: bb.1 (%ir-block.0):
1500 ; GFX7-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
1502 ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1503 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
1504 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
1505 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
1506 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1507 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
1508 ; GFX7-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1509 ; GFX7-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1510 ; GFX7-NEXT: [[C3:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1511 ; GFX7-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
1512 ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
1515 ; GFX7-NEXT: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %26, %bb.3
1516 ; GFX7-NEXT: [[UV:%[0-9]+]]:vgpr_32(s32), [[UV1:%[0-9]+]]:vgpr_32(s32), [[UV2:%[0-9]+]]:vgpr_32(s32), [[UV3:%[0-9]+]]:vgpr_32(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1517 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV]](s32), implicit $exec
1518 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV1]](s32), implicit $exec
1519 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV2]](s32), implicit $exec
1520 ; GFX7-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32(s32) = V_READFIRSTLANE_B32 [[UV3]](s32), implicit $exec
1521 ; GFX7-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
1522 ; GFX7-NEXT: [[UV4:%[0-9]+]]:vgpr(s64), [[UV5:%[0-9]+]]:vgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
1523 ; GFX7-NEXT: [[UV6:%[0-9]+]]:sgpr(s64), [[UV7:%[0-9]+]]:sgpr(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>)
1524 ; GFX7-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV6]](s64), [[UV4]]
1525 ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV7]](s64), [[UV5]]
1526 ; GFX7-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]]
1527 ; GFX7-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_64_xexec(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AND]](s1)
1528 ; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[INTRINSIC_CONVERGENT]](s64), implicit-def $exec, implicit-def $scc, implicit $exec
1531 ; GFX7-NEXT: successors: %bb.4, %bb.2
1533 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4064, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
1534 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR1]](<4 x s32>), [[C3]](s32), [[C1]], [[C2]], 4080, 0, 0 :: (dereferenceable invariant load (s128) from unknown-address + 4064, align 4)
1535 ; GFX7-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
1536 ; GFX7-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
1539 ; GFX7-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]]
1542 ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[AMDGPU_BUFFER_LOAD]](<4 x s32>), [[AMDGPU_BUFFER_LOAD1]](<4 x s32>)
1543 ; GFX7-NEXT: [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s32>)
1544 ; GFX7-NEXT: $vgpr0 = COPY [[UV8]](s32)
1545 ; GFX7-NEXT: $vgpr1 = COPY [[UV9]](s32)
1546 ; GFX7-NEXT: $vgpr2 = COPY [[UV10]](s32)
1547 ; GFX7-NEXT: $vgpr3 = COPY [[UV11]](s32)
1548 ; GFX7-NEXT: $vgpr4 = COPY [[UV12]](s32)
1549 ; GFX7-NEXT: $vgpr5 = COPY [[UV13]](s32)
1550 ; GFX7-NEXT: $vgpr6 = COPY [[UV14]](s32)
1551 ; GFX7-NEXT: $vgpr7 = COPY [[UV15]](s32)
1552 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
1553 %val = call <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32> %rsrc, i32 4064, i32 0)
1554 ret <8 x float> %val
1557 define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1558 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr
1559 ; GFX7: bb.1 (%ir-block.0):
1560 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1562 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1563 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1564 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1565 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1566 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1567 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1568 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1569 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1570 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
1571 ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1572 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1573 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1574 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1575 %offset = add i32 %offset.v, %offset.s
1576 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1580 define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1581 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr
1582 ; GFX7: bb.1 (%ir-block.0):
1583 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1585 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1586 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1587 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1588 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1589 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1590 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1591 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1592 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1593 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
1594 ; GFX7-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1595 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32))
1596 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1597 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1598 %offset = add i32 %offset.s, %offset.v
1599 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1603 define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1604 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm
1605 ; GFX7: bb.1 (%ir-block.0):
1606 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1608 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1609 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1610 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1611 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1612 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1613 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1614 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1615 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1616 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
1617 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1618 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1619 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
1620 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1621 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1622 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
1623 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1624 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1625 %offset.base = add i32 %offset.v, %offset.s
1626 %offset = add i32 %offset.base, 1024
1627 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1631 define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1632 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm
1633 ; GFX7: bb.1 (%ir-block.0):
1634 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1636 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1637 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1638 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1639 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1640 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1641 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1642 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1643 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1644 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
1645 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1646 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1647 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
1648 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1649 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1650 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
1651 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1652 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1653 %offset.base = add i32 %offset.s, %offset.v
1654 %offset = add i32 %offset.base, 1024
1655 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1659 ; TODO: Ideally this would be reassociated to fold.
1660 define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1661 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr
1662 ; GFX7: bb.1 (%ir-block.0):
1663 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1665 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1666 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1667 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1668 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1669 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1670 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1671 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1672 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1673 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
1674 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1675 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1676 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
1677 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1678 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1679 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
1680 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1681 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1682 %offset.base = add i32 %offset.s, 1024
1683 %offset = add i32 %offset.base, %offset.v
1684 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1688 define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inreg %rsrc, i32 %offset.v, i32 inreg %offset.s) {
1689 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr
1690 ; GFX7: bb.1 (%ir-block.0):
1691 ; GFX7-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
1693 ; GFX7-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
1694 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
1695 ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
1696 ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
1697 ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
1698 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
1699 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
1700 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
1701 ; GFX7-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
1702 ; GFX7-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
1703 ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
1704 ; GFX7-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]]
1705 ; GFX7-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
1706 ; GFX7-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
1707 ; GFX7-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32))
1708 ; GFX7-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
1709 ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
1710 %offset.base = add i32 %offset.v, 1024
1711 %offset = add i32 %offset.base, %offset.s
1712 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
1716 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
1717 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg)
1718 declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg)
1719 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg)
1720 declare <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32>, i32, i32 immarg)
1721 declare <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32>, i32, i32 immarg)
1723 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg)
1724 declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32 immarg)
1725 declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32 immarg)
1726 declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg)
1727 declare <8 x float> @llvm.amdgcn.s.buffer.load.v8f32(<4 x i32>, i32, i32 immarg)
1728 declare <16 x float> @llvm.amdgcn.s.buffer.load.v16f32(<4 x i32>, i32, i32 immarg)
1730 declare i96 @llvm.amdgcn.s.buffer.load.i96(<4 x i32>, i32, i32 immarg)
1731 declare i256 @llvm.amdgcn.s.buffer.load.i256(<4 x i32>, i32, i32 immarg)
1732 declare i512 @llvm.amdgcn.s.buffer.load.i512(<4 x i32>, i32, i32 immarg)
1734 declare <16 x i16> @llvm.amdgcn.s.buffer.load.v16i16(<4 x i32>, i32, i32 immarg)
1735 declare <32 x i16> @llvm.amdgcn.s.buffer.load.v32i16(<4 x i32>, i32, i32 immarg)
1737 declare <4 x i64> @llvm.amdgcn.s.buffer.load.v4i64(<4 x i32>, i32, i32 immarg)
1738 declare <8 x i64> @llvm.amdgcn.s.buffer.load.v8i64(<4 x i32>, i32, i32 immarg)
1740 declare <4 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v4p1(<4 x i32>, i32, i32 immarg)
1741 declare <8 x ptr addrspace(1)> @llvm.amdgcn.s.buffer.load.v8p1(<4 x i32>, i32, i32 immarg)