Bump version to 19.1.0-rc3
[llvm-project.git] / llvm / test / CodeGen / AMDGPU / GlobalISel / load-constant.96.ll
blob6bb104311a4d8e189871fe936f39de00210f046a
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
7 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s
9 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
11 define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
12 ; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
13 ; GFX12-UNALIGNED:       ; %bb.0:
14 ; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
15 ; GFX12-UNALIGNED-NEXT:    s_wait_expcnt 0x0
16 ; GFX12-UNALIGNED-NEXT:    s_wait_samplecnt 0x0
17 ; GFX12-UNALIGNED-NEXT:    s_wait_bvhcnt 0x0
18 ; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
19 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v[0:1], off
20 ; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
21 ; GFX12-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
23 ; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
24 ; GFX12-NOUNALIGNED:       ; %bb.0:
25 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
26 ; GFX12-NOUNALIGNED-NEXT:    s_wait_expcnt 0x0
27 ; GFX12-NOUNALIGNED-NEXT:    s_wait_samplecnt 0x0
28 ; GFX12-NOUNALIGNED-NEXT:    s_wait_bvhcnt 0x0
29 ; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
30 ; GFX12-NOUNALIGNED-NEXT:    s_clause 0xb
31 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v2, v[0:1], off
32 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v3, v[0:1], off offset:1
33 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v4, v[0:1], off offset:2
34 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v5, v[0:1], off offset:3
35 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v6, v[0:1], off offset:4
36 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v7, v[0:1], off offset:5
37 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v8, v[0:1], off offset:6
38 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v9, v[0:1], off offset:7
39 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v10, v[0:1], off offset:8
40 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v11, v[0:1], off offset:9
41 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v12, v[0:1], off offset:11
42 ; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v0, v[0:1], off offset:10
43 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0xa
44 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
45 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x9
46 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
47 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
48 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
49 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x6
50 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
51 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x5
52 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
53 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
54 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
55 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
56 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v7, v11, 8, v10
57 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x1
58 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
59 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
60 ; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
61 ; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v0, v2, v3, v1
62 ; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v1, v5, v6, v4
63 ; GFX12-NOUNALIGNED-NEXT:    s_delay_alu instid0(VALU_DEP_3)
64 ; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v2, v8, v9, v7
65 ; GFX12-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
67 ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
68 ; GFX9-UNALIGNED:       ; %bb.0:
69 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70 ; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
71 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
72 ; GFX9-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
74 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
75 ; GFX9-NOUNALIGNED:       ; %bb.0:
76 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[0:1], off
78 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v[0:1], off offset:1
79 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[0:1], off offset:2
80 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v[0:1], off offset:3
81 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v[0:1], off offset:4
82 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v[0:1], off offset:5
83 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[0:1], off offset:6
84 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v[0:1], off offset:7
85 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[0:1], off offset:8
86 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[0:1], off offset:9
87 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[0:1], off offset:11
88 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v13, v[0:1], off offset:10
89 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
90 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 8, v2
91 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
92 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
93 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
94 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 24, v5
95 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v0
96 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
97 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
98 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
99 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
100 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
101 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
102 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v4, v5, v3
103 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
104 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v10
105 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
106 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v12
107 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
108 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
109 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v7, v8, v6
110 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
112 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
113 ; GFX7-UNALIGNED:       ; %bb.0:
114 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s6, 0
116 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
117 ; GFX7-UNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
118 ; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
119 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
120 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
122 ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
123 ; GFX7-NOUNALIGNED:       ; %bb.0:
124 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s6, 0
126 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
127 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
128 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
129 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
130 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
131 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5
132 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:7
133 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:6
134 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9
135 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
136 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:10
137 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64
138 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4
139 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
140 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
141 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
142 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
143 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
144 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
145 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
146 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
147 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
148 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(7)
149 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
150 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
151 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
152 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
153 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
154 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
155 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
156 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
157 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
158 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
159 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v11
160 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v3
161 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
162 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v3, v4, v12
163 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v4, v5, v6
164 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
165 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v5, v7, v0
166 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v6, v8, v9
167 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v2, v1
168 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v3
169 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v6, v5
170 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
172 ; GFX6-LABEL: v_load_constant_v3i32_align1:
173 ; GFX6:       ; %bb.0:
174 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175 ; GFX6-NEXT:    s_mov_b32 s6, 0
176 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
177 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
178 ; GFX6-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
179 ; GFX6-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
180 ; GFX6-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
181 ; GFX6-NEXT:    buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5
182 ; GFX6-NEXT:    buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:7
183 ; GFX6-NEXT:    buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:6
184 ; GFX6-NEXT:    buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9
185 ; GFX6-NEXT:    buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
186 ; GFX6-NEXT:    buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:10
187 ; GFX6-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64
188 ; GFX6-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4
189 ; GFX6-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
190 ; GFX6-NEXT:    s_waitcnt vmcnt(11)
191 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
192 ; GFX6-NEXT:    s_waitcnt vmcnt(10)
193 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
194 ; GFX6-NEXT:    s_waitcnt vmcnt(9)
195 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
196 ; GFX6-NEXT:    s_waitcnt vmcnt(8)
197 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
198 ; GFX6-NEXT:    s_waitcnt vmcnt(7)
199 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
200 ; GFX6-NEXT:    s_waitcnt vmcnt(6)
201 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
202 ; GFX6-NEXT:    s_waitcnt vmcnt(5)
203 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
204 ; GFX6-NEXT:    s_waitcnt vmcnt(4)
205 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
206 ; GFX6-NEXT:    s_waitcnt vmcnt(3)
207 ; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
208 ; GFX6-NEXT:    s_waitcnt vmcnt(2)
209 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v11
210 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
211 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
212 ; GFX6-NEXT:    v_or_b32_e32 v3, v4, v12
213 ; GFX6-NEXT:    v_or_b32_e32 v4, v5, v6
214 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
215 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
216 ; GFX6-NEXT:    v_or_b32_e32 v6, v8, v9
217 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v1
218 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v3
219 ; GFX6-NEXT:    v_or_b32_e32 v2, v6, v5
220 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
221   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
222   ret <3 x i32> %load
225 define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
226 ; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
227 ; GFX12-UNALIGNED:       ; %bb.0:
228 ; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
229 ; GFX12-UNALIGNED-NEXT:    s_wait_expcnt 0x0
230 ; GFX12-UNALIGNED-NEXT:    s_wait_samplecnt 0x0
231 ; GFX12-UNALIGNED-NEXT:    s_wait_bvhcnt 0x0
232 ; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
233 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v[0:1], off
234 ; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
235 ; GFX12-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
237 ; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
238 ; GFX12-NOUNALIGNED:       ; %bb.0:
239 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
240 ; GFX12-NOUNALIGNED-NEXT:    s_wait_expcnt 0x0
241 ; GFX12-NOUNALIGNED-NEXT:    s_wait_samplecnt 0x0
242 ; GFX12-NOUNALIGNED-NEXT:    s_wait_bvhcnt 0x0
243 ; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
244 ; GFX12-NOUNALIGNED-NEXT:    s_clause 0x5
245 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v2, v[0:1], off
246 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v3, v[0:1], off offset:2
247 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v4, v[0:1], off offset:4
248 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v5, v[0:1], off offset:6
249 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v6, v[0:1], off offset:8
250 ; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v7, v[0:1], off offset:10
251 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
252 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
253 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
254 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
255 ; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
256 ; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
257 ; GFX12-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
259 ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
260 ; GFX9-UNALIGNED:       ; %bb.0:
261 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262 ; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
263 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
264 ; GFX9-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
266 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
267 ; GFX9-NOUNALIGNED:       ; %bb.0:
268 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[0:1], off
270 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v[0:1], off offset:2
271 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[0:1], off offset:4
272 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v[0:1], off offset:6
273 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v[0:1], off offset:8
274 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v7, v[0:1], off offset:10
275 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
276 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
277 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
278 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
279 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
280 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
281 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
283 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
284 ; GFX7-UNALIGNED:       ; %bb.0:
285 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s6, 0
287 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
288 ; GFX7-UNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
289 ; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
290 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
291 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
293 ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
294 ; GFX7-NOUNALIGNED:       ; %bb.0:
295 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s6, 0
297 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
298 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
299 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2
300 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6
301 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10
302 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64
303 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4
304 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8
305 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
306 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
307 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
308 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
309 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
310 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
311 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
312 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v5
313 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
314 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v6
315 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
316 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v7
317 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
319 ; GFX6-LABEL: v_load_constant_v3i32_align2:
320 ; GFX6:       ; %bb.0:
321 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322 ; GFX6-NEXT:    s_mov_b32 s6, 0
323 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
324 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
325 ; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2
326 ; GFX6-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6
327 ; GFX6-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10
328 ; GFX6-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64
329 ; GFX6-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4
330 ; GFX6-NEXT:    buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8
331 ; GFX6-NEXT:    s_waitcnt vmcnt(5)
332 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
333 ; GFX6-NEXT:    s_waitcnt vmcnt(4)
334 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
335 ; GFX6-NEXT:    s_waitcnt vmcnt(3)
336 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
337 ; GFX6-NEXT:    s_waitcnt vmcnt(2)
338 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
339 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
340 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v6
341 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
342 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v7
343 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
344   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
345   ret <3 x i32> %load
348 define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
349 ; GFX12-LABEL: v_load_constant_v3i32_align4:
350 ; GFX12:       ; %bb.0:
351 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
352 ; GFX12-NEXT:    s_wait_expcnt 0x0
353 ; GFX12-NEXT:    s_wait_samplecnt 0x0
354 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
355 ; GFX12-NEXT:    s_wait_kmcnt 0x0
356 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
357 ; GFX12-NEXT:    s_wait_loadcnt 0x0
358 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
360 ; GFX9-LABEL: v_load_constant_v3i32_align4:
361 ; GFX9:       ; %bb.0:
362 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
364 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
365 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
367 ; GFX7-LABEL: v_load_constant_v3i32_align4:
368 ; GFX7:       ; %bb.0:
369 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370 ; GFX7-NEXT:    s_mov_b32 s6, 0
371 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
372 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
373 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
374 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
375 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
377 ; GFX6-LABEL: v_load_constant_v3i32_align4:
378 ; GFX6:       ; %bb.0:
379 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380 ; GFX6-NEXT:    s_mov_b32 s6, 0
381 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
382 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
383 ; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
384 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
385 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
386 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
387 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
388 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
389 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
390   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
391   ret <3 x i32> %load
394 define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
395 ; GFX12-LABEL: v_load_constant_i96_align8:
396 ; GFX12:       ; %bb.0:
397 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
398 ; GFX12-NEXT:    s_wait_expcnt 0x0
399 ; GFX12-NEXT:    s_wait_samplecnt 0x0
400 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
401 ; GFX12-NEXT:    s_wait_kmcnt 0x0
402 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
403 ; GFX12-NEXT:    s_wait_loadcnt 0x0
404 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
406 ; GFX9-LABEL: v_load_constant_i96_align8:
407 ; GFX9:       ; %bb.0:
408 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
410 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
411 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
413 ; GFX7-LABEL: v_load_constant_i96_align8:
414 ; GFX7:       ; %bb.0:
415 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416 ; GFX7-NEXT:    s_mov_b32 s6, 0
417 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
418 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
419 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
420 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
421 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
423 ; GFX6-LABEL: v_load_constant_i96_align8:
424 ; GFX6:       ; %bb.0:
425 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426 ; GFX6-NEXT:    s_mov_b32 s6, 0
427 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
428 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
429 ; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
430 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
431 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
432 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
433 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
434 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
435 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
436   %load = load i96, ptr addrspace(4) %ptr, align 8
437   ret i96 %load
440 define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
441 ; GFX12-LABEL: v_load_constant_v3i32_align8:
442 ; GFX12:       ; %bb.0:
443 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
444 ; GFX12-NEXT:    s_wait_expcnt 0x0
445 ; GFX12-NEXT:    s_wait_samplecnt 0x0
446 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
447 ; GFX12-NEXT:    s_wait_kmcnt 0x0
448 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
449 ; GFX12-NEXT:    s_wait_loadcnt 0x0
450 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
452 ; GFX9-LABEL: v_load_constant_v3i32_align8:
453 ; GFX9:       ; %bb.0:
454 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
456 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
457 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
459 ; GFX7-LABEL: v_load_constant_v3i32_align8:
460 ; GFX7:       ; %bb.0:
461 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462 ; GFX7-NEXT:    s_mov_b32 s6, 0
463 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
464 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
465 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
466 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
467 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
469 ; GFX6-LABEL: v_load_constant_v3i32_align8:
470 ; GFX6:       ; %bb.0:
471 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472 ; GFX6-NEXT:    s_mov_b32 s6, 0
473 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
474 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
475 ; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
476 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
477 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
478 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
479 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
480 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
481 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
482   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
483   ret <3 x i32> %load
486 define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
487 ; GFX12-LABEL: v_load_constant_v6i16_align8:
488 ; GFX12:       ; %bb.0:
489 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
490 ; GFX12-NEXT:    s_wait_expcnt 0x0
491 ; GFX12-NEXT:    s_wait_samplecnt 0x0
492 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
493 ; GFX12-NEXT:    s_wait_kmcnt 0x0
494 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
495 ; GFX12-NEXT:    s_wait_loadcnt 0x0
496 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
498 ; GFX9-LABEL: v_load_constant_v6i16_align8:
499 ; GFX9:       ; %bb.0:
500 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
502 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
503 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
505 ; GFX7-LABEL: v_load_constant_v6i16_align8:
506 ; GFX7:       ; %bb.0:
507 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508 ; GFX7-NEXT:    s_mov_b32 s6, 0
509 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
510 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
511 ; GFX7-NEXT:    buffer_load_dwordx3 v[6:8], v[0:1], s[4:7], 0 addr64
512 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
513 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
514 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
515 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
516 ; GFX7-NEXT:    v_mov_b32_e32 v0, v6
517 ; GFX7-NEXT:    v_mov_b32_e32 v2, v7
518 ; GFX7-NEXT:    v_mov_b32_e32 v4, v8
519 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
521 ; GFX6-LABEL: v_load_constant_v6i16_align8:
522 ; GFX6:       ; %bb.0:
523 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
524 ; GFX6-NEXT:    s_mov_b32 s6, 0
525 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
526 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
527 ; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
528 ; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8
529 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
530 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
531 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
532 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
533 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
534 ; GFX6-NEXT:    v_mov_b32_e32 v0, v6
535 ; GFX6-NEXT:    v_mov_b32_e32 v2, v7
536 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
537   %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
538   ret <6 x i16> %load
541 define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
542 ; GFX12-LABEL: v_load_constant_v12i8_align8:
543 ; GFX12:       ; %bb.0:
544 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
545 ; GFX12-NEXT:    s_wait_expcnt 0x0
546 ; GFX12-NEXT:    s_wait_samplecnt 0x0
547 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
548 ; GFX12-NEXT:    s_wait_kmcnt 0x0
549 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
550 ; GFX12-NEXT:    s_wait_loadcnt 0x0
551 ; GFX12-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
552 ; GFX12-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
553 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
554 ; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
555 ; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
556 ; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
557 ; GFX12-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
558 ; GFX12-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
559 ; GFX12-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
560 ; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
561 ; GFX12-NEXT:    v_mov_b32_e32 v8, v2
562 ; GFX12-NEXT:    v_mov_b32_e32 v2, v12
563 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
565 ; GFX9-LABEL: v_load_constant_v12i8_align8:
566 ; GFX9:       ; %bb.0:
567 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
569 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
570 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
571 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
572 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
573 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
574 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
575 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
576 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
577 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
578 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
579 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
580 ; GFX9-NEXT:    v_mov_b32_e32 v8, v2
581 ; GFX9-NEXT:    v_mov_b32_e32 v1, v13
582 ; GFX9-NEXT:    v_mov_b32_e32 v2, v12
583 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
585 ; GFX7-LABEL: v_load_constant_v12i8_align8:
586 ; GFX7:       ; %bb.0:
587 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588 ; GFX7-NEXT:    s_mov_b32 s6, 0
589 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
590 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
591 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
592 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
593 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
594 ; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
595 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
596 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
597 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
598 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
599 ; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
600 ; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
601 ; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
602 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
603 ; GFX7-NEXT:    v_mov_b32_e32 v8, v2
604 ; GFX7-NEXT:    v_mov_b32_e32 v1, v13
605 ; GFX7-NEXT:    v_mov_b32_e32 v2, v12
606 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
608 ; GFX6-LABEL: v_load_constant_v12i8_align8:
609 ; GFX6:       ; %bb.0:
610 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
611 ; GFX6-NEXT:    s_mov_b32 s6, 0
612 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
613 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
614 ; GFX6-NEXT:    buffer_load_dwordx2 v[12:13], v[0:1], s[4:7], 0 addr64
615 ; GFX6-NEXT:    buffer_load_dword v8, v[0:1], s[4:7], 0 addr64 offset:8
616 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
617 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
618 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
619 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v12
620 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
621 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
622 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
623 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
624 ; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
625 ; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
626 ; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
627 ; GFX6-NEXT:    v_mov_b32_e32 v0, v12
628 ; GFX6-NEXT:    v_mov_b32_e32 v4, v13
629 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
630   %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
631   ret <12 x i8> %load
634 define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
635 ; GFX12-LABEL: v_load_constant_v3i32_align16:
636 ; GFX12:       ; %bb.0:
637 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
638 ; GFX12-NEXT:    s_wait_expcnt 0x0
639 ; GFX12-NEXT:    s_wait_samplecnt 0x0
640 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
641 ; GFX12-NEXT:    s_wait_kmcnt 0x0
642 ; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
643 ; GFX12-NEXT:    s_wait_loadcnt 0x0
644 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
646 ; GFX9-LABEL: v_load_constant_v3i32_align16:
647 ; GFX9:       ; %bb.0:
648 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
650 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
651 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
653 ; GFX7-LABEL: v_load_constant_v3i32_align16:
654 ; GFX7:       ; %bb.0:
655 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656 ; GFX7-NEXT:    s_mov_b32 s6, 0
657 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
658 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
659 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
660 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
661 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
663 ; GFX6-LABEL: v_load_constant_v3i32_align16:
664 ; GFX6:       ; %bb.0:
665 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666 ; GFX6-NEXT:    s_mov_b32 s6, 0
667 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
668 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
669 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
670 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
671 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
672   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
673   ret <3 x i32> %load
676 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg %ptr) {
677 ; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
678 ; GFX12-UNALIGNED:       ; %bb.0:
679 ; GFX12-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
680 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
681 ; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
682 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
683 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
684 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
685 ; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
687 ; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
688 ; GFX12-NOUNALIGNED:       ; %bb.0:
689 ; GFX12-NOUNALIGNED-NEXT:    s_clause 0xb
690 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s2, s[0:1], 0x1
691 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s3, s[0:1], 0x3
692 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s4, s[0:1], 0x2
693 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s5, s[0:1], 0x5
694 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s6, s[0:1], 0x7
695 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s7, s[0:1], 0x6
696 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s8, s[0:1], 0x9
697 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s9, s[0:1], 0xb
698 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s10, s[0:1], 0x0
699 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s11, s[0:1], 0x4
700 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s12, s[0:1], 0xa
701 ; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s1, s[0:1], 0x8
702 ; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
703 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s0, s2, 8
704 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s3, 24
705 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s3, s4, 16
706 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s4, s5, 8
707 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s3
708 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s5, s6, 24
709 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s6, s7, 16
710 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s7, s8, 8
711 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s0, s0, s10
712 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s8, s9, 24
713 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s0, s2, s0
714 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s12, 16
715 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s3, s4, s11
716 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s4, s5, s6
717 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s5, s7, s1
718 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s8, s2
719 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s1, s4, s3
720 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s5
721 ; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
723 ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
724 ; GFX9-UNALIGNED:       ; %bb.0:
725 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
726 ; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
727 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
728 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
729 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
730 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
731 ; GFX9-UNALIGNED-NEXT:    ; return to shader part epilog
733 ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
734 ; GFX9-NOUNALIGNED:       ; %bb.0:
735 ; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
736 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v1, v0, s[0:1]
737 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:1
738 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:2
739 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:3
740 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:4
741 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:5
742 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:6
743 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:7
744 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
745 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:9
746 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:11
747 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:10
748 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
749 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
750 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
751 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
752 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
753 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
754 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v0
755 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
756 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
757 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
758 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
759 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
760 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
761 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v4, v5, v3
762 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
763 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v10, 8, v9
764 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
765 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v11
766 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
767 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
768 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v7, v8, v6
769 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
770 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
771 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
772 ; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
774 ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
775 ; GFX7-UNALIGNED:       ; %bb.0:
776 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s2, -1
777 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
778 ; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], off, s[0:3], 0
779 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
780 ; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
781 ; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
782 ; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
783 ; GFX7-UNALIGNED-NEXT:    ; return to shader part epilog
785 ; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
786 ; GFX7-NOUNALIGNED:       ; %bb.0:
787 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s2, -1
788 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
789 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
790 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:3
791 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:2
792 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0 offset:5
793 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:7
794 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:6
795 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:9
796 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:11
797 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0 offset:10
798 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0
799 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:4
800 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:8
801 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
802 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
803 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
804 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
805 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
806 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
807 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
808 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
809 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(7)
810 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
811 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
812 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
813 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
814 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
815 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
816 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
817 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
818 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
819 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
820 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v9
821 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v2
822 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
823 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v3, v10
824 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v3, v4, v5
825 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
826 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v4, v6, v11
827 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v5, v7, v8
828 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v1, v0
829 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v3, v2
830 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v5, v4
831 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
832 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
833 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
834 ; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
836 ; GFX6-LABEL: s_load_constant_v3i32_align1:
837 ; GFX6:       ; %bb.0:
838 ; GFX6-NEXT:    s_mov_b32 s2, -1
839 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
840 ; GFX6-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
841 ; GFX6-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:3
842 ; GFX6-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:2
843 ; GFX6-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0 offset:5
844 ; GFX6-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:7
845 ; GFX6-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:6
846 ; GFX6-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:9
847 ; GFX6-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:11
848 ; GFX6-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0 offset:10
849 ; GFX6-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0
850 ; GFX6-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:4
851 ; GFX6-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:8
852 ; GFX6-NEXT:    s_waitcnt vmcnt(11)
853 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
854 ; GFX6-NEXT:    s_waitcnt vmcnt(10)
855 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
856 ; GFX6-NEXT:    s_waitcnt vmcnt(9)
857 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
858 ; GFX6-NEXT:    s_waitcnt vmcnt(8)
859 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
860 ; GFX6-NEXT:    s_waitcnt vmcnt(7)
861 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
862 ; GFX6-NEXT:    s_waitcnt vmcnt(6)
863 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
864 ; GFX6-NEXT:    s_waitcnt vmcnt(5)
865 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
866 ; GFX6-NEXT:    s_waitcnt vmcnt(4)
867 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
868 ; GFX6-NEXT:    s_waitcnt vmcnt(3)
869 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
870 ; GFX6-NEXT:    s_waitcnt vmcnt(2)
871 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v9
872 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
873 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
874 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v10
875 ; GFX6-NEXT:    v_or_b32_e32 v3, v4, v5
876 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
877 ; GFX6-NEXT:    v_or_b32_e32 v4, v6, v11
878 ; GFX6-NEXT:    v_or_b32_e32 v5, v7, v8
879 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
880 ; GFX6-NEXT:    v_or_b32_e32 v1, v3, v2
881 ; GFX6-NEXT:    v_or_b32_e32 v2, v5, v4
882 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
883 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
884 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
885 ; GFX6-NEXT:    ; return to shader part epilog
886   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
887   ret <3 x i32> %load
890 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg %ptr) {
891 ; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
892 ; GFX12-UNALIGNED:       ; %bb.0:
893 ; GFX12-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
894 ; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
895 ; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
896 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
897 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
898 ; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
899 ; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
901 ; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
902 ; GFX12-NOUNALIGNED:       ; %bb.0:
903 ; GFX12-NOUNALIGNED-NEXT:    s_clause 0x5
904 ; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s2, s[0:1], 0x2
905 ; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s3, s[0:1], 0x6
906 ; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s4, s[0:1], 0xa
907 ; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s5, s[0:1], 0x0
908 ; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s6, s[0:1], 0x4
909 ; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s7, s[0:1], 0x8
910 ; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
911 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s0, s2, 16
912 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s1, s3, 16
913 ; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s4, 16
914 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s0, s0, s5
915 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s1, s1, s6
916 ; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s7
917 ; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
919 ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
920 ; GFX9-UNALIGNED:       ; %bb.0:
921 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
922 ; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
923 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
924 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
925 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
926 ; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
927 ; GFX9-UNALIGNED-NEXT:    ; return to shader part epilog
929 ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
930 ; GFX9-NOUNALIGNED:       ; %bb.0:
931 ; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
932 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v1, v0, s[0:1]
933 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2
934 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v0, s[0:1] offset:4
935 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v0, s[0:1] offset:6
936 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v0, s[0:1] offset:8
937 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v0, s[0:1] offset:10
938 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
939 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
940 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
941 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
942 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
943 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
944 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
945 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
946 ; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
947 ; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
949 ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
950 ; GFX7-UNALIGNED:       ; %bb.0:
951 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s2, -1
952 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
953 ; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], off, s[0:3], 0
954 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
955 ; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
956 ; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
957 ; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
958 ; GFX7-UNALIGNED-NEXT:    ; return to shader part epilog
960 ; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
961 ; GFX7-NOUNALIGNED:       ; %bb.0:
962 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s2, -1
963 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
964 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
965 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
966 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:10
967 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
968 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
969 ; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
970 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
971 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
972 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
973 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
974 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
975 ; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
976 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
977 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v3
978 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
979 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v4
980 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
981 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v5
982 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
983 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
984 ; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
985 ; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
987 ; GFX6-LABEL: s_load_constant_v3i32_align2:
988 ; GFX6:       ; %bb.0:
989 ; GFX6-NEXT:    s_mov_b32 s2, -1
990 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
991 ; GFX6-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
992 ; GFX6-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
993 ; GFX6-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:10
994 ; GFX6-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
995 ; GFX6-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
996 ; GFX6-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
997 ; GFX6-NEXT:    s_waitcnt vmcnt(5)
998 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
999 ; GFX6-NEXT:    s_waitcnt vmcnt(4)
1000 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1001 ; GFX6-NEXT:    s_waitcnt vmcnt(3)
1002 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1003 ; GFX6-NEXT:    s_waitcnt vmcnt(2)
1004 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
1005 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
1006 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
1007 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
1008 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v5
1009 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
1010 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
1011 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
1012 ; GFX6-NEXT:    ; return to shader part epilog
1013   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
1014   ret <3 x i32> %load
1017 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
1018 ; GFX12-LABEL: s_load_constant_v3i32_align4:
1019 ; GFX12:       ; %bb.0:
1020 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1021 ; GFX12-NEXT:    s_wait_kmcnt 0x0
1022 ; GFX12-NEXT:    ; return to shader part epilog
1024 ; GFX9-LABEL: s_load_constant_v3i32_align4:
1025 ; GFX9:       ; %bb.0:
1026 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1027 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1028 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1029 ; GFX9-NEXT:    s_mov_b32 s0, s4
1030 ; GFX9-NEXT:    s_mov_b32 s1, s5
1031 ; GFX9-NEXT:    ; return to shader part epilog
1033 ; GFX7-LABEL: s_load_constant_v3i32_align4:
1034 ; GFX7:       ; %bb.0:
1035 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1036 ; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1037 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1038 ; GFX7-NEXT:    s_mov_b32 s0, s4
1039 ; GFX7-NEXT:    s_mov_b32 s1, s5
1040 ; GFX7-NEXT:    ; return to shader part epilog
1042 ; GFX6-LABEL: s_load_constant_v3i32_align4:
1043 ; GFX6:       ; %bb.0:
1044 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1045 ; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1046 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1047 ; GFX6-NEXT:    s_mov_b32 s0, s4
1048 ; GFX6-NEXT:    s_mov_b32 s1, s5
1049 ; GFX6-NEXT:    ; return to shader part epilog
1050   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
1051   ret <3 x i32> %load
1054 define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
1055 ; GFX12-LABEL: s_load_constant_i96_align8:
1056 ; GFX12:       ; %bb.0:
1057 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1058 ; GFX12-NEXT:    s_wait_kmcnt 0x0
1059 ; GFX12-NEXT:    ; return to shader part epilog
1061 ; GFX9-LABEL: s_load_constant_i96_align8:
1062 ; GFX9:       ; %bb.0:
1063 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1064 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1065 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1066 ; GFX9-NEXT:    s_mov_b32 s0, s4
1067 ; GFX9-NEXT:    s_mov_b32 s1, s5
1068 ; GFX9-NEXT:    ; return to shader part epilog
1070 ; GFX7-LABEL: s_load_constant_i96_align8:
1071 ; GFX7:       ; %bb.0:
1072 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1073 ; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1074 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1075 ; GFX7-NEXT:    s_mov_b32 s0, s4
1076 ; GFX7-NEXT:    s_mov_b32 s1, s5
1077 ; GFX7-NEXT:    ; return to shader part epilog
1079 ; GFX6-LABEL: s_load_constant_i96_align8:
1080 ; GFX6:       ; %bb.0:
1081 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1082 ; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1083 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1084 ; GFX6-NEXT:    s_mov_b32 s0, s4
1085 ; GFX6-NEXT:    s_mov_b32 s1, s5
1086 ; GFX6-NEXT:    ; return to shader part epilog
1087   %load = load i96, ptr addrspace(4) %ptr, align 8
1088   ret i96 %load
1091 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
1092 ; GFX12-LABEL: s_load_constant_v3i32_align8:
1093 ; GFX12:       ; %bb.0:
1094 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1095 ; GFX12-NEXT:    s_wait_kmcnt 0x0
1096 ; GFX12-NEXT:    ; return to shader part epilog
1098 ; GFX9-LABEL: s_load_constant_v3i32_align8:
1099 ; GFX9:       ; %bb.0:
1100 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1101 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1102 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1103 ; GFX9-NEXT:    s_mov_b32 s0, s4
1104 ; GFX9-NEXT:    s_mov_b32 s1, s5
1105 ; GFX9-NEXT:    ; return to shader part epilog
1107 ; GFX7-LABEL: s_load_constant_v3i32_align8:
1108 ; GFX7:       ; %bb.0:
1109 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1110 ; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1111 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1112 ; GFX7-NEXT:    s_mov_b32 s0, s4
1113 ; GFX7-NEXT:    s_mov_b32 s1, s5
1114 ; GFX7-NEXT:    ; return to shader part epilog
1116 ; GFX6-LABEL: s_load_constant_v3i32_align8:
1117 ; GFX6:       ; %bb.0:
1118 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1119 ; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1120 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1121 ; GFX6-NEXT:    s_mov_b32 s0, s4
1122 ; GFX6-NEXT:    s_mov_b32 s1, s5
1123 ; GFX6-NEXT:    ; return to shader part epilog
1124   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
1125   ret <3 x i32> %load
1128 define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
1129 ; GFX12-LABEL: s_load_constant_v6i16_align8:
1130 ; GFX12:       ; %bb.0:
1131 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1132 ; GFX12-NEXT:    s_wait_kmcnt 0x0
1133 ; GFX12-NEXT:    ; return to shader part epilog
1135 ; GFX9-LABEL: s_load_constant_v6i16_align8:
1136 ; GFX9:       ; %bb.0:
1137 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1138 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1139 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1140 ; GFX9-NEXT:    s_mov_b32 s0, s4
1141 ; GFX9-NEXT:    s_mov_b32 s1, s5
1142 ; GFX9-NEXT:    ; return to shader part epilog
1144 ; GFX7-LABEL: s_load_constant_v6i16_align8:
1145 ; GFX7:       ; %bb.0:
1146 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1147 ; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1148 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1149 ; GFX7-NEXT:    s_mov_b32 s0, s4
1150 ; GFX7-NEXT:    s_mov_b32 s1, s5
1151 ; GFX7-NEXT:    ; return to shader part epilog
1153 ; GFX6-LABEL: s_load_constant_v6i16_align8:
1154 ; GFX6:       ; %bb.0:
1155 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1156 ; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1157 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1158 ; GFX6-NEXT:    s_mov_b32 s0, s4
1159 ; GFX6-NEXT:    s_mov_b32 s1, s5
1160 ; GFX6-NEXT:    ; return to shader part epilog
1161   %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
1162   %cast = bitcast <6 x i16> %load to <3 x i32>
1163   ret <3 x i32> %cast
1166 define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
1167 ; GFX12-LABEL: s_load_constant_v12i8_align8:
1168 ; GFX12:       ; %bb.0:
1169 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1170 ; GFX12-NEXT:    s_wait_kmcnt 0x0
1171 ; GFX12-NEXT:    s_lshr_b32 s13, s0, 8
1172 ; GFX12-NEXT:    s_lshr_b32 s12, s0, 16
1173 ; GFX12-NEXT:    s_lshr_b32 s3, s0, 24
1174 ; GFX12-NEXT:    s_lshr_b32 s5, s1, 8
1175 ; GFX12-NEXT:    s_lshr_b32 s6, s1, 16
1176 ; GFX12-NEXT:    s_lshr_b32 s7, s1, 24
1177 ; GFX12-NEXT:    s_lshr_b32 s9, s2, 8
1178 ; GFX12-NEXT:    s_lshr_b32 s10, s2, 16
1179 ; GFX12-NEXT:    s_lshr_b32 s11, s2, 24
1180 ; GFX12-NEXT:    s_mov_b32 s4, s1
1181 ; GFX12-NEXT:    s_mov_b32 s8, s2
1182 ; GFX12-NEXT:    s_mov_b32 s1, s13
1183 ; GFX12-NEXT:    s_mov_b32 s2, s12
1184 ; GFX12-NEXT:    ; return to shader part epilog
1186 ; GFX9-LABEL: s_load_constant_v12i8_align8:
1187 ; GFX9:       ; %bb.0:
1188 ; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
1189 ; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x8
1190 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1191 ; GFX9-NEXT:    s_lshr_b32 s1, s12, 8
1192 ; GFX9-NEXT:    s_lshr_b32 s2, s12, 16
1193 ; GFX9-NEXT:    s_lshr_b32 s3, s12, 24
1194 ; GFX9-NEXT:    s_lshr_b32 s5, s13, 8
1195 ; GFX9-NEXT:    s_lshr_b32 s6, s13, 16
1196 ; GFX9-NEXT:    s_lshr_b32 s7, s13, 24
1197 ; GFX9-NEXT:    s_lshr_b32 s9, s8, 8
1198 ; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
1199 ; GFX9-NEXT:    s_lshr_b32 s11, s8, 24
1200 ; GFX9-NEXT:    s_mov_b32 s0, s12
1201 ; GFX9-NEXT:    s_mov_b32 s4, s13
1202 ; GFX9-NEXT:    ; return to shader part epilog
1204 ; GFX7-LABEL: s_load_constant_v12i8_align8:
1205 ; GFX7:       ; %bb.0:
1206 ; GFX7-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
1207 ; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x2
1208 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1209 ; GFX7-NEXT:    s_lshr_b32 s1, s12, 8
1210 ; GFX7-NEXT:    s_lshr_b32 s2, s12, 16
1211 ; GFX7-NEXT:    s_lshr_b32 s3, s12, 24
1212 ; GFX7-NEXT:    s_lshr_b32 s5, s13, 8
1213 ; GFX7-NEXT:    s_lshr_b32 s6, s13, 16
1214 ; GFX7-NEXT:    s_lshr_b32 s7, s13, 24
1215 ; GFX7-NEXT:    s_lshr_b32 s9, s8, 8
1216 ; GFX7-NEXT:    s_lshr_b32 s10, s8, 16
1217 ; GFX7-NEXT:    s_lshr_b32 s11, s8, 24
1218 ; GFX7-NEXT:    s_mov_b32 s0, s12
1219 ; GFX7-NEXT:    s_mov_b32 s4, s13
1220 ; GFX7-NEXT:    ; return to shader part epilog
1222 ; GFX6-LABEL: s_load_constant_v12i8_align8:
1223 ; GFX6:       ; %bb.0:
1224 ; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
1225 ; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x2
1226 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1227 ; GFX6-NEXT:    s_lshr_b32 s1, s12, 8
1228 ; GFX6-NEXT:    s_lshr_b32 s2, s12, 16
1229 ; GFX6-NEXT:    s_lshr_b32 s3, s12, 24
1230 ; GFX6-NEXT:    s_lshr_b32 s5, s13, 8
1231 ; GFX6-NEXT:    s_lshr_b32 s6, s13, 16
1232 ; GFX6-NEXT:    s_lshr_b32 s7, s13, 24
1233 ; GFX6-NEXT:    s_lshr_b32 s9, s8, 8
1234 ; GFX6-NEXT:    s_lshr_b32 s10, s8, 16
1235 ; GFX6-NEXT:    s_lshr_b32 s11, s8, 24
1236 ; GFX6-NEXT:    s_mov_b32 s0, s12
1237 ; GFX6-NEXT:    s_mov_b32 s4, s13
1238 ; GFX6-NEXT:    ; return to shader part epilog
1239   %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
1240   ret <12 x i8> %load
1243 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) {
1244 ; GFX12-LABEL: s_load_constant_v3i32_align16:
1245 ; GFX12:       ; %bb.0:
1246 ; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1247 ; GFX12-NEXT:    s_wait_kmcnt 0x0
1248 ; GFX12-NEXT:    ; return to shader part epilog
1250 ; GCN-LABEL: s_load_constant_v3i32_align16:
1251 ; GCN:       ; %bb.0:
1252 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
1253 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1254 ; GCN-NEXT:    ; return to shader part epilog
1255   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
1256   ret <3 x i32> %load