1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
2 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -passes=amdgpu-lower-kernel-arguments %s | FileCheck -check-prefixes=GCN,HSA %s
3 ; RUN: opt -mtriple=amdgcn-- -S -o - -passes=amdgpu-lower-kernel-arguments %s | FileCheck -check-prefixes=GCN,MESA %s
5 target datalayout = "A5"
7 define amdgpu_kernel void @kern_noargs() {
8 ; GCN-LABEL: @kern_noargs(
14 define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
15 ; HSA-LABEL: @kern_i8(
16 ; HSA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
17 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_KERNARG_SEGMENT]], i64 0
18 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
19 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
20 ; HSA-NEXT: store i8 [[TMP2]], ptr addrspace(1) undef, align 1
23 ; MESA-LABEL: @kern_i8(
24 ; MESA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
25 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_KERNARG_SEGMENT]], i64 36
26 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
27 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
28 ; MESA-NEXT: store i8 [[TMP2]], ptr addrspace(1) undef, align 1
31 store i8 %arg, ptr addrspace(1) undef, align 1
35 define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
36 ; HSA-LABEL: @kern_i16(
37 ; HSA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
38 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I16_KERNARG_SEGMENT]], i64 0
39 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
40 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
41 ; HSA-NEXT: store i16 [[TMP2]], ptr addrspace(1) undef, align 1
44 ; MESA-LABEL: @kern_i16(
45 ; MESA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
46 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I16_KERNARG_SEGMENT]], i64 36
47 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
48 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
49 ; MESA-NEXT: store i16 [[TMP2]], ptr addrspace(1) undef, align 1
52 store i16 %arg, ptr addrspace(1) undef, align 1
56 define amdgpu_kernel void @kern_f16(half %arg) #0 {
57 ; HSA-LABEL: @kern_f16(
58 ; HSA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
59 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F16_KERNARG_SEGMENT]], i64 0
60 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
61 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
62 ; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
63 ; HSA-NEXT: store half [[ARG_LOAD]], ptr addrspace(1) undef, align 1
66 ; MESA-LABEL: @kern_f16(
67 ; MESA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
68 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F16_KERNARG_SEGMENT]], i64 36
69 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
70 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
71 ; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
72 ; MESA-NEXT: store half [[ARG_LOAD]], ptr addrspace(1) undef, align 1
75 store half %arg, ptr addrspace(1) undef, align 1
79 define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
80 ; HSA-LABEL: @kern_zeroext_i8(
81 ; HSA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
82 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
83 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
84 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
85 ; HSA-NEXT: store i8 [[TMP2]], ptr addrspace(1) undef, align 1
88 ; MESA-LABEL: @kern_zeroext_i8(
89 ; MESA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
90 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
91 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
92 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
93 ; MESA-NEXT: store i8 [[TMP2]], ptr addrspace(1) undef, align 1
96 store i8 %arg, ptr addrspace(1) undef, align 1
100 define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
101 ; HSA-LABEL: @kern_zeroext_i16(
102 ; HSA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
103 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
104 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
105 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
106 ; HSA-NEXT: store i16 [[TMP2]], ptr addrspace(1) undef, align 1
109 ; MESA-LABEL: @kern_zeroext_i16(
110 ; MESA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
111 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
112 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
113 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
114 ; MESA-NEXT: store i16 [[TMP2]], ptr addrspace(1) undef, align 1
115 ; MESA-NEXT: ret void
117 store i16 %arg, ptr addrspace(1) undef, align 1
121 define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
122 ; HSA-LABEL: @kern_signext_i8(
123 ; HSA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
124 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
125 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
126 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
127 ; HSA-NEXT: store i8 [[TMP2]], ptr addrspace(1) undef, align 1
130 ; MESA-LABEL: @kern_signext_i8(
131 ; MESA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
132 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
133 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
134 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
135 ; MESA-NEXT: store i8 [[TMP2]], ptr addrspace(1) undef, align 1
136 ; MESA-NEXT: ret void
138 store i8 %arg, ptr addrspace(1) undef, align 1
142 define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
143 ; HSA-LABEL: @kern_signext_i16(
144 ; HSA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
145 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
146 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
147 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
148 ; HSA-NEXT: store i16 [[TMP2]], ptr addrspace(1) undef, align 1
151 ; MESA-LABEL: @kern_signext_i16(
152 ; MESA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
153 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
154 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
155 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
156 ; MESA-NEXT: store i16 [[TMP2]], ptr addrspace(1) undef, align 1
157 ; MESA-NEXT: ret void
159 store i16 %arg, ptr addrspace(1) undef, align 1
163 define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
164 ; HSA-LABEL: @kern_i8_i8(
165 ; HSA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
166 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
167 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
168 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
169 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
170 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
171 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
172 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
173 ; HSA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
174 ; HSA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
177 ; MESA-LABEL: @kern_i8_i8(
178 ; MESA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
179 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
180 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
181 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
182 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
183 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
184 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
185 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
186 ; MESA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
187 ; MESA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
188 ; MESA-NEXT: ret void
190 store volatile i8 %arg0, ptr addrspace(1) undef, align 1
191 store volatile i8 %arg1, ptr addrspace(1) undef, align 1
195 define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
196 ; HSA-LABEL: @kern_v3i8(
197 ; HSA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
198 ; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
199 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
200 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
201 ; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
202 ; HSA-NEXT: store <3 x i8> [[ARG_LOAD]], ptr addrspace(1) undef, align 4
205 ; MESA-LABEL: @kern_v3i8(
206 ; MESA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
207 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
208 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
209 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
210 ; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
211 ; MESA-NEXT: store <3 x i8> [[ARG_LOAD]], ptr addrspace(1) undef, align 4
212 ; MESA-NEXT: ret void
214 store <3 x i8> %arg, ptr addrspace(1) undef, align 4
218 define amdgpu_kernel void @kern_i24(i24 %arg0) {
219 ; HSA-LABEL: @kern_i24(
220 ; HSA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
221 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I24_KERNARG_SEGMENT]], i64 0
222 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
223 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
224 ; HSA-NEXT: store i24 [[TMP2]], ptr addrspace(1) undef, align 4
227 ; MESA-LABEL: @kern_i24(
228 ; MESA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
229 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I24_KERNARG_SEGMENT]], i64 36
230 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
231 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
232 ; MESA-NEXT: store i24 [[TMP2]], ptr addrspace(1) undef, align 4
233 ; MESA-NEXT: ret void
235 store i24 %arg0, ptr addrspace(1) undef
239 define amdgpu_kernel void @kern_i32(i32 %arg0) {
240 ; HSA-LABEL: @kern_i32(
241 ; HSA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
242 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_KERNARG_SEGMENT]], i64 0
243 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
244 ; HSA-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
247 ; MESA-LABEL: @kern_i32(
248 ; MESA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
249 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_KERNARG_SEGMENT]], i64 36
250 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
251 ; MESA-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
252 ; MESA-NEXT: ret void
254 store i32 %arg0, ptr addrspace(1) undef
258 define amdgpu_kernel void @kern_f32(float %arg0) {
259 ; HSA-LABEL: @kern_f32(
260 ; HSA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
261 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F32_KERNARG_SEGMENT]], i64 0
262 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = load float, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
263 ; HSA-NEXT: store float [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
266 ; MESA-LABEL: @kern_f32(
267 ; MESA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
268 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F32_KERNARG_SEGMENT]], i64 36
269 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = load float, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
270 ; MESA-NEXT: store float [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
271 ; MESA-NEXT: ret void
273 store float %arg0, ptr addrspace(1) undef
277 define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
278 ; HSA-LABEL: @kern_v3i32(
279 ; HSA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
280 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I32_KERNARG_SEGMENT]], i64 0
281 ; HSA-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
282 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
283 ; HSA-NEXT: store <3 x i32> [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
286 ; MESA-LABEL: @kern_v3i32(
287 ; MESA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
288 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
289 ; MESA-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
290 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
291 ; MESA-NEXT: store <3 x i32> [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
292 ; MESA-NEXT: ret void
294 store <3 x i32> %arg0, ptr addrspace(1) undef, align 4
298 define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
299 ; HSA-LABEL: @kern_v8i32(
300 ; HSA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(88) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
301 ; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
302 ; HSA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 16, !invariant.load !0
303 ; HSA-NEXT: store <8 x i32> [[ARG_LOAD]], ptr addrspace(1) undef, align 32
306 ; MESA-LABEL: @kern_v8i32(
307 ; MESA-NEXT: [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(88) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
308 ; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
309 ; MESA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 4, !invariant.load !0
310 ; MESA-NEXT: store <8 x i32> [[ARG_LOAD]], ptr addrspace(1) undef, align 32
311 ; MESA-NEXT: ret void
313 store <8 x i32> %arg, ptr addrspace(1) undef
317 define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
318 ; HSA-LABEL: @kern_v8i64(
319 ; HSA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(120) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
320 ; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
321 ; HSA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 16, !invariant.load !0
322 ; HSA-NEXT: store <8 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 64
325 ; MESA-LABEL: @kern_v8i64(
326 ; MESA-NEXT: [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(120) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
327 ; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
328 ; MESA-NEXT: [[ARG_LOAD:%.*]] = load <8 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 4, !invariant.load !0
329 ; MESA-NEXT: store <8 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 64
330 ; MESA-NEXT: ret void
332 store <8 x i64> %arg, ptr addrspace(1) undef
336 define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
337 ; HSA-LABEL: @kern_v16i64(
338 ; HSA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(184) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
339 ; HSA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
340 ; HSA-NEXT: [[ARG_LOAD:%.*]] = load <16 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 16, !invariant.load !0
341 ; HSA-NEXT: store <16 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 128
344 ; MESA-LABEL: @kern_v16i64(
345 ; MESA-NEXT: [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(184) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
346 ; MESA-NEXT: [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
347 ; MESA-NEXT: [[ARG_LOAD:%.*]] = load <16 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 4, !invariant.load !0
348 ; MESA-NEXT: store <16 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 128
349 ; MESA-NEXT: ret void
351 store <16 x i64> %arg, ptr addrspace(1) undef
355 define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
356 ; HSA-LABEL: @kern_i32_v3i32(
357 ; HSA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
358 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0
359 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
360 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 16
361 ; HSA-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load !0
362 ; HSA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
363 ; HSA-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
364 ; HSA-NEXT: store <3 x i32> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
367 ; MESA-LABEL: @kern_i32_v3i32(
368 ; MESA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
369 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
370 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
371 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 52
372 ; MESA-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load !0
373 ; MESA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
374 ; MESA-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
375 ; MESA-NEXT: store <3 x i32> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
376 ; MESA-NEXT: ret void
378 store i32 %arg0, ptr addrspace(1) undef
379 store <3 x i32> %arg1, ptr addrspace(1) undef, align 4
383 %struct.a = type { i32, i8, [4 x i8] }
384 %struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
386 define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
387 ; HSA-LABEL: @kern_struct_a(
388 ; HSA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
389 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
390 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
391 ; HSA-NEXT: store [[STRUCT_A]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
394 ; MESA-LABEL: @kern_struct_a(
395 ; MESA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
396 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
397 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
398 ; MESA-NEXT: store [[STRUCT_A]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
399 ; MESA-NEXT: ret void
401 store %struct.a %arg0, ptr addrspace(1) undef
405 define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
406 ; HSA-LABEL: @kern_struct_b_packed(
407 ; HSA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
408 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
409 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
410 ; HSA-NEXT: store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 16
413 ; MESA-LABEL: @kern_struct_b_packed(
414 ; MESA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
415 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
416 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
417 ; MESA-NEXT: store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 16
418 ; MESA-NEXT: ret void
420 store %struct.b.packed %arg0, ptr addrspace(1) undef
424 define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
425 ; HSA-LABEL: @kern_implicit_arg_num_bytes(
426 ; HSA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
427 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0
428 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
429 ; HSA-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
432 ; MESA-LABEL: @kern_implicit_arg_num_bytes(
433 ; MESA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
434 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
435 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
436 ; MESA-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
437 ; MESA-NEXT: ret void
439 store i32 %arg0, ptr addrspace(1) undef
443 define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %arg1) #1 {
444 ; HSA-LABEL: @kernel_implicitarg_no_struct_align(
445 ; HSA-NEXT: [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(112) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
446 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64
447 ; HSA-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load !0
448 ; HSA-NEXT: store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
451 ; MESA-LABEL: @kernel_implicitarg_no_struct_align(
452 ; MESA-NEXT: [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(108) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
453 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100
454 ; MESA-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load !0
455 ; MESA-NEXT: store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
456 ; MESA-NEXT: ret void
458 store i32 %arg1, ptr addrspace(1) undef
462 define amdgpu_kernel void @kern_lds_ptr(ptr addrspace(3) %lds) #0 {
463 ; HSA-LABEL: @kern_lds_ptr(
464 ; HSA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
465 ; HSA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0
466 ; HSA-NEXT: [[LDS_LOAD:%.*]] = load ptr addrspace(3), ptr addrspace(4) [[LDS_KERNARG_OFFSET]], align 16, !invariant.load !0
467 ; HSA-NEXT: store i32 0, ptr addrspace(3) [[LDS_LOAD]], align 4
470 ; MESA-LABEL: @kern_lds_ptr(
471 ; MESA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
472 ; MESA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
473 ; MESA-NEXT: [[LDS_LOAD:%.*]] = load ptr addrspace(3), ptr addrspace(4) [[LDS_KERNARG_OFFSET]], align 4, !invariant.load !0
474 ; MESA-NEXT: store i32 0, ptr addrspace(3) [[LDS_LOAD]], align 4
475 ; MESA-NEXT: ret void
477 store i32 0, ptr addrspace(3) %lds, align 4
481 define amdgpu_kernel void @kern_lds_ptr_si(ptr addrspace(3) %lds) #2 {
482 ; GCN-LABEL: @kern_lds_ptr_si(
483 ; GCN-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
484 ; GCN-NEXT: store i32 0, ptr addrspace(3) [[LDS:%.*]], align 4
487 store i32 0, ptr addrspace(3) %lds, align 4
491 define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
492 ; HSA-LABEL: @kern_realign_i8_i8(
493 ; HSA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
494 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
495 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
496 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
497 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
498 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
499 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
500 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
501 ; HSA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
502 ; HSA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
505 ; MESA-LABEL: @kern_realign_i8_i8(
506 ; MESA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
507 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
508 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
509 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
510 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
511 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
512 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
513 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
514 ; MESA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
515 ; MESA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
516 ; MESA-NEXT: ret void
518 store volatile i8 %arg0, ptr addrspace(1) undef
519 store volatile i8 %arg1, ptr addrspace(1) undef
523 define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
524 ; HSA-LABEL: @kern_realign_i8_i8_i8(
525 ; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
526 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
527 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
528 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
529 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
530 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
531 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
532 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
533 ; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
534 ; HSA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
535 ; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
536 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
537 ; HSA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
538 ; HSA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
539 ; HSA-NEXT: store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
542 ; MESA-LABEL: @kern_realign_i8_i8_i8(
543 ; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
544 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
545 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
546 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
547 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
548 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
549 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
550 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
551 ; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
552 ; MESA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
553 ; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
554 ; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
555 ; MESA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
556 ; MESA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
557 ; MESA-NEXT: store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
558 ; MESA-NEXT: ret void
560 store volatile i8 %arg0, ptr addrspace(1) undef
561 store volatile i8 %arg1, ptr addrspace(1) undef
562 store volatile i8 %arg2, ptr addrspace(1) undef
566 define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
567 ; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
568 ; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
569 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
570 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
571 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
572 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
573 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
574 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
575 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
576 ; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
577 ; HSA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
578 ; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
579 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
580 ; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
581 ; HSA-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
582 ; HSA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
583 ; HSA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
584 ; HSA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
585 ; HSA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
586 ; HSA-NEXT: store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
587 ; HSA-NEXT: store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
590 ; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
591 ; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
592 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
593 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
594 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
595 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
596 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
597 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
598 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
599 ; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
600 ; MESA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
601 ; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
602 ; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
603 ; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
604 ; MESA-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
605 ; MESA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
606 ; MESA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
607 ; MESA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
608 ; MESA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
609 ; MESA-NEXT: store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
610 ; MESA-NEXT: store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
611 ; MESA-NEXT: ret void
613 store volatile i8 %arg0, ptr addrspace(1) undef
614 store volatile i8 %arg1, ptr addrspace(1) undef
615 store volatile i8 %arg2, ptr addrspace(1) undef
616 store volatile i8 %arg3, ptr addrspace(1) undef
620 define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
621 ; HSA-LABEL: @kern_realign_i8_v3i8(
622 ; HSA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
623 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
624 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
625 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
626 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
627 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
628 ; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
629 ; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
630 ; HSA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
631 ; HSA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
634 ; MESA-LABEL: @kern_realign_i8_v3i8(
635 ; MESA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
636 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
637 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
638 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
639 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 40
640 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load !0
641 ; MESA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
642 ; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
643 ; MESA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
644 ; MESA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
645 ; MESA-NEXT: ret void
647 store volatile i8 %arg0, ptr addrspace(1) undef
648 store volatile <3 x i8> %arg1, ptr addrspace(1) undef
652 define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
653 ; HSA-LABEL: @kern_realign_i8_i16(
654 ; HSA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
655 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
656 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
657 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
658 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
659 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
660 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
661 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
662 ; HSA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
663 ; HSA-NEXT: store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
666 ; MESA-LABEL: @kern_realign_i8_i16(
667 ; MESA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
668 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
669 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
670 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
671 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
672 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
673 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
674 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
675 ; MESA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
676 ; MESA-NEXT: store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
677 ; MESA-NEXT: ret void
679 store volatile i8 %arg0, ptr addrspace(1) undef
680 store volatile i16 %arg1, ptr addrspace(1) undef
684 define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
685 ; HSA-LABEL: @kern_realign_i1_i1(
686 ; HSA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
687 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
688 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
689 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
690 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
691 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
692 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
693 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
694 ; HSA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
695 ; HSA-NEXT: store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
698 ; MESA-LABEL: @kern_realign_i1_i1(
699 ; MESA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
700 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
701 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
702 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
703 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
704 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
705 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
706 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
707 ; MESA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
708 ; MESA-NEXT: store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
709 ; MESA-NEXT: ret void
711 store volatile i1 %arg0, ptr addrspace(1) undef
712 store volatile i1 %arg1, ptr addrspace(1) undef
716 define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
717 ; HSA-LABEL: @kern_realign_i1_i1_i1(
718 ; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
719 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
720 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
721 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
722 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
723 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
724 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
725 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
726 ; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
727 ; HSA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
728 ; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
729 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
730 ; HSA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
731 ; HSA-NEXT: store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
732 ; HSA-NEXT: store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
735 ; MESA-LABEL: @kern_realign_i1_i1_i1(
736 ; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
737 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
738 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
739 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
740 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
741 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
742 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
743 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
744 ; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
745 ; MESA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
746 ; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
747 ; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
748 ; MESA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
749 ; MESA-NEXT: store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
750 ; MESA-NEXT: store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
751 ; MESA-NEXT: ret void
753 store volatile i1 %arg0, ptr addrspace(1) undef
754 store volatile i1 %arg1, ptr addrspace(1) undef
755 store volatile i1 %arg2, ptr addrspace(1) undef
759 define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
760 ; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
761 ; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
762 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
763 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
764 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
765 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
766 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
767 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
768 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
769 ; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
770 ; HSA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
771 ; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
772 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
773 ; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
774 ; HSA-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
775 ; HSA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
776 ; HSA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
777 ; HSA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
778 ; HSA-NEXT: store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
779 ; HSA-NEXT: store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
780 ; HSA-NEXT: store volatile i1 [[TMP11]], ptr addrspace(1) undef, align 1
783 ; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
784 ; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
785 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
786 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
787 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
788 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
789 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
790 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
791 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
792 ; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
793 ; MESA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
794 ; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
795 ; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
796 ; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
797 ; MESA-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
798 ; MESA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
799 ; MESA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
800 ; MESA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
801 ; MESA-NEXT: store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
802 ; MESA-NEXT: store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
803 ; MESA-NEXT: store volatile i1 [[TMP11]], ptr addrspace(1) undef, align 1
804 ; MESA-NEXT: ret void
806 store volatile i1 %arg0, ptr addrspace(1) undef
807 store volatile i1 %arg1, ptr addrspace(1) undef
808 store volatile i1 %arg2, ptr addrspace(1) undef
809 store volatile i1 %arg3, ptr addrspace(1) undef
813 define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
814 ; HSA-LABEL: @kern_realign_i1_v3i1(
815 ; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
816 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
817 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
818 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
819 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
820 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
821 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
822 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3
823 ; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1>
824 ; HSA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
825 ; HSA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], ptr addrspace(1) undef, align 1
828 ; MESA-LABEL: @kern_realign_i1_v3i1(
829 ; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
830 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
831 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
832 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
833 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
834 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
835 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
836 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3
837 ; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1>
838 ; MESA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
839 ; MESA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], ptr addrspace(1) undef, align 1
840 ; MESA-NEXT: ret void
842 store volatile i1 %arg0, ptr addrspace(1) undef
843 store volatile <3 x i1> %arg1, ptr addrspace(1) undef
847 define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
848 ; HSA-LABEL: @kern_realign_i1_i16(
849 ; HSA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
850 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
851 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
852 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
853 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
854 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
855 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
856 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
857 ; HSA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
858 ; HSA-NEXT: store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
861 ; MESA-LABEL: @kern_realign_i1_i16(
862 ; MESA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
863 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
864 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
865 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
866 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
867 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
868 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
869 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
870 ; MESA-NEXT: store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
871 ; MESA-NEXT: store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
872 ; MESA-NEXT: ret void
874 store volatile i1 %arg0, ptr addrspace(1) undef
875 store volatile i16 %arg1, ptr addrspace(1) undef
879 define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
880 ; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
881 ; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
882 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
883 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
884 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
885 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
886 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
887 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
888 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
889 ; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
890 ; HSA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
891 ; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
892 ; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
893 ; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
894 ; HSA-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
895 ; HSA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
896 ; HSA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
897 ; HSA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
898 ; HSA-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
899 ; HSA-NEXT: [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
900 ; HSA-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
901 ; HSA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
902 ; HSA-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(4) [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
903 ; HSA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
904 ; HSA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
905 ; HSA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
906 ; HSA-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(4) [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
907 ; HSA-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
908 ; HSA-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
909 ; HSA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
910 ; HSA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
911 ; HSA-NEXT: store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
912 ; HSA-NEXT: store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
913 ; HSA-NEXT: store volatile i8 [[TMP14]], ptr addrspace(1) undef, align 1
914 ; HSA-NEXT: store volatile i8 [[TMP17]], ptr addrspace(1) undef, align 1
915 ; HSA-NEXT: store volatile i8 [[TMP20]], ptr addrspace(1) undef, align 1
918 ; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
919 ; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
920 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
921 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
922 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
923 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
924 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
925 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
926 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
927 ; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
928 ; MESA-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
929 ; MESA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
930 ; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
931 ; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
932 ; MESA-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
933 ; MESA-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
934 ; MESA-NEXT: [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
935 ; MESA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
936 ; MESA-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(4) [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load !0
937 ; MESA-NEXT: [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
938 ; MESA-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
939 ; MESA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
940 ; MESA-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(4) [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load !0
941 ; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
942 ; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
943 ; MESA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
944 ; MESA-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(4) [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load !0
945 ; MESA-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
946 ; MESA-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
947 ; MESA-NEXT: store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
948 ; MESA-NEXT: store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
949 ; MESA-NEXT: store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
950 ; MESA-NEXT: store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
951 ; MESA-NEXT: store volatile i8 [[TMP14]], ptr addrspace(1) undef, align 1
952 ; MESA-NEXT: store volatile i8 [[TMP17]], ptr addrspace(1) undef, align 1
953 ; MESA-NEXT: store volatile i8 [[TMP20]], ptr addrspace(1) undef, align 1
954 ; MESA-NEXT: ret void
956 store volatile i8 %arg0, ptr addrspace(1) undef
957 store volatile i8 %arg1, ptr addrspace(1) undef
958 store volatile i8 %arg2, ptr addrspace(1) undef
959 store volatile i8 %arg3, ptr addrspace(1) undef
960 store volatile i8 %arg5, ptr addrspace(1) undef
961 store volatile i8 %arg6, ptr addrspace(1) undef
962 store volatile i8 %arg7, ptr addrspace(1) undef
966 define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
967 ; HSA-LABEL: @kern_realign_f16_f16(
968 ; HSA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
969 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
970 ; HSA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
971 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
972 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
973 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
974 ; HSA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
975 ; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
976 ; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
977 ; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
978 ; HSA-NEXT: store volatile half [[ARG0_LOAD]], ptr addrspace(1) undef, align 2
979 ; HSA-NEXT: store volatile half [[ARG1_LOAD]], ptr addrspace(1) undef, align 2
982 ; MESA-LABEL: @kern_realign_f16_f16(
983 ; MESA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
984 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
985 ; MESA-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
986 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
987 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
988 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
989 ; MESA-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
990 ; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
991 ; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
992 ; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
993 ; MESA-NEXT: store volatile half [[ARG0_LOAD]], ptr addrspace(1) undef, align 2
994 ; MESA-NEXT: store volatile half [[ARG1_LOAD]], ptr addrspace(1) undef, align 2
995 ; MESA-NEXT: ret void
997 store volatile half %arg0, ptr addrspace(1) undef
998 store volatile half %arg1, ptr addrspace(1) undef
1002 define amdgpu_kernel void @kern_global_ptr(ptr addrspace(1) %ptr) #0 {
1003 ; HSA-LABEL: @kern_global_ptr(
1004 ; HSA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1005 ; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1006 ; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0
1007 ; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1008 ; HSA-NEXT: ret void
1010 ; MESA-LABEL: @kern_global_ptr(
1011 ; MESA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1012 ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1013 ; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0
1014 ; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1015 ; MESA-NEXT: ret void
1017 store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1021 define amdgpu_kernel void @kern_global_ptr_dereferencable(ptr addrspace(1) dereferenceable(42) %ptr) #0 {
1022 ; HSA-LABEL: @kern_global_ptr_dereferencable(
1023 ; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1024 ; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
1025 ; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable !1
1026 ; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1027 ; HSA-NEXT: ret void
1029 ; MESA-LABEL: @kern_global_ptr_dereferencable(
1030 ; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1031 ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
1032 ; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable !1
1033 ; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1034 ; MESA-NEXT: ret void
1036 store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1040 define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(ptr addrspace(1) dereferenceable_or_null(128) %ptr) #0 {
1041 ; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
1042 ; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1043 ; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
1044 ; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable_or_null !2
1045 ; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1046 ; HSA-NEXT: ret void
1048 ; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
1049 ; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1050 ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
1051 ; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable_or_null !2
1052 ; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1053 ; MESA-NEXT: ret void
1055 store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1059 define amdgpu_kernel void @kern_nonnull_global_ptr(ptr addrspace(1) nonnull %ptr) #0 {
1060 ; HSA-LABEL: @kern_nonnull_global_ptr(
1061 ; HSA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1062 ; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1063 ; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !nonnull !0
1064 ; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1065 ; HSA-NEXT: ret void
1067 ; MESA-LABEL: @kern_nonnull_global_ptr(
1068 ; MESA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1069 ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1070 ; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !nonnull !0
1071 ; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1072 ; MESA-NEXT: ret void
1074 store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1078 define amdgpu_kernel void @kern_align32_global_ptr(ptr addrspace(1) align 1024 %ptr) #0 {
1079 ; HSA-LABEL: @kern_align32_global_ptr(
1080 ; HSA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1081 ; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1082 ; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !align !3
1083 ; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1084 ; HSA-NEXT: ret void
1086 ; MESA-LABEL: @kern_align32_global_ptr(
1087 ; MESA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1088 ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1089 ; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !align !3
1090 ; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1091 ; MESA-NEXT: ret void
1093 store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1097 define amdgpu_kernel void @kern_noalias_global_ptr(ptr addrspace(1) noalias %ptr) #0 {
1098 ; GCN-LABEL: @kern_noalias_global_ptr(
1099 ; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1100 ; GCN-NEXT: store volatile ptr addrspace(1) [[PTR:%.*]], ptr addrspace(1) undef, align 8
1101 ; GCN-NEXT: ret void
1103 store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1107 define amdgpu_kernel void @kern_noalias_global_ptr_x2(ptr addrspace(1) noalias %ptr0, ptr addrspace(1) noalias %ptr1) #0 {
1108 ; GCN-LABEL: @kern_noalias_global_ptr_x2(
1109 ; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1110 ; GCN-NEXT: store volatile ptr addrspace(1) [[PTR0:%.*]], ptr addrspace(1) undef, align 8
1111 ; GCN-NEXT: store volatile ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) undef, align 8
1112 ; GCN-NEXT: ret void
1114 store volatile ptr addrspace(1) %ptr0, ptr addrspace(1) undef
1115 store volatile ptr addrspace(1) %ptr1, ptr addrspace(1) undef
1119 define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
1120 ; HSA-LABEL: @struct_i8_i8_arg(
1122 ; HSA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1123 ; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0
1124 ; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
1125 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1126 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1127 ; HSA-NEXT: store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1128 ; HSA-NEXT: store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1129 ; HSA-NEXT: ret void
1131 ; MESA-LABEL: @struct_i8_i8_arg(
1133 ; MESA-NEXT: [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1134 ; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
1135 ; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i8 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
1136 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1137 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1138 ; MESA-NEXT: store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1139 ; MESA-NEXT: store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1140 ; MESA-NEXT: ret void
1143 %elt0 = extractvalue {i8, i8} %in, 0
1144 %elt1 = extractvalue {i8, i8} %in, 1
1145 store volatile i8 %elt0, ptr addrspace(1) null, align 4
1146 store volatile i8 %elt1, ptr addrspace(1) null, align 4
1150 define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
1151 ; HSA-LABEL: @struct_i8_i16_arg(
1153 ; HSA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1154 ; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0
1155 ; HSA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
1156 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1157 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1158 ; HSA-NEXT: store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1159 ; HSA-NEXT: store volatile i16 [[ELT1]], ptr addrspace(1) null, align 4
1160 ; HSA-NEXT: ret void
1162 ; MESA-LABEL: @struct_i8_i16_arg(
1164 ; MESA-NEXT: [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1165 ; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
1166 ; MESA-NEXT: [[IN_LOAD:%.*]] = load { i8, i16 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
1167 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1168 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1169 ; MESA-NEXT: store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1170 ; MESA-NEXT: store volatile i16 [[ELT1]], ptr addrspace(1) null, align 4
1171 ; MESA-NEXT: ret void
1174 %elt0 = extractvalue {i8, i16} %in, 0
1175 %elt1 = extractvalue {i8, i16} %in, 1
1176 store volatile i8 %elt0, ptr addrspace(1) null, align 4
1177 store volatile i16 %elt1, ptr addrspace(1) null, align 4
1181 define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
1182 ; HSA-LABEL: @array_2xi8_arg(
1184 ; HSA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1185 ; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0
1186 ; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
1187 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1188 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1189 ; HSA-NEXT: store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1190 ; HSA-NEXT: store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1191 ; HSA-NEXT: ret void
1193 ; MESA-LABEL: @array_2xi8_arg(
1195 ; MESA-NEXT: [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1196 ; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
1197 ; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i8], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
1198 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1199 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1200 ; MESA-NEXT: store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1201 ; MESA-NEXT: store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1202 ; MESA-NEXT: ret void
1205 %elt0 = extractvalue [2 x i8] %in, 0
1206 %elt1 = extractvalue [2 x i8] %in, 1
1207 store volatile i8 %elt0, ptr addrspace(1) null, align 4
1208 store volatile i8 %elt1, ptr addrspace(1) null, align 4
1212 define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
1213 ; HSA-LABEL: @array_2xi1_arg(
1215 ; HSA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1216 ; HSA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0
1217 ; HSA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
1218 ; HSA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1219 ; HSA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1220 ; HSA-NEXT: store volatile i1 [[ELT0]], ptr addrspace(1) null, align 4
1221 ; HSA-NEXT: store volatile i1 [[ELT1]], ptr addrspace(1) null, align 4
1222 ; HSA-NEXT: ret void
1224 ; MESA-LABEL: @array_2xi1_arg(
1226 ; MESA-NEXT: [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1227 ; MESA-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
1228 ; MESA-NEXT: [[IN_LOAD:%.*]] = load [2 x i1], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
1229 ; MESA-NEXT: [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1230 ; MESA-NEXT: [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1231 ; MESA-NEXT: store volatile i1 [[ELT0]], ptr addrspace(1) null, align 4
1232 ; MESA-NEXT: store volatile i1 [[ELT1]], ptr addrspace(1) null, align 4
1233 ; MESA-NEXT: ret void
1236 %elt0 = extractvalue [2 x i1] %in, 0
1237 %elt1 = extractvalue [2 x i1] %in, 1
1238 store volatile i1 %elt0, ptr addrspace(1) null, align 4
1239 store volatile i1 %elt1, ptr addrspace(1) null, align 4
1243 define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
1244 ; GCN-LABEL: @only_empty_struct(
1245 ; GCN-NEXT: [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1246 ; GCN-NEXT: ret void
1251 define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
1252 ; HSA-LABEL: @empty_struct_with_other(
1253 ; HSA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1254 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
1255 ; HSA-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load !0
1256 ; HSA-NEXT: store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
1257 ; HSA-NEXT: ret void
1259 ; MESA-LABEL: @empty_struct_with_other(
1260 ; MESA-NEXT: [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1261 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
1262 ; MESA-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load !0
1263 ; MESA-NEXT: store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
1264 ; MESA-NEXT: ret void
1266 store i32 %arg1, ptr addrspace(1) undef
1270 ; Should insert code after the allocas
1271 define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
1272 ; HSA-LABEL: @static_alloca_kern_i32(
1273 ; HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1274 ; HSA-NEXT: [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1275 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 0
1276 ; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
1277 ; HSA-NEXT: store volatile i32 [[ARG0_LOAD]], ptr addrspace(5) [[ALLOCA]], align 4
1278 ; HSA-NEXT: ret void
1280 ; MESA-LABEL: @static_alloca_kern_i32(
1281 ; MESA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1282 ; MESA-NEXT: [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1283 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 36
1284 ; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
1285 ; MESA-NEXT: store volatile i32 [[ARG0_LOAD]], ptr addrspace(5) [[ALLOCA]], align 4
1286 ; MESA-NEXT: ret void
1288 %alloca = alloca i32, addrspace(5)
1289 store volatile i32 %arg0, ptr addrspace(5) %alloca
1293 ; Make sure we don't break the IR if an alloca depends on the
1295 define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
1296 ; HSA-LABEL: @dyn_alloca_kernarg_i32(
1297 ; HSA-NEXT: [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1298 ; HSA-NEXT: [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1299 ; HSA-NEXT: [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 0
1300 ; HSA-NEXT: [[N_LOAD:%.*]] = load i32, ptr addrspace(4) [[N_KERNARG_OFFSET]], align 16, !invariant.load !0
1301 ; HSA-NEXT: [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1302 ; HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[ALLOCA0]], align 4
1303 ; HSA-NEXT: store volatile i32 1, ptr addrspace(5) [[ALLOCA1]], align 4
1304 ; HSA-NEXT: ret void
1306 ; MESA-LABEL: @dyn_alloca_kernarg_i32(
1307 ; MESA-NEXT: [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1308 ; MESA-NEXT: [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1309 ; MESA-NEXT: [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 36
1310 ; MESA-NEXT: [[N_LOAD:%.*]] = load i32, ptr addrspace(4) [[N_KERNARG_OFFSET]], align 4, !invariant.load !0
1311 ; MESA-NEXT: [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1312 ; MESA-NEXT: store volatile i32 0, ptr addrspace(5) [[ALLOCA0]], align 4
1313 ; MESA-NEXT: store volatile i32 1, ptr addrspace(5) [[ALLOCA1]], align 4
1314 ; MESA-NEXT: ret void
1316 %alloca0 = alloca i32, addrspace(5)
1317 %alloca1 = alloca i32, i32 %n, addrspace(5)
1318 store volatile i32 0, ptr addrspace(5) %alloca0
1319 store volatile i32 1, ptr addrspace(5) %alloca1
1323 ; Byref pointers should only be treated as offsets from kernarg
1324 define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) {
1325 ; HSA-LABEL: @byref_constant_i8_arg(
1326 ; HSA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1327 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0
1328 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1329 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8
1330 ; HSA-NEXT: [[IN:%.*]] = load i8, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1331 ; HSA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32
1332 ; HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1333 ; HSA-NEXT: ret void
1335 ; MESA-LABEL: @byref_constant_i8_arg(
1336 ; MESA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1337 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36
1338 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1339 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44
1340 ; MESA-NEXT: [[IN:%.*]] = load i8, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1341 ; MESA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32
1342 ; MESA-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1343 ; MESA-NEXT: ret void
1345 %in = load i8, ptr addrspace(4) %in.byref
1346 %ext = zext i8 %in to i32
1347 store i32 %ext, ptr addrspace(1) %out, align 4
1351 define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) {
1352 ; HSA-LABEL: @byref_constant_i16_arg(
1353 ; HSA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1354 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0
1355 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1356 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8
1357 ; HSA-NEXT: [[IN:%.*]] = load i16, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 2
1358 ; HSA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32
1359 ; HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1360 ; HSA-NEXT: ret void
1362 ; MESA-LABEL: @byref_constant_i16_arg(
1363 ; MESA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1364 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36
1365 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1366 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44
1367 ; MESA-NEXT: [[IN:%.*]] = load i16, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 2
1368 ; MESA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32
1369 ; MESA-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1370 ; MESA-NEXT: ret void
1372 %in = load i16, ptr addrspace(4) %in.byref
1373 %ext = zext i16 %in to i32
1374 store i32 %ext, ptr addrspace(1) %out, align 4
1378 define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) {
1379 ; HSA-LABEL: @byref_constant_i32_arg(
1380 ; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1381 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1382 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1383 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1384 ; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1385 ; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load !0
1386 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1387 ; HSA-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1388 ; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1389 ; HSA-NEXT: ret void
1391 ; MESA-LABEL: @byref_constant_i32_arg(
1392 ; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1393 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1394 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1395 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1396 ; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1397 ; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load !0
1398 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1399 ; MESA-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1400 ; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1401 ; MESA-NEXT: ret void
1403 %in = load i32, ptr addrspace(4) %in.byref
1404 store volatile i32 %in, ptr addrspace(1) %out, align 4
1405 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1409 define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) {
1410 ; HSA-LABEL: @byref_constant_v4i32_arg(
1411 ; HSA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(96) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1412 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0
1413 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1414 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16
1415 ; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32
1416 ; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load !0
1417 ; HSA-NEXT: [[IN:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 16
1418 ; HSA-NEXT: store volatile <4 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1419 ; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1420 ; HSA-NEXT: ret void
1422 ; MESA-LABEL: @byref_constant_v4i32_arg(
1423 ; MESA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(92) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1424 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36
1425 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1426 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52
1427 ; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68
1428 ; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load !0
1429 ; MESA-NEXT: [[IN:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 16
1430 ; MESA-NEXT: store volatile <4 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1431 ; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1432 ; MESA-NEXT: ret void
1434 %in = load <4 x i32>, ptr addrspace(4) %in.byref
1435 store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4
1436 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1440 define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
1441 ; HSA-LABEL: @byref_align_constant_i32_arg(
1442 ; HSA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(320) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1443 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1444 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1445 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256
1446 ; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260
1447 ; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load !0
1448 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1449 ; HSA-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1450 ; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1451 ; HSA-NEXT: ret void
1453 ; MESA-LABEL: @byref_align_constant_i32_arg(
1454 ; MESA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(320) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1455 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1456 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1457 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292
1458 ; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296
1459 ; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 8, !invariant.load !0
1460 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1461 ; MESA-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1462 ; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1463 ; MESA-NEXT: ret void
1465 %in = load i32, ptr addrspace(4) %in.byref
1466 store volatile i32 %in, ptr addrspace(1) %out, align 4
1467 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1471 define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) %in.byref, i32 %after.offset) {
1472 ; HSA-LABEL: @byref_natural_align_constant_v16i32_arg(
1473 ; HSA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(192) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1474 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0
1475 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1476 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64
1477 ; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128
1478 ; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load !0
1479 ; HSA-NEXT: [[IN:%.*]] = load <16 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 64
1480 ; HSA-NEXT: store volatile <16 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1481 ; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1482 ; HSA-NEXT: ret void
1484 ; MESA-LABEL: @byref_natural_align_constant_v16i32_arg(
1485 ; MESA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(188) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1486 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36
1487 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1488 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100
1489 ; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164
1490 ; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load !0
1491 ; MESA-NEXT: [[IN:%.*]] = load <16 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 64
1492 ; MESA-NEXT: store volatile <16 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1493 ; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1494 ; MESA-NEXT: ret void
1496 %in = load <16 x i32>, ptr addrspace(4) %in.byref
1497 store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
1498 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1502 ; Also accept byref kernel arguments with other global address spaces.
1503 define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) {
1504 ; HSA-LABEL: @byref_global_i32_arg(
1505 ; HSA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1506 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1507 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1508 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1509 ; HSA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
1510 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
1511 ; HSA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1512 ; HSA-NEXT: ret void
1514 ; MESA-LABEL: @byref_global_i32_arg(
1515 ; MESA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1516 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1517 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1518 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1519 ; MESA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
1520 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
1521 ; MESA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1522 ; MESA-NEXT: ret void
1524 %in = load i32, ptr addrspace(1) %in.byref
1525 store i32 %in, ptr addrspace(1) %out, align 4
1529 define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) {
1530 ; HSA-LABEL: @byref_flat_i32_arg(
1531 ; HSA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1532 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0
1533 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1534 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8
1535 ; HSA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr
1536 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr [[TMP1]], align 4
1537 ; HSA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1538 ; HSA-NEXT: ret void
1540 ; MESA-LABEL: @byref_flat_i32_arg(
1541 ; MESA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1542 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36
1543 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1544 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44
1545 ; MESA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr
1546 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr [[TMP1]], align 4
1547 ; MESA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1548 ; MESA-NEXT: ret void
1550 %in = load i32, ptr %in.byref
1551 store i32 %in, ptr addrspace(1) %out, align 4
1555 define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) {
1556 ; HSA-LABEL: @byref_constant_32bit_i32_arg(
1557 ; HSA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1558 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0
1559 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1560 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8
1561 ; HSA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(6)
1562 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(6) [[TMP1]], align 4
1563 ; HSA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1564 ; HSA-NEXT: ret void
1566 ; MESA-LABEL: @byref_constant_32bit_i32_arg(
1567 ; MESA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1568 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36
1569 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1570 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44
1571 ; MESA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(6)
1572 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(6) [[TMP1]], align 4
1573 ; MESA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1574 ; MESA-NEXT: ret void
1576 %in = load i32, ptr addrspace(6) %in.byref
1577 store i32 %in, ptr addrspace(1) %out, align 4
1581 define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref(i32) %in.byref) {
1582 ; HSA-LABEL: @byref_unknown_as_i32_arg(
1583 ; HSA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1584 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0
1585 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1586 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8
1587 ; HSA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(999)
1588 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(999) [[TMP1]], align 4
1589 ; HSA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1590 ; HSA-NEXT: ret void
1592 ; MESA-LABEL: @byref_unknown_as_i32_arg(
1593 ; MESA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1594 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36
1595 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1596 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44
1597 ; MESA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(999)
1598 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(999) [[TMP1]], align 4
1599 ; MESA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1600 ; MESA-NEXT: ret void
1602 %in = load i32, ptr addrspace(999) %in.byref
1603 store i32 %in, ptr addrspace(1) %out, align 4
1607 ; Invalid, but should not crash.
1608 define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(3) byref(i32) %in.byref) {
1609 ; HSA-LABEL: @byref_local_i32_arg(
1610 ; HSA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1611 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1612 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1613 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1614 ; HSA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(3)
1615 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
1616 ; HSA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1617 ; HSA-NEXT: ret void
1619 ; MESA-LABEL: @byref_local_i32_arg(
1620 ; MESA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1621 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1622 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1623 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1624 ; MESA-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(3)
1625 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
1626 ; MESA-NEXT: store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1627 ; MESA-NEXT: ret void
1629 %in = load i32, ptr addrspace(3) %in.byref
1630 store i32 %in, ptr addrspace(1) %out, align 4
1634 define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) {
1635 ; HSA-LABEL: @multi_byref_constant_i32_arg(
1636 ; HSA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1637 ; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1638 ; HSA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0
1639 ; HSA-NEXT: [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1640 ; HSA-NEXT: [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1641 ; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16
1642 ; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load !0
1643 ; HSA-NEXT: [[IN0:%.*]] = load i32, ptr addrspace(4) [[IN0_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1644 ; HSA-NEXT: [[IN1:%.*]] = load i32, ptr addrspace(4) [[IN1_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1645 ; HSA-NEXT: store volatile i32 [[IN0]], ptr addrspace(1) [[OUT_LOAD]], align 4
1646 ; HSA-NEXT: store volatile i32 [[IN1]], ptr addrspace(1) [[OUT_LOAD]], align 4
1647 ; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1648 ; HSA-NEXT: ret void
1650 ; MESA-LABEL: @multi_byref_constant_i32_arg(
1651 ; MESA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(76) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1652 ; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1653 ; MESA-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load !0
1654 ; MESA-NEXT: [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1655 ; MESA-NEXT: [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1656 ; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52
1657 ; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load !0
1658 ; MESA-NEXT: [[IN0:%.*]] = load i32, ptr addrspace(4) [[IN0_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1659 ; MESA-NEXT: [[IN1:%.*]] = load i32, ptr addrspace(4) [[IN1_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1660 ; MESA-NEXT: store volatile i32 [[IN0]], ptr addrspace(1) [[OUT_LOAD]], align 4
1661 ; MESA-NEXT: store volatile i32 [[IN1]], ptr addrspace(1) [[OUT_LOAD]], align 4
1662 ; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1663 ; MESA-NEXT: ret void
1665 %in0 = load i32, ptr addrspace(4) %in0.byref
1666 %in1 = load i32, ptr addrspace(4) %in1.byref
1667 store volatile i32 %in0, ptr addrspace(1) %out, align 4
1668 store volatile i32 %in1, ptr addrspace(1) %out, align 4
1669 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1673 define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) {
1674 ; HSA-LABEL: @byref_constant_i32_arg_offset0(
1675 ; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1676 ; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0
1677 ; HSA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1678 ; HSA-NEXT: store i32 [[IN]], ptr addrspace(1) undef, align 4
1679 ; HSA-NEXT: ret void
1681 ; MESA-LABEL: @byref_constant_i32_arg_offset0(
1682 ; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1683 ; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36
1684 ; MESA-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1685 ; MESA-NEXT: store i32 [[IN]], ptr addrspace(1) undef, align 4
1686 ; MESA-NEXT: ret void
1688 %in = load i32, ptr addrspace(4) %in.byref
1689 store i32 %in, ptr addrspace(1) undef, align 4
1693 attributes #0 = { nounwind "target-cpu"="kaveri" }
1694 attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
1695 attributes #2 = { nounwind "target-cpu"="tahiti" }
1698 ; GCN: attributes #[[ATTR0:[0-9]+]] = { nounwind "target-cpu"="kaveri" }
1699 ; GCN: attributes #[[ATTR1:[0-9]+]] = { nounwind "amdgpu-implicitarg-num-bytes"="40" "target-cpu"="kaveri" }
1700 ; GCN: attributes #[[ATTR2:[0-9]+]] = { nounwind "target-cpu"="tahiti" }
1701 ; GCN: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1703 ; GCN: [[META0:![0-9]+]] = !{}
1704 ; GCN: [[META1:![0-9]+]] = !{i64 42}
1705 ; GCN: [[META2:![0-9]+]] = !{i64 128}
1706 ; GCN: [[META3:![0-9]+]] = !{i64 1024}