1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
5 ; Make sure we don't turn the 32-bit argument load into a 16-bit
6 ; load. There aren't extending scalar lods, so that would require
7 ; using a buffer_load instruction.
9 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
11 ; SI: buffer_store_short v
12 define amdgpu_kernel void @truncate_kernarg_i32_to_i16(ptr addrspace(1) %out, i32 %arg) nounwind {
13 %trunc = trunc i32 %arg to i16
14 store i16 %trunc, ptr addrspace(1) %out
18 ; It should be OK (and probably performance neutral) to reduce this,
19 ; but we don't know if the load is uniform yet.
21 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
22 ; SI: buffer_load_dword v
23 ; SI: buffer_store_short v
24 define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
25 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
26 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
27 %gep.out = getelementptr i16, ptr addrspace(1) %out, i32 %tid
28 %load = load i32, ptr addrspace(1) %gep.in
29 %trunc = trunc i32 %load to i16
30 store i16 %trunc, ptr addrspace(1) %gep.out
34 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
36 ; SI: buffer_store_byte v
37 define amdgpu_kernel void @truncate_kernarg_i32_to_i8(ptr addrspace(1) %out, i32 %arg) nounwind {
38 %trunc = trunc i32 %arg to i8
39 store i8 %trunc, ptr addrspace(1) %out
43 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
44 ; SI: buffer_load_dword v
45 ; SI: buffer_store_byte v
46 define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
47 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
48 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
49 %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
50 %load = load i32, ptr addrspace(1) %gep.in
51 %trunc = trunc i32 %load to i8
52 store i8 %trunc, ptr addrspace(1) %gep.out
56 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
58 ; SI: buffer_store_byte v
59 define amdgpu_kernel void @truncate_kernarg_i32_to_i1(ptr addrspace(1) %out, i32 %arg) nounwind {
60 %trunc = trunc i32 %arg to i1
61 store i1 %trunc, ptr addrspace(1) %out
65 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
66 ; SI: buffer_load_dword v
67 ; SI: buffer_store_byte v
68 define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
69 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
70 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
71 %gep.out = getelementptr i1, ptr addrspace(1) %out, i32 %tid
72 %load = load i32, ptr addrspace(1) %gep.in
73 %trunc = trunc i32 %load to i1
74 store i1 %trunc, ptr addrspace(1) %gep.out
78 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
80 ; SI: buffer_store_dword v
81 define amdgpu_kernel void @truncate_kernarg_i64_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
82 %trunc = trunc i64 %arg to i32
83 store i32 %trunc, ptr addrspace(1) %out
87 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
88 ; SI: buffer_load_dword v
89 ; SI: buffer_store_dword v
90 define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
91 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
92 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
93 %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
94 %load = load i64, ptr addrspace(1) %gep.in
95 %trunc = trunc i64 %load to i32
96 store i32 %trunc, ptr addrspace(1) %gep.out
100 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
102 ; SI: buffer_store_dword v
103 define amdgpu_kernel void @srl_kernarg_i64_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
104 %srl = lshr i64 %arg, 32
105 %trunc = trunc i64 %srl to i32
106 store i32 %trunc, ptr addrspace(1) %out
110 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
111 ; SI: buffer_load_dword v
112 ; SI: buffer_store_dword v
113 define amdgpu_kernel void @srl_buffer_load_i64_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
114 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
115 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
116 %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
117 %load = load i64, ptr addrspace(1) %gep.in
118 %srl = lshr i64 %load, 32
119 %trunc = trunc i64 %srl to i32
120 store i32 %trunc, ptr addrspace(1) %gep.out
124 ; Might as well reduce to 8-bit loads.
125 ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
127 ; SI: buffer_store_byte v
128 define amdgpu_kernel void @truncate_kernarg_i16_to_i8(ptr addrspace(1) %out, i16 %arg) nounwind {
129 %trunc = trunc i16 %arg to i8
130 store i8 %trunc, ptr addrspace(1) %out
134 ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
135 ; SI: buffer_load_ubyte v
136 ; SI: buffer_store_byte v
137 define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
138 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
139 %gep.in = getelementptr i16, ptr addrspace(1) %in, i32 %tid
140 %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
141 %load = load i16, ptr addrspace(1) %gep.in
142 %trunc = trunc i16 %load to i8
143 store i8 %trunc, ptr addrspace(1) %gep.out
147 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
149 ; SI: buffer_store_byte v
150 define amdgpu_kernel void @srl_kernarg_i64_to_i8(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
151 %srl = lshr i64 %arg, 32
152 %trunc = trunc i64 %srl to i8
153 store i8 %trunc, ptr addrspace(1) %out
157 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
158 ; SI: buffer_load_dword v
159 ; SI: buffer_store_byte v
160 define amdgpu_kernel void @srl_buffer_load_i64_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
161 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
162 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
163 %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
164 %load = load i64, ptr addrspace(1) %gep.in
165 %srl = lshr i64 %load, 32
166 %trunc = trunc i64 %srl to i8
167 store i8 %trunc, ptr addrspace(1) %gep.out
171 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
173 ; SI: buffer_store_byte v
174 define amdgpu_kernel void @truncate_kernarg_i64_to_i8(ptr addrspace(1) %out, [8 x i32], i64 %arg) nounwind {
175 %trunc = trunc i64 %arg to i8
176 store i8 %trunc, ptr addrspace(1) %out
180 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
181 ; SI: buffer_load_dword v
182 ; SI: buffer_store_byte v
183 define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
184 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
185 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
186 %gep.out = getelementptr i8, ptr addrspace(1) %out, i32 %tid
187 %load = load i64, ptr addrspace(1) %gep.in
188 %trunc = trunc i64 %load to i8
189 store i8 %trunc, ptr addrspace(1) %gep.out
193 ; FUNC-LABEL: {{^}}smrd_mask_i32_to_i16
194 ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
195 ; SI: s_waitcnt lgkmcnt(0)
196 ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
197 define amdgpu_kernel void @smrd_mask_i32_to_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) {
199 %val = load i32, ptr addrspace(4) %in
200 %mask = and i32 %val, 65535
201 store i32 %mask, ptr addrspace(1) %out
205 ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
206 ; SI: buffer_load_dword v
207 ; SI: buffer_store_dword v
208 define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
209 %ld = load <2 x i32>, ptr addrspace(1) %in
210 %bc = bitcast <2 x i32> %ld to i64
211 %hi = lshr i64 %bc, 32
212 %trunc = trunc i64 %hi to i32
213 store i32 %trunc, ptr addrspace(1) %out