1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
7 ; Function Attrs: mustprogress nounwind willreturn
8 define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
11 ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
12 ; GFX908-NEXT: v_mov_b32_e32 v4, 0
13 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
15 ; GFX908-NEXT: s_waitcnt vmcnt(0)
16 ; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
17 ; GFX908-NEXT: s_endpgm
19 ; GFX90A-LABEL: half8:
21 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
22 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0
23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
24 ; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
25 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
26 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
27 ; GFX90A-NEXT: s_endpgm
29 ; GFX1030-LABEL: half8:
31 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
32 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0
33 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
34 ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
35 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
36 ; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
37 ; GFX1030-NEXT: s_endpgm
38 %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
39 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
40 %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
41 %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
42 %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
43 %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
44 %gep6 = getelementptr half, ptr addrspace(1) %0, i64 6
45 %gep7 = getelementptr half, ptr addrspace(1) %0, i64 7
46 %l0 = load half, ptr addrspace(1) %gep0, align 2
47 %l1 = load half, ptr addrspace(1) %gep1, align 2
48 %l2 = load half, ptr addrspace(1) %gep2, align 2
49 %l3 = load half, ptr addrspace(1) %gep3, align 2
50 %l4 = load half, ptr addrspace(1) %gep4, align 2
51 %l5 = load half, ptr addrspace(1) %gep5, align 2
52 %l6 = load half, ptr addrspace(1) %gep6, align 2
53 %l7 = load half, ptr addrspace(1) %gep7, align 2
54 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
55 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
56 %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
57 %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
58 %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
59 %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
60 %sgep6 = getelementptr half, ptr addrspace(1) %1, i64 6
61 %sgep7 = getelementptr half, ptr addrspace(1) %1, i64 7
62 store half %l0, ptr addrspace(1) %sgep0, align 2
63 store half %l1, ptr addrspace(1) %sgep1, align 2
64 store half %l2, ptr addrspace(1) %sgep2, align 2
65 store half %l3, ptr addrspace(1) %sgep3, align 2
66 store half %l4, ptr addrspace(1) %sgep4, align 2
67 store half %l5, ptr addrspace(1) %sgep5, align 2
68 store half %l6, ptr addrspace(1) %sgep6, align 2
69 store half %l7, ptr addrspace(1) %sgep7, align 2
73 ; Function Attrs: mustprogress nounwind willreturn
74 define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
75 ; GFX908-LABEL: half6:
77 ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
78 ; GFX908-NEXT: v_mov_b32_e32 v3, 0
79 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
80 ; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
81 ; GFX908-NEXT: s_waitcnt vmcnt(0)
82 ; GFX908-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
83 ; GFX908-NEXT: s_endpgm
85 ; GFX90A-LABEL: half6:
87 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
88 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0
89 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
90 ; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
91 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
92 ; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
93 ; GFX90A-NEXT: s_endpgm
95 ; GFX1030-LABEL: half6:
97 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
98 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0
99 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
101 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
102 ; GFX1030-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
103 ; GFX1030-NEXT: s_endpgm
104 %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
105 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
106 %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
107 %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
108 %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
109 %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
110 %l0 = load half, ptr addrspace(1) %gep0, align 1
111 %l1 = load half, ptr addrspace(1) %gep1, align 1
112 %l2 = load half, ptr addrspace(1) %gep2, align 1
113 %l3 = load half, ptr addrspace(1) %gep3, align 1
114 %l4 = load half, ptr addrspace(1) %gep4, align 1
115 %l5 = load half, ptr addrspace(1) %gep5, align 1
116 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
117 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
118 %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
119 %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
120 %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
121 %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
122 store half %l0, ptr addrspace(1) %sgep0, align 1
123 store half %l1, ptr addrspace(1) %sgep1, align 1
124 store half %l2, ptr addrspace(1) %sgep2, align 1
125 store half %l3, ptr addrspace(1) %sgep3, align 1
126 store half %l4, ptr addrspace(1) %sgep4, align 1
127 store half %l5, ptr addrspace(1) %sgep5, align 1
131 ; Function Attrs: mustprogress nounwind willreturn
132 define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
133 ; GFX908-LABEL: half4:
135 ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
136 ; GFX908-NEXT: v_mov_b32_e32 v2, 0
137 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
138 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
139 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX908-NEXT: v_mov_b32_e32 v0, s0
141 ; GFX908-NEXT: v_mov_b32_e32 v1, s1
142 ; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
143 ; GFX908-NEXT: s_endpgm
145 ; GFX90A-LABEL: half4:
147 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
148 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
149 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
151 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
153 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
154 ; GFX90A-NEXT: s_endpgm
156 ; GFX1030-LABEL: half4:
158 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
159 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0
160 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
161 ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
162 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX1030-NEXT: v_mov_b32_e32 v0, s0
164 ; GFX1030-NEXT: v_mov_b32_e32 v1, s1
165 ; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
166 ; GFX1030-NEXT: s_endpgm
167 %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
168 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
169 %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
170 %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
171 %l0 = load half, ptr addrspace(1) %gep0, align 4
172 %l1 = load half, ptr addrspace(1) %gep1, align 4
173 %l2 = load half, ptr addrspace(1) %gep2, align 4
174 %l3 = load half, ptr addrspace(1) %gep3, align 4
175 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
176 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
177 %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
178 %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
179 store half %l0, ptr addrspace(1) %sgep0, align 4
180 store half %l1, ptr addrspace(1) %sgep1, align 4
181 store half %l2, ptr addrspace(1) %sgep2, align 4
182 store half %l3, ptr addrspace(1) %sgep3, align 4
187 ; Function Attrs: mustprogress nounwind willreturn
188 define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
189 ; GFX908-LABEL: half2:
191 ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
192 ; GFX908-NEXT: v_mov_b32_e32 v0, 0
193 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX908-NEXT: global_load_dword v1, v0, s[0:1]
195 ; GFX908-NEXT: s_waitcnt vmcnt(0)
196 ; GFX908-NEXT: global_store_dword v0, v1, s[2:3]
197 ; GFX908-NEXT: s_endpgm
199 ; GFX90A-LABEL: half2:
201 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
202 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
203 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
204 ; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
205 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
206 ; GFX90A-NEXT: global_store_dword v0, v1, s[2:3]
207 ; GFX90A-NEXT: s_endpgm
209 ; GFX1030-LABEL: half2:
211 ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
212 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0
213 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX1030-NEXT: global_load_dword v1, v0, s[0:1]
215 ; GFX1030-NEXT: s_waitcnt vmcnt(0)
216 ; GFX1030-NEXT: global_store_dword v0, v1, s[2:3]
217 ; GFX1030-NEXT: s_endpgm
218 %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
219 %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
220 %l0 = load half, ptr addrspace(1) %gep0
221 %l1 = load half, ptr addrspace(1) %gep1
222 %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
223 %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
224 store half %l0, ptr addrspace(1) %sgep0
225 store half %l1, ptr addrspace(1) %sgep1