1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
4 ; FIXME: Merge into imm.ll
6 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
7 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
8 ; GCN: buffer_store_dword [[REG]]
9 define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
10 store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
14 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
15 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
16 ; GCN: buffer_store_dword [[REG]]
17 define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
18 store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
22 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
23 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
24 ; GCN: buffer_store_dword [[REG]]
25 define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
26 store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
30 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
31 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
32 ; GCN: buffer_store_dword [[REG]]
33 define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
34 store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
38 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
39 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
40 ; GCN: buffer_store_dword [[REG]]
41 define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
42 store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
46 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
47 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
48 ; GCN: buffer_store_dword [[REG]]
49 define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
50 store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
54 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
55 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
56 ; GCN: buffer_store_dword [[REG]]
57 define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
58 store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
62 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
63 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
64 ; GCN: buffer_store_dword [[REG]]
65 define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
66 store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
70 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
71 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
72 ; GCN: buffer_store_dword [[REG]]
73 define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
74 store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
78 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
79 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
80 ; GCN: buffer_store_dword [[REG]]
81 define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
82 store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
86 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
87 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
88 ; GCN: buffer_store_dword [[REG]]
89 define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
90 store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
94 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
95 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
96 ; GCN: buffer_store_dword [[REG]]
97 define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
98 store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
102 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
103 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
104 ; GCN: buffer_store_dword [[REG]]
105 define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
106 store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
110 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
111 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
112 ; GCN: buffer_store_dword [[REG]]
113 define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
114 store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
118 ; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
119 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
120 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
121 ; GFX9: buffer_store_dword [[REG]]
123 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
124 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
125 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
126 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
127 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
129 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
130 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
132 ; VI: buffer_store_dword
133 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
134 %y = fadd <2 x half> %x, <half 0.0, half 0.0>
135 store <2 x half> %y, <2 x half> addrspace(1)* %out
139 ; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
140 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
141 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}}
142 ; GFX9: buffer_store_dword [[REG]]
144 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
145 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
146 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
147 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
148 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
150 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
151 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
153 ; VI: buffer_store_dword
154 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
155 %y = fadd <2 x half> %x, <half 0.5, half 0.5>
156 store <2 x half> %y, <2 x half> addrspace(1)* %out
160 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
161 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
162 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}}
163 ; GFX9: buffer_store_dword [[REG]]
165 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
166 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
167 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
168 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
169 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
171 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
172 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
174 ; VI: buffer_store_dword
175 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
176 %y = fadd <2 x half> %x, <half -0.5, half -0.5>
177 store <2 x half> %y, <2 x half> addrspace(1)* %out
181 ; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
182 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
183 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}}
184 ; GFX9: buffer_store_dword [[REG]]
186 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
187 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
188 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
189 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
190 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
192 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
193 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
195 ; VI: buffer_store_dword
196 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
197 %y = fadd <2 x half> %x, <half 1.0, half 1.0>
198 store <2 x half> %y, <2 x half> addrspace(1)* %out
202 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
203 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
204 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}}
205 ; GFX9: buffer_store_dword [[REG]]
208 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
209 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
210 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00
211 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
212 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
214 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
215 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
217 ; VI: buffer_store_dword
218 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
219 %y = fadd <2 x half> %x, <half -1.0, half -1.0>
220 store <2 x half> %y, <2 x half> addrspace(1)* %out
224 ; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
225 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
226 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}}
227 ; GFX9: buffer_store_dword [[REG]]
229 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
230 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
231 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
232 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
233 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
235 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
236 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
238 ; VI: buffer_store_dword
239 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
240 %y = fadd <2 x half> %x, <half 2.0, half 2.0>
241 store <2 x half> %y, <2 x half> addrspace(1)* %out
245 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
246 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
247 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}}
248 ; GFX9: buffer_store_dword [[REG]]
250 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
251 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
252 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
253 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
254 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
256 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
257 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
259 ; VI: buffer_store_dword
260 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
261 %y = fadd <2 x half> %x, <half -2.0, half -2.0>
262 store <2 x half> %y, <2 x half> addrspace(1)* %out
266 ; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
267 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
268 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}}
269 ; GFX9: buffer_store_dword [[REG]]
271 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
272 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
273 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
274 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
275 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
277 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
278 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
280 ; VI: buffer_store_dword
281 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
282 %y = fadd <2 x half> %x, <half 4.0, half 4.0>
283 store <2 x half> %y, <2 x half> addrspace(1)* %out
287 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
288 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
289 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}}
290 ; GFX9: buffer_store_dword [[REG]]
292 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
293 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
294 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
295 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
296 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
298 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
299 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
301 ; VI: buffer_store_dword
302 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
303 %y = fadd <2 x half> %x, <half -4.0, half -4.0>
304 store <2 x half> %y, <2 x half> addrspace(1)* %out
308 ; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
309 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
310 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
311 ; GFX9: buffer_store_dword [[REG]]
313 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
314 ; VI-DAG: buffer_load_dword
316 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
317 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
319 ; VI: buffer_store_dword
320 define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
321 %x = load <2 x half>, <2 x half> addrspace(1)* %in
322 %y = fadd <2 x half> %x, <half 0.5, half 0.5>
323 store <2 x half> %y, <2 x half> addrspace(1)* %out
327 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
328 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
329 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
330 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}}
331 ; GFX9: buffer_store_dword [[REG]]
333 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
334 ; VI-DAG: buffer_load_dword
336 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
337 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
338 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
339 ; VI: buffer_store_dword
340 define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
341 %x = load <2 x half>, <2 x half> addrspace(1)* %in
342 %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
343 store <2 x half> %y, <2 x half> addrspace(1)* %out
347 ; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
348 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
349 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}}
350 ; GFX9: buffer_store_dword [[REG]]
352 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
353 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
354 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}}
355 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
356 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
358 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
359 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}}
361 ; VI: buffer_store_dword
362 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
363 %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
364 store <2 x half> %y, <2 x half> addrspace(1)* %out
368 ; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
369 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
370 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}}
371 ; GFX9: buffer_store_dword [[REG]]
374 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
375 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
376 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}}
377 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
378 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
380 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
381 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}}
383 ; VI: buffer_store_dword
384 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
385 %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
386 store <2 x half> %y, <2 x half> addrspace(1)* %out
390 ; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
391 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
392 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}}
393 ; GFX9: buffer_store_dword [[REG]]
396 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
397 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
398 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}}
399 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
400 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
402 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
403 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}}
405 ; VI: buffer_store_dword
406 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
407 %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
408 store <2 x half> %y, <2 x half> addrspace(1)* %out
412 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
413 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1
414 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
415 ; GFX9: buffer_store_dword [[REG]]
417 ; VI: s_load_dword [[VAL:s[0-9]+]]
418 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}}
419 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
420 ; VI: buffer_store_dword [[REG]]
421 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
422 %xbc = bitcast <2 x half> %x to i32
423 %y = add i32 %xbc, -1
424 %ybc = bitcast i32 %y to <2 x half>
425 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
429 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
430 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe
431 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
432 ; GFX9: buffer_store_dword [[REG]]
434 ; VI: s_load_dword [[VAL:s[0-9]+]]
435 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}}
436 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
437 ; VI: buffer_store_dword [[REG]]
438 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
439 %xbc = bitcast <2 x half> %x to i32
440 %y = add i32 %xbc, 4294901758 ; 0xfffefffe
441 %ybc = bitcast i32 %y to <2 x half>
442 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
446 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
447 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0
448 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
449 ; GFX9: buffer_store_dword [[REG]]
452 ; VI: s_load_dword [[VAL:s[0-9]+]]
453 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}}
454 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
455 ; VI: buffer_store_dword [[REG]]
456 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
457 %xbc = bitcast <2 x half> %x to i32
458 %y = add i32 %xbc, 4293984240 ; 0xfff0fff0
459 %ybc = bitcast i32 %y to <2 x half>
460 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
464 ; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
465 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
466 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
467 ; GFX9: buffer_store_dword [[REG]]
469 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
470 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
471 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
472 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
473 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
475 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
476 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
478 ; VI: buffer_store_dword
479 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
480 %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
481 store <2 x half> %y, <2 x half> addrspace(1)* %out
485 ; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
486 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
487 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
488 ; GFX9: buffer_store_dword [[REG]]
490 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
491 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
492 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
493 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
494 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
496 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
497 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
499 ; VI: buffer_store_dword
500 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
501 %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
502 store <2 x half> %y, <2 x half> addrspace(1)* %out
506 attributes #0 = { nounwind }