1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
4 ; FIXME: Merge into imm.ll
6 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
7 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
8 ; GCN: buffer_store_dword [[REG]]
9 define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
10 store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
14 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
15 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
16 ; GCN: buffer_store_dword [[REG]]
17 define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
18 store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
22 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
23 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
24 ; GCN: buffer_store_dword [[REG]]
25 define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
26 store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
30 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
31 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
32 ; GCN: buffer_store_dword [[REG]]
33 define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
34 store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
38 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
39 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
40 ; GCN: buffer_store_dword [[REG]]
41 define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
42 store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
46 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
47 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
48 ; GCN: buffer_store_dword [[REG]]
49 define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
50 store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
54 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
55 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
56 ; GCN: buffer_store_dword [[REG]]
57 define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
58 store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
62 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
63 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
64 ; GCN: buffer_store_dword [[REG]]
65 define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
66 store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
70 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
71 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
72 ; GCN: buffer_store_dword [[REG]]
73 define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
74 store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
78 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
79 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
80 ; GCN: buffer_store_dword [[REG]]
81 define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
82 store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
86 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
87 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
88 ; GCN: buffer_store_dword [[REG]]
89 define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
90 store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
94 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
95 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
96 ; GCN: buffer_store_dword [[REG]]
97 define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
98 store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
102 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
103 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
104 ; GCN: buffer_store_dword [[REG]]
105 define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
106 store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
110 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
111 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
112 ; GCN: buffer_store_dword [[REG]]
113 define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
114 store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
118 ; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
119 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
120 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
121 ; GFX9: buffer_store_dword [[REG]]
123 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
124 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
125 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
126 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
127 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
129 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
130 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
132 ; VI: buffer_store_dword
133 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
134 %y = fadd <2 x half> %x, <half 0.0, half 0.0>
135 store <2 x half> %y, <2 x half> addrspace(1)* %out
139 ; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
140 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
141 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}}
142 ; GFX9: buffer_store_dword [[REG]]
144 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
145 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
146 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
147 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
148 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
150 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
151 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
153 ; VI: buffer_store_dword
154 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
155 %y = fadd <2 x half> %x, <half 0.5, half 0.5>
156 store <2 x half> %y, <2 x half> addrspace(1)* %out
160 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
161 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
162 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}}
163 ; GFX9: buffer_store_dword [[REG]]
165 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
166 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
167 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
168 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
169 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
171 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
172 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
174 ; VI: buffer_store_dword
175 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
176 %y = fadd <2 x half> %x, <half -0.5, half -0.5>
177 store <2 x half> %y, <2 x half> addrspace(1)* %out
181 ; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
182 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
183 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}}
184 ; GFX9: buffer_store_dword [[REG]]
186 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
187 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
188 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
189 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
190 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
192 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
193 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
195 ; VI: buffer_store_dword
196 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
197 %y = fadd <2 x half> %x, <half 1.0, half 1.0>
198 store <2 x half> %y, <2 x half> addrspace(1)* %out
202 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
203 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
204 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}}
205 ; GFX9: buffer_store_dword [[REG]]
208 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
209 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
210 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00
211 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
212 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
214 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
215 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
217 ; VI: buffer_store_dword
218 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
219 %y = fadd <2 x half> %x, <half -1.0, half -1.0>
220 store <2 x half> %y, <2 x half> addrspace(1)* %out
224 ; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
225 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
226 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}}
227 ; GFX9: buffer_store_dword [[REG]]
229 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
230 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
231 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
232 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
233 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
235 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
236 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
238 ; VI: buffer_store_dword
239 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
240 %y = fadd <2 x half> %x, <half 2.0, half 2.0>
241 store <2 x half> %y, <2 x half> addrspace(1)* %out
245 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
246 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
247 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}}
248 ; GFX9: buffer_store_dword [[REG]]
250 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
251 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
252 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
253 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
254 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
256 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
257 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
259 ; VI: buffer_store_dword
260 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
261 %y = fadd <2 x half> %x, <half -2.0, half -2.0>
262 store <2 x half> %y, <2 x half> addrspace(1)* %out
266 ; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
267 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
268 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}}
269 ; GFX9: buffer_store_dword [[REG]]
271 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
272 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
273 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
274 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
275 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
277 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
278 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
280 ; VI: buffer_store_dword
281 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
282 %y = fadd <2 x half> %x, <half 4.0, half 4.0>
283 store <2 x half> %y, <2 x half> addrspace(1)* %out
287 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
288 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
289 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}}
290 ; GFX9: buffer_store_dword [[REG]]
292 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
293 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
294 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
295 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
296 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
298 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
299 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
301 ; VI: buffer_store_dword
302 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
303 %y = fadd <2 x half> %x, <half -4.0, half -4.0>
304 store <2 x half> %y, <2 x half> addrspace(1)* %out
308 ; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
309 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
310 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
311 ; GFX9: buffer_store_dword [[REG]]
313 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
314 ; VI-DAG: buffer_load_dword
316 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
317 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
319 ; VI: buffer_store_dword
320 define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
321 %x = load <2 x half>, <2 x half> addrspace(1)* %in
322 %y = fadd <2 x half> %x, <half 0.5, half 0.5>
323 store <2 x half> %y, <2 x half> addrspace(1)* %out
327 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
328 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
329 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400{{$}}
330 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}}
331 ; GFX9: buffer_store_dword [[REG]]
333 ; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400{{$}}
334 ; VI-DAG: buffer_load_dword
336 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
337 ; gfx8 does not support sreg or imm in sdwa - this will be move then
338 ; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
339 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
340 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
341 ; VI: buffer_store_dword
342 define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
343 %x = load <2 x half>, <2 x half> addrspace(1)* %in
344 %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
345 store <2 x half> %y, <2 x half> addrspace(1)* %out
349 ; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
350 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
351 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}}
352 ; GFX9: buffer_store_dword [[REG]]
354 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
355 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
356 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}}
357 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
358 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
360 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
361 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}}
363 ; VI: buffer_store_dword
364 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
365 %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
366 store <2 x half> %y, <2 x half> addrspace(1)* %out
370 ; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
371 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
372 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}}
373 ; GFX9: buffer_store_dword [[REG]]
376 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
377 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
378 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}}
379 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
380 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
382 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
383 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}}
385 ; VI: buffer_store_dword
386 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
387 %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
388 store <2 x half> %y, <2 x half> addrspace(1)* %out
392 ; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
393 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
394 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}}
395 ; GFX9: buffer_store_dword [[REG]]
398 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
399 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
400 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}}
401 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
402 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
404 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
405 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}}
407 ; VI: buffer_store_dword
408 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
409 %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
410 store <2 x half> %y, <2 x half> addrspace(1)* %out
414 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
415 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1
416 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
417 ; GFX9: buffer_store_dword [[REG]]
419 ; VI: s_load_dword [[VAL:s[0-9]+]]
420 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}}
421 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
422 ; VI: buffer_store_dword [[REG]]
423 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
424 %xbc = bitcast <2 x half> %x to i32
425 %y = add i32 %xbc, -1
426 %ybc = bitcast i32 %y to <2 x half>
427 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
431 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
432 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe
433 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
434 ; GFX9: buffer_store_dword [[REG]]
436 ; VI: s_load_dword [[VAL:s[0-9]+]]
437 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}}
438 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
439 ; VI: buffer_store_dword [[REG]]
440 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
441 %xbc = bitcast <2 x half> %x to i32
442 %y = add i32 %xbc, 4294901758 ; 0xfffefffe
443 %ybc = bitcast i32 %y to <2 x half>
444 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
448 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
449 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0
450 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
451 ; GFX9: buffer_store_dword [[REG]]
454 ; VI: s_load_dword [[VAL:s[0-9]+]]
455 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}}
456 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
457 ; VI: buffer_store_dword [[REG]]
458 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
459 %xbc = bitcast <2 x half> %x to i32
460 %y = add i32 %xbc, 4293984240 ; 0xfff0fff0
461 %ybc = bitcast i32 %y to <2 x half>
462 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
466 ; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
467 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
468 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
469 ; GFX9: buffer_store_dword [[REG]]
471 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
472 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
473 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
474 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
475 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
477 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
478 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
480 ; VI: buffer_store_dword
481 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
482 %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
483 store <2 x half> %y, <2 x half> addrspace(1)* %out
487 ; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
488 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
489 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
490 ; GFX9: buffer_store_dword [[REG]]
492 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
493 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
494 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
495 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
496 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
498 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
499 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
501 ; VI: buffer_store_dword
502 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
503 %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
504 store <2 x half> %y, <2 x half> addrspace(1)* %out
508 attributes #0 = { nounwind }