1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
5 ; FIXME: Merge into imm.ll
7 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
8 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
9 ; GCN: buffer_store_dword [[REG]]
10 define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
11 store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
15 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
16 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 ; encoding
17 ; GCN: buffer_store_dword [[REG]]
18 define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
19 store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
23 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
24 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
25 ; GCN: buffer_store_dword [[REG]]
26 define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
27 store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
31 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
32 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800 ; encoding
33 ; GCN: buffer_store_dword [[REG]]
34 define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
35 store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
39 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
40 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800 ; encoding
41 ; GCN: buffer_store_dword [[REG]]
42 define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
43 store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
47 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
48 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00 ; encoding
49 ; GCN: buffer_store_dword [[REG]]
50 define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
51 store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
55 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
56 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00 ; encoding
57 ; GCN: buffer_store_dword [[REG]]
58 define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
59 store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
63 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
64 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000 ; encoding
65 ; GCN: buffer_store_dword [[REG]]
66 define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
67 store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
71 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
72 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000 ; encoding
73 ; GCN: buffer_store_dword [[REG]]
74 define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
75 store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
79 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
80 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400 ; encoding
81 ; GCN: buffer_store_dword [[REG]]
82 define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
83 store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
87 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
88 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400 ; encoding
89 ; GCN: buffer_store_dword [[REG]]
90 define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
91 store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
95 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
96 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118 ; encoding
97 ; GCN: buffer_store_dword [[REG]]
98 define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
99 store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
103 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
104 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118 ; encoding
105 ; GCN: buffer_store_dword [[REG]]
106 define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
107 store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
111 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
112 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
113 ; GCN: buffer_store_dword [[REG]]
114 define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
115 store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
119 ; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
120 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
121 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0 ; encoding
122 ; GFX9: buffer_store_dword [[REG]]
124 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
125 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
126 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
127 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
128 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
130 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
131 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
133 ; VI: buffer_store_dword
134 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
135 %y = fadd <2 x half> %x, <half 0.0, half 0.0>
136 store <2 x half> %y, <2 x half> addrspace(1)* %out
140 ; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
141 ; GFX10: s_load_dword [[VAL:s[0-9]+]]
142 ; GFX10: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe0,0x01,0x08]
143 ; GFX10: buffer_store_dword [[REG]]
145 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
146 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x04,0xe0,0x01,0x08]
147 ; GFX9: buffer_store_dword [[REG]]
149 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
150 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
151 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
152 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
153 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
155 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
156 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
158 ; VI: buffer_store_dword
159 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
160 %y = fadd <2 x half> %x, <half 0.5, half 0.5>
161 store <2 x half> %y, <2 x half> addrspace(1)* %out
165 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
166 ; GFX10: s_load_dword [[VAL:s[0-9]+]]
167 ; GFX10: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe2,0x01,0x08]
168 ; GFX10: buffer_store_dword [[REG]]
170 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
171 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x04,0xe2,0x01,0x08]
172 ; GFX9: buffer_store_dword [[REG]]
174 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
175 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
176 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
177 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
178 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
180 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
181 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
183 ; VI: buffer_store_dword
184 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
185 %y = fadd <2 x half> %x, <half -0.5, half -0.5>
186 store <2 x half> %y, <2 x half> addrspace(1)* %out
190 ; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
191 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
192 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0] ; encoding
193 ; GFX9: buffer_store_dword [[REG]]
195 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
196 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
197 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
198 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
199 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
201 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
202 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
204 ; VI: buffer_store_dword
205 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
206 %y = fadd <2 x half> %x, <half 1.0, half 1.0>
207 store <2 x half> %y, <2 x half> addrspace(1)* %out
211 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
212 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
213 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0] ; encoding
214 ; GFX9: buffer_store_dword [[REG]]
217 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
218 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
219 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00
220 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
221 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
223 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
224 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
226 ; VI: buffer_store_dword
227 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
228 %y = fadd <2 x half> %x, <half -1.0, half -1.0>
229 store <2 x half> %y, <2 x half> addrspace(1)* %out
233 ; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
234 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
235 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0] ; encoding
236 ; GFX9: buffer_store_dword [[REG]]
238 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
239 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
240 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
241 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
242 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
244 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
245 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
247 ; VI: buffer_store_dword
248 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
249 %y = fadd <2 x half> %x, <half 2.0, half 2.0>
250 store <2 x half> %y, <2 x half> addrspace(1)* %out
254 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
255 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
256 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0] ; encoding
257 ; GFX9: buffer_store_dword [[REG]]
259 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
260 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
261 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
262 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
263 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
265 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
266 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
268 ; VI: buffer_store_dword
269 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
270 %y = fadd <2 x half> %x, <half -2.0, half -2.0>
271 store <2 x half> %y, <2 x half> addrspace(1)* %out
275 ; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
276 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
277 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0] ; encoding
278 ; GFX9: buffer_store_dword [[REG]]
280 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
281 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
282 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
283 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
284 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
286 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
287 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
289 ; VI: buffer_store_dword
290 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
291 %y = fadd <2 x half> %x, <half 4.0, half 4.0>
292 store <2 x half> %y, <2 x half> addrspace(1)* %out
296 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
297 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
298 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0] ; encoding
299 ; GFX9: buffer_store_dword [[REG]]
301 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
302 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
303 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
304 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
305 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
307 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
308 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
310 ; VI: buffer_store_dword
311 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
312 %y = fadd <2 x half> %x, <half -4.0, half -4.0>
313 store <2 x half> %y, <2 x half> addrspace(1)* %out
317 ; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
318 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
319 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
320 ; GFX9: buffer_store_dword [[REG]]
322 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
323 ; VI-DAG: buffer_load_dword
325 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
326 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
328 ; VI: buffer_store_dword
329 define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
330 %x = load <2 x half>, <2 x half> addrspace(1)* %in
331 %y = fadd <2 x half> %x, <half 0.5, half 0.5>
332 store <2 x half> %y, <2 x half> addrspace(1)* %out
336 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
337 ; GFX10: v_pk_add_f16 v0, 0x6400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x0f,0xcc,0xff,0x00,0x02,0x10,0x00,0x64,0x00,0x00]
339 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
340 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding
341 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x09,0x00,0x08]
342 ; GFX9: buffer_store_dword [[REG]]
344 ; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding
345 ; VI-DAG: buffer_load_dword
347 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
348 ; gfx8 does not support sreg or imm in sdwa - this will be move then
349 ; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
350 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
351 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
352 ; VI: buffer_store_dword
353 define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
354 %x = load <2 x half>, <2 x half> addrspace(1)* %in
355 %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
356 store <2 x half> %y, <2 x half> addrspace(1)* %out
360 ; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
361 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
362 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0] ; encoding
363 ; GFX9: buffer_store_dword [[REG]]
365 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
366 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
367 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1 ; encoding
368 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
369 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
371 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
372 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1 ; encoding
374 ; VI: buffer_store_dword
375 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
376 %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
377 store <2 x half> %y, <2 x half> addrspace(1)* %out
381 ; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
382 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
383 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0] ; encoding
384 ; GFX9: buffer_store_dword [[REG]]
387 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
388 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
389 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2 ; encoding
390 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
391 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
393 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
394 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2 ; encoding
396 ; VI: buffer_store_dword
397 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
398 %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
399 store <2 x half> %y, <2 x half> addrspace(1)* %out
403 ; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
404 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
405 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0] ; encoding
406 ; GFX9: buffer_store_dword [[REG]]
409 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
410 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
411 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16 ; encoding
412 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
413 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
415 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
416 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16 ; encoding
418 ; VI: buffer_store_dword
419 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
420 %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
421 store <2 x half> %y, <2 x half> addrspace(1)* %out
425 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
426 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, -1
427 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
428 ; GFX9: buffer_store_dword [[REG]]
430 ; VI: s_load_dword [[VAL:s[0-9]+]]
431 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1 ; encoding
432 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
433 ; VI: buffer_store_dword [[REG]]
434 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
435 %xbc = bitcast <2 x half> %x to i32
436 %y = add i32 %xbc, -1
437 %ybc = bitcast i32 %y to <2 x half>
438 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
442 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
443 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfffefffe
444 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
445 ; GFX9: buffer_store_dword [[REG]]
447 ; VI: s_load_dword [[VAL:s[0-9]+]]
448 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe ; encoding
449 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
450 ; VI: buffer_store_dword [[REG]]
451 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
452 %xbc = bitcast <2 x half> %x to i32
453 %y = add i32 %xbc, 4294901758 ; 0xfffefffe
454 %ybc = bitcast i32 %y to <2 x half>
455 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
459 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
460 ; GFX9: s_add_i32 [[VAL:s[0-9]+]], s4, 0xfff0fff0
461 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
462 ; GFX9: buffer_store_dword [[REG]]
465 ; VI: s_load_dword [[VAL:s[0-9]+]]
466 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0 ; encoding
467 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
468 ; VI: buffer_store_dword [[REG]]
469 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
470 %xbc = bitcast <2 x half> %x to i32
471 %y = add i32 %xbc, 4293984240 ; 0xfff0fff0
472 %ybc = bitcast i32 %y to <2 x half>
473 store <2 x half> %ybc, <2 x half> addrspace(1)* %out
477 ; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
478 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
479 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
480 ; GFX9: buffer_store_dword [[REG]]
482 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
483 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
484 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
485 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
486 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
488 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
489 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
491 ; VI: buffer_store_dword
492 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
493 %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
494 store <2 x half> %y, <2 x half> addrspace(1)* %out
498 ; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
499 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
500 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
501 ; GFX9: buffer_store_dword [[REG]]
503 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
504 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
505 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
506 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
507 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
509 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
510 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
512 ; VI: buffer_store_dword
513 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
514 %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
515 store <2 x half> %y, <2 x half> addrspace(1)* %out
519 ; GCN-LABEL: {{^}}mul_inline_imm_0.5_v2i16:
520 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800
521 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
523 ; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00]
524 define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) {
525 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0.5, half 0.5> to <2 x i16>)
529 ; GCN-LABEL: {{^}}mul_inline_imm_neg_0.5_v2i16:
530 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800
531 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
533 ; GFX10: v_pk_mul_lo_u16 v0, 0xb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0x00]
534 define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) {
535 %y = mul <2 x i16> %x, bitcast (<2 x half> <half -0.5, half -0.5> to <2 x i16>)
539 ; GCN-LABEL: {{^}}mul_inline_imm_1.0_v2i16:
540 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00
541 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
543 ; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00]
544 define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) {
545 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 1.0, half 1.0> to <2 x i16>)
549 ; GCN-LABEL: {{^}}mul_inline_imm_neg_1.0_v2i16:
550 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00
551 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
553 ; GFX10: v_pk_mul_lo_u16 v0, 0xbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0x00]
554 define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) {
555 %y = mul <2 x i16> %x, bitcast (<2 x half> <half -1.0, half -1.0> to <2 x i16>)
559 ; GCN-LABEL: {{^}}shl_inline_imm_2.0_v2i16:
560 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40004000
561 ; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]]
563 ; GFX10: v_pk_lshlrev_b16 v0, v0, 0x4000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x40,0x00,0x00]
564 define <2 x i16> @shl_inline_imm_2.0_v2i16(<2 x i16> %x) {
565 %y = shl <2 x i16> bitcast (<2 x half> <half 2.0, half 2.0> to <2 x i16>), %x
569 ; GCN-LABEL: {{^}}shl_inline_imm_neg_2.0_v2i16:
570 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc000c000
571 ; GFX9: v_pk_lshlrev_b16 v0, v0, [[K]]
573 ; GFX10: v_pk_lshlrev_b16 v0, v0, 0xc000 op_sel_hi:[1,0] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc0,0x00,0x00]
574 define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) {
575 %y = shl <2 x i16> bitcast (<2 x half> <half -2.0, half -2.0> to <2 x i16>), %x
579 ; GCN-LABEL: {{^}}mul_inline_imm_4.0_v2i16:
580 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400
581 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
583 ; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00]
584 define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
585 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 4.0, half 4.0> to <2 x i16>)
590 ; GCN-LABEL: {{^}}mul_inline_imm_neg_4.0_v2i16:
591 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400
592 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
594 ; GFX10: v_pk_mul_lo_u16 v0, 0xc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0x00]
595 define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) {
596 %y = mul <2 x i16> %x, bitcast (<2 x half> <half -4.0, half -4.0> to <2 x i16>)
600 ; GCN-LABEL: {{^}}mul_inline_imm_inv2pi_v2i16:
601 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118
602 ; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]]
604 ; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00]
605 define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) {
606 %y = mul <2 x i16> %x, bitcast (<2 x half> <half 0xH3118, half 0xH3118> to <2 x i16>)
610 attributes #0 = { nounwind }