1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
4 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
5 declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
7 ; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
8 declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
9 declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
10 declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
11 declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
13 define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
14 ; GFX940-LABEL: flat_atomic_fadd_f32_noret:
16 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
17 ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
18 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
19 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
20 ; GFX940-NEXT: v_mov_b32_e32 v2, s4
21 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
22 ; GFX940-NEXT: s_endpgm
23 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
27 define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
28 ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
30 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
31 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
32 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
33 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
34 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
35 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
36 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
37 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
38 ; GFX940-NEXT: buffer_inv sc0 sc1
39 ; GFX940-NEXT: s_endpgm
40 %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
44 define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
45 ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
47 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
48 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
49 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
51 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
52 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
53 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
54 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
55 ; GFX940-NEXT: buffer_inv sc0 sc1
56 ; GFX940-NEXT: s_endpgm
57 %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
61 define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
62 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
64 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65 ; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
66 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
67 ; GFX940-NEXT: s_setpc_b64 s[30:31]
68 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
72 define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
73 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
75 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76 ; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
77 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
78 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
79 ; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
80 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
81 ; GFX940-NEXT: buffer_inv sc0 sc1
82 ; GFX940-NEXT: s_setpc_b64 s[30:31]
83 %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
87 define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
88 ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret:
90 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
91 ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
92 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
93 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
94 ; GFX940-NEXT: v_mov_b32_e32 v2, s4
95 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
96 ; GFX940-NEXT: s_endpgm
97 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
101 define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
102 ; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
104 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
106 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
107 ; GFX940-NEXT: s_setpc_b64 s[30:31]
108 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
112 define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
113 ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret:
115 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
116 ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
117 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
119 ; GFX940-NEXT: v_mov_b32_e32 v2, s4
120 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
121 ; GFX940-NEXT: s_endpgm
122 %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
126 define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
127 ; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
129 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
131 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
132 ; GFX940-NEXT: s_setpc_b64 s[30:31]
133 %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
137 define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
138 ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret:
140 ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
141 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
142 ; GFX940-NEXT: v_mov_b32_e32 v1, 0
143 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
144 ; GFX940-NEXT: v_mov_b32_e32 v0, s4
145 ; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3]
146 ; GFX940-NEXT: s_endpgm
147 %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
151 define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
152 ; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
154 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155 ; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
156 ; GFX940-NEXT: s_waitcnt vmcnt(0)
157 ; GFX940-NEXT: s_setpc_b64 s[30:31]
158 %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
162 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
163 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
165 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
166 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX940-NEXT: v_mov_b32_e32 v0, s0
168 ; GFX940-NEXT: v_mov_b32_e32 v1, s1
169 ; GFX940-NEXT: ds_pk_add_f16 v0, v1
170 ; GFX940-NEXT: s_endpgm
171 %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
175 define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
176 ; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
178 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
179 ; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
180 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX940-NEXT: s_setpc_b64 s[30:31]
182 %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
186 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
187 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
189 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
190 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
191 ; GFX940-NEXT: v_mov_b32_e32 v0, s1
192 ; GFX940-NEXT: v_mov_b32_e32 v1, s0
193 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
194 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
195 ; GFX940-NEXT: ds_pk_add_bf16 v1, v0
196 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
197 ; GFX940-NEXT: buffer_inv sc0 sc1
198 ; GFX940-NEXT: s_endpgm
199 %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
203 define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
204 ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
206 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
208 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
209 ; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
210 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
211 ; GFX940-NEXT: buffer_inv sc0 sc1
212 ; GFX940-NEXT: s_setpc_b64 s[30:31]
213 %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
217 attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }