llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
   3
   4 define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
   5 ; GCN-LABEL: divergent_or3_b32:
   6 ; GCN:       ; %bb.0: ; %bb
   7 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
   8 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
   9 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  10 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
  11 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  12 ; GCN-NEXT:    v_or3_b32 v0, v1, v0, v2
  13 ; GCN-NEXT:    v_not_b32_e32 v0, v0
  14 ; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
  15 ; GCN-NEXT:    s_endpgm
  16 bb:
  17   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
  18   %i1 = zext i32 %i to i64
  19   %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
  20   %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
  21   %i4 = extractelement <3 x i32> %i3, i64 0
  22   %i5 = extractelement <3 x i32> %i3, i64 1
  23   %i6 = extractelement <3 x i32> %i3, i64 2
  24   %i7 = or i32 %i5, %i4
  25   %i8 = or i32 %i7, %i6
  26   %i9 = xor i32 %i8, -1
  27   store i32 %i9, ptr addrspace(1) %i2, align 16
  28   ret void
  29 }
  30
  31 define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
  32 ; GCN-LABEL: divergent_or3_b64:
  33 ; GCN:       ; %bb.0: ; %bb
  34 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
  35 ; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
  36 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  37 ; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
  38 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
  39 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  40 ; GCN-NEXT:    v_or3_b32 v1, v3, v1, v5
  41 ; GCN-NEXT:    v_or3_b32 v0, v2, v0, v4
  42 ; GCN-NEXT:    v_not_b32_e32 v1, v1
  43 ; GCN-NEXT:    v_not_b32_e32 v0, v0
  44 ; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
  45 ; GCN-NEXT:    s_endpgm
  46 bb:
  47   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
  48   %i1 = zext i32 %i to i64
  49   %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
  50   %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
  51   %i4 = extractelement <3 x i64> %i3, i64 0
  52   %i5 = extractelement <3 x i64> %i3, i64 1
  53   %i6 = extractelement <3 x i64> %i3, i64 2
  54   %i7 = or i64 %i5, %i4
  55   %i8 = or i64 %i7, %i6
  56   %i9 = xor i64 %i8, -1
  57   store i64 %i9, ptr addrspace(1) %i2, align 32
  58   ret void
  59 }
  60
  61 define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) {
  62 ; GCN-LABEL: divergent_and3_b32:
  63 ; GCN:       ; %bb.0: ; %bb
  64 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
  65 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
  66 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  67 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
  68 ; GCN-NEXT:    s_waitcnt vmcnt(0)
  69 ; GCN-NEXT:    v_and_b32_e32 v0, v1, v0
  70 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
  71 ; GCN-NEXT:    v_not_b32_e32 v0, v0
  72 ; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
  73 ; GCN-NEXT:    s_endpgm
  74 bb:
  75   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
  76   %i1 = zext i32 %i to i64
  77   %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
  78   %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
  79   %i4 = extractelement <3 x i32> %i3, i64 0
  80   %i5 = extractelement <3 x i32> %i3, i64 1
  81   %i6 = extractelement <3 x i32> %i3, i64 2
  82   %i7 = and i32 %i5, %i4
  83   %i8 = and i32 %i7, %i6
  84   %i9 = xor i32 %i8, -1
  85   store i32 %i9, ptr addrspace(1) %i2, align 16
  86   ret void
  87 }
  88
  89 define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
  90 ; GCN-LABEL: divergent_and3_b64:
  91 ; GCN:       ; %bb.0: ; %bb
  92 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
  93 ; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
  94 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  95 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
  96 ; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
  97 ; GCN-NEXT:    s_waitcnt vmcnt(1)
  98 ; GCN-NEXT:    v_and_b32_e32 v1, v3, v1
  99 ; GCN-NEXT:    v_and_b32_e32 v0, v2, v0
 100 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 101 ; GCN-NEXT:    v_and_b32_e32 v1, v1, v5
 102 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v4
 103 ; GCN-NEXT:    v_not_b32_e32 v1, v1
 104 ; GCN-NEXT:    v_not_b32_e32 v0, v0
 105 ; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 106 ; GCN-NEXT:    s_endpgm
 107 bb:
 108   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
 109   %i1 = zext i32 %i to i64
 110   %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
 111   %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
 112   %i4 = extractelement <3 x i64> %i3, i64 0
 113   %i5 = extractelement <3 x i64> %i3, i64 1
 114   %i6 = extractelement <3 x i64> %i3, i64 2
 115   %i7 = and i64 %i5, %i4
 116   %i8 = and i64 %i7, %i6
 117   %i9 = xor i64 %i8, -1
 118   store i64 %i9, ptr addrspace(1) %i2, align 32
 119   ret void
 120 }
 121
 122 define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) {
 123 ; GCN-LABEL: divergent_xor3_b32:
 124 ; GCN:       ; %bb.0: ; %bb
 125 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 126 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
 127 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 128 ; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
 129 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 130 ; GCN-NEXT:    v_xor_b32_e32 v0, v1, v0
 131 ; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v2
 132 ; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
 133 ; GCN-NEXT:    s_endpgm
 134 bb:
 135   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
 136   %i1 = zext i32 %i to i64
 137   %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
 138   %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
 139   %i4 = extractelement <3 x i32> %i3, i64 0
 140   %i5 = extractelement <3 x i32> %i3, i64 1
 141   %i6 = extractelement <3 x i32> %i3, i64 2
 142   %i7 = xor i32 %i5, %i4
 143   %i8 = xor i32 %i7, %i6
 144   %i9 = xor i32 %i8, -1
 145   store i32 %i9, ptr addrspace(1) %i2, align 16
 146   ret void
 147 }
 148
 149 define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) {
 150 ; GCN-LABEL: divergent_xor3_b64:
 151 ; GCN:       ; %bb.0: ; %bb
 152 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 153 ; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
 154 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 155 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
 156 ; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
 157 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 158 ; GCN-NEXT:    v_xor_b32_e32 v1, v3, v1
 159 ; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
 160 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 161 ; GCN-NEXT:    v_xnor_b32_e32 v1, v1, v5
 162 ; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v4
 163 ; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 164 ; GCN-NEXT:    s_endpgm
 165 bb:
 166   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
 167   %i1 = zext i32 %i to i64
 168   %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
 169   %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
 170   %i4 = extractelement <3 x i64> %i3, i64 0
 171   %i5 = extractelement <3 x i64> %i3, i64 1
 172   %i6 = extractelement <3 x i64> %i3, i64 2
 173   %i7 = xor i64 %i5, %i4
 174   %i8 = xor i64 %i7, %i6
 175   %i9 = xor i64 %i8, -1
 176   store i64 %i9, ptr addrspace(1) %i2, align 32
 177   ret void
 178 }
 179
 180 define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) {
 181 ; GCN-LABEL: uniform_or3_b32:
 182 ; GCN:       ; %bb.0: ; %bb
 183 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 184 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 185 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 186 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 187 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 188 ; GCN-NEXT:    s_or_b32 s0, s1, s0
 189 ; GCN-NEXT:    s_nor_b32 s0, s0, s2
 190 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 191 ; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
 192 ; GCN-NEXT:    s_endpgm
 193 bb:
 194   %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
 195   %i4 = extractelement <3 x i32> %i3, i64 0
 196   %i5 = extractelement <3 x i32> %i3, i64 1
 197   %i6 = extractelement <3 x i32> %i3, i64 2
 198   %i7 = or i32 %i5, %i4
 199   %i8 = or i32 %i7, %i6
 200   %i9 = xor i32 %i8, -1
 201   store i32 %i9, ptr addrspace(1) %arg, align 16
 202   ret void
 203 }
 204
 205 define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) {
 206 ; GCN-LABEL: uniform_or3_b64:
 207 ; GCN:       ; %bb.0: ; %bb
 208 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 209 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 210 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 211 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 212 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 213 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 214 ; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 215 ; GCN-NEXT:    s_nor_b64 s[0:1], s[0:1], s[6:7]
 216 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 217 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 218 ; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 219 ; GCN-NEXT:    s_endpgm
 220 bb:
 221   %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
 222   %i4 = extractelement <3 x i64> %i3, i64 0
 223   %i5 = extractelement <3 x i64> %i3, i64 1
 224   %i6 = extractelement <3 x i64> %i3, i64 2
 225   %i7 = or i64 %i5, %i4
 226   %i8 = or i64 %i7, %i6
 227   %i9 = xor i64 %i8, -1
 228   store i64 %i9, ptr addrspace(1) %arg, align 32
 229   ret void
 230 }
 231
 232 define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) {
 233 ; GCN-LABEL: uniform_and3_b32:
 234 ; GCN:       ; %bb.0: ; %bb
 235 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 236 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 237 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 238 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 239 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 240 ; GCN-NEXT:    s_and_b32 s0, s1, s0
 241 ; GCN-NEXT:    s_nand_b32 s0, s0, s2
 242 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 243 ; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
 244 ; GCN-NEXT:    s_endpgm
 245 bb:
 246   %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
 247   %i4 = extractelement <3 x i32> %i3, i64 0
 248   %i5 = extractelement <3 x i32> %i3, i64 1
 249   %i6 = extractelement <3 x i32> %i3, i64 2
 250   %i7 = and i32 %i5, %i4
 251   %i8 = and i32 %i7, %i6
 252   %i9 = xor i32 %i8, -1
 253   store i32 %i9, ptr addrspace(1) %arg, align 16
 254   ret void
 255 }
 256
 257 define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) {
 258 ; GCN-LABEL: uniform_and3_b64:
 259 ; GCN:       ; %bb.0: ; %bb
 260 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 261 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 262 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 263 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 264 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 265 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 266 ; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 267 ; GCN-NEXT:    s_nand_b64 s[0:1], s[0:1], s[6:7]
 268 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 269 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 270 ; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 271 ; GCN-NEXT:    s_endpgm
 272 bb:
 273   %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
 274   %i4 = extractelement <3 x i64> %i3, i64 0
 275   %i5 = extractelement <3 x i64> %i3, i64 1
 276   %i6 = extractelement <3 x i64> %i3, i64 2
 277   %i7 = and i64 %i5, %i4
 278   %i8 = and i64 %i7, %i6
 279   %i9 = xor i64 %i8, -1
 280   store i64 %i9, ptr addrspace(1) %arg, align 32
 281   ret void
 282 }
 283
 284 define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) {
 285 ; GCN-LABEL: uniform_xor3_b32:
 286 ; GCN:       ; %bb.0: ; %bb
 287 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 288 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 289 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 290 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 291 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 292 ; GCN-NEXT:    s_xor_b32 s0, s1, s0
 293 ; GCN-NEXT:    s_xnor_b32 s0, s0, s2
 294 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 295 ; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
 296 ; GCN-NEXT:    s_endpgm
 297 bb:
 298   %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
 299   %i4 = extractelement <3 x i32> %i3, i64 0
 300   %i5 = extractelement <3 x i32> %i3, i64 1
 301   %i6 = extractelement <3 x i32> %i3, i64 2
 302   %i7 = xor i32 %i5, %i4
 303   %i8 = xor i32 %i7, %i6
 304   %i9 = xor i32 %i8, -1
 305   store i32 %i9, ptr addrspace(1) %arg, align 16
 306   ret void
 307 }
 308
 309 define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) {
 310 ; GCN-LABEL: uniform_xor3_b64:
 311 ; GCN:       ; %bb.0: ; %bb
 312 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 313 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 314 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 315 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 316 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 317 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 318 ; GCN-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 319 ; GCN-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[6:7]
 320 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 321 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 322 ; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 323 ; GCN-NEXT:    s_endpgm
 324 bb:
 325   %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
 326   %i4 = extractelement <3 x i64> %i3, i64 0
 327   %i5 = extractelement <3 x i64> %i3, i64 1
 328   %i6 = extractelement <3 x i64> %i3, i64 2
 329   %i7 = xor i64 %i5, %i4
 330   %i8 = xor i64 %i7, %i6
 331   %i9 = xor i64 %i8, -1
 332   store i64 %i9, ptr addrspace(1) %arg, align 32
 333   ret void
 334 }
 335
 336 declare i32 @llvm.amdgcn.workitem.id.x()