llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
   5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
   8 ; GFX12-NEXT:    s_clause 0x1
   9 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  10 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  11 ; GFX12-NEXT:    s_endpgm
  12 bb:
  13   %fneg.A = fneg <8 x half> %A
  14   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
  15   store <8 x float> %res, ptr addrspace(1) %out
  16   ret void
  17 }
  18
  19 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
  20 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
  21 ; GFX12:       ; %bb.0: ; %bb
  22 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
  23 ; GFX12-NEXT:    s_clause 0x1
  24 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  25 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  26 ; GFX12-NEXT:    s_endpgm
  27 bb:
  28   %fneg.B = fneg <8 x half> %B
  29   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
  30   store <8 x float> %res, ptr addrspace(1) %out
  31   ret void
  32 }
  33
  34 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
  35 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
  36 ; GFX12:       ; %bb.0: ; %bb
  37 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
  38 ; GFX12-NEXT:    s_clause 0x1
  39 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  40 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  41 ; GFX12-NEXT:    s_endpgm
  42 bb:
  43   %fneg.C = fneg <8 x float> %C
  44   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
  45   store <8 x float> %res, ptr addrspace(1) %out
  46   ret void
  47 }
  48
  49 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
  50 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
  51 ; GFX12:       ; %bb.0: ; %bb
  52 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
  53 ; GFX12-NEXT:    s_clause 0x1
  54 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  55 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  56 ; GFX12-NEXT:    s_endpgm
  57 bb:
  58   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  59   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
  60   store <8 x float> %res, ptr addrspace(1) %out
  61   ret void
  62 }
  63
  64 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
  65 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
  66 ; GFX12:       ; %bb.0: ; %bb
  67 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
  68 ; GFX12-NEXT:    s_clause 0x1
  69 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  70 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  71 ; GFX12-NEXT:    s_endpgm
  72 bb:
  73   %fneg.C = fneg <8 x float> %C
  74   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
  75   store <8 x float> %res, ptr addrspace(1) %out
  76   ret void
  77 }
  78
  79 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
  80 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
  81 ; GFX12:       ; %bb.0: ; %bb
  82 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
  83 ; GFX12-NEXT:    s_clause 0x1
  84 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
  85 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
  86 ; GFX12-NEXT:    s_endpgm
  87 bb:
  88   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  89   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
  90   store <8 x float> %res, ptr addrspace(1) %out
  91   ret void
  92 }
  93
  94 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
  95 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
  96 ; GFX12:       ; %bb.0: ; %bb
  97 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
  98 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
  99 ; GFX12-NEXT:    s_endpgm
 100 bb:
 101   %fneg.A = fneg <8 x half> %A
 102   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
 103   store <8 x half> %res, ptr addrspace(1) %out
 104   ret void
 105 }
 106
 107 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 108 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
 109 ; GFX12:       ; %bb.0: ; %bb
 110 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
 111 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
 112 ; GFX12-NEXT:    s_endpgm
 113 bb:
 114   %fneg.B = fneg <8 x half> %B
 115   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
 116   store <8 x half> %res, ptr addrspace(1) %out
 117   ret void
 118 }
 119
 120 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 121 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
 122 ; GFX12:       ; %bb.0: ; %bb
 123 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
 124 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
 125 ; GFX12-NEXT:    s_endpgm
 126 bb:
 127   %fneg.C = fneg <8 x half> %C
 128   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
 129   store <8 x half> %res, ptr addrspace(1) %out
 130   ret void
 131 }
 132
 133 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 134 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
 135 ; GFX12:       ; %bb.0: ; %bb
 136 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
 137 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
 138 ; GFX12-NEXT:    s_endpgm
 139 bb:
 140   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
 141   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
 142   store <8 x half> %res, ptr addrspace(1) %out
 143   ret void
 144 }
 145
 146 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 147 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
 148 ; GFX12:       ; %bb.0: ; %bb
 149 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
 150 ; GFX12-NEXT:    s_clause 0x1
 151 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 152 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 153 ; GFX12-NEXT:    s_endpgm
 154 bb:
 155   %fneg.C = fneg <8 x float> %C
 156   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
 157   store <8 x float> %res, ptr addrspace(1) %out
 158   ret void
 159 }
 160
 161 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 162 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
 163 ; GFX12:       ; %bb.0: ; %bb
 164 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
 165 ; GFX12-NEXT:    s_clause 0x1
 166 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 167 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 168 ; GFX12-NEXT:    s_endpgm
 169 bb:
 170   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
 171   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
 172   store <8 x float> %res, ptr addrspace(1) %out
 173   ret void
 174 }
 175
 176 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 177 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
 178 ; GFX12:       ; %bb.0: ; %bb
 179 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
 180 ; GFX12-NEXT:    s_clause 0x1
 181 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 182 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 183 ; GFX12-NEXT:    s_endpgm
 184 bb:
 185   %fneg.C = fneg <8 x float> %C
 186   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
 187   store <8 x float> %res, ptr addrspace(1) %out
 188   ret void
 189 }
 190
 191 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 192 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
 193 ; GFX12:       ; %bb.0: ; %bb
 194 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
 195 ; GFX12-NEXT:    s_clause 0x1
 196 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 197 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 198 ; GFX12-NEXT:    s_endpgm
 199 bb:
 200   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
 201   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
 202   store <8 x float> %res, ptr addrspace(1) %out
 203   ret void
 204 }
 205
 206 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 207 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
 208 ; GFX12:       ; %bb.0: ; %bb
 209 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
 210 ; GFX12-NEXT:    s_clause 0x1
 211 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 212 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 213 ; GFX12-NEXT:    s_endpgm
 214 bb:
 215   %fneg.C = fneg <8 x float> %C
 216   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
 217   store <8 x float> %res, ptr addrspace(1) %out
 218   ret void
 219 }
 220
 221 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 222 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
 223 ; GFX12:       ; %bb.0: ; %bb
 224 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
 225 ; GFX12-NEXT:    s_clause 0x1
 226 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 227 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 228 ; GFX12-NEXT:    s_endpgm
 229 bb:
 230   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
 231   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
 232   store <8 x float> %res, ptr addrspace(1) %out
 233   ret void
 234 }
 235
 236 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 237 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
 238 ; GFX12:       ; %bb.0: ; %bb
 239 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
 240 ; GFX12-NEXT:    s_clause 0x1
 241 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 242 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 243 ; GFX12-NEXT:    s_endpgm
 244 bb:
 245   %fneg.C = fneg <8 x float> %C
 246   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
 247   store <8 x float> %res, ptr addrspace(1) %out
 248   ret void
 249 }
 250
 251 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 252 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
 253 ; GFX12:       ; %bb.0: ; %bb
 254 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
 255 ; GFX12-NEXT:    s_clause 0x1
 256 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
 257 ; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
 258 ; GFX12-NEXT:    s_endpgm
 259 bb:
 260   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
 261   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
 262   store <8 x float> %res, ptr addrspace(1) %out
 263   ret void
 264 }
 265
 266 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 267 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
 268 ; GFX12:       ; %bb.0: ; %bb
 269 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
 270 ; GFX12-NEXT:    s_clause 0x1
 271 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 272 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 273 ; GFX12-NEXT:    s_endpgm
 274 bb:
 275   %fneg.A = fneg <8 x half> %A
 276   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
 277   store <8 x float> %res, ptr addrspace(1) %out
 278   ret void
 279 }
 280
 281 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 282 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
 283 ; GFX12:       ; %bb.0: ; %bb
 284 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
 285 ; GFX12-NEXT:    s_clause 0x1
 286 ; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
 287 ; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
 288 ; GFX12-NEXT:    s_endpgm
 289 bb:
 290   %fneg.B = fneg <16 x half> %B
 291   %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
 292   store <8 x float> %res, ptr addrspace(1) %out
 293   ret void
 294 }
 295
 296 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 297 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
 298 ; GFX12:       ; %bb.0: ; %bb
 299 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
 300 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
 301 ; GFX12-NEXT:    s_endpgm
 302 bb:
 303   %fneg.A = fneg <8 x half> %A
 304   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
 305   store <8 x half> %res, ptr addrspace(1) %out
 306   ret void
 307 }
 308
 309 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 310 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
 311 ; GFX12:       ; %bb.0: ; %bb
 312 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
 313 ; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
 314 ; GFX12-NEXT:    s_endpgm
 315 bb:
 316   %fneg.B = fneg <16 x half> %B
 317   %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
 318   store <8 x half> %res, ptr addrspace(1) %out
 319   ret void
 320 }
 321
 322 ; both neg and abs patterns (wmma matrix C f32 or f16 )
 323
 324 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 325 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
 326 ; GFX12:       ; %bb.0: ; %bb
 327 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
 328 ; GFX12-NEXT:    s_clause 0x1
 329 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 330 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 331 ; GFX12-NEXT:    s_endpgm
 332 bb:
 333   %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
 334   %fneg.fabs.C = fneg <8 x float> %fabs.C
 335   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
 336   store <8 x float> %res, ptr addrspace(1) %out
 337   ret void
 338 }
 339
 340 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 341 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
 342 ; GFX12:       ; %bb.0: ; %bb
 343 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
 344 ; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
 345 ; GFX12-NEXT:    s_endpgm
 346 bb:
 347   %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
 348   %fneg.fabs.C = fneg <8 x half> %fabs.C
 349   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
 350   store <8 x half> %res, ptr addrspace(1) %out
 351   ret void
 352 }
 353
 354 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 355 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
 356 ; GFX12:       ; %bb.0: ; %bb
 357 ; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
 358 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 359 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
 360 ; GFX12-NEXT:    s_clause 0x1
 361 ; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
 362 ; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
 363 ; GFX12-NEXT:    s_endpgm
 364 bb:
 365   %el3 = extractelement <8 x float> %C, i32 3
 366   %el3.fabs = call float @llvm.fabs.f32(float %el3)
 367   %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
 368   %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
 369   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
 370   store <8 x float> %res, ptr addrspace(1) %out
 371   ret void
 372 }
 373
 374 ; A or B matrix modifier and constant in C
 375
 376 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 377 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
 378 ; GFX12:       ; %bb.0: ; %bb
 379 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
 380 ; GFX12-NEXT:    s_clause 0x1
 381 ; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
 382 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 383 ; GFX12-NEXT:    s_endpgm
 384 bb:
 385   %fneg.A = fneg <8 x half> %A
 386   %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
 387   store <8 x float> %res, ptr addrspace(1) %out
 388   ret void
 389 }
 390
 391 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 392 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
 393 ; GFX12:       ; %bb.0: ; %bb
 394 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
 395 ; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
 396 ; GFX12-NEXT:    s_endpgm
 397 bb:
 398   %fneg.B = fneg <8 x half> %B
 399   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
 400   store <8 x half> %res, ptr addrspace(1) %out
 401   ret void
 402 }
 403
 404 ; pack f16 elements with v_perm_b32 since they don't come from same b32
 405
 406 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
 407 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 408 ; GFX12:       ; %bb.0: ; %bb
 409 ; GFX12-NEXT:    s_clause 0x1
 410 ; GFX12-NEXT:    flat_load_b128 v[12:15], v[8:9] offset:16
 411 ; GFX12-NEXT:    flat_load_b128 v[16:19], v[8:9]
 412 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x101
 413 ; GFX12-NEXT:    v_perm_b32 v15, v15, v14, 0x5040100
 414 ; GFX12-NEXT:    v_perm_b32 v14, v13, v12, 0x5040100
 415 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 416 ; GFX12-NEXT:    v_perm_b32 v13, v19, v18, 0x5040100
 417 ; GFX12-NEXT:    v_perm_b32 v12, v17, v16, 0x5040100
 418 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 419 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
 420 ; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
 421 ; GFX12-NEXT:    s_endpgm
 422 bb:
 423   %C = load <16 x half>, ptr %Caddr
 424   %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 425   %fneg.C_shuffle = fneg <8 x half> %C_shuffle
 426   %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
 427   store <8 x half> %res, ptr addrspace(1) %out
 428   ret void
 429 }
 430
 431 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
 432 declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
 433 declare float @llvm.fabs.f32(float)
 434
 435 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half>, <8 x half>, <8 x float>)
 436 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16>, <8 x i16>, <8 x float>)
 437 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
 438 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 439 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 440 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 441 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
 442 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half>, <16 x half>, <8 x float>, i16)
 443 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)