llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
   3
   4 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
   5 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
   6 ; GFX12:       ; %bb.0: ; %bb
   7 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
   8 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
   9 ; GFX12-NEXT:    s_endpgm
  10 bb:
  11   %fneg.A = fneg <4 x half> %A
  12   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
  13   store <4 x float> %res, ptr addrspace(1) %out
  14   ret void
  15 }
  16
  17 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
  18 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
  19 ; GFX12:       ; %bb.0: ; %bb
  20 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
  21 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
  22 ; GFX12-NEXT:    s_endpgm
  23 bb:
  24   %fneg.B = fneg <4 x half> %B
  25   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
  26   store <4 x float> %res, ptr addrspace(1) %out
  27   ret void
  28 }
  29
  30 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
  31 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
  32 ; GFX12:       ; %bb.0: ; %bb
  33 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
  34 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
  35 ; GFX12-NEXT:    s_endpgm
  36 bb:
  37   %fneg.C = fneg <4 x float> %C
  38   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
  39   store <4 x float> %res, ptr addrspace(1) %out
  40   ret void
  41 }
  42
  43 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
  44 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
  45 ; GFX12:       ; %bb.0: ; %bb
  46 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
  47 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
  48 ; GFX12-NEXT:    s_endpgm
  49 bb:
  50   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  51   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
  52   store <4 x float> %res, ptr addrspace(1) %out
  53   ret void
  54 }
  55
  56 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
  57 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
  58 ; GFX12:       ; %bb.0: ; %bb
  59 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
  60 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
  61 ; GFX12-NEXT:    s_endpgm
  62 bb:
  63   %fneg.C = fneg <4 x float> %C
  64   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
  65   store <4 x float> %res, ptr addrspace(1) %out
  66   ret void
  67 }
  68
  69 define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
  70 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
  71 ; GFX12:       ; %bb.0: ; %bb
  72 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
  73 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
  74 ; GFX12-NEXT:    s_endpgm
  75 bb:
  76   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  77   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
  78   store <4 x float> %res, ptr addrspace(1) %out
  79   ret void
  80 }
  81
  82 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
  83 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
  84 ; GFX12:       ; %bb.0: ; %bb
  85 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
  86 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
  87 ; GFX12-NEXT:    s_endpgm
  88 bb:
  89   %fneg.A = fneg <4 x half> %A
  90   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
  91   store <4 x half> %res, ptr addrspace(1) %out
  92   ret void
  93 }
  94
  95 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
  96 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
  97 ; GFX12:       ; %bb.0: ; %bb
  98 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
  99 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
 100 ; GFX12-NEXT:    s_endpgm
 101 bb:
 102   %fneg.B = fneg <4 x half> %B
 103   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
 104   store <4 x half> %res, ptr addrspace(1) %out
 105   ret void
 106 }
 107
 108 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 109 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
 110 ; GFX12:       ; %bb.0: ; %bb
 111 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
 112 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
 113 ; GFX12-NEXT:    s_endpgm
 114 bb:
 115   %fneg.C = fneg <4 x half> %C
 116   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
 117   store <4 x half> %res, ptr addrspace(1) %out
 118   ret void
 119 }
 120
 121 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 122 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
 123 ; GFX12:       ; %bb.0: ; %bb
 124 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
 125 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
 126 ; GFX12-NEXT:    s_endpgm
 127 bb:
 128   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
 129   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
 130   store <4 x half> %res, ptr addrspace(1) %out
 131   ret void
 132 }
 133
 134 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 135 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
 136 ; GFX12:       ; %bb.0: ; %bb
 137 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 138 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 139 ; GFX12-NEXT:    s_endpgm
 140 bb:
 141   %fneg.C = fneg <4 x float> %C
 142   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
 143   store <4 x float> %res, ptr addrspace(1) %out
 144   ret void
 145 }
 146
 147 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 148 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
 149 ; GFX12:       ; %bb.0: ; %bb
 150 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 151 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 152 ; GFX12-NEXT:    s_endpgm
 153 bb:
 154   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
 155   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
 156   store <4 x float> %res, ptr addrspace(1) %out
 157   ret void
 158 }
 159
 160 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 161 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
 162 ; GFX12:       ; %bb.0: ; %bb
 163 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 164 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 165 ; GFX12-NEXT:    s_endpgm
 166 bb:
 167   %fneg.C = fneg <4 x float> %C
 168   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
 169   store <4 x float> %res, ptr addrspace(1) %out
 170   ret void
 171 }
 172
 173 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 174 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
 175 ; GFX12:       ; %bb.0: ; %bb
 176 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 177 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 178 ; GFX12-NEXT:    s_endpgm
 179 bb:
 180   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
 181   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
 182   store <4 x float> %res, ptr addrspace(1) %out
 183   ret void
 184 }
 185
 186 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 187 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
 188 ; GFX12:       ; %bb.0: ; %bb
 189 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 190 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 191 ; GFX12-NEXT:    s_endpgm
 192 bb:
 193   %fneg.C = fneg <4 x float> %C
 194   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
 195   store <4 x float> %res, ptr addrspace(1) %out
 196   ret void
 197 }
 198
 199 define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 200 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
 201 ; GFX12:       ; %bb.0: ; %bb
 202 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 203 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 204 ; GFX12-NEXT:    s_endpgm
 205 bb:
 206   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
 207   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
 208   store <4 x float> %res, ptr addrspace(1) %out
 209   ret void
 210 }
 211
 212 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 213 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
 214 ; GFX12:       ; %bb.0: ; %bb
 215 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
 216 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 217 ; GFX12-NEXT:    s_endpgm
 218 bb:
 219   %fneg.C = fneg <4 x float> %C
 220   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
 221   store <4 x float> %res, ptr addrspace(1) %out
 222   ret void
 223 }
 224
 225 define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 226 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
 227 ; GFX12:       ; %bb.0: ; %bb
 228 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
 229 ; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
 230 ; GFX12-NEXT:    s_endpgm
 231 bb:
 232   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
 233   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
 234   store <4 x float> %res, ptr addrspace(1) %out
 235   ret void
 236 }
 237
 238 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 239 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
 240 ; GFX12:       ; %bb.0: ; %bb
 241 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
 242 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
 243 ; GFX12-NEXT:    s_endpgm
 244 bb:
 245   %fneg.A = fneg <4 x half> %A
 246   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
 247   store <4 x float> %res, ptr addrspace(1) %out
 248   ret void
 249 }
 250
 251 define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 252 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
 253 ; GFX12:       ; %bb.0: ; %bb
 254 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
 255 ; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
 256 ; GFX12-NEXT:    s_endpgm
 257 bb:
 258   %fneg.B = fneg <8 x half> %B
 259   %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
 260   store <4 x float> %res, ptr addrspace(1) %out
 261   ret void
 262 }
 263
 264 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 265 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
 266 ; GFX12:       ; %bb.0: ; %bb
 267 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
 268 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
 269 ; GFX12-NEXT:    s_endpgm
 270 bb:
 271   %fneg.A = fneg <4 x half> %A
 272   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
 273   store <4 x half> %res, ptr addrspace(1) %out
 274   ret void
 275 }
 276
 277 define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 278 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
 279 ; GFX12:       ; %bb.0: ; %bb
 280 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
 281 ; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
 282 ; GFX12-NEXT:    s_endpgm
 283 bb:
 284   %fneg.B = fneg <8 x half> %B
 285   %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
 286   store <4 x half> %res, ptr addrspace(1) %out
 287   ret void
 288 }
 289
 290 ; both neg and abs patterns (wmma matrix C f32 or f16 )
 291
 292 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 293 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
 294 ; GFX12:       ; %bb.0: ; %bb
 295 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
 296 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
 297 ; GFX12-NEXT:    s_endpgm
 298 bb:
 299   %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
 300   %fneg.fabs.C = fneg <4 x float> %fabs.C
 301   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
 302   store <4 x float> %res, ptr addrspace(1) %out
 303   ret void
 304 }
 305
 306 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 307 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
 308 ; GFX12:       ; %bb.0: ; %bb
 309 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
 310 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
 311 ; GFX12-NEXT:    s_endpgm
 312 bb:
 313   %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
 314   %fneg.fabs.C = fneg <4 x half> %fabs.C
 315   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
 316   store <4 x half> %res, ptr addrspace(1) %out
 317   ret void
 318 }
 319
 320 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 321 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
 322 ; GFX12:       ; %bb.0: ; %bb
 323 ; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
 324 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 325 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
 326 ; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
 327 ; GFX12-NEXT:    s_endpgm
 328 bb:
 329   %el3 = extractelement <4 x float> %C, i32 3
 330   %el3.fabs = call float @llvm.fabs.f32(float %el3)
 331   %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
 332   %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
 333   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
 334   store <4 x float> %res, ptr addrspace(1) %out
 335   ret void
 336 }
 337
 338 ; A or B matrix modifier and constant in C
 339
 340 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 341 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
 342 ; GFX12:       ; %bb.0: ; %bb
 343 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
 344 ; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
 345 ; GFX12-NEXT:    s_endpgm
 346 bb:
 347   %fneg.A = fneg <4 x half> %A
 348   %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
 349   store <4 x float> %res, ptr addrspace(1) %out
 350   ret void
 351 }
 352
 353 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 354 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
 355 ; GFX12:       ; %bb.0: ; %bb
 356 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
 357 ; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
 358 ; GFX12-NEXT:    s_endpgm
 359 bb:
 360   %fneg.B = fneg <4 x half> %B
 361   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
 362   store <4 x half> %res, ptr addrspace(1) %out
 363   ret void
 364 }
 365
 366 ; pack f16 elements with v_perm_b32 since they don't come from same b32
 367
 368 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
 369 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 370 ; GFX12:       ; %bb.0: ; %bb
 371 ; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
 372 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
 373 ; GFX12-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
 374 ; GFX12-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
 375 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 376 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
 377 ; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
 378 ; GFX12-NEXT:    s_endpgm
 379 bb:
 380   %C = load <8 x half>, ptr %Caddr
 381   %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 382   %fneg.C_shuffle = fneg <4 x half> %C_shuffle
 383   %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
 384   store <4 x half> %res, ptr addrspace(1) %out
 385   ret void
 386 }
 387
 388 declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
 389 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
 390 declare float @llvm.fabs.f32(float)
 391
 392 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half>, <4 x half>, <4 x float>)
 393 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16>, <4 x i16>, <4 x float>)
 394 declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
 395 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32, i32, <4 x float>)
 396 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32, i32, <4 x float>)
 397 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32, i32, <4 x float>)
 398 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32, i32, <4 x float>)
 399 declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half>, <8 x half>, <4 x float>, i16)
 400 declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)