llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

   1 ; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
   2
   3 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   4 define amdgpu_kernel void @ds_swizzle(ptr addrspace(1) %out, i32 %src) #0 {
   5   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   6   store i32 %swizzle, ptr addrspace(1) %out, align 4
   7   ret void
   8 }
   9
  10 ; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  11 define amdgpu_kernel void @v_permlane16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
  12   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  13   store i32 %v, ptr addrspace(1) %out
  14   ret void
  15 }
  16
  17 ; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  18 define amdgpu_kernel void @v_permlanex16_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
  19   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  20   store i32 %v, ptr addrspace(1) %out
  21   ret void
  22 }
  23
  24 ; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  25 define amdgpu_kernel void @v_permlane16_var_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
  26   %v = call i32 @llvm.amdgcn.permlane16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  27   store i32 %v, ptr addrspace(1) %out
  28   ret void
  29 }
  30
  31 ; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  32 define amdgpu_kernel void @v_permlanex16_var_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
  33   %v = call i32 @llvm.amdgcn.permlanex16.var(i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) #0
  34   store i32 %v, ptr addrspace(1) %out
  35   ret void
  36 }
  37
  38 ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
  39 define amdgpu_kernel void @update_dpp(ptr addrspace(1) %out, i32 %in1, i32 %in2) #0 {
  40   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
  41   store i32 %tmp0, ptr addrspace(1) %out
  42   ret void
  43 }
  44
  45 ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0
  46 define amdgpu_kernel void @mov_dpp(ptr addrspace(1) %out, i32 %in) #0 {
  47   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0
  48   store i32 %tmp0, ptr addrspace(1) %out
  49   ret void
  50 }
  51
  52 ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0
  53 define amdgpu_kernel void @mov_dpp8(ptr addrspace(1) %out, i32 %in) #0 {
  54   %tmp0 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %in, i32 1) #0
  55   store i32 %tmp0, ptr addrspace(1) %out
  56   ret void
  57 }
  58
  59 ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
  60 define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
  61   %tmp0 = call i32 @llvm.amdgcn.writelane.i32(i32 0, i32 1, i32 2)
  62   store i32 %tmp0, ptr addrspace(1) %out
  63   ret void
  64 }
  65
  66 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
  67 define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
  68   %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
  69   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
  70   ret void
  71 }
  72
  73 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
  74 define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
  75   %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
  76   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
  77   ret void
  78 }
  79
  80 ; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
  81 define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
  82 bb:
  83   %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
  84   store <16 x half> %tmp0, ptr addrspace(1) %out, align 32
  85   ret void
  86 }
  87
  88 ; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
  89 define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
  90 bb:
  91   %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
  92   store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32
  93   ret void
  94 }
  95
  96 ; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
  97 define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
  98 bb:
  99   %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
 100   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
 101   ret void
 102 }
 103
 104 ; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
 105 define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 106 bb:
 107   %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
 108   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
 109   ret void
 110 }
 111
 112 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
 113 define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 114 bb:
 115   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
 116     store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
 117     ret void
 118 }
 119
 120 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
 121 define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 122 bb:
 123   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
 124     store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
 125     ret void
 126 }
 127
 128 ; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
 129 define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 130 bb:
 131   %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
 132     store <8 x half> %tmp0, ptr addrspace(1) %out, align 32
 133     ret void
 134 }
 135
 136 ; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
 137 define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
 138 bb:
 139   %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
 140   store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32
 141   ret void
 142 }
 143
 144 ; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
 145 define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
 146 bb:
 147   %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
 148   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
 149   ret void
 150 }
 151
 152 ; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
 153 define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
 154 bb:
 155   %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
 156   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
 157   ret void
 158 }
 159
 160 ; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
 161 define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
 162 bb:
 163   %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
 164   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
 165   ret void
 166 }
 167
 168 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 169 define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 170 bb:
 171   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 172   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
 173   ret void
 174 }
 175
 176 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 177 define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 178 bb:
 179   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 180   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
 181   ret void
 182 }
 183
 184 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 185 define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 186 bb:
 187   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 188   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
 189   ret void
 190 }
 191
 192 ; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 193 define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 194 bb:
 195   %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
 196   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
 197   ret void
 198 }
 199
 200 ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) %addr)
 201 define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 202 bb:
 203   %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) %addr)
 204   store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
 205   ret void
 206 }
 207
 208 ; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) %addr)
 209 define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 210 bb:
 211   %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) %addr)
 212   store <8 x i16> %tmp0, ptr addrspace(1) %out, align 16
 213   ret void
 214 }
 215
 216 ; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr)
 217 define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 218 bb:
 219   %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr)
 220   store <8 x half> %tmp0, ptr addrspace(1) %out, align 16
 221   ret void
 222 }
 223
 224 ; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr)
 225 define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 226 bb:
 227   %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr)
 228   store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
 229   ret void
 230 }
 231
 232 ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr)
 233 define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 234 bb:
 235   %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr)
 236   store i32 %tmp0, ptr addrspace(1) %out, align 4
 237   ret void
 238 }
 239
 240 ; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) %addr)
 241 define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 242 bb:
 243   %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) %addr)
 244   store <4 x i16> %tmp0, ptr addrspace(1) %out, align 8
 245   ret void
 246 }
 247
 248 ; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr)
 249 define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 250 bb:
 251   %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr)
 252   store <4 x half> %tmp0, ptr addrspace(1) %out, align 8
 253   ret void
 254 }
 255
 256 ; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr)
 257 define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 258 bb:
 259   %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr)
 260   store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8
 261   ret void
 262 }
 263
 264 declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3))
 265
 266 ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
 267 define amdgpu_kernel void @ds_read_b64_tr4_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
 268 bb:
 269   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
 270   %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
 271   store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
 272   ret void
 273 }
 274
 275 declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3))
 276
 277 ; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
 278 define amdgpu_kernel void @ds_read_b96_tr6_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
 279 bb:
 280   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
 281   %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
 282   store <3 x i32> %tmp0, ptr addrspace(1) %out, align 16
 283   ret void
 284 }
 285
 286 declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3))
 287
 288 ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
 289 define amdgpu_kernel void @ds_read_b64_tr8_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
 290 bb:
 291   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
 292   %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
 293   store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
 294   ret void
 295 }
 296
 297 declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3))
 298
 299 ; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
 300 define amdgpu_kernel void @ds_read_b64_tr_b16_v4i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
 301 bb:
 302   %gep = getelementptr i64, ptr addrspace(3) %addr, i16 4
 303   %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
 304   store <4 x i16> %tmp0, ptr addrspace(1) %out, align 16
 305   ret void
 306 }
 307
 308 declare <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3))
 309
 310 ; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
 311 define amdgpu_kernel void @ds_read_b64_tr_b16_v4f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
 312 bb:
 313   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
 314   %tmp0 = call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep)
 315   store <4 x half> %tmp0, ptr addrspace(1) %out, align 16
 316   ret void
 317 }
 318
 319 declare <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3))
 320
 321 ; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
 322 define amdgpu_kernel void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
 323 bb:
 324   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
 325   %tmp0 = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16(ptr addrspace(3) %gep)
 326   store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
 327   ret void
 328 }
 329
 330 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 331 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
 332
 333 ; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 334 define amdgpu_kernel void @mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
 335   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 336   store <4 x float> %result, ptr addrspace(1) %out
 337   ret void
 338 }
 339
 340 ; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 341 define amdgpu_kernel void @mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
 342   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 343   store <16 x float> %result, ptr addrspace(1) %out
 344   ret void
 345 }
 346
 347 ; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 348 define amdgpu_kernel void @mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, ptr addrspace(1) %out) {
 349   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 350   store <16 x float> %result, ptr addrspace(1) %out
 351   ret void
 352 }
 353
 354 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32>, <8 x i32>, <4 x float>, i32 immarg, i32 immarg,
 355                                                                               i32 immarg, i32, i32 immarg, i32)
 356
 357 ; CHECK: DIVERGENT:   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
 358 define amdgpu_kernel void @mfma_scale_f32_16x16x128_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) {
 359   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
 360   store <4 x float> %result, ptr addrspace(1) %out
 361   ret void
 362 }
 363
 364 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4(<8 x i32>, <8 x i32>, <16 x float>, i32 immarg, i32 immarg,
 365                                                                 i32 immarg, i32, i32 immarg, i32)
 366
 367 ; CHECK: DIVERGENT: %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
 368 define amdgpu_kernel void @mfma_f32_scale_32x32x64_f8f6f4(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 %arg4, ptr addrspace(1) %out) {
 369   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0, i32 %arg3, i32 immarg 0, i32 %arg4)
 370   store <16 x float> %result, ptr addrspace(1) %out
 371   ret void
 372 }
 373
 374 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg)
 375
 376 ; CHECK: DIVERGENT: %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 377 define amdgpu_kernel void @mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, ptr addrspace(1) %out) {
 378   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 379   store <4 x i32> %result, ptr addrspace(1) %out
 380   ret void
 381 }
 382
 383 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32>, <4 x i32>, <16 x i32>, i32 immarg, i32 immarg, i32 immarg)
 384
 385 ; CHECK: DIVERGENT: %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 386 define amdgpu_kernel void @mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) {
 387   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 388   store <16 x i32> %result, ptr addrspace(1) %out
 389   ret void
 390 }
 391
 392 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 393
 394 ; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 395 define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
 396   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 397   store <4 x float> %result, ptr addrspace(1) %out
 398   ret void
 399 }
 400
 401 declare <4 x float> @llvm.amdgcn.smmfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg)
 402
 403 ; CHECK: DIVERGENT:   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
 404 define amdgpu_kernel void @smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 405   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
 406   store <4 x float> %result, ptr addrspace(1) %out
 407   ret void
 408 }
 409
 410 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half>, <16 x float>, i32, i32 immarg, i32 immarg)
 411
 412 ; CHECK: DIVERGENT:   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
 413 define amdgpu_kernel void @smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 414   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
 415   store <16 x float> %result, ptr addrspace(1) %out
 416   ret void
 417 }
 418
 419 declare <4 x float> @llvm.amdgcn.smmfmac.f32.16x16x64.bf16(<8 x bfloat>, <16 x bfloat>, <4 x float>, i32, i32 immarg, i32 immarg)
 420
 421 ; CHECK: DIVERGENT:   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
 422 define amdgpu_kernel void @smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 423   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
 424   store <4 x float> %result, ptr addrspace(1) %out
 425   ret void
 426 }
 427
 428 declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32)
 429
 430 ; CHECK: DIVERGENT:   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 1, i32 2)
 431 define amdgpu_kernel void @smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 432   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 1, i32 2)
 433   store <4 x i32> %result, ptr addrspace(1) %out
 434   ret void
 435 }
 436
 437 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
 438
 439 ; CHECK: DIVERGENT:  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 440 define amdgpu_kernel void @smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 441   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 442   store <4 x float> %result, ptr addrspace(1) %out
 443   ret void
 444 }
 445
 446 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
 447
 448 ; CHECK: DIVERGENT:  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 449 define amdgpu_kernel void @smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 450   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 451   store <4 x float> %result, ptr addrspace(1) %out
 452   ret void
 453 }
 454
 455 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
 456
 457 ; CHECK: DIVERGENT:  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 458 define amdgpu_kernel void @smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 459   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 460   store <4 x float> %result, ptr addrspace(1) %out
 461   ret void
 462 }
 463
 464 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
 465
 466 ; CHECK: DIVERGENT:  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 467 define amdgpu_kernel void @smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 468   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
 469   store <4 x float> %result, ptr addrspace(1) %out
 470   ret void
 471 }
 472
 473 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32, i32)
 474
 475 ; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 476 define amdgpu_kernel void @smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 477   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 478   store <16 x float> %result, ptr addrspace(1) %out
 479   ret void
 480 }
 481
 482 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32, i32)
 483
 484 ; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 485 define amdgpu_kernel void @smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 486   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 487   store <16 x float> %result, ptr addrspace(1) %out
 488   ret void
 489 }
 490
 491 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32, i32)
 492
 493 ; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 494 define amdgpu_kernel void @smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 495   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 496   store <16 x float> %result, ptr addrspace(1) %out
 497   ret void
 498 }
 499
 500 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32, i32)
 501
 502 ; CHECK: DIVERGENT:  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 503 define amdgpu_kernel void @smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
 504   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 1, i32 2)
 505   store <16 x float> %result, ptr addrspace(1) %out
 506   ret void
 507 }
 508
 509 ; CHECK: DIVERGENT:   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %src0, i32 %src1, i1 false, i1 false)
 510 define amdgpu_kernel void @v_permlane16_swap(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
 511   %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %src0, i32 %src1, i1 false, i1 false)
 512   store { i32, i32 } %v, ptr addrspace(1) %out
 513   ret void
 514 }
 515
 516 ; CHECK: DIVERGENT:   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %src0, i32 %src1, i1 false, i1 false)
 517 define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
 518   %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %src0, i32 %src1, i1 false, i1 false)
 519   store { i32, i32 } %v, ptr addrspace(1) %out
 520   ret void
 521 }
 522
 523
 524
 525 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 526 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
 527 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
 528 declare i32 @llvm.amdgcn.permlane16.var(i32, i32, i32, i1, i1) #1
 529 declare i32 @llvm.amdgcn.permlanex16.var(i32, i32, i32, i1, i1) #1
 530 declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
 531 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
 532 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
 533 declare i32 @llvm.amdgcn.writelane.i32(i32, i32, i32) #1
 534 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
 535 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
 536 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
 537 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
 538 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
 539 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
 540 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half>, <16 x half>, <8 x float>, i16)
 541 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
 542 declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
 543 declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
 544 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
 545 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1)
 546 declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
 547 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 548 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 549 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 550 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 551
 552 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
 553 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))
 554 declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1))
 555 declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1))
 556 declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1))
 557 declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
 558 declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
 559 declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))
 560
 561 attributes #0 = { nounwind convergent }
 562 attributes #1 = { nounwind readnone convergent }