llvm/test/CodeGen/NVPTX/wmma.py

   1 # This test generates all variants of wmma intrinsics and verifies that LLVM
   2 # generates correct instructions for them.
   3
   4 # Check all variants of instructions supported by PTX60 on SM70
   5 # RUN: %python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll
   6 # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
   7 # RUN:           --check-prefixes=INTRINSICS,M16N16
   8 # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
   9 # RUN:           --check-prefixes=INTRINSICS,NOEXTGEOM,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
  10 # RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
  11 # RUN:           | FileCheck %t-ptx60-sm_70.ll
  12 # RUN: %if ptxas %{                                                       \
  13 # RUN:   llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
  14 # RUN:           | %ptxas-verify -arch=sm_70                              \
  15 # RUN: %}
  16
  17 # Check all variants of instructions supported by PTX61 on SM70
  18 # RUN: %python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
  19 # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
  20 # RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM
  21 # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
  22 # RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
  23 # RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
  24 # RUN:           | FileCheck %t-ptx61-sm_70.ll
  25 # RUN: %if ptxas-9.1 %{                                                   \
  26 # RUN:   llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
  27 # RUN:           | %ptxas-verify -arch=sm_70                              \
  28 # RUN: %}
  29
  30 # Check all variants of instructions supported by PTX63 on SM72
  31 # RUN: %python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
  32 # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
  33 # RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT
  34 # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
  35 # RUN:           --check-prefixes=INTRINSICS,NOSUBINT,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
  36 # RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
  37 # RUN:           | FileCheck %t-ptx63-sm_72.ll
  38 # RUN: %if ptxas-10.0 %{                                                  \
  39 # RUN:   llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
  40 # RUN:           | %ptxas-verify -arch=sm_72                              \
  41 # RUN: %}
  42
  43 # Check all variants of instructions supported by PTX63 on SM75
  44 # RUN: %python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
  45 # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
  46 # RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT
  47 # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
  48 # RUN:           --check-prefixes=INTRINSICS,NOMMA,NODOUBLE,NOALTFLOAT,NOLDMATRIX
  49 # RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
  50 # RUN:           | FileCheck %t-ptx63-sm_75.ll
  51 # RUN: %if ptxas-10.0 %{                                                  \
  52 # RUN:   llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
  53 # RUN:           | %ptxas-verify -arch=sm_75                              \
  54 # RUN: %}
  55
  56 # Check all variants of instructions supported by PTX64 on SM70+
  57 # RUN: %python %s --ptx=64 --gpu-arch=70 > %t-ptx64-sm_70.ll
  58 # RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
  59 # RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,MMA
  60 # RUN: FileCheck %t-ptx64-sm_70.ll < %t-ptx64-sm_70.ll \
  61 # RUN:           --check-prefixes=INTRINSICS,NOINT,NOSUBINT,NODOUBLE,NOALTFLOAT,NOLDMATRIX
  62 # RUN: llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
  63 # RUN:           | FileCheck %t-ptx64-sm_70.ll
  64 # RUN: %if ptxas-10.1 %{                                                  \
  65 # RUN:   llc < %t-ptx64-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx64 \
  66 # RUN:           | %ptxas-verify -arch=sm_70                              \
  67 # RUN: %}
  68
  69 # Check all variants of instructions supported by PTX65 on SM75+
  70 # RUN: %python %s --ptx=65 --gpu-arch=75 > %t-ptx65-sm_75.ll
  71 # RUN: FileCheck %t-ptx65-sm_75.ll < %t-ptx65-sm_75.ll \
  72 # RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT,MMA,PTX65MMA,PTX65LDMATRIX
  73 # RUN: FileCheck %t-ptx65-sm_75.ll < %t-ptx65-sm_75.ll \
  74 # RUN:           --check-prefixes=INTRINSICS
  75 # RUN: llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \
  76 # RUN:           | FileCheck %t-ptx65-sm_75.ll
  77 # RUN: %if ptxas-10.2 %{                                                  \
  78 # RUN:   llc < %t-ptx65-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx65 \
  79 # RUN:           | %ptxas-verify -arch=sm_75                              \
  80 # RUN: %}
  81
  82 # Check all variants of instructions supported by PTX71 on SM80+
  83 # RUN: %python %s --ptx=71 --gpu-arch=80 > %t-ptx71-sm_80.ll
  84 # RUN: FileCheck %t-ptx71-sm_80.ll < %t-ptx71-sm_80.ll \
  85 # RUN:           --check-prefixes=INTRINSICS,M16N16,EXTGEOM,INT,SUBINT,MMA,ALTFLOAT,DOUBLE,PTX65MMA,PTX65LDMATRIX,PTX71MMA
  86 # RUN: FileCheck %t-ptx71-sm_80.ll < %t-ptx71-sm_80.ll \
  87 # RUN:           --check-prefixes=INTRINSICS
  88 # RUN: llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \
  89 # RUN:           | FileCheck %t-ptx71-sm_80.ll
  90 # RUN: %if ptxas-11.1 %{                                                  \
  91 # RUN:   llc < %t-ptx71-sm_80.ll -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 \
  92 # RUN:           | %ptxas-verify -arch=sm_80                              \
  93 # RUN: %}
  94
  95 from __future__ import print_function
  96
  97 import argparse
  98 from itertools import product
  99 from string import Template
 100
 101
 102 class MMAType:
 103     def __init__(self, ptx_type):
 104         self.ptx_type = ptx_type
 105         self.llvm_type = {
 106             "f16": "<2 x half>",
 107             "f32": "float",
 108             "f64": "double",
 109             "s32": "i32",
 110             "b16": "i32",
 111             "s8": "i32",
 112             "u8": "i32",
 113             "s4": "i32",
 114             "u4": "i32",
 115             "b1": "i32",
 116             "bf16": "i32",
 117             "tf32": "i32",
 118         }[ptx_type]
 119
 120         self.ptx_reg_pattern = {
 121             "f16": "%r[0-9]+",
 122             "f32": "%f[0-9]+",
 123             "f64": "%fd[0-9]+",
 124         }.get(ptx_type, "%r[0-9]+")
 125
 126     def __repr__(self):
 127         return "%s/%s" % (self.ptx_type, self.llvm_type)
 128
 129
 130 class MMAFrag:
 131     def __init__(self, geom, frag, ptx_elt_type):
 132         self.geom = geom
 133         self.frag = frag
 134         self.mma_type = MMAType(ptx_elt_type)
 135         self.nregs = {
 136             # u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
 137             "m16n16k16:a:u8": 2,
 138             "m16n16k16:a:s8": 2,
 139             "m16n16k16:b:u8": 2,
 140             "m16n16k16:b:s8": 2,
 141             "m16n16k16:c:s32": 8,
 142             "m16n16k16:d:s32": 8,
 143             "m8n32k16:a:u8": 1,
 144             "m8n32k16:a:s8": 1,
 145             "m8n32k16:b:u8": 4,
 146             "m8n32k16:b:s8": 4,
 147             "m8n32k16:c:s32": 8,
 148             "m8n32k16:d:s32": 8,
 149             "m32n8k16:a:u8": 4,
 150             "m32n8k16:a:s8": 4,
 151             "m32n8k16:b:u8": 1,
 152             "m32n8k16:b:s8": 1,
 153             "m32n8k16:c:s32": 8,
 154             "m32n8k16:d:s32": 8,
 155             "m8n8k16:a:u8": 1,
 156             "m8n8k16:a:s8": 1,
 157             "m8n8k16:b:u8": 1,
 158             "m8n8k16:b:s8": 1,
 159             "m8n8k16:c:s32": 2,
 160             "m8n8k16:d:s32": 2,
 161             "m16n8k16:a:u8": 2,
 162             "m16n8k16:a:s8": 2,
 163             "m16n8k16:b:u8": 1,
 164             "m16n8k16:b:s8": 1,
 165             "m16n8k16:c:s32": 4,
 166             "m16n8k16:d:s32": 4,
 167             "m16n8k32:a:u8": 4,
 168             "m16n8k32:a:s8": 4,
 169             "m16n8k32:b:u8": 2,
 170             "m16n8k32:b:s8": 2,
 171             "m16n8k32:c:s32": 4,
 172             "m16n8k32:d:s32": 4,
 173             # u4/s4 -> s32 @ m8n8k32 (u4/s4)
 174             "m8n8k32:a:u4": 1,
 175             "m8n8k32:a:s4": 1,
 176             "m8n8k32:b:u4": 1,
 177             "m8n8k32:b:s4": 1,
 178             "m8n8k32:c:s32": 2,
 179             "m8n8k32:d:s32": 2,
 180             "m16n8k32:a:u4": 2,
 181             "m16n8k32:a:s4": 2,
 182             "m16n8k32:b:u4": 1,
 183             "m16n8k32:b:s4": 1,
 184             "m16n8k32:c:s32": 4,
 185             "m16n8k32:d:s32": 4,
 186             "m16n8k64:a:u4": 4,
 187             "m16n8k64:a:s4": 4,
 188             "m16n8k64:b:u4": 2,
 189             "m16n8k64:b:s4": 2,
 190             "m16n8k64:c:s32": 4,
 191             "m16n8k64:d:s32": 4,
 192             # b1 -> s32 @ m8n8k128(b1)
 193             "m8n8k128:a:b1": 1,
 194             "m8n8k128:b:b1": 1,
 195             "m8n8k128:c:s32": 2,
 196             "m8n8k128:d:s32": 2,
 197             "m16n8k128:a:b1": 2,
 198             "m16n8k128:b:b1": 1,
 199             "m16n8k128:c:s32": 4,
 200             "m16n8k128:d:s32": 4,
 201             "m16n8k256:a:b1": 4,
 202             "m16n8k256:b:b1": 2,
 203             "m16n8k256:c:s32": 4,
 204             "m16n8k256:d:s32": 4,
 205             # bf16 -> s32 @ m16n16k16/m8n32k16/m32n8k16
 206             "m16n16k16:a:bf16": 4,
 207             "m16n16k16:b:bf16": 4,
 208             "m8n32k16:a:bf16": 2,
 209             "m8n32k16:b:bf16": 8,
 210             "m32n8k16:a:bf16": 8,
 211             "m32n8k16:b:bf16": 2,
 212             "m16n8k16:a:bf16": 4,
 213             "m16n8k16:b:bf16": 2,
 214             "m16n8k16:c:f32": 4,
 215             "m16n8k16:d:f32": 4,
 216             "m16n8k8:a:bf16": 2,
 217             "m16n8k8:b:bf16": 1,
 218             "m16n8k8:c:f32": 4,
 219             "m16n8k8:d:f32": 4,
 220             "m8n8k4:a:f64": 1,
 221             "m8n8k4:b:f64": 1,
 222             "m8n8k4:c:f64": 2,
 223             "m8n8k4:d:f64": 2,
 224             # tf32 -> s32 @ m16n16k8
 225             "m16n16k8:a:tf32": 4,
 226             "m16n16k8:b:tf32": 4,
 227             "m16n8k4:a:tf32": 2,
 228             "m16n8k4:b:tf32": 1,
 229             "m16n8k4:c:f32": 4,
 230             "m16n8k4:d:f32": 4,
 231             "m16n8k8:a:tf32": 4,
 232             "m16n8k8:b:tf32": 2,
 233             "m16n8k8:c:f32": 4,
 234             "m16n8k8:d:f32": 4,
 235             "m8n8k4:a:f16": 2,
 236             "m8n8k4:b:f16": 2,
 237             "m16n8k8:a:f16": 2,
 238             "m16n8k8:b:f16": 1,
 239             "m16n8k8:c:f16": 2,
 240             "m16n8k8:d:f16": 2,
 241             "m16n8k8:c:f32": 4,
 242             "m16n8k8:d:f32": 4,
 243             "m16n8k16:a:f16": 4,
 244             "m16n8k16:b:f16": 2,
 245             "m16n8k16:c:f16": 2,
 246             "m16n8k16:d:f16": 2,
 247             "m16n8k16:c:f32": 4,
 248             "m16n8k16:d:f32": 4,
 249             # ldmatrix
 250             "m8n8:x1:b16": 1,
 251             "m8n8:x2:b16": 2,
 252             "m8n8:x4:b16": 4,
 253         }.get(
 254             "%s:%s:%s" % (geom, frag, ptx_elt_type),
 255             {
 256                 # All other FP shape/fragment/type combinations have the same size
 257                 "a:f16": 8,
 258                 "b:f16": 8,
 259                 "c:f16": 4,
 260                 "d:f16": 4,
 261                 "c:f32": 8,
 262                 "d:f32": 8,
 263             }.get("%s:%s" % (frag, ptx_elt_type), None),
 264         )
 265         assert self.nregs
 266
 267     def __repr__(self):
 268         return "%s:%s:%s%s" % (
 269             self.geom,
 270             self.frag,
 271             self.mma_type,
 272             "" if self.nregs == 1 else ("*%d" % self.nregs),
 273         )
 274
 275
 276 class MMAOp:
 277     def __init__(self, a, b, c, d):
 278         self.a = a
 279         self.b = b
 280         self.c = c
 281         self.d = d
 282
 283     def __repr__(self):
 284         return "{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d)
 285
 286
 287 def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
 288     ops = []
 289     for geom, type_a, type_c in product(geoms, types_a, types_c):
 290         for type_b, type_d in product(
 291             types_b if types_b else [type_a], types_d if types_d else [type_c]
 292         ):
 293             ops.append(
 294                 MMAOp(
 295                     MMAFrag(geom, "a", type_a),
 296                     MMAFrag(geom, "b", type_b),
 297                     MMAFrag(geom, "c", type_c),
 298                     MMAFrag(geom, "d", type_d),
 299                 )
 300             )
 301     return ops
 302
 303
 304 def make_ldst_ops(geoms, frags, types):
 305     return [
 306         MMAFrag(geom, frag, ptx_type)
 307         for (geom, frag, ptx_type) in product(geoms, frags, types)
 308     ]
 309
 310
 311 def make_ldmatrix_ops(geoms, frags, types):
 312     return [
 313         MMAFrag(geom, frag, ptx_type)
 314         for (geom, frag, ptx_type) in product(geoms, frags, types)
 315     ]
 316
 317
 318 def get_wmma_ops():
 319     return (
 320         make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], [])
 321         + make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"], ["bf16"], [], ["f32"], [])
 322         + make_mma_ops(["m8n8k4"], ["f64"], [], ["f64"], [])
 323         + make_mma_ops(
 324             ["m16n16k16", "m32n8k16", "m8n32k16"],
 325             ["f16"],
 326             [],
 327             ["f16", "f32"],
 328             ["f16", "f32"],
 329         )
 330         + make_mma_ops(
 331             ["m16n16k16", "m32n8k16", "m8n32k16"], ["s8", "u8"], [], ["s32"], []
 332         )
 333         + make_mma_ops(["m8n8k32"], ["s4", "u4"], [], ["s32"], [])
 334         + make_mma_ops(["m8n8k128"], ["b1"], [], ["s32"], [])
 335     )
 336
 337
 338 def get_mma_ops():
 339     return (
 340         make_mma_ops(["m8n8k4"], ["f64"], [], ["f64"], [])
 341         + make_mma_ops(["m16n8k4", "m16n8k8"], ["tf32"], [], ["f32"], [])
 342         + make_mma_ops(["m16n8k16", "m16n8k8"], ["bf16"], [], ["f32"], [])
 343         + make_mma_ops(
 344             ["m8n8k4", "m16n8k8", "m16n8k16"],
 345             ["f16"],
 346             [],
 347             ["f16", "f32"],
 348             ["f16", "f32"],
 349         )
 350         + make_mma_ops(
 351             ["m8n8k16", "m16n8k16", "m16n8k32"], ["s8", "u8"], ["s8", "u8"], ["s32"], []
 352         )
 353         + make_mma_ops(
 354             ["m8n8k32", "m16n8k32", "m16n8k64"], ["s4", "u4"], ["s4", "u4"], ["s32"], []
 355         )
 356         + make_mma_ops(["m8n8k128", "m16n8k128", "m16n8k256"], ["b1"], [], ["s32"], [])
 357     )
 358
 359
 360 def get_ldst_ops(kind):
 361     ldst_ops = (
 362         make_ldst_ops(
 363             ["m16n16k16", "m32n8k16", "m8n32k16"],
 364             ["a", "b"],
 365             ["f16", "u8", "s8", "bf16"],
 366         )
 367         + make_ldst_ops(
 368             ["m16n16k16", "m32n8k16", "m8n32k16"], ["c", "d"], ["f16", "f32", "s32"]
 369         )
 370         + make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4", "u4"])
 371         + make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"])
 372         + make_ldst_ops(["m8n8k32", "m8n8k128"], ["c", "d"], ["s32"])
 373         + make_ldst_ops(["m8n8k4"], ["a", "b", "c", "d"], ["f64"])
 374         + make_ldst_ops(["m16n16k8"], ["a", "b"], ["tf32"])
 375         + make_ldst_ops(["m16n16k8"], ["c", "d"], ["f32"])
 376     )
 377     return [x for x in ldst_ops if (x.frag == "d") == (kind == "store")]
 378
 379
 380 def get_ldmatrix_ops():
 381     return make_ldmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"])
 382
 383
 384 def is_wmma_geom_supported(geom):
 385     # geometries for FP and ints.
 386     if geom in ["m8n32k16", "m32n8k16"]:
 387         return ptx_version >= 61
 388     # geometries for sub-ints.
 389     if geom in ["m8n8k32", "m8n8k128"]:
 390         return ptx_version >= 63 and gpu_arch >= 75
 391     if geom == "m16n16k16":
 392         return ptx_version >= 60
 393     if geom == "m16n8k8":
 394         return ptx_version >= 65
 395     if geom in ["m16n16k8", "m8n8k4"]:
 396         return ptx_version >= 70
 397     assert False  # Unexpected geometry.
 398
 399
 400 def is_mma_geom_supported(geom):
 401     # geometries for FP and ints.
 402     if geom == "m8n8k4":
 403         return ptx_version >= 64
 404     if geom in ["m16n8k8", "m8n8k16", "m8n8k32"]:
 405         return ptx_version >= 65
 406     if geom in [
 407         "m16n8k16",
 408         "m16n8k4",
 409         "m16n8k32",
 410         "m16n8k64",
 411         "m8n8k128",
 412         "m16n8k128",
 413         "m16n8k256",
 414     ]:
 415         return ptx_version >= 70
 416     assert False  # Unexpected geometry.
 417
 418
 419 def is_ldmatrix_geom_supported(geom):
 420     if geom in ["m8n8"]:
 421         return ptx_version >= 65 and gpu_arch >= 75
 422     assert False  # Unexpected geometry.
 423
 424
 425 def is_type_supported(ptx_type):
 426     if ptx_type in ["s8", "u8", "s32"]:
 427         return ptx_version >= 63 and gpu_arch >= 72
 428     if ptx_type in ["s4", "u4", "b1"]:
 429         return ptx_version >= 63 and gpu_arch >= 75
 430     if ptx_type == "b16":
 431         return ptx_version >= 65 and gpu_arch >= 75
 432     if ptx_type in ["bf16", "tf32", "f64"]:
 433         return ptx_version >= 70
 434     return ptx_version >= 60 and gpu_arch >= 70
 435
 436
 437 def is_wmma_variant_supported(op, layout_a, layout_b, rnd, satf):
 438     if not (
 439         is_type_supported(op.a.mma_type.ptx_type) and is_wmma_geom_supported(op.a.geom)
 440     ):
 441         return False
 442
 443     # rnd is only supported for FP64 WMMA
 444     if rnd and op.a.mma_type.ptx_type != "f64":
 445         return False
 446
 447     if satf:
 448         # satfinite for floating points was removed in PTX 6.5
 449         if op.a.mma_type.ptx_type == "f16" and ptx_version >= 65:
 450             return False
 451         if not op.a.mma_type.ptx_type in ["f16", "s8", "u8", "s4", "u4"]:
 452             return False
 453
 454     # sub-integer require row/col layout.
 455     if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]:
 456         return layout_a == "row" and layout_b == "col"
 457     return True
 458
 459
 460 def is_mma_variant_supported(op, layout_a, layout_b, satf):
 461     if not (
 462         is_type_supported(op.a.mma_type.ptx_type) and is_mma_geom_supported(op.a.geom)
 463     ):
 464         return False
 465
 466     if satf and not op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4"]:
 467         return False
 468
 469     # If the type of C is f32 then so must the type of D
 470     if (
 471         op.a.geom == "m8n8k4"
 472         and op.c.mma_type.ptx_type == "f32"
 473         and op.d.mma_type.ptx_type != "f32"
 474     ):
 475         return False
 476
 477     # A and B type must be the same. C and D type must be the same
 478     if op.a.geom == "m16n8k8" and (
 479         op.a.mma_type.ptx_type != op.b.mma_type.ptx_type
 480         or op.c.mma_type.ptx_type != op.d.mma_type.ptx_type
 481     ):
 482         return False
 483
 484     # C and D type must be the same
 485     if op.a.geom == "m16n8k16" and op.c.mma_type.ptx_type != op.d.mma_type.ptx_type:
 486         return False
 487
 488     # Require row/col layout for all MMA except m8n8k4 on FP16
 489     if not (op.a.geom == "m8n8k4" and op.a.mma_type.ptx_type == "f16"):
 490         return layout_a == "row" and layout_b == "col"
 491     return True
 492
 493
 494 def is_ldst_variant_supported(frag, layout):
 495     if not (
 496         is_type_supported(frag.mma_type.ptx_type) and is_wmma_geom_supported(frag.geom)
 497     ):
 498         return False
 499     if frag.mma_type.ptx_type in ["s4", "u4", "b1"]:
 500         # sub-integer require sm_75 and ptx63, row/col layout for a/b.
 501         return (
 502             (frag.frag == "a" and layout == "row")
 503             or (frag.frag == "b" and layout == "col")
 504             or frag.frag in ["c", "d"]
 505         )
 506     return True
 507
 508
 509 def is_ldmatrix_variant_supported(frag):
 510     if not (
 511         is_type_supported(frag.mma_type.ptx_type)
 512         and is_ldmatrix_geom_supported(frag.geom)
 513     ):
 514         return False
 515     return frag.frag in ["x1", "x2", "x4"]
 516
 517
 518 def make_wmma_slice_ty(frag):
 519     return [frag.mma_type.llvm_type] * frag.nregs
 520
 521
 522 def make_wmma_ld_ret_ty(frag):
 523     results = make_wmma_slice_ty(frag)
 524     if len(results) == 1:
 525         return "%s" % results[0]
 526     return "{%s}" % ", ".join(results)
 527
 528
 529 # returns address space
 530 def get_aspace(space):
 531     space_map = {
 532         ".global": 1,
 533         ".shared": 3,
 534         ".const": 4,
 535         ".local": 5,
 536         ".param": 101,
 537         "": 0,
 538         ".generic": 0,
 539     }
 540     return space_map[space]
 541
 542
 543 def get_pspace(space):
 544     return "p%di8" % get_aspace(space)
 545
 546
 547 def check_pattern(frag):
 548     return "{{%s}}" % ", *".join([frag.mma_type.ptx_reg_pattern] * frag.nregs)
 549
 550
 551 def gen_wmma_load_tests():
 552     load_template = """
 553 declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
 554
 555 ; CHECK-LABEL: .func {{.*}}test_${function}(
 556 define ${ret_ty} @test_${function}(i8 ${as}* %src ${extra_args}) {
 557 ; CHECK: ${instruction}
 558 ; CHECK: {${check_result}}
 559 ; CHECK: [%rd{{[0-9]+}}]${stride_pattern}
 560   %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
 561   ret ${ret_ty} %v0;
 562 }
 563
 564 ; CHECK-LABEL: .func{{.*}}test_${function}_o(
 565 define ${ret_ty} @test_${function}_o(i8 ${as}* %src ${extra_args}) {
 566 ; CHECK: ${instruction}
 567 ; CHECK: {${check_result}}
 568 ; CHECK: [%rd{{[0-9]+}}+128]${stride_pattern}
 569   %src1 = getelementptr i8, i8 ${as}* %src, i32 128;
 570   %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src1 ${extra_args});
 571   ret ${ret_ty} %v0;
 572 }
 573 """
 574     intrinsic_template = (
 575         "llvm.nvvm.wmma.${geom}.load.${abc}.${layout}${stride}.${itype}.${pspace}"
 576     )
 577     instruction_template = (
 578         "wmma.load.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"
 579     )
 580
 581     generated_items = []
 582
 583     for frag, layout, space, stride in product(
 584         get_ldst_ops("load"),
 585         ["row", "col"],
 586         ["", ".shared", ".global"],
 587         ["", ".stride"],
 588     ):
 589         if not is_ldst_variant_supported(frag, layout):
 590             continue
 591
 592         params = {
 593             "abc": frag.frag,
 594             "aligned": ".aligned" if ptx_version >= 63 else "",
 595             "layout": layout,
 596             "space": space,
 597             "stride": stride,
 598             "itype": frag.mma_type.ptx_type,
 599             "pspace": get_pspace(space),
 600             "as": "addrspace(%d)" % get_aspace(space),
 601             "geom": frag.geom,
 602         }
 603
 604         test_params = params
 605         test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
 606         test_params["function"] = test_params["intrinsic"].replace(".", "_")
 607         test_params["instruction"] = Template(instruction_template).substitute(params)
 608         test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)
 609         test_params["check_result"] = check_pattern(frag)
 610
 611         if stride:
 612             test_params["extra_args"] = ", i32 %stride"
 613             test_params["stride_pattern"] = ", %r{{[0-9]+}}"
 614         else:
 615             test_params["extra_args"] = ""
 616             test_params["stride_pattern"] = ""
 617
 618         print(Template(load_template).substitute(test_params))
 619
 620         generated_items.append((test_params["intrinsic"], test_params["instruction"]))
 621
 622     return generated_items
 623
 624
 625 def make_wmma_slice_args(frag):
 626     return ", ".join(
 627         [
 628             "%s %%%s%d" % (t, frag.frag, i)
 629             for i, t in enumerate(make_wmma_slice_ty(frag))
 630         ]
 631     )
 632
 633
 634 def gen_wmma_store_tests():
 635     store_template = """
 636 declare void @${intrinsic}(i8 ${as}* %src, ${args}${extra_args});
 637
 638 ; CHECK-LABEL: .func {{.*}}test_${function}(
 639 define void @test_${function}(i8 ${as}* %src, ${args}${extra_args}) {
 640 ; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}
 641 ; CHECK: {${check_args}}
 642 ; CHECK: ${stride_pattern}
 643   call void @${intrinsic}(i8 ${as}* %src, ${args} ${extra_args});
 644   ret void
 645 }
 646
 647 ; CHECK-LABEL: .func{{.*}}test_${function}_o(
 648 define void @test_${function}_o(i8 ${as}* %src, ${args}${extra_args}) {
 649 ; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128]
 650 ; CHECK: ${check_args}
 651 ; CHECK: ${stride_pattern}
 652   %src1 = getelementptr i8, i8 ${as}* %src, i32 128;
 653   call void @${intrinsic}(i8 ${as}* %src1, ${args}${extra_args});
 654   ret void
 655 }
 656 """
 657     intrinsic_template = (
 658         "llvm.nvvm.wmma.${geom}.store.${abc}.${layout}${stride}.${itype}.${pspace}"
 659     )
 660     instruction_template = (
 661         "wmma.store.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"
 662     )
 663
 664     generated_items = []
 665
 666     for frag, layout, space, stride in product(
 667         get_ldst_ops("store"),
 668         ["row", "col"],
 669         ["", ".shared", ".global"],
 670         ["", ".stride"],
 671     ):
 672
 673         if not is_ldst_variant_supported(frag, layout):
 674             continue
 675
 676         params = {
 677             "abc": frag.frag,
 678             "aligned": ".aligned" if ptx_version >= 63 else "",
 679             "layout": layout,
 680             "space": space,
 681             "stride": stride,
 682             "itype": frag.mma_type.ptx_type,
 683             "pspace": get_pspace(space),
 684             "as": "addrspace(%d)" % get_aspace(space),
 685             "geom": frag.geom,
 686         }
 687
 688         test_params = params
 689         test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
 690         test_params["function"] = test_params["intrinsic"].replace(".", "_")
 691         test_params["instruction"] = Template(instruction_template).substitute(params)
 692         test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)
 693         test_params["check_args"] = check_pattern(frag)
 694         if stride:
 695             test_params["extra_args"] = ", i32 %stride"
 696             test_params["stride_pattern"] = ", %r{{[0-9]+}};"
 697         else:
 698             test_params["extra_args"] = ""
 699             test_params["stride_pattern"] = ";"
 700         test_params["args"] = make_wmma_slice_args(frag)
 701
 702         print(Template(store_template).substitute(test_params))
 703         generated_items.append((test_params["intrinsic"], test_params["instruction"]))
 704
 705     return generated_items
 706
 707
 708 def gen_ldmatrix_tests():
 709     ldmatrix_template = """
 710 declare ${ret_ty} @${intrinsic}(i8 ${as}* %src);
 711
 712 ; CHECK-LABEL: .func {{.*}}test_${function}(
 713 define ${ret_ty} @test_${function}(i8 ${as}* %src) {
 714 ; CHECK: ${instruction}
 715 ; CHECK: {${check_result}}
 716 ; CHECK: [%rd{{[0-9]+}}]
 717   %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src);
 718   ret ${ret_ty} %v0;
 719 }
 720
 721 ; CHECK-LABEL: .func{{.*}}test_${function}_o(
 722 define ${ret_ty} @test_${function}_o(i8 ${as}* %src) {
 723 ; CHECK: ${instruction}
 724 ; CHECK: {${check_result}}
 725 ; CHECK: [%rd{{[0-9]+}}+128]
 726   %src1 = getelementptr i8, i8 ${as}* %src, i32 128;
 727   %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src1);
 728   ret ${ret_ty} %v0;
 729 }
 730 """
 731     intrinsic_template = (
 732         "llvm.nvvm.ldmatrix.sync.aligned.${geom}.${frag}${trans}.${itype}.${pspace}"
 733     )
 734     instruction_template = (
 735         "ldmatrix.sync.aligned.${geom}.${frag}${trans}${space}.${itype}"
 736     )
 737
 738     generated_items = []
 739
 740     for frag, space, trans in product(
 741         get_ldmatrix_ops(),
 742         ["", ".shared"],
 743         ["", ".trans"],
 744     ):
 745         if not is_ldmatrix_variant_supported(frag):
 746             continue
 747
 748         params = {
 749             "frag": frag.frag,
 750             "space": space,
 751             "trans": trans,
 752             "itype": frag.mma_type.ptx_type,
 753             "pspace": get_pspace(space),
 754             "as": "addrspace(%d)" % get_aspace(space),
 755             "geom": frag.geom,
 756         }
 757
 758         test_params = params
 759         test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
 760         test_params["function"] = test_params["intrinsic"].replace(".", "_")
 761         test_params["instruction"] = Template(instruction_template).substitute(params)
 762         test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)
 763         test_params["check_result"] = check_pattern(frag)
 764
 765         print(Template(ldmatrix_template).substitute(test_params))
 766
 767         generated_items.append((test_params["intrinsic"], test_params["instruction"]))
 768
 769     return generated_items
 770
 771
 772 def mma_signature(op):
 773     if op.a.mma_type.ptx_type == "f16":
 774         # FP16 ops identified by accumulator & result type.
 775         return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
 776     elif op.a.mma_type.ptx_type != op.b.mma_type.ptx_type:
 777         # other ops are identified by input types.
 778         return "%s.%s" % (op.a.mma_type.ptx_type, op.b.mma_type.ptx_type)
 779     else:
 780         # if input types are the same, it only appears once.
 781         return op.a.mma_type.ptx_type
 782
 783
 784 def mma_ptx_signature(op):
 785     # Encode all four types as D.A.B.C
 786     return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))
 787
 788
 789 def wmma_signature(op):
 790     if op.a.mma_type.ptx_type == "f16":
 791         # FP16 ops identified by accumulator & result type.
 792         return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
 793     else:
 794         # other ops are identified by input type.
 795         return op.a.mma_type.ptx_type
 796
 797
 798 def wmma_ptx_signature(op):
 799     if op.a.mma_type.ptx_type == "f16":
 800         # FP16 instructions use D.C
 801         return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
 802     else:
 803         # other instructions encode all four types as D.A.B.C
 804         return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))
 805
 806
 807 def common_mma_test_gen(params, op, intrinsic_template, instruction_template):
 808     mma_template = """
 809 declare ${ret_ty} @${intrinsic}(
 810         ${args});
 811
 812 ; CHECK-LABEL: .func {{.*}}test_${function}(
 813 define ${ret_ty} @test_${function}(
 814         ${args}) {
 815 ; CHECK: ${instruction}
 816 ; CHECK-NEXT: ${check_d}
 817 ; CHECK-NEXT: ${check_a}
 818 ; CHECK-NEXT: ${check_b}
 819 ; CHECK-NEXT: ${check_c}
 820   %r = call ${ret_ty} @${intrinsic}(
 821         ${args});
 822   ret ${ret_ty} %r;
 823 }
 824 """
 825
 826     test_params = params
 827     test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
 828     test_params["function"] = test_params["intrinsic"].replace(".", "_")
 829     test_params["instruction"] = Template(instruction_template).substitute(params)
 830     test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d)
 831     test_params["check_a"] = check_pattern(op.a)
 832     test_params["check_b"] = check_pattern(op.b)
 833     test_params["check_c"] = check_pattern(op.c)
 834     test_params["check_d"] = check_pattern(op.d)
 835     args = ",\n        ".join(make_wmma_slice_args(frag) for frag in (op.a, op.b, op.c))
 836     test_params["args"] = args
 837     print(Template(mma_template).substitute(test_params))
 838     return (test_params["intrinsic"], test_params["instruction"])
 839
 840
 841 def get_b1_ops(ptx_type):
 842     if ptx_type != "b1":
 843         return [""]
 844     if ptx_version >= 71:
 845         return [".xor.popc", ".and.popc"]
 846     return [".xor.popc"]
 847
 848
 849 def gen_wmma_mma_tests():
 850     wmma_intrinsic_template = "llvm.nvvm.wmma.${geom}.mma${b1op}.${alayout}.${blayout}${rnd}.${intrinsic_signature}${satf}"
 851     wmma_instruction_template = "wmma.mma${b1op}.sync${aligned}.${alayout}.${blayout}.${geom}${rnd}.${ptx_signature}${satf}"
 852
 853     generated_items = []
 854
 855     for op, alayout, blayout, rnd, satf in product(
 856         get_wmma_ops(),
 857         ["row", "col"],
 858         ["row", "col"],
 859         [".rn", ".rz", ".rm", ".rp", ""],
 860         [".satfinite", ""],
 861     ):
 862
 863         if not is_wmma_variant_supported(op, alayout, blayout, rnd, satf):
 864             continue
 865
 866         for b1op in get_b1_ops(op.a.mma_type.ptx_type):
 867             params = {
 868                 "aligned": ".aligned" if ptx_version >= 63 else "",
 869                 "alayout": alayout,
 870                 "blayout": blayout,
 871                 "intrinsic_signature": wmma_signature(op),
 872                 "ptx_signature": wmma_ptx_signature(op),
 873                 "satf": satf,
 874                 "rnd": rnd,
 875                 "geom": op.a.geom,
 876                 "b1op": b1op,
 877             }
 878
 879             intrinsic_template = wmma_intrinsic_template
 880             instruction_template = wmma_instruction_template
 881
 882             generated_items.append(
 883                 common_mma_test_gen(
 884                     params, op, intrinsic_template, instruction_template
 885                 )
 886             )
 887
 888     return generated_items
 889
 890
 891 def gen_mma_tests():
 892     mma_intrinsic_template = "llvm.nvvm.mma${b1op}.${geom}.${alayout}.${blayout}${satf}.${intrinsic_signature}"
 893     mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}${satf}.${ptx_signature}${b1op}"
 894
 895     generated_items = []
 896
 897     for op, alayout, blayout, satf in product(
 898         get_mma_ops(), ["row", "col"], ["row", "col"], [".satfinite", ""]
 899     ):
 900
 901         if not is_mma_variant_supported(op, alayout, blayout, satf):
 902             continue
 903
 904         for b1op in get_b1_ops(op.a.mma_type.ptx_type):
 905             params = {
 906                 "aligned": ".aligned" if ptx_version >= 63 else "",
 907                 "alayout": alayout,
 908                 "blayout": blayout,
 909                 "intrinsic_signature": mma_signature(op),
 910                 "ptx_signature": mma_ptx_signature(op),
 911                 "satf": satf,
 912                 "geom": op.a.geom,
 913                 "b1op": b1op,
 914             }
 915
 916             intrinsic_template = mma_intrinsic_template
 917             instruction_template = mma_instruction_template
 918
 919             generated_items.append(
 920                 common_mma_test_gen(
 921                     params, op, intrinsic_template, instruction_template
 922                 )
 923             )
 924
 925     return generated_items
 926
 927
 928 # Append complete list of intrinsics and instructions we've generated tests for.
 929 # Generate set of checks to verify that that we did generate sensible set of
 930 # tests for the given combination of PTX and SM variants.
 931 #
 932 def gen_check_unsupported_ops(items):
 933     print(
 934         "; Complete list of intrinsics supported by PTX%d on sm_%d"
 935         % (ptx_version, gpu_arch)
 936     )
 937     print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}")
 938     print(
 939         """
 940
 941 ; NOEXTGEOM-NOT: {{m8n32|m32n8}}
 942 ; NOINT-NOT: .{{s32|s8}}
 943 ; NOSUBINT-NOT: {{s4|u4|b1}}
 944 ; NOMMA-NOT: .m8n8k4.
 945 ; NOALTFLOAT-NOT: .{{bf16|tf32}}
 946 ; NODOUBLE-NOT: .f64
 947 ; NOLDMATRIX-NOT: ldmatrix.sync.aligned
 948
 949 ; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p
 950 ; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
 951 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f32
 952 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f16
 953 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f16.f16
 954 ; M16N16-DAG: m16n16k16.mma.{{.*}}.f32.f32
 955
 956 ; PTX60 adds support for m32n8k16/m8n32k16 geometries.
 957 ; EXTGEOM-DAG: m32n8k16.load.{{[ab].*}}.f16.p
 958 ; EXTGEOM-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
 959 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f32
 960 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f16
 961 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f16.f16
 962 ; EXTGEOM-DAG: m32n8k16.mma.{{.*}}.f32.f32
 963
 964 ; EXTGEOM-DAG: m8n32k16.load.{{[ab].*}}.f16.p
 965 ; EXTGEOM-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
 966 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f32
 967 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f16
 968 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f16.f16
 969 ; EXTGEOM-DAG: m8n32k16.mma.{{.*}}.f32.f32
 970
 971 ; INT-DAG: m16n16k16.load.{{[ab].*}}.s8.p
 972 ; INT-DAG: m8n32k16.load.{{[ab].*}}.s8.p
 973 ; INT-DAG: m32n8k16.load.{{[ab].*}}.s8.p
 974 ; INT-DAG: m16n16k16.load.{{[ab].*}}.u8.p
 975 ; INT-DAG: m8n32k16.load.{{[ab].*}}.u8.p
 976 ; INT-DAG: m32n8k16.load.{{[ab].*}}.u8.p
 977 ; INT-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p
 978 ; INT-DAG: m16n16k16.mma.{{.*}}.u8
 979 ; INT-DAG: m16n16k16.mma.{{.*}}.s8
 980 ; INT-DAG: m8n32k16.mma.{{.*}}.u8
 981 ; INT-DAG: m8n32k16.mma.{{.*}}.s8
 982 ; INT-DAG: m32n8k16.mma.{{.*}}.u8
 983 ; INT-DAG: m32n8k16.mma.{{.*}}.s8
 984
 985 ; SUBINT-DAG: m8n8k128.load.{{[ab].*}}.b1.p
 986 ; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.s4.p
 987 ; SUBINT-DAG: m8n8k32.load.{{[ab].*}}.u4.p
 988 ; SUBINT-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p
 989 ; SUBINT-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p
 990 ; SUBINT-DAG: m8n8k32.mma.{{.*}}.u4
 991 ; SUBINT-DAG: m8n8k32.mma.{{.*}}.s4
 992 ; SUBINT-DAG: m8n8k128.mma.{{.*}}.b1
 993
 994 ; ALTFLOAT-DAG: m16n16k16.load.{{[ab].*}}.bf16.p
 995 ; ALTFLOAT-DAG: m8n32k16.load.{{[ab].*}}.bf16.p
 996 ; ALTFLOAT-DAG: m32n8k16.load.{{[ab].*}}.bf16.p
 997 ; ALTFLOAT-DAG: m16n16k8.load.{{[ab].*}}.tf32.p
 998 ; ALTFLOAT-DAG: m16n16k16.mma.{{.*}}.bf16
 999 ; ALTFLOAT-DAG: m8n32k16.mma.{{.*}}.bf16
1000 ; ALTFLOAT-DAG: m32n8k16.mma.{{.*}}.bf16
1001 ; ALTFLOAT-DAG: m16n16k8.mma.{{.*}}.tf32
1002
1003 ; DOUBLE-DAG: m8n8k4.load.{{[abc].*}}.f64.p
1004 ; DOUBLE-DAG: m8n8k4.store.d.{{.*}}.f64.p
1005 ; DOUBLE-DAG: m8n8k4.mma.{{.*}}.f64
1006
1007 ; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f32
1008 ; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f16
1009 ; MMA-DAG: mma.m8n8k4.{{.*}}.f16.f16
1010 ; MMA-DAG: mma.m8n8k4.{{.*}}.f32.f32
1011
1012 ; PTX65MMA-DAG: mma.m16n8k8.row.col.f16.f16
1013 ; PTX65MMA-DAG: mma.m16n8k8.row.col.f32.f32
1014 ; PTX65MMA-DAG: mma.m8n8k16.row.col{{.*}}.u8.u8
1015 ; PTX65MMA-DAG: mma.m8n8k16.row.col{{.*}}.s8.s8
1016 ; PTX65MMA-DAG: mma.m8n8k16.row.col{{.*}}.s8.u8
1017 ; PTX65MMA-DAG: mma.m8n8k16.row.col{{.*}}.u8.s8
1018 ; PTX65MMA-DAG: mma.m8n8k32.row.col{{.*}}.u4.u4
1019 ; PTX65MMA-DAG: mma.m8n8k32.row.col{{.*}}.s4.s4
1020 ; PTX65MMA-DAG: mma.m8n8k32.row.col{{.*}}.s4.u4
1021 ; PTX65MMA-DAG: mma.m8n8k32.row.col{{.*}}.u4.s4
1022
1023 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x1.b16
1024 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x2.b16
1025 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x4.b16
1026 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x1.trans.b16
1027 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x2.trans.b16
1028 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x4.trans.b16
1029 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x1.shared.b16
1030 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x2.shared.b16
1031 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x4.shared.b16
1032 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16
1033 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16
1034 ; PTX65LDMATRIX-DAG: ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16
1035
1036 ; PTX71MMA-DAG: mma.m8n8k4.row.col.f64
1037 ; PTX71MMA-DAG: mma.m16n8k4.row.col.tf32
1038 ; PTX71MMA-DAG: mma.m16n8k8.row.col.tf32
1039 ; PTX71MMA-DAG: mma.m16n8k16.row.col.bf16
1040 ; PTX71MMA-DAG: mma.m16n8k8.row.col.bf16
1041 ; PTX71MMA-DAG: mma.m16n8k16.row.col.f16.f16
1042 ; PTX71MMA-DAG: mma.m16n8k16.row.col.f32.f32
1043 ; PTX71MMA-DAG: mma.m16n8k16.row.col{{.*}}.u8.u8
1044 ; PTX71MMA-DAG: mma.m16n8k16.row.col{{.*}}.s8.s8
1045 ; PTX71MMA-DAG: mma.m16n8k16.row.col{{.*}}.s8.u8
1046 ; PTX71MMA-DAG: mma.m16n8k16.row.col{{.*}}.u8.s8
1047 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.u8.u8
1048 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.s8.s8
1049 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.s8.u8
1050 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.u8.s8
1051 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.u4.u4
1052 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.s4.s4
1053 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.s4.u4
1054 ; PTX71MMA-DAG: mma.m16n8k32.row.col{{.*}}.u4.s4
1055 ; PTX71MMA-DAG: mma.m16n8k64.row.col{{.*}}.u4.u4
1056 ; PTX71MMA-DAG: mma.m16n8k64.row.col{{.*}}.s4.s4
1057 ; PTX71MMA-DAG: mma.m16n8k64.row.col{{.*}}.s4.u4
1058 ; PTX71MMA-DAG: mma.m16n8k64.row.col{{.*}}.u4.s4
1059 ; PTX71MMA-DAG: mma.and.popc.m8n8k128.row.col.b1
1060 ; PTX71MMA-DAG: mma.xor.popc.m8n8k128.row.col.b1
1061 ; PTX71MMA-DAG: mma.and.popc.m16n8k128.row.col.b1
1062 ; PTX71MMA-DAG: mma.xor.popc.m16n8k128.row.col.b1
1063 ; PTX71MMA-DAG: mma.and.popc.m16n8k256.row.col.b1
1064 ; PTX71MMA-DAG: mma.xor.popc.m16n8k256.row.col.b1
1065 ;
1066
1067 """
1068     )
1069
1070     print("; INTRINSICS_LIST_BEGIN")
1071     for intrinsic, instruction in sorted(items):
1072         print("; ", intrinsic, " -> ", instruction, "")
1073     print("; INTRINSICS_LIST_END")
1074     print("; INTRINSICS: ; INTRINSICS_LIST_END")
1075
1076
1077 def gen_tests():
1078     items = gen_wmma_load_tests()
1079     items += gen_wmma_store_tests()
1080     items += gen_ldmatrix_tests()
1081     items += gen_wmma_mma_tests()
1082     items += gen_mma_tests()
1083     gen_check_unsupported_ops(items)
1084
1085
1086 parser = argparse.ArgumentParser()
1087 parser.add_argument("--ptx", type=int, default=60)
1088 parser.add_argument("--gpu-arch", type=int, default=70)
1089 args = parser.parse_args()
1090 ptx_version = args.ptx
1091 gpu_arch = args.gpu_arch
1092
1093 gen_tests()