test/CodeGen/NVPTX/wmma.py

   1 # This test generates all variants of wmma intrinsics and verifies that LLVM
   2 # generates correct instructions for them.
   3
   4 # Check all variants of instructions supported by PTX60 on SM70
   5 # RUN: python %s --ptx=60 --gpu-arch=70 > %t-ptx60-sm_70.ll
   6 # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
   7 # RUN:           --check-prefixes=INTRINSICS,PTX60,SM70
   8 # RUN: FileCheck %t-ptx60-sm_70.ll < %t-ptx60-sm_70.ll \
   9 # RUN:           --check-prefixes=INTRINSICS,PTX60U,SM70U
  10 # RUN: llc < %t-ptx60-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 \
  11 # RUN:           | FileCheck %t-ptx60-sm_70.ll
  12
  13 # Check all variants of instructions supported by PTX61 on SM70
  14 # RUN: python %s --ptx=61 --gpu-arch=70 > %t-ptx61-sm_70.ll
  15 # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
  16 # RUN:           --check-prefixes=INTRINSICS,PTX60,PTX61,SM70
  17 # RUN: FileCheck %t-ptx61-sm_70.ll < %t-ptx61-sm_70.ll \
  18 # RUN:           --check-prefixes=INTRINSICS,PTX61U,SM70U
  19 # RUN: llc < %t-ptx61-sm_70.ll -march=nvptx64 -mcpu=sm_70 -mattr=+ptx61 \
  20 # RUN:           | FileCheck %t-ptx61-sm_70.ll
  21
  22 # Check all variants of instructions supported by PTX63 on SM72
  23 # RUN: python %s --ptx=63 --gpu-arch=72 > %t-ptx63-sm_72.ll
  24 # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
  25 # RUN:           --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72
  26 # RUN: FileCheck %t-ptx63-sm_72.ll < %t-ptx63-sm_72.ll \
  27 # RUN:           --check-prefixes=INTRINSICS,PTX63U,SM72U
  28 # RUN: llc < %t-ptx63-sm_72.ll -march=nvptx64 -mcpu=sm_72 -mattr=+ptx63 \
  29 # RUN:           | FileCheck %t-ptx63-sm_72.ll
  30
  31 # Check all variants of instructions supported by PTX63 on SM75
  32 # RUN: python %s --ptx=63 --gpu-arch=75 > %t-ptx63-sm_75.ll
  33 # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
  34 # RUN:           --check-prefixes=INTRINSICS,PTX60,PTX61,PTX63,SM70,SM72,SM75
  35 # RUN: FileCheck %t-ptx63-sm_75.ll < %t-ptx63-sm_75.ll \
  36 # RUN:           --check-prefixes=INTRINSICS,PTX63U,SM75U
  37 # RUN: llc < %t-ptx63-sm_75.ll -march=nvptx64 -mcpu=sm_75 -mattr=+ptx63 \
  38 # RUN:           | FileCheck %t-ptx63-sm_75.ll
  39
  40
  41 from __future__ import print_function
  42
  43 import argparse
  44 from itertools import product
  45 from string import Template
  46
  47 class MMAType:
  48   def __init__(self, ptx_type):
  49     self.ptx_type = ptx_type
  50     self.llvm_type = {
  51         "f16" : "<2 x half>",
  52         "f32" : "float",
  53         "s32" : "i32",
  54         "s8"  : "i32",
  55         "u8"  : "i32",
  56         "s4"  : "i32",
  57         "u4"  : "i32",
  58         "b1"  : "i32",
  59     }[ptx_type];
  60
  61     self.ptx_reg_pattern = {
  62         "f16" : "%hh[0-9]+",
  63         "f32" : "%f[0-9]+",
  64     }.get(ptx_type, "%r[0-9]+")
  65
  66   def __repr__(self):
  67     return "%s/%s" % (self.ptx_type, self.llvm_type)
  68
  69 class MMAFrag:
  70   def __init__(self, geom, frag, ptx_elt_type):
  71     self.geom = geom
  72     self.frag = frag
  73     self.mma_type = MMAType(ptx_elt_type);
  74     self.nregs = {
  75         "a:f16" : 8,
  76         "b:f16" : 8,
  77         "c:f16" : 4,
  78         "d:f16" : 4,
  79         "c:f32" : 8,
  80         "d:f32" : 8,
  81     }.get("%s:%s" % (frag, ptx_elt_type), {
  82         # u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
  83         "m16n16k16:a:u8" : 2,
  84         "m16n16k16:a:s8" : 2,
  85         "m16n16k16:b:u8" : 2,
  86         "m16n16k16:b:s8" : 2,
  87         "m16n16k16:c:s32" : 8,
  88         "m16n16k16:d:s32" : 8,
  89
  90         "m8n32k16:a:u8" : 1,
  91         "m8n32k16:a:s8" : 1,
  92         "m8n32k16:b:u8" : 4,
  93         "m8n32k16:b:s8" : 4,
  94         "m8n32k16:c:s32" : 8,
  95         "m8n32k16:d:s32" : 8,
  96
  97         "m32n8k16:a:u8" : 4,
  98         "m32n8k16:a:s8" : 4,
  99         "m32n8k16:b:u8" : 1,
 100         "m32n8k16:b:s8" : 1,
 101         "m32n8k16:c:s32" : 8,
 102         "m32n8k16:d:s32" : 8,
 103
 104         # u4/s4/b1 -> s32 @ m8n8k32 (u4/s4), m8n8k128(b1)
 105         "m8n8k128:a:b1" : 1,
 106         "m8n8k32:a:u4" : 1,
 107         "m8n8k32:a:s4" : 1,
 108         "m8n8k128:b:b1" : 1,
 109         "m8n8k32:b:u4" : 1,
 110         "m8n8k32:b:s4" : 1,
 111         "m8n8k128:c:s32" : 2,
 112         "m8n8k128:d:s32" : 2,
 113         "m8n8k32:c:s32" : 2,
 114         "m8n8k32:d:s32" : 2,
 115     }.get("%s:%s:%s" % (geom, frag, ptx_elt_type), None));
 116     assert(self.nregs);
 117
 118   def __repr__(self):
 119     return "%s:%s:%s%s" % (self.geom, self.frag, self.mma_type,
 120                            "" if self.nregs == 1 else ("*%d" % self.nregs))
 121
 122 class MMAOp:
 123   def __init__(self, a, b, c, d):
 124     self.a = a
 125     self.b = b
 126     self.c = c
 127     self.d = d
 128
 129   def __repr__(self):
 130     return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
 131
 132 def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
 133   ops = []
 134   for geom, type_a, type_c in product( geoms,  types_a, types_c):
 135     for type_b, type_d in product(types_b if types_b else [type_a],
 136                                   types_d if types_d else [type_c]):
 137       ops.append(MMAOp(MMAFrag(geom, "a", type_a),
 138                        MMAFrag(geom, "b", type_b),
 139                        MMAFrag(geom, "c", type_c),
 140                        MMAFrag(geom, "d", type_d)))
 141   return ops
 142
 143 def make_ldst_ops(geoms, frags, types):
 144   return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
 145           in product(geoms, frags, types)]
 146
 147 def get_mma_ops():
 148   return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
 149                        ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
 150           make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
 151                        ["s8", "u8"], [], ["s32"], []) +
 152           make_mma_ops(["m8n8k32"],
 153                        ["s4", "u4"], [], ["s32"], []) +
 154           make_mma_ops(["m8n8k128"],
 155                        ["b1"], [], ["s32"], []))
 156 def get_ldst_ops(kind):
 157   ldst_ops = (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
 158                             ["a", "b"], ["f16", "u8", "s8"]) +
 159               make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
 160                             ["c", "d"], ["f16", "f32", "s32"]) +
 161               make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
 162               make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
 163               make_ldst_ops(["m8n8k32", "m8n8k128"],  ["c", "d"], ["s32"]))
 164   return [ x for x in ldst_ops if (x.frag == "d") == (kind == "store")]
 165
 166 def is_geom_supported(geom):
 167   # geometries for FP and ints.
 168   if geom in ["m8n32k16", "m32n8k16"]:
 169     return ptx_version >= 61
 170   # geometries for sub-ints.
 171   if geom in ["m8n8k32", "m8n8k128"]:
 172     return ptx_version >= 63 and gpu_arch >= 75
 173   if geom == "m16n16k16":
 174     return ptx_version >= 60
 175   assert(False) # Unexpected geometry.
 176
 177 def is_type_supported(ptx_type):
 178   if ptx_type in ["s8", "u8", "s32"]:
 179     return ptx_version >= 63 and gpu_arch >= 72
 180   if ptx_type in ["s4", "u4", "b1"]:
 181     return ptx_version >= 63 and gpu_arch >= 75
 182   return ptx_version >= 60 and gpu_arch >= 70
 183
 184
 185 def is_mma_variant_supported(op, layout_a, layout_b, satf):
 186   if not (is_type_supported(op.a.mma_type.ptx_type)
 187           and is_geom_supported(op.a.geom)):
 188     return False
 189   # sub-integer require row/col layout, and no satf.
 190   if op.a.mma_type.ptx_type in ["s4", "u4", "b1"]:
 191     if op.a.mma_type.ptx_type == "b1" and satf:
 192       return False
 193     return layout_a == "row" and layout_b == "col"
 194   return True
 195
 196 def is_ldst_variant_supported(frag, layout):
 197   if not (is_type_supported(frag.mma_type.ptx_type)
 198           and is_geom_supported(frag.geom)):
 199     return False
 200   if frag.mma_type.ptx_type in ["s4", "u4", "b1"]:
 201     # sub-integer require sm_75 and ptx63, row/col layout for a/b.
 202     return ((frag.frag == "a" and layout == "row")
 203             or (frag.frag == "b" and layout == "col")
 204             or frag.frag in ["c", "d"])
 205   return True
 206
 207 def make_wmma_slice_ty(frag):
 208   return [frag.mma_type.llvm_type] * frag.nregs
 209
 210 def make_wmma_ld_ret_ty(frag):
 211   results = make_wmma_slice_ty(frag)
 212   if len(results) == 1:
 213     return "%s" % results[0]
 214   return "{%s}" % ", ".join(results)
 215
 216 # returns address space
 217 def get_aspace(space):
 218   space_map = {
 219       ".global" : 1,
 220       ".shared" : 3,
 221       ".const"  : 4,
 222       ".local"  : 5,
 223       ".param"  : 101,
 224       ""        : 0,
 225       ".generic": 0
 226   }
 227   return space_map[space];
 228
 229 def get_pspace(space):
 230   return "p%di8" % get_aspace(space);
 231
 232 def check_pattern(frag):
 233    return "{{%s}}" % ", *".join([frag.mma_type.ptx_reg_pattern] * frag.nregs)
 234
 235 known_geoms = ["m16n16k16", "m8n32k16", "m32n8k16"]
 236
 237 def gen_wmma_load_tests():
 238   load_template = """
 239 declare ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
 240
 241 ; CHECK-LABEL: .func {{.*}}test_${function}(
 242 define ${ret_ty} @test_${function}(i8 ${as}* %src ${extra_args}) {
 243 ; CHECK: ${instruction}
 244 ; CHECK: {${check_result}}
 245 ; CHECK: [%rd{{[0-9]+}}]${stride_pattern}
 246   %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src ${extra_args});
 247   ret ${ret_ty} %v0;
 248 }
 249
 250 ; CHECK-LABEL: .func{{.*}}test_${function}_o(
 251 define ${ret_ty} @test_${function}_o(i8 ${as}* %src ${extra_args}) {
 252 ; CHECK: ${instruction}
 253 ; CHECK: {${check_result}}
 254 ; CHECK: [%rd{{[0-9]+}}+128]${stride_pattern}
 255   %src1 = getelementptr i8, i8 ${as}* %src, i32 128;
 256   %v0 = call ${ret_ty} @${intrinsic}(i8 ${as}* %src1 ${extra_args});
 257   ret ${ret_ty} %v0;
 258 }
 259 """
 260   intrinsic_template = "llvm.nvvm.wmma.${geom}.load.${abc}.${layout}${stride}.${itype}.${pspace}"
 261   instruction_template = "wmma.load.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"
 262
 263   generated_items = []
 264
 265   for frag, layout, space, stride in product(
 266       get_ldst_ops("load"),
 267       ["row","col"],
 268       ["",".shared",".global"],
 269       ["", ".stride"],
 270       ):
 271     if not is_ldst_variant_supported(frag, layout):
 272       continue
 273
 274     params = {
 275         "abc" : frag.frag,
 276         "aligned" : ".aligned" if ptx_version >= 63 else "",
 277         "layout" : layout,
 278         "space" : space,
 279         "stride" : stride,
 280         "itype" : frag.mma_type.ptx_type,
 281         "pspace" : get_pspace(space),
 282         "as"     : "addrspace(%d)" % get_aspace(space),
 283         "geom"   : frag.geom,
 284     }
 285
 286     test_params = params
 287     test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
 288     test_params["function"] = test_params["intrinsic"].replace(".","_")
 289     test_params["instruction"] = Template(instruction_template).substitute(params)
 290     test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)
 291     test_params["check_result"] = check_pattern(frag)
 292
 293     if stride:
 294       test_params["extra_args"] = ", i32 %stride";
 295       test_params["stride_pattern"] = ", %r{{[0-9]+}}"
 296     else:
 297       test_params["extra_args"] = ""
 298       test_params["stride_pattern"] = ""
 299
 300     print(Template(load_template).substitute(test_params))
 301
 302     generated_items.append((test_params["intrinsic"],
 303                             test_params["instruction"]))
 304
 305   return generated_items
 306
 307 def make_wmma_slice_args(frag):
 308   return ", ".join(["%s %%%s%d" % (t, frag.frag, i) for i,t
 309                   in enumerate(make_wmma_slice_ty(frag))])
 310
 311 def gen_wmma_store_tests():
 312   store_template = """
 313 declare void @${intrinsic}(i8 ${as}* %src, ${args}${extra_args});
 314
 315 ; CHECK-LABEL: .func {{.*}}test_${function}(
 316 define void @test_${function}(i8 ${as}* %src, ${args}${extra_args}) {
 317 ; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}
 318 ; CHECK: {${check_args}}
 319 ; CHECK: ${stride_pattern}
 320   call void @${intrinsic}(i8 ${as}* %src, ${args} ${extra_args});
 321   ret void
 322 }
 323
 324 ; CHECK-LABEL: .func{{.*}}test_${function}_o(
 325 define void @test_${function}_o(i8 ${as}* %src, ${args}${extra_args}) {
 326 ; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128]
 327 ; CHECK: ${check_args}
 328 ; CHECK: ${stride_pattern}
 329   %src1 = getelementptr i8, i8 ${as}* %src, i32 128;
 330   call void @${intrinsic}(i8 ${as}* %src1, ${args}${extra_args});
 331   ret void
 332 }
 333 """
 334   intrinsic_template = "llvm.nvvm.wmma.${geom}.store.${abc}.${layout}${stride}.${itype}.${pspace}"
 335   instruction_template = "wmma.store.${abc}.sync${aligned}.${layout}.${geom}${space}.${itype}"
 336
 337   generated_items = []
 338
 339   for frag, layout, space, stride in product(
 340       get_ldst_ops("store"),
 341       ["row","col"],
 342       ["",".shared",".global"],
 343       ["", ".stride"]):
 344
 345     if not is_ldst_variant_supported(frag, layout):
 346       continue
 347
 348     params = {
 349         "abc" : frag.frag,
 350         "aligned" : ".aligned" if ptx_version >= 63 else "",
 351         "layout" : layout,
 352         "space" : space,
 353         "stride" : stride,
 354         "itype" : frag.mma_type.ptx_type,
 355         "pspace" : get_pspace(space),
 356         "as"     : "addrspace(%d)" % get_aspace(space),
 357         "geom"   : frag.geom,
 358     }
 359
 360     test_params = params
 361     test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
 362     test_params["function"] = test_params["intrinsic"].replace(".","_")
 363     test_params["instruction"] = Template(instruction_template).substitute(params)
 364     test_params["ret_ty"] = make_wmma_ld_ret_ty(frag)
 365     test_params["check_args"] = check_pattern(frag)
 366     if stride:
 367       test_params["extra_args"] = ", i32 %stride";
 368       test_params["stride_pattern"] = ", %r{{[0-9]+}};"
 369     else:
 370       test_params["extra_args"] = ""
 371       test_params["stride_pattern"] = ";"
 372     test_params["args"] = make_wmma_slice_args(frag);
 373
 374     print(Template(store_template).substitute(test_params))
 375     generated_items.append((test_params["intrinsic"],
 376                             test_params["instruction"]))
 377
 378   return generated_items
 379
 380 def mma_signature(op):
 381   if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
 382     # int and sub-int ops are identified by input type.
 383     return op.a.mma_type.ptx_type
 384   else:
 385     # the rest are FP ops identified by accumulator & result type.
 386     return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
 387
 388 def mma_ptx_signature(op):
 389   if op.a.mma_type.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
 390     # int and sub-int instructions encode all four types as D.A.B.C
 391     return ".".join(x.mma_type.ptx_type for x in (op.d, op.a, op.b, op.c))
 392   else:
 393     # the rest are FP instructions use D.C
 394     return "%s.%s" % (op.d.mma_type.ptx_type, op.c.mma_type.ptx_type)
 395
 396 def gen_wmma_mma_tests():
 397   mma_template = """
 398 declare ${ret_ty} @${intrinsic}(
 399         ${args});
 400
 401 ; CHECK-LABEL: .func {{.*}}test_${function}(
 402 define ${ret_ty} @test_${function}(
 403         ${args}) {
 404 ; CHECK: ${instruction}
 405 ; CHECK-NEXT: ${check_d}
 406 ; CHECK-NEXT: ${check_a}
 407 ; CHECK-NEXT: ${check_b}
 408 ; CHECK-NEXT: ${check_c}
 409   %r = call ${ret_ty} @${intrinsic}(
 410         ${args});
 411   ret ${ret_ty} %r;
 412 }
 413 """
 414   intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
 415   instruction_template = "wmma.mma${mma_variant}.sync${aligned}.${alayout}.${blayout}.${geom}.${ptx_signature}${satf}"
 416
 417   generated_items=[]
 418
 419   for op, alayout, blayout, satf in product(
 420       get_mma_ops(),
 421       ["row","col"],
 422       ["row","col"],
 423       [".satfinite", ""]):
 424
 425     if not is_mma_variant_supported(op, alayout, blayout, satf):
 426       continue
 427
 428     params = {
 429         "aligned" : ".aligned" if ptx_version >= 63 else "",
 430         "alayout" : alayout,
 431         "blayout" : blayout,
 432         "intrinsic_signature" : mma_signature(op),
 433         "ptx_signature" : mma_ptx_signature(op),
 434         "satf"  : satf,
 435         "geom"  : op.a.geom,
 436         "mma_variant" : ".xor.popc" if op.a.mma_type.ptx_type == "b1" else "",
 437     }
 438
 439     test_params = params
 440     test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
 441     test_params["function"] = test_params["intrinsic"].replace(".", "_")
 442     test_params["instruction"] = Template(instruction_template).substitute(params)
 443     test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d)
 444     test_params["check_a"] = check_pattern(op.a)
 445     test_params["check_b"] = check_pattern(op.b)
 446     test_params["check_c"] = check_pattern(op.c)
 447     test_params["check_d"] = check_pattern(op.d)
 448     args = ",\n        ".join(make_wmma_slice_args(frag)
 449                               for frag in (op.a, op.b, op.c))
 450     test_params["args"] = args
 451     print(Template(mma_template).substitute(test_params))
 452     generated_items.append((test_params["intrinsic"],
 453                             test_params["instruction"]))
 454
 455   return generated_items
 456
 457 # Append complete list of intrinsics and instructions we've generated tests for.
 458 # Generate set of checks to verify that that we did generate sensible set of
 459 # tests for the given combination of PTX and SM variants.
 460 #
 461 # PTX<N>: verifies that we did generate tests for correct classes of intrinsics.
 462 # PTX<N>U: verifies that we did not generate intrinsics unsupported by
 463 #          the PTX version.
 464 # SM<N>: verifies that we did generate correct classes of instructions for the SM.
 465 # SM<N>U: verifies that we did not generate instructions unsupported by the SM
 466 #
 467 # Note that SM/PTX constraints overlap, but DAG checks do not allow overlapping
 468 # matches. We implicitly rely that we generate multiple variants of most of the
 469 # instructions and usually have enough input data to find more than one match of
 470 # the same kind, if necessary. When it's not possible (e.g. there's only one
 471 # m8n8k128.mma.row.col.b1), we may need to match PTX instruction instead.
 472 def gen_check_unsupported_ops(items):
 473   print("; Complete list of intrinsics supported by PTX%d on sm_%d"
 474         % (ptx_version, gpu_arch))
 475   print("; INTRINSICS: {{^; INTRINSICS_LIST_BEGIN}}")
 476   print("""
 477 ; PTX60-DAG: m16n16k16.load.{{[ab].*}}.f16.p
 478 ; PTX60-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
 479 ; PTX60U-NOT: m32n8k16
 480 ; PTX60U-NOT: m8n32k16
 481 ; PTX60U-NOT: .{{s32|s[48]|u[48]|b1}}
 482
 483 ; All features of PTX60, plus m32n8k16/m8n32k16 geometries.
 484 ; PTX61-DAG: m32n8k16.load.{{[ab].*}}.f16.p
 485 ; PTX61-DAG: m32n8k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
 486 ; PTX61-DAG: m8n32k16.load.{{[ab].*}}.f16.p
 487 ; PTX61-DAG: m8n32k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p
 488 ; PTX61U-NOT: .{{s32|s[48]|u[48]|b1}}
 489
 490 ; SM70U-NOT: .{{s32|s[48]|u[48]|b1}}
 491
 492 ; PTX63 supports all features of PTX60+PTX61, plus support for integers.
 493 ; Alas we can"t just use PTX<N> checks for that as available instructions
 494 ; depend on SM integers need sm72+ and subinteger ops need sm75, so we
 495 ; transition to SM<N> checks
 496 ; SM72-DAG: m16n16k16.load.{{[ab].*}}.s8.p
 497 ; SM72-DAG: m8n32k16.load.{{[ab].*}}.s8.p
 498 ; SM72-DAG: m32n8k16.load.{{[ab].*}}.s8.p
 499 ; SM72-DAG: m16n16k16.load.{{[ab].*}}.u8.p
 500 ; SM72-DAG: m8n32k16.load.{{[ab].*}}.u8.p
 501 ; SM72-DAG: m32n8k16.load.{{[ab].*}}.u8.p
 502 ; SM72-DAG: m32n8k16.{{load|store}}.{{[cd].*\.s32}}.p
 503 ; SM72U-NOT: .{{s4|u4|b1}}
 504
 505 ; SM75-DAG: m8n8k128.load.{{[ab].*}}.b1.p
 506 ; SM75-DAG: m8n8k32.load.{{[ab].*}}.s4.p
 507 ; SM75-DAG: m8n8k32.load.{{[ab].*}}.u4.p
 508 ; SM75-DAG: m8n8k128.{{load|store}}.{{[cd].*\.s32}}.p
 509 ; SM75-DAG: m8n8k32.{{load|store}}.{{[cd].*\.s32}}.p
 510 """)
 511
 512   print("; INTRINSICS_LIST_BEGIN")
 513   for intrinsic, instruction in sorted(items):
 514     print("; ", intrinsic, " -> ", instruction,"")
 515   print("; INTRINSICS_LIST_END")
 516   print("; INTRINSICS: ; INTRINSICS_LIST_END")
 517
 518 def gen_tests():
 519   items = gen_wmma_load_tests()
 520   items += gen_wmma_store_tests()
 521   items += gen_wmma_mma_tests()
 522   gen_check_unsupported_ops(items)
 523
 524 parser = argparse.ArgumentParser()
 525 parser.add_argument("--ptx", type=int, default=60)
 526 parser.add_argument("--gpu-arch", type=int, default=70)
 527 args = parser.parse_args()
 528 ptx_version = args.ptx
 529 gpu_arch = args.gpu_arch
 530
 531 gen_tests()