test/CodeGen/AMDGPU/mad_int24.ll

   1 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
   2 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
   3 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
   4 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
   5
   6 ; FUNC-LABEL: {{^}}i32_mad24:
   7 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
   8 ; EG: MULLO_INT
   9 ; Make sure we aren't masking the inputs.
  10 ; CM-NOT: AND
  11 ; CM: MULADD_INT24
  12 ; GCN-NOT: and
  13 ; GCN: v_mad_i32_i24
  14 define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
  15 entry:
  16   %0 = shl i32 %a, 8
  17   %a_24 = ashr i32 %0, 8
  18   %1 = shl i32 %b, 8
  19   %b_24 = ashr i32 %1, 8
  20   %2 = mul i32 %a_24, %b_24
  21   %3 = add i32 %2, %c
  22   store i32 %3, i32 addrspace(1)* %out
  23   ret void
  24 }
  25
  26 ; GCN-LABEL: {{^}}mad24_known_bits_destroyed:
  27 ; GCN: s_waitcnt
  28 ; GCN-NEXT: v_mad_i32_i24
  29 ; GCN-NEXT: v_mul_i32_i24
  30 ; GCN-NEXT: s_setpc_b64
  31 define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
  32
  33   %shl.0 = shl i32 %a, 8
  34   %sra.0 = ashr i32 %shl.0, 8
  35   %shl.1 = shl i32 %b, 8
  36   %sra.1 = ashr i32 %shl.1, 8
  37
  38   %mul0 = mul nsw i32 %sra.0, %sra.1
  39   %add0 = add nsw i32 %mul0, %c
  40
  41   %shl.2 = shl i32 %add0, 8
  42   %sra.2 = ashr i32 %shl.2, 8
  43
  44   %shl.3 = shl i32 %sra.0, 8
  45   %sra.3 = ashr i32 %shl.3, 8
  46
  47   %mul1 = mul nsw i32 %sra.2, %sra.3
  48   ret i32 %mul1
  49 }
  50
  51 ; GCN-LABEL: {{^}}mad24_intrin_known_bits_destroyed:
  52 ; GCN: s_waitcnt
  53 ; GCN-NEXT: v_mad_i32_i24
  54 ; GCN-NEXT: v_mul_i32_i24
  55 ; GCN-NEXT: s_setpc_b64
  56 define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
  57   %shl.0 = shl i32 %a, 8
  58   %sra.0 = ashr i32 %shl.0, 8
  59   %shl.1 = shl i32 %b, 8
  60   %sra.1 = ashr i32 %shl.1, 8
  61
  62   %mul0 = call i32 @llvm.amdgcn.mul.i24(i32 %sra.0, i32 %sra.1)
  63   %add0 = add nsw i32 %mul0, %c
  64
  65   %shl.2 = shl i32 %add0, 8
  66   %sra.2 = ashr i32 %shl.2, 8
  67
  68   %shl.3 = shl i32 %sra.0, 8
  69   %sra.3 = ashr i32 %shl.3, 8
  70
  71   %mul1 = mul nsw i32 %sra.2, %sra.3
  72   ret i32 %mul1
  73 }
  74
  75 ; Make sure no unnecessary BFEs are emitted in the loop.
  76 ; GCN-LABEL: {{^}}mad24_destroyed_knownbits_2:
  77 ; GCN-NOT: v_bfe
  78 ; GCN: v_mad_i32_i24
  79 ; GCN-NOT: v_bfe
  80 ; GCN: v_mad_i32_i24
  81 ; GCN-NOT: v_bfe
  82 ; GCN: v_mad_i32_i24
  83 ; GCN-NOT: v_bfe
  84 ; GCN: v_mad_i32_i24
  85 ; GCN-NOT: v_bfe
  86 define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, i32 addrspace(1)* %arg3) {
  87 bb:
  88   br label %bb6
  89
  90 bb5:                                              ; preds = %bb6
  91   ret void
  92
  93 bb6:                                              ; preds = %bb6, %bb
  94   %tmp = phi i32 [ %tmp27, %bb6 ], [ 0, %bb ]
  95   %tmp7 = phi i32 [ %arg2, %bb6 ], [ 1, %bb ]
  96   %tmp8 = phi i32 [ %tmp26, %bb6 ], [ %arg, %bb ]
  97   %tmp9 = shl i32 %tmp7, 8
  98   %tmp10 = ashr exact i32 %tmp9, 8
  99   %tmp11 = shl i32 %tmp8, 8
 100   %tmp12 = ashr exact i32 %tmp11, 8
 101   %tmp13 = mul nsw i32 %tmp12, %tmp10
 102   %tmp14 = add nsw i32 %tmp13, %tmp7
 103   %tmp15 = shl i32 %tmp14, 8
 104   %tmp16 = ashr exact i32 %tmp15, 8
 105   %tmp17 = mul nsw i32 %tmp16, %tmp10
 106   %tmp18 = add nsw i32 %tmp17, %tmp14
 107   %tmp19 = shl i32 %tmp18, 8
 108   %tmp20 = ashr exact i32 %tmp19, 8
 109   %tmp21 = mul nsw i32 %tmp20, %tmp16
 110   %tmp22 = add nsw i32 %tmp21, %tmp18
 111   %tmp23 = shl i32 %tmp22, 8
 112   %tmp24 = ashr exact i32 %tmp23, 8
 113   %tmp25 = mul nsw i32 %tmp24, %tmp20
 114   %tmp26 = add nsw i32 %tmp25, %tmp22
 115   store i32 %tmp26, i32 addrspace(1)* %arg3
 116   %tmp27 = add nuw i32 %tmp, 1
 117   %tmp28 = icmp eq i32 %tmp27, %arg1
 118   br i1 %tmp28, label %bb5, label %bb6
 119 }
 120
 121 declare i32 @llvm.amdgcn.mul.i24(i32, i32)