Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / CodeGenOpenCL / builtins-amdgcn-dl-insts-gfx11.cl
blobdc7069decaaa61c7957638fa1dedd0b95930d945
1 // REQUIRES: amdgpu-registered-target
3 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck %s
5 typedef unsigned int uint;
6 typedef half __attribute__((ext_vector_type(2))) half2;
7 typedef short __attribute__((ext_vector_type(2))) short2;
8 typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
10 // CHECK-LABEL: @builtins_amdgcn_dl_insts
11 // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false)
12 // CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true)
13 // CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC)
14 // CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC)
15 // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false)
16 // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true)
17 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
18 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
19 // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 false)
20 // CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 true)
21 // CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
22 // CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
23 // CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 false)
24 // CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 true)
25 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
26 kernel void builtins_amdgcn_dl_insts_err(
27 global float *fOut, global int *siOut, global uint *uiOut,
28 global short *sOut, global int *iOut, global half *hOut,
29 half2 v2hA, half2 v2hB, float fC, half hC,
30 short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC,
31 ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC,
32 int A, int B, int C) {
33 fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);
34 fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);
36 hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC);
38 sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC);
40 fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
41 fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true);
43 uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false);
44 uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);
46 iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false);
47 iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true);
49 uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);
50 uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);
52 iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);
53 iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true);