1 // REQUIRES
: amdgpu-registered-target
3 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
5 typedef unsigned int uint
;
6 typedef half __attribute__
((ext_vector_type(2))) half2
;
7 typedef short __attribute__
((ext_vector_type(2))) short2
;
8 typedef unsigned short __attribute__
((ext_vector_type(2))) ushort2
;
10 // CHECK-LABEL
: @builtins_amdgcn_dl_insts
11 // CHECK
: call float
@llvm.amdgcn.fdot2
(<2 x half
> %v2hA
, <2 x half
> %v2hB
, float %fC
, i1 false
)
12 // CHECK
: call float
@llvm.amdgcn.fdot2
(<2 x half
> %v2hA
, <2 x half
> %v2hB
, float %fC
, i1 true
)
13 // CHECK
: call half
@llvm.amdgcn.fdot2.f16.f16
(<2 x half
> %v2hA
, <2 x half
> %v2hB
, half %hC
)
14 // CHECK
: [[s1
:%
[0-
9]+]] = bitcast
<2 x i16
> %v2ssA to
<2 x bfloat
>
15 // CHECK-NEXT
: [[s2
:%
[0-
9]+]] = bitcast
<2 x i16
> %v2ssB to
<2 x bfloat
>
16 // CHECK-NEXT
: [[s3
:%
[0-
9]+]] = bitcast i16 %sC to bfloat
17 // CHECK-NEXT
: [[d
:%
[0-
9]+]] = tail call bfloat
@llvm.amdgcn.fdot2.bf16.bf16
(<2 x bfloat
> [[s1]], <2 x bfloat> [[s2]], bfloat [[s3]])
18 // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x bfloat
> [[s2]], float %fC, i1 false)
19 // CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], float %fC
, i1 true
)
20 // CHECK
: call i32
@llvm.amdgcn.udot4
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 false
)
21 // CHECK
: call i32
@llvm.amdgcn.udot4
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 true
)
22 // CHECK
: call i32
@llvm.amdgcn.sudot4
(i1 true
, i32 %A
, i1 false
, i32 %B
, i32 %C
, i1 false
)
23 // CHECK
: call i32
@llvm.amdgcn.sudot4
(i1 false
, i32 %A
, i1 true
, i32 %B
, i32 %C
, i1 true
)
24 // CHECK
: call i32
@llvm.amdgcn.udot8
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 false
)
25 // CHECK
: call i32
@llvm.amdgcn.udot8
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 true
)
26 // CHECK
: call i32
@llvm.amdgcn.sudot8
(i1 false
, i32 %A
, i1 true
, i32 %B
, i32 %C
, i1 false
)
27 // CHECK
: call i32
@llvm.amdgcn.sudot8
(i1 true
, i32 %A
, i1 false
, i32 %B
, i32 %C
, i1 true
)
28 #pragma OPENCL EXTENSION cl_khr_fp16
: enable
29 kernel void builtins_amdgcn_dl_insts_err
(
30 global float
*fOut
, global int
*siOut
, global uint
*uiOut
,
31 global short
*sOut
, global int
*iOut
, global half
*hOut
,
32 half2 v2hA
, half2 v2hB
, float fC
, half hC
,
33 short2 v2ssA
, short2 v2ssB
, short sC
, int siA
, int siB
, int siC
,
34 ushort2 v2usA
, ushort2 v2usB
, uint uiA
, uint uiB
, uint uiC
,
35 int A
, int B
, int C
) {
36 fOut
[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);
37 fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);
39 hOut[0] = __builtin_amdgcn_fdot2_f16_f16
(v2hA, v2hB
, hC
);
41 sOut
[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC);
43 fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
44 fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true);
46 uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false);
47 uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);
49 iOut[0] = __builtin_amdgcn_sudot4
(true, A
, false
, B
, C
, false
);
50 iOut
[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true);
52 uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);
53 uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);
55 iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false);
56 iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true);