1 // REQUIRES
: amdgpu-registered-target
3 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck %s
4 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck %s
5 // RUN
: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -S -emit-llvm -o - %s | FileCheck %s
7 typedef unsigned int uint
;
8 typedef half __attribute__
((ext_vector_type(2))) half2
;
9 typedef short __attribute__
((ext_vector_type(2))) short2
;
10 typedef unsigned short __attribute__
((ext_vector_type(2))) ushort2
;
12 // CHECK-LABEL
: @builtins_amdgcn_dl_insts
13 // CHECK
: call float
@llvm.amdgcn.fdot2
(<2 x half
> %v2hA
, <2 x half
> %v2hB
, float %fC
, i1 false
)
14 // CHECK
: call float
@llvm.amdgcn.fdot2
(<2 x half
> %v2hA
, <2 x half
> %v2hB
, float %fC
, i1 true
)
16 // CHECK
: call i32
@llvm.amdgcn.sdot2
(<2 x i16
> %v2ssA
, <2 x i16
> %v2ssB
, i32 %siC
, i1 false
)
17 // CHECK
: call i32
@llvm.amdgcn.sdot2
(<2 x i16
> %v2ssA
, <2 x i16
> %v2ssB
, i32 %siC
, i1 true
)
19 // CHECK
: call i32
@llvm.amdgcn.udot2
(<2 x i16
> %v2usA
, <2 x i16
> %v2usB
, i32 %uiC
, i1 false
)
20 // CHECK
: call i32
@llvm.amdgcn.udot2
(<2 x i16
> %v2usA
, <2 x i16
> %v2usB
, i32 %uiC
, i1 true
)
22 // CHECK
: call i32
@llvm.amdgcn.sdot4
(i32 %siA
, i32 %siB
, i32 %siC
, i1 false
)
23 // CHECK
: call i32
@llvm.amdgcn.sdot4
(i32 %siA
, i32 %siB
, i32 %siC
, i1 true
)
25 // CHECK
: call i32
@llvm.amdgcn.udot4
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 false
)
26 // CHECK
: call i32
@llvm.amdgcn.udot4
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 true
)
28 // CHECK
: call i32
@llvm.amdgcn.sdot8
(i32 %siA
, i32 %siB
, i32 %siC
, i1 false
)
29 // CHECK
: call i32
@llvm.amdgcn.sdot8
(i32 %siA
, i32 %siB
, i32 %siC
, i1 true
)
31 // CHECK
: call i32
@llvm.amdgcn.udot8
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 false
)
32 // CHECK
: call i32
@llvm.amdgcn.udot8
(i32 %uiA
, i32 %uiB
, i32 %uiC
, i1 true
)
33 kernel void builtins_amdgcn_dl_insts
(
34 global float
*fOut
, global int
*siOut
, global uint
*uiOut
,
35 half2 v2hA
, half2 v2hB
, float fC
,
36 short2 v2ssA
, short2 v2ssB
, int siA
, int siB
, int siC
,
37 ushort2 v2usA
, ushort2 v2usB
, uint uiA
, uint uiB
, uint uiC
) {
38 fOut
[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);
39 fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);
41 siOut[0] = __builtin_amdgcn_sdot2
(v2ssA, v2ssB
, siC
, false
);
42 siOut
[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true);
44 uiOut[0] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, false);
45 uiOut[1] = __builtin_amdgcn_udot2
(v2usA, v2usB
, uiC
, true
);
47 siOut
[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false);
48 siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true);
50 uiOut[2] = __builtin_amdgcn_udot4
(uiA, uiB
, uiC
, false
);
51 uiOut
[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);
53 siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false);
54 siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true);
56 uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);
57 uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);