1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
5 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_1:
6 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 1.0 op_sel:[0,1] op_sel_hi:[1,0]{{$}}
7 define amdgpu_kernel void @test_pk_max_f16_literal_0_1(ptr addrspace(1) nocapture %arg) {
9 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
10 %tmp1 = zext i32 %tmp to i64
11 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
12 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
13 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xH3C00>)
14 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
18 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_1_0:
19 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 1.0{{$}}
20 define amdgpu_kernel void @test_pk_max_f16_literal_1_0(ptr addrspace(1) nocapture %arg) {
22 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
23 %tmp1 = zext i32 %tmp to i64
24 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
25 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
26 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH0000>)
27 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
31 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_1_1:
32 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 1.0 op_sel_hi:[1,0]{{$}}
33 define amdgpu_kernel void @test_pk_max_f16_literal_1_1(ptr addrspace(1) nocapture %arg) {
35 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
36 %tmp1 = zext i32 %tmp to i64
37 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
38 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
39 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH3C00, half 0xH3C00>)
40 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
44 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_m1:
45 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, -1.0 op_sel:[0,1] op_sel_hi:[1,0]{{$}}
46 define amdgpu_kernel void @test_pk_max_f16_literal_0_m1(ptr addrspace(1) nocapture %arg) {
48 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
49 %tmp1 = zext i32 %tmp to i64
50 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
51 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
52 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xHBC00>)
53 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
57 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_m1_0:
58 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, -1.0{{$}}
59 define amdgpu_kernel void @test_pk_max_f16_literal_m1_0(ptr addrspace(1) nocapture %arg) {
61 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
62 %tmp1 = zext i32 %tmp to i64
63 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
64 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
65 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xHBC00, half 0xH0000>)
66 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
70 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_m1_m1:
71 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, -1.0 op_sel_hi:[1,0]{{$}}
72 define amdgpu_kernel void @test_pk_max_f16_literal_m1_m1(ptr addrspace(1) nocapture %arg) {
74 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
75 %tmp1 = zext i32 %tmp to i64
76 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
77 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
78 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xHBC00, half 0xHBC00>)
79 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
83 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_0:
84 ; GCN: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
85 define amdgpu_kernel void @test_pk_max_f16_literal_0_0(ptr addrspace(1) nocapture %arg) {
87 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
88 %tmp1 = zext i32 %tmp to i64
89 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
90 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
91 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xH0000>)
92 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
96 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_0_41c8:
97 ; GFX9: s_mov_b32 [[C:s[0-9]+]], 0x41c80000
98 ; GFX9: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}}
99 ; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c8, v{{[0-9]+}} op_sel:[1,0] op_sel_hi:[0,1]{{$}}
100 define amdgpu_kernel void @test_pk_max_f16_literal_0_41c8(ptr addrspace(1) nocapture %arg) {
102 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
103 %tmp1 = zext i32 %tmp to i64
104 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
105 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
106 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH0000, half 0xH41C8>)
107 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
111 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_41c8_0:
112 ; GFX9: s_movk_i32 [[C:s[0-9]+]], 0x41c8
113 ; GFX9: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}}
114 ; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c8, v{{[0-9]+}}{{$}}
115 define amdgpu_kernel void @test_pk_max_f16_literal_41c8_0(ptr addrspace(1) nocapture %arg) {
117 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
118 %tmp1 = zext i32 %tmp to i64
119 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
120 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
121 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH41C8, half 0xH0>)
122 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
126 ; GCN-LABEL: {{^}}test_pk_max_f16_literal_42ca_41c8:
127 ; GFX9: s_mov_b32 [[C:s[0-9]+]], 0x41c842ca
128 ; GFX9: v_pk_max_f16 v{{[0-9]+}}, v{{[0-9]+}}, [[C]]{{$}}
129 ; GFX10: v_pk_max_f16 v{{[0-9]+}}, 0x41c842ca, v{{[0-9]+}}{{$}}
130 define amdgpu_kernel void @test_pk_max_f16_literal_42ca_41c8(ptr addrspace(1) nocapture %arg) {
132 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
133 %tmp1 = zext i32 %tmp to i64
134 %tmp2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i64 %tmp1
135 %tmp3 = load <2 x half>, ptr addrspace(1) %tmp2, align 4
136 %tmp4 = tail call <2 x half> @llvm.maxnum.v2f16(<2 x half> %tmp3, <2 x half> <half 0xH42CA, half 0xH41C8>)
137 store <2 x half> %tmp4, ptr addrspace(1) %tmp2, align 4
141 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
142 declare i32 @llvm.amdgcn.workitem.id.x()