1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
5 define amdgpu_kernel void @madak_f16(
7 ; SI: ; %bb.0: ; %entry
8 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
9 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
10 ; SI-NEXT: s_mov_b32 s3, 0xf000
11 ; SI-NEXT: s_mov_b32 s2, -1
12 ; SI-NEXT: s_mov_b32 s10, s2
13 ; SI-NEXT: s_mov_b32 s11, s3
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s12, s6
16 ; SI-NEXT: s_mov_b32 s13, s7
17 ; SI-NEXT: s_mov_b32 s14, s2
18 ; SI-NEXT: s_mov_b32 s15, s3
19 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
20 ; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
21 ; SI-NEXT: s_mov_b32 s0, s4
22 ; SI-NEXT: s_mov_b32 s1, s5
23 ; SI-NEXT: s_waitcnt vmcnt(1)
24 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
25 ; SI-NEXT: s_waitcnt vmcnt(0)
26 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
27 ; SI-NEXT: v_madak_f32 v0, v1, v0, 0x41200000
28 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
29 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
32 ; VI-LABEL: madak_f16:
33 ; VI: ; %bb.0: ; %entry
34 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
35 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
36 ; VI-NEXT: s_mov_b32 s3, 0xf000
37 ; VI-NEXT: s_mov_b32 s2, -1
38 ; VI-NEXT: s_mov_b32 s10, s2
39 ; VI-NEXT: s_waitcnt lgkmcnt(0)
40 ; VI-NEXT: s_mov_b32 s0, s4
41 ; VI-NEXT: s_mov_b32 s1, s5
42 ; VI-NEXT: s_mov_b32 s4, s6
43 ; VI-NEXT: s_mov_b32 s5, s7
44 ; VI-NEXT: s_mov_b32 s6, s2
45 ; VI-NEXT: s_mov_b32 s7, s3
46 ; VI-NEXT: s_mov_b32 s11, s3
47 ; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
48 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
49 ; VI-NEXT: s_waitcnt vmcnt(0)
50 ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900
51 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
53 half addrspace(1)* %r,
54 half addrspace(1)* %a,
55 half addrspace(1)* %b) #0 {
57 %a.val = load half, half addrspace(1)* %a
58 %b.val = load half, half addrspace(1)* %b
60 %t.val = fmul half %a.val, %b.val
61 %r.val = fadd half %t.val, 10.0
63 store half %r.val, half addrspace(1)* %r
67 define amdgpu_kernel void @madak_f16_use_2(
68 ; SI-LABEL: madak_f16_use_2:
69 ; SI: ; %bb.0: ; %entry
70 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
71 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
72 ; SI-NEXT: s_mov_b32 s3, 0xf000
73 ; SI-NEXT: s_mov_b32 s2, -1
74 ; SI-NEXT: s_mov_b32 s18, s2
75 ; SI-NEXT: s_waitcnt lgkmcnt(0)
76 ; SI-NEXT: s_mov_b32 s16, s8
77 ; SI-NEXT: s_mov_b32 s17, s9
78 ; SI-NEXT: s_mov_b32 s19, s3
79 ; SI-NEXT: s_mov_b32 s8, s10
80 ; SI-NEXT: s_mov_b32 s9, s11
81 ; SI-NEXT: s_mov_b32 s10, s2
82 ; SI-NEXT: s_mov_b32 s11, s3
83 ; SI-NEXT: s_mov_b32 s14, s2
84 ; SI-NEXT: s_mov_b32 s15, s3
85 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
86 ; SI-NEXT: s_waitcnt vmcnt(0)
87 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
88 ; SI-NEXT: s_waitcnt vmcnt(0)
89 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
90 ; SI-NEXT: s_waitcnt vmcnt(0)
91 ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000
92 ; SI-NEXT: s_mov_b32 s0, s4
93 ; SI-NEXT: s_mov_b32 s1, s5
94 ; SI-NEXT: s_mov_b32 s8, s6
95 ; SI-NEXT: s_mov_b32 s9, s7
96 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
97 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
98 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
99 ; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000
100 ; SI-NEXT: v_mac_f32_e32 v3, v0, v2
101 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
102 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
103 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
104 ; SI-NEXT: buffer_store_short v1, off, s[8:11], 0
107 ; VI-LABEL: madak_f16_use_2:
108 ; VI: ; %bb.0: ; %entry
109 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
110 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
111 ; VI-NEXT: s_mov_b32 s3, 0xf000
112 ; VI-NEXT: s_mov_b32 s2, -1
113 ; VI-NEXT: s_mov_b32 s18, s2
114 ; VI-NEXT: s_waitcnt lgkmcnt(0)
115 ; VI-NEXT: s_mov_b32 s16, s8
116 ; VI-NEXT: s_mov_b32 s17, s9
117 ; VI-NEXT: s_mov_b32 s19, s3
118 ; VI-NEXT: s_mov_b32 s8, s10
119 ; VI-NEXT: s_mov_b32 s9, s11
120 ; VI-NEXT: s_mov_b32 s10, s2
121 ; VI-NEXT: s_mov_b32 s11, s3
122 ; VI-NEXT: s_mov_b32 s14, s2
123 ; VI-NEXT: s_mov_b32 s15, s3
124 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
125 ; VI-NEXT: s_waitcnt vmcnt(0)
126 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
127 ; VI-NEXT: s_waitcnt vmcnt(0)
128 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
129 ; VI-NEXT: s_waitcnt vmcnt(0)
130 ; VI-NEXT: v_mov_b32_e32 v3, 0x4900
131 ; VI-NEXT: s_mov_b32 s0, s4
132 ; VI-NEXT: s_mov_b32 s1, s5
133 ; VI-NEXT: s_mov_b32 s8, s6
134 ; VI-NEXT: s_mov_b32 s9, s7
135 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900
136 ; VI-NEXT: v_mac_f16_e32 v3, v0, v2
137 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
138 ; VI-NEXT: buffer_store_short v3, off, s[8:11], 0
140 half addrspace(1)* %r0,
141 half addrspace(1)* %r1,
142 half addrspace(1)* %a,
143 half addrspace(1)* %b,
144 half addrspace(1)* %c) #0 {
146 %a.val = load volatile half, half addrspace(1)* %a
147 %b.val = load volatile half, half addrspace(1)* %b
148 %c.val = load volatile half, half addrspace(1)* %c
150 %t0.val = fmul half %a.val, %b.val
151 %t1.val = fmul half %a.val, %c.val
152 %r0.val = fadd half %t0.val, 10.0
153 %r1.val = fadd half %t1.val, 10.0
155 store half %r0.val, half addrspace(1)* %r0
156 store half %r1.val, half addrspace(1)* %r1
160 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }