1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
4 ; There is no dependence between the store and the two loads. So we can combine
5 ; the loads and schedule it freely.
7 ; GCN-LABEL: {{^}}ds_combine_nodep
9 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
10 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
11 ; GCN: s_waitcnt lgkmcnt({{[0-9]+}})
12 define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {
14 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
15 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
16 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
17 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
18 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
19 %v0 = extractelement <3 x float> %load0, i32 2
21 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
22 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
24 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
25 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
26 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
28 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
29 %v1 = load float, float addrspace(3)* %vaddr1, align 4
31 %sum = fadd float %v0, %v1
32 store float %sum, float addrspace(1)* %out, align 4
37 ; The store depends on the first load, so we could not move the first load down to combine with
38 ; the second load directly. However, we can move the store after the combined load.
40 ; GCN-LABEL: {{^}}ds_combine_WAR
42 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
43 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
44 define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) {
46 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
47 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
48 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
49 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
50 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
51 %v0 = extractelement <3 x float> %load0, i32 2
53 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
54 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
56 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
57 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
58 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
60 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
61 %v1 = load float, float addrspace(3)* %vaddr1, align 4
63 %sum = fadd float %v0, %v1
64 store float %sum, float addrspace(1)* %out, align 4
69 ; The second load depends on the store. We can combine the two loads, and the combined load is
70 ; at the original place of the second load.
72 ; GCN-LABEL: {{^}}ds_combine_RAW
74 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
75 ; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
76 define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
78 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
79 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
80 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
81 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
82 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
83 %v0 = extractelement <3 x float> %load0, i32 2
85 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
86 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
88 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
89 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
90 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
92 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
93 %v1 = load float, float addrspace(3)* %vaddr1, align 4
95 %sum = fadd float %v0, %v1
96 store float %sum, float addrspace(1)* %out, align 4
101 ; The store depends on the first load, also the second load depends on the store.
102 ; So we can not combine the two loads.
104 ; GCN-LABEL: {{^}}ds_combine_WAR_RAW
106 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108
107 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
108 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
109 define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
111 %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
112 %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
113 %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
114 %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
115 %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
116 %v0 = extractelement <3 x float> %load0, i32 2
118 %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
119 %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
121 %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
122 %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
123 store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
125 %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
126 %v1 = load float, float addrspace(3)* %vaddr1, align 4
128 %sum = fadd float %v0, %v1
129 store float %sum, float addrspace(1)* %out, align 4