1 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
2 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s
3 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s
5 ; GCN: 'extractelement_v2i32'
6 ; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32>
7 define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
8 %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
9 %elt = extractelement <2 x i32> %vec, i32 1
10 store i32 %elt, i32 addrspace(1)* %out
14 ; GCN: 'extractelement_v2f32'
15 ; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float>
16 define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
17 %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
18 %elt = extractelement <2 x float> %vec, i32 1
19 store float %elt, float addrspace(1)* %out
23 ; GCN: 'extractelement_v3i32'
24 ; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32>
25 define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
26 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
27 %elt = extractelement <3 x i32> %vec, i32 1
28 store i32 %elt, i32 addrspace(1)* %out
32 ; GCN: 'extractelement_v4i32'
33 ; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32>
34 define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
35 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
36 %elt = extractelement <4 x i32> %vec, i32 1
37 store i32 %elt, i32 addrspace(1)* %out
41 ; GCN: 'extractelement_v5i32'
42 ; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32>
43 define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) {
44 %vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
45 %elt = extractelement <5 x i32> %vec, i32 1
46 store i32 %elt, i32 addrspace(1)* %out
50 ; GCN: 'extractelement_v8i32'
51 ; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
52 define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
53 %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
54 %elt = extractelement <8 x i32> %vec, i32 1
55 store i32 %elt, i32 addrspace(1)* %out
59 ; FIXME: Should be non-0
60 ; GCN: 'extractelement_v8i32_dynindex'
61 ; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32>
62 define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
63 %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
64 %elt = extractelement <8 x i32> %vec, i32 %idx
65 store i32 %elt, i32 addrspace(1)* %out
69 ; GCN: 'extractelement_v2i64'
70 ; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64>
71 define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
72 %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
73 %elt = extractelement <2 x i64> %vec, i64 1
74 store i64 %elt, i64 addrspace(1)* %out
78 ; GCN: 'extractelement_v3i64'
79 ; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64>
80 define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
81 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
82 %elt = extractelement <3 x i64> %vec, i64 1
83 store i64 %elt, i64 addrspace(1)* %out
87 ; GCN: 'extractelement_v4i64'
88 ; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64>
89 define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
90 %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
91 %elt = extractelement <4 x i64> %vec, i64 1
92 store i64 %elt, i64 addrspace(1)* %out
96 ; GCN: 'extractelement_v8i64'
97 ; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64>
98 define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
99 %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
100 %elt = extractelement <8 x i64> %vec, i64 1
101 store i64 %elt, i64 addrspace(1)* %out
105 ; GCN: 'extractelement_v4i8'
106 ; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8>
107 define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
108 %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
109 %elt = extractelement <4 x i8> %vec, i8 1
110 store i8 %elt, i8 addrspace(1)* %out
114 ; GCN: 'extractelement_0_v2i16':
115 ; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0
116 ; VI: estimated cost of 0 for {{.*}} extractelement <2 x i16>
117 ; GFX9: estimated cost of 0 for {{.*}} extractelement <2 x i16>
118 define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
119 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
120 %elt = extractelement <2 x i16> %vec, i16 0
121 store i16 %elt, i16 addrspace(1)* %out
125 ; GCN: 'extractelement_1_v2i16':
126 ; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
127 define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
128 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
129 %elt = extractelement <2 x i16> %vec, i16 1
130 store i16 %elt, i16 addrspace(1)* %out
134 ; GCN: 'extractelement_var_v2i16'
135 ; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
136 define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) {
137 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
138 %elt = extractelement <2 x i16> %vec, i32 %idx
139 store i16 %elt, i16 addrspace(1)* %out