1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s
5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s
7 define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
8 ; GFX90A-LABEL: test_insert_extract:
9 ; GFX90A: ; %bb.0: ; %entry
10 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
11 ; GFX90A-NEXT: s_mov_b32 s2, 0
12 ; GFX90A-NEXT: s_and_b64 vcc, exec, -1
13 ; GFX90A-NEXT: s_mov_b32 s3, 0
14 ; GFX90A-NEXT: s_mov_b32 s4, 0
15 ; GFX90A-NEXT: s_mov_b32 s5, 0
16 ; GFX90A-NEXT: s_mov_b32 s6, 0
17 ; GFX90A-NEXT: .LBB0_1: ; %for.body
18 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
19 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
20 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
21 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
22 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
23 ; GFX90A-NEXT: s_cselect_b32 s7, s4, s3
24 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
25 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
26 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
27 ; GFX90A-NEXT: s_cselect_b32 s7, s5, s7
28 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
29 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
30 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
31 ; GFX90A-NEXT: s_cselect_b32 s7, s6, s7
32 ; GFX90A-NEXT: s_or_b32 s7, s7, s0
33 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
34 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
35 ; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec
36 ; GFX90A-NEXT: s_cselect_b32 s4, s7, s4
37 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
38 ; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0
39 ; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec
40 ; GFX90A-NEXT: s_cselect_b32 s6, s7, s6
41 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
42 ; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0
43 ; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec
44 ; GFX90A-NEXT: s_cselect_b32 s5, s7, s5
45 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 0
46 ; GFX90A-NEXT: s_cselect_b32 s3, s7, s3
47 ; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
48 ; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
49 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
50 ; GFX90A-NEXT: s_cselect_b32 s2, 0, s2
51 ; GFX90A-NEXT: s_mov_b64 vcc, vcc
52 ; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1
53 ; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock
54 ; GFX90A-NEXT: s_endpgm
56 ; GFX940-LABEL: test_insert_extract:
57 ; GFX940: ; %bb.0: ; %entry
58 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
59 ; GFX940-NEXT: s_mov_b32 s2, 0
60 ; GFX940-NEXT: s_and_b64 vcc, exec, -1
61 ; GFX940-NEXT: s_mov_b32 s3, 0
62 ; GFX940-NEXT: s_mov_b32 s4, 0
63 ; GFX940-NEXT: s_mov_b32 s5, 0
64 ; GFX940-NEXT: s_mov_b32 s6, 0
65 ; GFX940-NEXT: .LBB0_1: ; %for.body
66 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
67 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX940-NEXT: s_cmp_eq_u32 s1, 1
69 ; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0
70 ; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec
71 ; GFX940-NEXT: s_cselect_b32 s7, s4, s3
72 ; GFX940-NEXT: s_cmp_eq_u32 s1, 2
73 ; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0
74 ; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec
75 ; GFX940-NEXT: s_cselect_b32 s7, s5, s7
76 ; GFX940-NEXT: s_cmp_eq_u32 s1, 3
77 ; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0
78 ; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec
79 ; GFX940-NEXT: s_cselect_b32 s7, s6, s7
80 ; GFX940-NEXT: s_or_b32 s7, s7, s0
81 ; GFX940-NEXT: s_cmp_eq_u32 s1, 1
82 ; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0
83 ; GFX940-NEXT: s_and_b64 s[10:11], s[8:9], exec
84 ; GFX940-NEXT: s_cselect_b32 s4, s7, s4
85 ; GFX940-NEXT: s_cmp_eq_u32 s1, 3
86 ; GFX940-NEXT: s_cselect_b64 s[10:11], -1, 0
87 ; GFX940-NEXT: s_and_b64 s[12:13], s[10:11], exec
88 ; GFX940-NEXT: s_cselect_b32 s6, s7, s6
89 ; GFX940-NEXT: s_cmp_eq_u32 s1, 2
90 ; GFX940-NEXT: s_cselect_b64 s[12:13], -1, 0
91 ; GFX940-NEXT: s_and_b64 s[14:15], s[12:13], exec
92 ; GFX940-NEXT: s_cselect_b32 s5, s7, s5
93 ; GFX940-NEXT: s_cmp_eq_u32 s1, 0
94 ; GFX940-NEXT: s_cselect_b32 s3, s7, s3
95 ; GFX940-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
96 ; GFX940-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
97 ; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec
98 ; GFX940-NEXT: s_cselect_b32 s2, 0, s2
99 ; GFX940-NEXT: s_mov_b64 vcc, vcc
100 ; GFX940-NEXT: s_cbranch_vccnz .LBB0_1
101 ; GFX940-NEXT: ; %bb.2: ; %DummyReturnBlock
102 ; GFX940-NEXT: s_endpgm
104 ; GFX1030-LABEL: test_insert_extract:
105 ; GFX1030: ; %bb.0: ; %entry
106 ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
107 ; GFX1030-NEXT: s_mov_b32 s2, 0
108 ; GFX1030-NEXT: s_mov_b32 s3, 0
109 ; GFX1030-NEXT: s_mov_b32 s4, 0
110 ; GFX1030-NEXT: s_mov_b32 s5, 0
111 ; GFX1030-NEXT: s_mov_b32 s6, 0
112 ; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo
113 ; GFX1030-NEXT: .p2align 6
114 ; GFX1030-NEXT: .LBB0_1: ; %for.body
115 ; GFX1030-NEXT: ; =>This Inner Loop Header: Depth=1
116 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
117 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
118 ; GFX1030-NEXT: s_cselect_b32 s7, -1, 0
119 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
120 ; GFX1030-NEXT: s_cselect_b32 s7, s4, s3
121 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
122 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
123 ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
124 ; GFX1030-NEXT: s_cselect_b32 s7, s5, s7
125 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
126 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
127 ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
128 ; GFX1030-NEXT: s_cselect_b32 s7, s6, s7
129 ; GFX1030-NEXT: s_or_b32 s7, s7, s0
130 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
131 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
132 ; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo
133 ; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
134 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
135 ; GFX1030-NEXT: s_cselect_b32 s9, -1, 0
136 ; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo
137 ; GFX1030-NEXT: s_cselect_b32 s6, s7, s6
138 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
139 ; GFX1030-NEXT: s_cselect_b32 s10, -1, 0
140 ; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo
141 ; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
142 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 0
143 ; GFX1030-NEXT: s_cselect_b32 s3, s7, s3
144 ; GFX1030-NEXT: s_or_b32 s7, s10, s8
145 ; GFX1030-NEXT: s_or_b32 s7, s9, s7
146 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
147 ; GFX1030-NEXT: s_cselect_b32 s2, 0, s2
148 ; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1
149 ; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock
150 ; GFX1030-NEXT: s_endpgm
152 ; GFX1100-LABEL: test_insert_extract:
153 ; GFX1100: ; %bb.0: ; %entry
154 ; GFX1100-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
155 ; GFX1100-NEXT: s_mov_b32 s2, 0
156 ; GFX1100-NEXT: s_mov_b32 s3, 0
157 ; GFX1100-NEXT: s_mov_b32 s4, 0
158 ; GFX1100-NEXT: s_mov_b32 s5, 0
159 ; GFX1100-NEXT: s_mov_b32 s6, 0
160 ; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo
161 ; GFX1100-NEXT: .p2align 6
162 ; GFX1100-NEXT: .LBB0_1: ; %for.body
163 ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1
164 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1
166 ; GFX1100-NEXT: s_cselect_b32 s7, -1, 0
167 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
168 ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
169 ; GFX1100-NEXT: s_cselect_b32 s7, s4, s3
170 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
171 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
172 ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
173 ; GFX1100-NEXT: s_cselect_b32 s7, s5, s7
174 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
175 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
176 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
177 ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
178 ; GFX1100-NEXT: s_cselect_b32 s7, s6, s7
179 ; GFX1100-NEXT: s_or_b32 s7, s7, s0
180 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1
181 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
182 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
183 ; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo
184 ; GFX1100-NEXT: s_cselect_b32 s4, s7, s4
185 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
186 ; GFX1100-NEXT: s_cselect_b32 s9, -1, 0
187 ; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo
188 ; GFX1100-NEXT: s_cselect_b32 s6, s7, s6
189 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
190 ; GFX1100-NEXT: s_cselect_b32 s10, -1, 0
191 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
192 ; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo
193 ; GFX1100-NEXT: s_cselect_b32 s5, s7, s5
194 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 0
195 ; GFX1100-NEXT: s_cselect_b32 s3, s7, s3
196 ; GFX1100-NEXT: s_or_b32 s7, s10, s8
197 ; GFX1100-NEXT: s_or_b32 s7, s9, s7
198 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
199 ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
200 ; GFX1100-NEXT: s_cselect_b32 s2, 0, s2
201 ; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1
202 ; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock
203 ; GFX1100-NEXT: s_endpgm
205 %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0
208 for.body: ; preds = %for.body, %entry
209 %x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ]
210 %x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ]
211 %idxprom = zext i32 %q to i64
212 %e1 = extractelement <4 x i32> %x2, i64 %idxprom
213 %add = or i32 %e1, %p
214 %i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom
215 %e3 = extractelement <4 x i32> %x1, i64 %idxprom
216 %i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0