1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
5 ; CHECK: .byte 0 // 0x0
6 ; CHECK: .byte 16 // 0x10
7 ; CHECK: .byte 32 // 0x20
8 ; CHECK: .byte 48 // 0x30
9 ; CHECK: .byte 2 // 0x2
10 ; CHECK: .byte 18 // 0x12
11 ; CHECK: .byte 34 // 0x22
12 ; CHECK: .byte 50 // 0x32
13 ; CHECK: .byte 4 // 0x4
14 ; CHECK: .byte 20 // 0x14
15 ; CHECK: .byte 36 // 0x24
16 ; CHECK: .byte 52 // 0x34
17 ; CHECK: .byte 6 // 0x6
18 ; CHECK: .byte 22 // 0x16
19 ; CHECK: .byte 38 // 0x26
20 ; CHECK: .byte 54 // 0x36
21 define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
22 ; CHECK-LABEL: shuffle4_v4i8_16:
24 ; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
25 ; CHECK-NEXT: adrp x8, .LCPI0_0
26 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
27 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
28 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
29 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
30 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
32 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
33 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
34 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
39 ; CHECK: .byte 0 // 0x0
40 ; CHECK: .byte 16 // 0x10
41 ; CHECK: .byte 32 // 0x20
42 ; CHECK: .byte 48 // 0x30
43 ; CHECK: .byte 2 // 0x2
44 ; CHECK: .byte 18 // 0x12
45 ; CHECK: .byte 34 // 0x22
46 ; CHECK: .byte 50 // 0x32
47 define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
48 ; CHECK-LABEL: shuffle4_v4i8_8:
50 ; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
51 ; CHECK-NEXT: adrp x8, .LCPI1_0
52 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
53 ; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI1_0]
54 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
55 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
56 ; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
58 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
59 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
60 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
65 ; CHECK: .byte 0 // 0x0
66 ; CHECK: .byte 3 // 0x3
67 ; CHECK: .byte 2 // 0x2
68 ; CHECK: .byte 1 // 0x1
69 ; CHECK: .byte 12 // 0xc
70 ; CHECK: .byte 15 // 0xf
71 ; CHECK: .byte 14 // 0xe
72 ; CHECK: .byte 12 // 0xc
74 ; CHECK: .byte 4 // 0x4
75 ; CHECK: .byte 7 // 0x7
76 ; CHECK: .byte 6 // 0x6
77 ; CHECK: .byte 7 // 0x7
78 ; CHECK: .byte 8 // 0x8
79 ; CHECK: .byte 10 // 0xa
80 ; CHECK: .byte 9 // 0x9
81 ; CHECK: .byte 11 // 0xb
82 ; CHECK: .section .rodata.cst16,"aM",@progbits,16
85 ; CHECK: .byte 0 // 0x0
86 ; CHECK: .byte 4 // 0x4
87 ; CHECK: .byte 16 // 0x10
88 ; CHECK: .byte 20 // 0x14
89 ; CHECK: .byte 1 // 0x1
90 ; CHECK: .byte 5 // 0x5
91 ; CHECK: .byte 17 // 0x11
92 ; CHECK: .byte 21 // 0x15
93 ; CHECK: .byte 2 // 0x2
94 ; CHECK: .byte 6 // 0x6
95 ; CHECK: .byte 18 // 0x12
96 ; CHECK: .byte 22 // 0x16
97 ; CHECK: .byte 3 // 0x3
98 ; CHECK: .byte 7 // 0x7
99 ; CHECK: .byte 19 // 0x13
100 ; CHECK: .byte 23 // 0x17
101 define <16 x i8> @shuffle4_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
102 ; CHECK-LABEL: shuffle4_v8i8:
104 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
105 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
106 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
107 ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
108 ; CHECK-NEXT: adrp x8, .LCPI2_0
109 ; CHECK-NEXT: mov v0.d[1], v1.d[0]
110 ; CHECK-NEXT: mov v2.d[1], v3.d[0]
111 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0]
112 ; CHECK-NEXT: adrp x8, .LCPI2_1
113 ; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI2_1]
114 ; CHECK-NEXT: adrp x8, .LCPI2_2
115 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
116 ; CHECK-NEXT: tbl v1.8b, { v2.16b }, v3.8b
117 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_2]
118 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
120 %x = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 12, i32 15, i32 14, i32 12>
121 %y = shufflevector <8 x i8> %c, <8 x i8> %d, <8 x i32> <i32 4, i32 7, i32 6, i32 7, i32 8, i32 10, i32 9, i32 11>
122 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
127 ; CHECK: .byte 0 // 0x0
128 ; CHECK: .byte 3 // 0x3
129 ; CHECK: .byte 2 // 0x2
130 ; CHECK: .byte 1 // 0x1
131 ; CHECK: .byte 12 // 0xc
132 ; CHECK: .byte 15 // 0xf
133 ; CHECK: .byte 14 // 0xe
134 ; CHECK: .byte 12 // 0xc
135 ; CHECK: .byte 255 // 0xff
136 ; CHECK: .byte 255 // 0xff
137 ; CHECK: .byte 255 // 0xff
138 ; CHECK: .byte 255 // 0xff
139 ; CHECK: .byte 255 // 0xff
140 ; CHECK: .byte 255 // 0xff
141 ; CHECK: .byte 255 // 0xff
142 ; CHECK: .byte 255 // 0xff
144 ; CHECK: .byte 4 // 0x4
145 ; CHECK: .byte 7 // 0x7
146 ; CHECK: .byte 6 // 0x6
147 ; CHECK: .byte 7 // 0x7
148 ; CHECK: .byte 8 // 0x8
149 ; CHECK: .byte 10 // 0xa
150 ; CHECK: .byte 9 // 0x9
151 ; CHECK: .byte 11 // 0xb
152 ; CHECK: .byte 255 // 0xff
153 ; CHECK: .byte 255 // 0xff
154 ; CHECK: .byte 255 // 0xff
155 ; CHECK: .byte 255 // 0xff
156 ; CHECK: .byte 255 // 0xff
157 ; CHECK: .byte 255 // 0xff
158 ; CHECK: .byte 255 // 0xff
159 ; CHECK: .byte 255 // 0xff
161 ; CHECK: .byte 16 // 0x10
162 ; CHECK: .byte 20 // 0x14
163 ; CHECK: .byte 0 // 0x0
164 ; CHECK: .byte 4 // 0x4
165 ; CHECK: .byte 17 // 0x11
166 ; CHECK: .byte 21 // 0x15
167 ; CHECK: .byte 1 // 0x1
168 ; CHECK: .byte 5 // 0x5
169 ; CHECK: .byte 18 // 0x12
170 ; CHECK: .byte 22 // 0x16
171 ; CHECK: .byte 2 // 0x2
172 ; CHECK: .byte 6 // 0x6
173 ; CHECK: .byte 19 // 0x13
174 ; CHECK: .byte 23 // 0x17
175 ; CHECK: .byte 3 // 0x3
176 ; CHECK: .byte 7 // 0x7
177 define <16 x i8> @shuffle4_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
178 ; CHECK-LABEL: shuffle4_v16i8:
180 ; CHECK-NEXT: adrp x8, .LCPI3_0
181 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
182 ; CHECK-NEXT: adrp x8, .LCPI3_1
183 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1]
184 ; CHECK-NEXT: adrp x8, .LCPI3_2
185 ; CHECK-NEXT: tbl v1.16b, { v0.16b }, v1.16b
186 ; CHECK-NEXT: tbl v0.16b, { v2.16b }, v3.16b
187 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_2]
188 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
190 %x = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 12, i32 15, i32 14, i32 12>
191 %y = shufflevector <16 x i8> %c, <16 x i8> %d, <8 x i32> <i32 4, i32 7, i32 6, i32 7, i32 8, i32 10, i32 9, i32 11>
192 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
197 ; CHECK: .byte 0 // 0x0
198 ; CHECK: .byte 1 // 0x1
199 ; CHECK: .byte 8 // 0x8
200 ; CHECK: .byte 9 // 0x9
201 ; CHECK: .byte 16 // 0x10
202 ; CHECK: .byte 17 // 0x11
203 ; CHECK: .byte 24 // 0x18
204 ; CHECK: .byte 25 // 0x19
205 ; CHECK: .byte 2 // 0x2
206 ; CHECK: .byte 3 // 0x3
207 ; CHECK: .byte 10 // 0xa
208 ; CHECK: .byte 11 // 0xb
209 ; CHECK: .byte 18 // 0x12
210 ; CHECK: .byte 19 // 0x13
211 ; CHECK: .byte 26 // 0x1a
212 ; CHECK: .byte 27 // 0x1b
213 define <8 x i16> @shuffle4_v8i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
214 ; CHECK-LABEL: shuffle4_v8i16:
216 ; CHECK-NEXT: fmov d5, d2
217 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
218 ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
219 ; CHECK-NEXT: adrp x8, .LCPI4_0
220 ; CHECK-NEXT: fmov d4, d0
221 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI4_0]
222 ; CHECK-NEXT: mov v4.d[1], v1.d[0]
223 ; CHECK-NEXT: mov v5.d[1], v3.d[0]
224 ; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b }, v0.16b
226 %x = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
227 %y = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
228 %z = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
232 define <4 x i32> @shuffle4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
233 ; CHECK-LABEL: shuffle4_v4i32:
235 ; CHECK-NEXT: zip1 v1.4s, v1.4s, v1.4s
236 ; CHECK-NEXT: rev64 v3.4s, v3.4s
237 ; CHECK-NEXT: ext v1.16b, v1.16b, v0.16b, #4
238 ; CHECK-NEXT: zip2 v0.4s, v3.4s, v2.4s
239 ; CHECK-NEXT: mov v0.d[1], v1.d[1]
241 %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
242 %y = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
243 %z = shufflevector <8 x i32> %x, <8 x i32> %y, <4 x i32> <i32 15, i32 10, i32 5, i32 0>
248 ; CHECK: .byte 0 // 0x0
249 ; CHECK: .byte 7 // 0x7
250 ; CHECK: .byte 255 // 0xff
251 ; CHECK: .byte 1 // 0x1
252 ; CHECK: .byte 255 // 0xff
253 ; CHECK: .byte 255 // 0xff
254 ; CHECK: .byte 255 // 0xff
255 ; CHECK: .byte 255 // 0xff
256 ; CHECK: .section .rodata.cst16,"aM",@progbits,16
259 ; CHECK: .byte 0 // 0x0
260 ; CHECK: .byte 16 // 0x10
261 ; CHECK: .byte 19 // 0x13
262 ; CHECK: .byte 3 // 0x3
263 ; CHECK: .byte 1 // 0x1
264 ; CHECK: .byte 17 // 0x11
265 ; CHECK: .byte 0 // 0x0
266 ; CHECK: .byte 1 // 0x1
267 ; CHECK: .byte 0 // 0x0
268 ; CHECK: .byte 16 // 0x10
269 ; CHECK: .byte 19 // 0x13
270 ; CHECK: .byte 3 // 0x3
271 ; CHECK: .byte 1 // 0x1
272 ; CHECK: .byte 17 // 0x11
273 ; CHECK: .byte 0 // 0x0
274 ; CHECK: .byte 1 // 0x1
275 define <16 x i8> @shuffle4_v8i8_v16i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
276 ; CHECK-LABEL: shuffle4_v8i8_v16i8:
278 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
279 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
280 ; CHECK-NEXT: adrp x8, .LCPI6_0
281 ; CHECK-NEXT: mov v2.d[1], v2.d[0]
282 ; CHECK-NEXT: mov v0.d[1], v0.d[0]
283 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI6_0]
284 ; CHECK-NEXT: adrp x8, .LCPI6_1
285 ; CHECK-NEXT: tbl v3.8b, { v2.16b }, v1.8b
286 ; CHECK-NEXT: tbl v2.8b, { v0.16b }, v1.8b
287 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_1]
288 ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
290 %x = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
291 %y = shufflevector <8 x i8> %c, <8 x i8> %d, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
292 %z = shufflevector <4 x i8> %x, <4 x i8> %y, <16 x i32> <i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1, i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1>
297 ; CHECK: .byte 0 // 0x0
298 ; CHECK: .byte 7 // 0x7
299 ; CHECK: .byte 255 // 0xff
300 ; CHECK: .byte 1 // 0x1
301 ; CHECK: .byte 255 // 0xff
302 ; CHECK: .byte 255 // 0xff
303 ; CHECK: .byte 255 // 0xff
304 ; CHECK: .byte 255 // 0xff
306 ; CHECK: .byte 0 // 0x0
307 ; CHECK: .byte 8 // 0x8
308 ; CHECK: .byte 11 // 0xb
309 ; CHECK: .byte 3 // 0x3
310 ; CHECK: .byte 1 // 0x1
311 ; CHECK: .byte 9 // 0x9
312 ; CHECK: .byte 0 // 0x0
313 ; CHECK: .byte 1 // 0x1
314 define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
315 ; CHECK-LABEL: shuffle4_v8i8_v8i8:
317 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
318 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
319 ; CHECK-NEXT: adrp x8, .LCPI7_0
320 ; CHECK-NEXT: mov v2.d[1], v2.d[0]
321 ; CHECK-NEXT: mov v0.d[1], v0.d[0]
322 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI7_0]
323 ; CHECK-NEXT: adrp x8, .LCPI7_1
324 ; CHECK-NEXT: tbl v2.8b, { v2.16b }, v1.8b
325 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
326 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI7_1]
327 ; CHECK-NEXT: mov v0.d[1], v2.d[0]
328 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
330 %x = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
331 %y = shufflevector <8 x i8> %c, <8 x i8> %d, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
332 %z = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1>
337 ; CHECK: .byte 0 // 0x0
338 ; CHECK: .byte 1 // 0x1
339 ; CHECK: .byte 8 // 0x8
340 ; CHECK: .byte 9 // 0x9
341 ; CHECK: .byte 16 // 0x10
342 ; CHECK: .byte 17 // 0x11
343 ; CHECK: .byte 24 // 0x18
344 ; CHECK: .byte 25 // 0x19
345 ; CHECK: .byte 2 // 0x2
346 ; CHECK: .byte 3 // 0x3
347 ; CHECK: .byte 10 // 0xa
348 ; CHECK: .byte 11 // 0xb
349 ; CHECK: .byte 18 // 0x12
350 ; CHECK: .byte 19 // 0x13
351 ; CHECK: .byte 26 // 0x1a
352 ; CHECK: .byte 27 // 0x1b
353 define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
354 ; CHECK-LABEL: shuffle4_v4i8_zext:
356 ; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b
357 ; CHECK-NEXT: uzp1 v1.8b, v2.8b, v3.8b
358 ; CHECK-NEXT: adrp x8, .LCPI8_0
359 ; CHECK-NEXT: ushll v2.8h, v0.8b, #0
360 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_0]
361 ; CHECK-NEXT: ushll v3.8h, v1.8b, #0
362 ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
364 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
365 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
366 %xe = zext <8 x i8> %x to <8 x i16>
367 %ye = zext <8 x i8> %y to <8 x i16>
368 %z = shufflevector <8 x i16> %xe, <8 x i16> %ye, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
373 ; CHECK: .byte 0 // 0x0
374 ; CHECK: .byte 16 // 0x10
375 ; CHECK: .byte 32 // 0x20
376 ; CHECK: .byte 48 // 0x30
377 ; CHECK: .byte 2 // 0x2
378 ; CHECK: .byte 18 // 0x12
379 ; CHECK: .byte 34 // 0x22
380 ; CHECK: .byte 50 // 0x32
381 ; CHECK: .byte 4 // 0x4
382 ; CHECK: .byte 20 // 0x14
383 ; CHECK: .byte 36 // 0x24
384 ; CHECK: .byte 52 // 0x34
385 ; CHECK: .byte 6 // 0x6
386 ; CHECK: .byte 22 // 0x16
387 ; CHECK: .byte 38 // 0x26
388 ; CHECK: .byte 54 // 0x36
389 define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) {
390 ; CHECK-LABEL: shuffle4_v4i16_trunc:
392 ; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
393 ; CHECK-NEXT: adrp x8, .LCPI9_0
394 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
395 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0]
396 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
397 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
398 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
400 %a = trunc <4 x i16> %ae to <4 x i8>
401 %b = trunc <4 x i16> %be to <4 x i8>
402 %c = trunc <4 x i16> %ce to <4 x i8>
403 %d = trunc <4 x i16> %de to <4 x i8>
404 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
405 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
406 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
411 ; CHECK: .byte 0 // 0x0
412 ; CHECK: .byte 16 // 0x10
413 ; CHECK: .byte 32 // 0x20
414 ; CHECK: .byte 48 // 0x30
415 ; CHECK: .byte 2 // 0x2
416 ; CHECK: .byte 18 // 0x12
417 ; CHECK: .byte 34 // 0x22
418 ; CHECK: .byte 50 // 0x32
419 ; CHECK: .byte 4 // 0x4
420 ; CHECK: .byte 20 // 0x14
421 ; CHECK: .byte 36 // 0x24
422 ; CHECK: .byte 52 // 0x34
423 ; CHECK: .byte 6 // 0x6
424 ; CHECK: .byte 22 // 0x16
425 ; CHECK: .byte 38 // 0x26
426 ; CHECK: .byte 54 // 0x36
428 define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) {
429 ; CHECK-LABEL: shuffle4_v4i32_trunc:
431 ; CHECK-NEXT: xtn v4.4h, v0.4s
432 ; CHECK-NEXT: adrp x8, .LCPI10_0
433 ; CHECK-NEXT: xtn v5.4h, v1.4s
434 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0]
435 ; CHECK-NEXT: xtn v6.4h, v2.4s
436 ; CHECK-NEXT: xtn v7.4h, v3.4s
437 ; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
439 %a = trunc <4 x i32> %ae to <4 x i8>
440 %b = trunc <4 x i32> %be to <4 x i8>
441 %c = trunc <4 x i32> %ce to <4 x i8>
442 %d = trunc <4 x i32> %de to <4 x i8>
443 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
444 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
445 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
450 ; CHECK: .byte 0 // 0x0
451 ; CHECK: .byte 16 // 0x10
452 ; CHECK: .byte 32 // 0x20
453 ; CHECK: .byte 2 // 0x2
454 ; CHECK: .byte 18 // 0x12
455 ; CHECK: .byte 34 // 0x22
456 ; CHECK: .byte 4 // 0x4
457 ; CHECK: .byte 20 // 0x14
458 ; CHECK: .byte 36 // 0x24
459 ; CHECK: .byte 6 // 0x6
460 ; CHECK: .byte 22 // 0x16
461 ; CHECK: .byte 38 // 0x26
462 ; CHECK: .byte 255 // 0xff
463 ; CHECK: .byte 255 // 0xff
464 ; CHECK: .byte 255 // 0xff
465 ; CHECK: .byte 255 // 0xff
466 define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) {
467 ; CHECK-LABEL: shuffle3_v4i8:
469 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
470 ; CHECK-NEXT: adrp x8, .LCPI11_0
471 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0]
472 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
473 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
474 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
476 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
477 %y = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
478 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
483 ; CHECK: .byte 0 // 0x0
484 ; CHECK: .byte 1 // 0x1
485 ; CHECK: .byte 8 // 0x8
486 ; CHECK: .byte 9 // 0x9
487 ; CHECK: .byte 16 // 0x10
488 ; CHECK: .byte 17 // 0x11
489 ; CHECK: .byte 2 // 0x2
490 ; CHECK: .byte 3 // 0x3
491 ; CHECK: .byte 10 // 0xa
492 ; CHECK: .byte 11 // 0xb
493 ; CHECK: .byte 18 // 0x12
494 ; CHECK: .byte 19 // 0x13
495 ; CHECK: .byte 4 // 0x4
496 ; CHECK: .byte 5 // 0x5
497 ; CHECK: .byte 12 // 0xc
498 ; CHECK: .byte 13 // 0xd
499 define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
500 ; CHECK-LABEL: shuffle3_v4i16:
502 ; CHECK-NEXT: fmov d3, d2
503 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
504 ; CHECK-NEXT: adrp x8, .LCPI12_0
505 ; CHECK-NEXT: fmov d2, d0
506 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_0]
507 ; CHECK-NEXT: mov v2.d[1], v1.d[0]
508 ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
510 %x = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
511 %y = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
512 %z = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6>
516 define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
517 ; CHECK-LABEL: shuffle3_v4i32:
519 ; CHECK-NEXT: trn1 v1.4s, v0.4s, v1.4s
520 ; CHECK-NEXT: mov v1.d[1], v0.d[0]
521 ; CHECK-NEXT: mov v1.s[2], v2.s[0]
522 ; CHECK-NEXT: mov v0.16b, v1.16b
524 %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
525 %y = shufflevector <4 x i32> %c, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
526 %z = shufflevector <8 x i32> %x, <8 x i32> %y, <4 x i32> <i32 0, i32 4, i32 8, i32 1>
531 ; CHECK: .byte 4 // 0x4
532 ; CHECK: .byte 8 // 0x8
533 ; CHECK: .byte 255 // 0xff
534 ; CHECK: .byte 255 // 0xff
535 ; CHECK: .byte 14 // 0xe
536 ; CHECK: .byte 3 // 0x3
537 ; CHECK: .byte 255 // 0xff
538 ; CHECK: .byte 255 // 0xff
539 ; CHECK: .section .rodata.cst16,"aM",@progbits,16
542 ; CHECK: .byte 255 // 0xff
543 ; CHECK: .byte 255 // 0xff
544 ; CHECK: .byte 15 // 0xf
545 ; CHECK: .byte 27 // 0x1b
546 ; CHECK: .byte 255 // 0xff
547 ; CHECK: .byte 255 // 0xff
548 ; CHECK: .byte 24 // 0x18
549 ; CHECK: .byte 12 // 0xc
550 ; CHECK: .byte 255 // 0xff
551 ; CHECK: .byte 255 // 0xff
552 ; CHECK: .byte 255 // 0xff
553 ; CHECK: .byte 255 // 0xff
554 ; CHECK: .byte 255 // 0xff
555 ; CHECK: .byte 255 // 0xff
556 ; CHECK: .byte 255 // 0xff
557 ; CHECK: .byte 255 // 0xff
558 define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
559 ; CHECK-LABEL: insert4_v8i8:
561 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
562 ; CHECK-NEXT: mov v4.16b, v3.16b
563 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
564 ; CHECK-NEXT: adrp x8, .LCPI14_0
565 ; CHECK-NEXT: adrp x9, .LCPI14_1
566 ; CHECK-NEXT: mov v0.d[1], v2.d[0]
567 ; CHECK-NEXT: mov v3.16b, v1.16b
568 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
569 ; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1]
570 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
571 ; CHECK-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v2.16b
572 ; CHECK-NEXT: trn1 v0.4h, v1.4h, v0.4h
573 ; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h
575 %e1 = extractelement <8 x i8> %a, i32 4
576 %e2 = extractelement <8 x i8> %c, i32 0
577 %e3 = extractelement <16 x i8> %b, i32 15
578 %e4 = extractelement <16 x i8> %d, i32 11
579 %e5 = extractelement <8 x i8> %c, i32 6
580 %e6 = extractelement <8 x i8> %a, i32 3
581 %e7 = extractelement <16 x i8> %d, i32 8
582 %e8 = extractelement <16 x i8> %b, i32 12
583 %i1 = insertelement <8 x i8> undef, i8 %e1, i32 0
584 %i2 = insertelement <8 x i8> %i1, i8 %e2, i32 1
585 %i3 = insertelement <8 x i8> %i2, i8 %e3, i32 2
586 %i4 = insertelement <8 x i8> %i3, i8 %e4, i32 3
587 %i5 = insertelement <8 x i8> %i4, i8 %e5, i32 4
588 %i6 = insertelement <8 x i8> %i5, i8 %e6, i32 5
589 %i7 = insertelement <8 x i8> %i6, i8 %e7, i32 6
590 %i8 = insertelement <8 x i8> %i7, i8 %e8, i32 7
595 ; CHECK: .byte 255 // 0xff
596 ; CHECK: .byte 255 // 0xff
597 ; CHECK: .byte 15 // 0xf
598 ; CHECK: .byte 27 // 0x1b
599 ; CHECK: .byte 255 // 0xff
600 ; CHECK: .byte 255 // 0xff
601 ; CHECK: .byte 24 // 0x18
602 ; CHECK: .byte 12 // 0xc
603 ; CHECK: .byte 255 // 0xff
604 ; CHECK: .byte 255 // 0xff
605 ; CHECK: .byte 15 // 0xf
606 ; CHECK: .byte 27 // 0x1b
607 ; CHECK: .byte 255 // 0xff
608 ; CHECK: .byte 255 // 0xff
609 ; CHECK: .byte 24 // 0x18
610 ; CHECK: .byte 12 // 0xc
612 ; CHECK: .byte 20 // 0x14
613 ; CHECK: .byte 24 // 0x18
614 ; CHECK: .byte 2 // 0x2
615 ; CHECK: .byte 3 // 0x3
616 ; CHECK: .byte 30 // 0x1e
617 ; CHECK: .byte 19 // 0x13
618 ; CHECK: .byte 6 // 0x6
619 ; CHECK: .byte 7 // 0x7
620 ; CHECK: .byte 20 // 0x14
621 ; CHECK: .byte 24 // 0x18
622 ; CHECK: .byte 10 // 0xa
623 ; CHECK: .byte 11 // 0xb
624 ; CHECK: .byte 30 // 0x1e
625 ; CHECK: .byte 19 // 0x13
626 ; CHECK: .byte 14 // 0xe
627 ; CHECK: .byte 15 // 0xf
628 define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
629 ; CHECK-LABEL: insert4_v16i8:
631 ; CHECK-NEXT: mov v4.16b, v3.16b
632 ; CHECK-NEXT: adrp x8, .LCPI15_0
633 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q31_q0
634 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
635 ; CHECK-NEXT: mov v3.16b, v1.16b
636 ; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI15_0]
637 ; CHECK-NEXT: mov v0.d[1], v2.d[0]
638 ; CHECK-NEXT: adrp x8, .LCPI15_1
639 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1]
640 ; CHECK-NEXT: tbl v31.16b, { v3.16b, v4.16b }, v5.16b
641 ; CHECK-NEXT: tbl v0.16b, { v31.16b, v0.16b }, v1.16b
643 %e1 = extractelement <8 x i8> %a, i32 4
644 %e2 = extractelement <8 x i8> %c, i32 0
645 %e3 = extractelement <16 x i8> %b, i32 15
646 %e4 = extractelement <16 x i8> %d, i32 11
647 %e5 = extractelement <8 x i8> %c, i32 6
648 %e6 = extractelement <8 x i8> %a, i32 3
649 %e7 = extractelement <16 x i8> %d, i32 8
650 %e8 = extractelement <16 x i8> %b, i32 12
651 %e9 = extractelement <8 x i8> %a, i32 4
652 %e10 = extractelement <8 x i8> %c, i32 0
653 %e11 = extractelement <16 x i8> %b, i32 15
654 %e12 = extractelement <16 x i8> %d, i32 11
655 %e13 = extractelement <8 x i8> %c, i32 6
656 %e14 = extractelement <8 x i8> %a, i32 3
657 %e15 = extractelement <16 x i8> %d, i32 8
658 %e16 = extractelement <16 x i8> %b, i32 12
659 %i1 = insertelement <16 x i8> undef, i8 %e1, i32 0
660 %i2 = insertelement <16 x i8> %i1, i8 %e2, i32 1
661 %i3 = insertelement <16 x i8> %i2, i8 %e3, i32 2
662 %i4 = insertelement <16 x i8> %i3, i8 %e4, i32 3
663 %i5 = insertelement <16 x i8> %i4, i8 %e5, i32 4
664 %i6 = insertelement <16 x i8> %i5, i8 %e6, i32 5
665 %i7 = insertelement <16 x i8> %i6, i8 %e7, i32 6
666 %i8 = insertelement <16 x i8> %i7, i8 %e8, i32 7
667 %i9 = insertelement <16 x i8> %i8, i8 %e9, i32 8
668 %i10 = insertelement <16 x i8> %i9, i8 %e10, i32 9
669 %i11 = insertelement <16 x i8> %i10, i8 %e11, i32 10
670 %i12 = insertelement <16 x i8> %i11, i8 %e12, i32 11
671 %i13 = insertelement <16 x i8> %i12, i8 %e13, i32 12
672 %i14 = insertelement <16 x i8> %i13, i8 %e14, i32 13
673 %i15 = insertelement <16 x i8> %i14, i8 %e15, i32 14
674 %i16 = insertelement <16 x i8> %i15, i8 %e16, i32 15
696 define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l249, <2 x double> %l267, <2 x double> %l285, <2 x double> %l303, <2 x double> %l321, <2 x double> %l339) {
699 ; CHECK-NEXT: frintm v0.2d, v0.2d
700 ; CHECK-NEXT: frintm v4.2d, v4.2d
701 ; CHECK-NEXT: adrp x8, .LCPI16_0
702 ; CHECK-NEXT: frintm v1.2d, v1.2d
703 ; CHECK-NEXT: frintm v5.2d, v5.2d
704 ; CHECK-NEXT: frintm v2.2d, v2.2d
705 ; CHECK-NEXT: frintm v6.2d, v6.2d
706 ; CHECK-NEXT: frintm v3.2d, v3.2d
707 ; CHECK-NEXT: frintm v7.2d, v7.2d
708 ; CHECK-NEXT: fcvtzs v0.2d, v0.2d
709 ; CHECK-NEXT: fcvtzs v4.2d, v4.2d
710 ; CHECK-NEXT: fcvtzs v1.2d, v1.2d
711 ; CHECK-NEXT: fcvtzs v5.2d, v5.2d
712 ; CHECK-NEXT: fcvtzs v2.2d, v2.2d
713 ; CHECK-NEXT: fcvtzs v6.2d, v6.2d
714 ; CHECK-NEXT: fcvtzs v3.2d, v3.2d
715 ; CHECK-NEXT: fcvtzs v7.2d, v7.2d
716 ; CHECK-NEXT: xtn v16.2s, v0.2d
717 ; CHECK-NEXT: xtn v20.2s, v4.2d
718 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_0]
719 ; CHECK-NEXT: xtn v17.2s, v1.2d
720 ; CHECK-NEXT: xtn v21.2s, v5.2d
721 ; CHECK-NEXT: xtn v18.2s, v2.2d
722 ; CHECK-NEXT: xtn v22.2s, v6.2d
723 ; CHECK-NEXT: xtn v19.2s, v3.2d
724 ; CHECK-NEXT: xtn v23.2s, v7.2d
725 ; CHECK-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
726 ; CHECK-NEXT: tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
727 ; CHECK-NEXT: uzp1 v0.8h, v1.8h, v2.8h
728 ; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
730 %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213)
731 %l215 = fptosi <2 x double> %l214 to <2 x i16>
732 %l232 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l231)
733 %l233 = fptosi <2 x double> %l232 to <2 x i16>
734 %l250 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l249)
735 %l251 = fptosi <2 x double> %l250 to <2 x i16>
736 %l268 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l267)
737 %l269 = fptosi <2 x double> %l268 to <2 x i16>
738 %l286 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l285)
739 %l287 = fptosi <2 x double> %l286 to <2 x i16>
740 %l304 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l303)
741 %l305 = fptosi <2 x double> %l304 to <2 x i16>
742 %l322 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l321)
743 %l323 = fptosi <2 x double> %l322 to <2 x i16>
744 %l340 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l339)
745 %l341 = fptosi <2 x double> %l340 to <2 x i16>
746 %l342 = shufflevector <2 x i16> %l215, <2 x i16> %l233, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
747 %l343 = shufflevector <2 x i16> %l251, <2 x i16> %l269, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
748 %l344 = shufflevector <2 x i16> %l287, <2 x i16> %l305, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
749 %l345 = shufflevector <2 x i16> %l323, <2 x i16> %l341, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
750 %l346 = shufflevector <4 x i16> %l342, <4 x i16> %l343, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
751 %l347 = shufflevector <4 x i16> %l344, <4 x i16> %l345, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
752 %interleaved.vec = shufflevector <8 x i16> %l346, <8 x i16> %l347, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
753 ret <16 x i16> %interleaved.vec
756 declare <2 x double> @llvm.floor.v2f64(<2 x double> %l213)