1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
5 ; CHECK: .byte 0 // 0x0
6 ; CHECK: .byte 16 // 0x10
7 ; CHECK: .byte 32 // 0x20
8 ; CHECK: .byte 48 // 0x30
9 ; CHECK: .byte 2 // 0x2
10 ; CHECK: .byte 18 // 0x12
11 ; CHECK: .byte 34 // 0x22
12 ; CHECK: .byte 50 // 0x32
13 ; CHECK: .byte 4 // 0x4
14 ; CHECK: .byte 20 // 0x14
15 ; CHECK: .byte 36 // 0x24
16 ; CHECK: .byte 52 // 0x34
17 ; CHECK: .byte 6 // 0x6
18 ; CHECK: .byte 22 // 0x16
19 ; CHECK: .byte 38 // 0x26
20 ; CHECK: .byte 54 // 0x36
21 define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
22 ; CHECK-LABEL: shuffle4_v4i8_16:
24 ; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
25 ; CHECK-NEXT: adrp x8, .LCPI0_0
26 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
27 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
28 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
29 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
30 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
32 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
33 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
34 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
39 ; CHECK: .byte 0 // 0x0
40 ; CHECK: .byte 16 // 0x10
41 ; CHECK: .byte 32 // 0x20
42 ; CHECK: .byte 48 // 0x30
43 ; CHECK: .byte 2 // 0x2
44 ; CHECK: .byte 18 // 0x12
45 ; CHECK: .byte 34 // 0x22
46 ; CHECK: .byte 50 // 0x32
47 define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
48 ; CHECK-LABEL: shuffle4_v4i8_8:
50 ; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
51 ; CHECK-NEXT: adrp x8, .LCPI1_0
52 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
53 ; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI1_0]
54 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
55 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
56 ; CHECK-NEXT: tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
58 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
59 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
60 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
65 ; CHECK: .byte 0 // 0x0
66 ; CHECK: .byte 3 // 0x3
67 ; CHECK: .byte 2 // 0x2
68 ; CHECK: .byte 1 // 0x1
69 ; CHECK: .byte 12 // 0xc
70 ; CHECK: .byte 15 // 0xf
71 ; CHECK: .byte 14 // 0xe
72 ; CHECK: .byte 12 // 0xc
74 ; CHECK: .byte 4 // 0x4
75 ; CHECK: .byte 7 // 0x7
76 ; CHECK: .byte 6 // 0x6
77 ; CHECK: .byte 7 // 0x7
78 ; CHECK: .byte 8 // 0x8
79 ; CHECK: .byte 10 // 0xa
80 ; CHECK: .byte 9 // 0x9
81 ; CHECK: .byte 11 // 0xb
82 ; CHECK: .section .rodata.cst16,"aM",@progbits,16
85 ; CHECK: .byte 0 // 0x0
86 ; CHECK: .byte 4 // 0x4
87 ; CHECK: .byte 16 // 0x10
88 ; CHECK: .byte 20 // 0x14
89 ; CHECK: .byte 1 // 0x1
90 ; CHECK: .byte 5 // 0x5
91 ; CHECK: .byte 17 // 0x11
92 ; CHECK: .byte 21 // 0x15
93 ; CHECK: .byte 2 // 0x2
94 ; CHECK: .byte 6 // 0x6
95 ; CHECK: .byte 18 // 0x12
96 ; CHECK: .byte 22 // 0x16
97 ; CHECK: .byte 3 // 0x3
98 ; CHECK: .byte 7 // 0x7
99 ; CHECK: .byte 19 // 0x13
100 ; CHECK: .byte 23 // 0x17
101 define <16 x i8> @shuffle4_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
102 ; CHECK-LABEL: shuffle4_v8i8:
104 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
105 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
106 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
107 ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
108 ; CHECK-NEXT: adrp x8, .LCPI2_0
109 ; CHECK-NEXT: mov v0.d[1], v1.d[0]
110 ; CHECK-NEXT: mov v2.d[1], v3.d[0]
111 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0]
112 ; CHECK-NEXT: adrp x8, .LCPI2_1
113 ; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI2_1]
114 ; CHECK-NEXT: adrp x8, .LCPI2_2
115 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
116 ; CHECK-NEXT: tbl v1.8b, { v2.16b }, v3.8b
117 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_2]
118 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
120 %x = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 12, i32 15, i32 14, i32 12>
121 %y = shufflevector <8 x i8> %c, <8 x i8> %d, <8 x i32> <i32 4, i32 7, i32 6, i32 7, i32 8, i32 10, i32 9, i32 11>
122 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
127 ; CHECK: .byte 0 // 0x0
128 ; CHECK: .byte 3 // 0x3
129 ; CHECK: .byte 2 // 0x2
130 ; CHECK: .byte 1 // 0x1
131 ; CHECK: .byte 12 // 0xc
132 ; CHECK: .byte 15 // 0xf
133 ; CHECK: .byte 14 // 0xe
134 ; CHECK: .byte 12 // 0xc
135 ; CHECK: .byte 255 // 0xff
136 ; CHECK: .byte 255 // 0xff
137 ; CHECK: .byte 255 // 0xff
138 ; CHECK: .byte 255 // 0xff
139 ; CHECK: .byte 255 // 0xff
140 ; CHECK: .byte 255 // 0xff
141 ; CHECK: .byte 255 // 0xff
142 ; CHECK: .byte 255 // 0xff
144 ; CHECK: .byte 4 // 0x4
145 ; CHECK: .byte 7 // 0x7
146 ; CHECK: .byte 6 // 0x6
147 ; CHECK: .byte 7 // 0x7
148 ; CHECK: .byte 8 // 0x8
149 ; CHECK: .byte 10 // 0xa
150 ; CHECK: .byte 9 // 0x9
151 ; CHECK: .byte 11 // 0xb
152 ; CHECK: .byte 255 // 0xff
153 ; CHECK: .byte 255 // 0xff
154 ; CHECK: .byte 255 // 0xff
155 ; CHECK: .byte 255 // 0xff
156 ; CHECK: .byte 255 // 0xff
157 ; CHECK: .byte 255 // 0xff
158 ; CHECK: .byte 255 // 0xff
159 ; CHECK: .byte 255 // 0xff
161 ; CHECK: .byte 16 // 0x10
162 ; CHECK: .byte 20 // 0x14
163 ; CHECK: .byte 0 // 0x0
164 ; CHECK: .byte 4 // 0x4
165 ; CHECK: .byte 17 // 0x11
166 ; CHECK: .byte 21 // 0x15
167 ; CHECK: .byte 1 // 0x1
168 ; CHECK: .byte 5 // 0x5
169 ; CHECK: .byte 18 // 0x12
170 ; CHECK: .byte 22 // 0x16
171 ; CHECK: .byte 2 // 0x2
172 ; CHECK: .byte 6 // 0x6
173 ; CHECK: .byte 19 // 0x13
174 ; CHECK: .byte 23 // 0x17
175 ; CHECK: .byte 3 // 0x3
176 ; CHECK: .byte 7 // 0x7
177 define <16 x i8> @shuffle4_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
178 ; CHECK-LABEL: shuffle4_v16i8:
180 ; CHECK-NEXT: adrp x8, .LCPI3_0
181 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
182 ; CHECK-NEXT: adrp x8, .LCPI3_1
183 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1]
184 ; CHECK-NEXT: adrp x8, .LCPI3_2
185 ; CHECK-NEXT: tbl v1.16b, { v0.16b }, v1.16b
186 ; CHECK-NEXT: tbl v0.16b, { v2.16b }, v3.16b
187 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_2]
188 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
190 %x = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 12, i32 15, i32 14, i32 12>
191 %y = shufflevector <16 x i8> %c, <16 x i8> %d, <8 x i32> <i32 4, i32 7, i32 6, i32 7, i32 8, i32 10, i32 9, i32 11>
192 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
197 ; CHECK: .byte 0 // 0x0
198 ; CHECK: .byte 1 // 0x1
199 ; CHECK: .byte 8 // 0x8
200 ; CHECK: .byte 9 // 0x9
201 ; CHECK: .byte 16 // 0x10
202 ; CHECK: .byte 17 // 0x11
203 ; CHECK: .byte 24 // 0x18
204 ; CHECK: .byte 25 // 0x19
205 ; CHECK: .byte 2 // 0x2
206 ; CHECK: .byte 3 // 0x3
207 ; CHECK: .byte 10 // 0xa
208 ; CHECK: .byte 11 // 0xb
209 ; CHECK: .byte 18 // 0x12
210 ; CHECK: .byte 19 // 0x13
211 ; CHECK: .byte 26 // 0x1a
212 ; CHECK: .byte 27 // 0x1b
213 define <8 x i16> @shuffle4_v8i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
214 ; CHECK-LABEL: shuffle4_v8i16:
216 ; CHECK-NEXT: fmov d5, d2
217 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
218 ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
219 ; CHECK-NEXT: adrp x8, .LCPI4_0
220 ; CHECK-NEXT: fmov d4, d0
221 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI4_0]
222 ; CHECK-NEXT: mov v4.d[1], v1.d[0]
223 ; CHECK-NEXT: mov v5.d[1], v3.d[0]
224 ; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b }, v0.16b
226 %x = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
227 %y = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
228 %z = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
232 define <4 x i32> @shuffle4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
233 ; CHECK-LABEL: shuffle4_v4i32:
235 ; CHECK-NEXT: zip1 v1.4s, v1.4s, v1.4s
236 ; CHECK-NEXT: rev64 v3.4s, v3.4s
237 ; CHECK-NEXT: ext v1.16b, v1.16b, v0.16b, #4
238 ; CHECK-NEXT: zip2 v0.4s, v3.4s, v2.4s
239 ; CHECK-NEXT: mov v0.d[1], v1.d[1]
241 %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
242 %y = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
243 %z = shufflevector <8 x i32> %x, <8 x i32> %y, <4 x i32> <i32 15, i32 10, i32 5, i32 0>
248 ; CHECK: .byte 0 // 0x0
249 ; CHECK: .byte 7 // 0x7
250 ; CHECK: .byte 255 // 0xff
251 ; CHECK: .byte 1 // 0x1
252 ; CHECK: .byte 255 // 0xff
253 ; CHECK: .byte 255 // 0xff
254 ; CHECK: .byte 255 // 0xff
255 ; CHECK: .byte 255 // 0xff
256 ; CHECK: .section .rodata.cst16,"aM",@progbits,16
259 ; CHECK: .byte 0 // 0x0
260 ; CHECK: .byte 16 // 0x10
261 ; CHECK: .byte 19 // 0x13
262 ; CHECK: .byte 3 // 0x3
263 ; CHECK: .byte 1 // 0x1
264 ; CHECK: .byte 17 // 0x11
265 ; CHECK: .byte 0 // 0x0
266 ; CHECK: .byte 1 // 0x1
267 ; CHECK: .byte 0 // 0x0
268 ; CHECK: .byte 16 // 0x10
269 ; CHECK: .byte 19 // 0x13
270 ; CHECK: .byte 3 // 0x3
271 ; CHECK: .byte 1 // 0x1
272 ; CHECK: .byte 17 // 0x11
273 ; CHECK: .byte 0 // 0x0
274 ; CHECK: .byte 1 // 0x1
275 define <16 x i8> @shuffle4_v8i8_v16i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
276 ; CHECK-LABEL: shuffle4_v8i8_v16i8:
278 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
279 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
280 ; CHECK-NEXT: adrp x8, .LCPI6_0
281 ; CHECK-NEXT: mov v2.d[1], v2.d[0]
282 ; CHECK-NEXT: mov v0.d[1], v0.d[0]
283 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI6_0]
284 ; CHECK-NEXT: adrp x8, .LCPI6_1
285 ; CHECK-NEXT: tbl v3.8b, { v2.16b }, v1.8b
286 ; CHECK-NEXT: tbl v2.8b, { v0.16b }, v1.8b
287 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_1]
288 ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
290 %x = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
291 %y = shufflevector <8 x i8> %c, <8 x i8> %d, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
292 %z = shufflevector <4 x i8> %x, <4 x i8> %y, <16 x i32> <i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1, i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1>
297 ; CHECK: .byte 0 // 0x0
298 ; CHECK: .byte 7 // 0x7
299 ; CHECK: .byte 255 // 0xff
300 ; CHECK: .byte 1 // 0x1
301 ; CHECK: .byte 255 // 0xff
302 ; CHECK: .byte 255 // 0xff
303 ; CHECK: .byte 255 // 0xff
304 ; CHECK: .byte 255 // 0xff
306 ; CHECK: .byte 0 // 0x0
307 ; CHECK: .byte 8 // 0x8
308 ; CHECK: .byte 11 // 0xb
309 ; CHECK: .byte 3 // 0x3
310 ; CHECK: .byte 1 // 0x1
311 ; CHECK: .byte 9 // 0x9
312 ; CHECK: .byte 0 // 0x0
313 ; CHECK: .byte 1 // 0x1
314 define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
315 ; CHECK-LABEL: shuffle4_v8i8_v8i8:
317 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
318 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
319 ; CHECK-NEXT: adrp x8, .LCPI7_0
320 ; CHECK-NEXT: mov v2.d[1], v2.d[0]
321 ; CHECK-NEXT: mov v0.d[1], v0.d[0]
322 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI7_0]
323 ; CHECK-NEXT: adrp x8, .LCPI7_1
324 ; CHECK-NEXT: tbl v2.8b, { v2.16b }, v1.8b
325 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
326 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI7_1]
327 ; CHECK-NEXT: mov v0.d[1], v2.d[0]
328 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
330 %x = shufflevector <8 x i8> %a, <8 x i8> %b, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
331 %y = shufflevector <8 x i8> %c, <8 x i8> %d, <4 x i32> <i32 0, i32 7, i32 5, i32 1>
332 %z = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 0, i32 4, i32 7, i32 3, i32 1, i32 5, i32 0, i32 1>
337 ; CHECK: .byte 0 // 0x0
338 ; CHECK: .byte 1 // 0x1
339 ; CHECK: .byte 8 // 0x8
340 ; CHECK: .byte 9 // 0x9
341 ; CHECK: .byte 16 // 0x10
342 ; CHECK: .byte 17 // 0x11
343 ; CHECK: .byte 24 // 0x18
344 ; CHECK: .byte 25 // 0x19
345 ; CHECK: .byte 2 // 0x2
346 ; CHECK: .byte 3 // 0x3
347 ; CHECK: .byte 10 // 0xa
348 ; CHECK: .byte 11 // 0xb
349 ; CHECK: .byte 18 // 0x12
350 ; CHECK: .byte 19 // 0x13
351 ; CHECK: .byte 26 // 0x1a
352 ; CHECK: .byte 27 // 0x1b
353 define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
354 ; CHECK-LABEL: shuffle4_v4i8_zext:
356 ; CHECK-NEXT: fmov d5, d2
357 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
358 ; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
359 ; CHECK-NEXT: adrp x8, .LCPI8_0
360 ; CHECK-NEXT: fmov d4, d0
361 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_0]
362 ; CHECK-NEXT: mov v4.d[1], v1.d[0]
363 ; CHECK-NEXT: mov v5.d[1], v3.d[0]
364 ; CHECK-NEXT: bic v4.8h, #255, lsl #8
365 ; CHECK-NEXT: bic v5.8h, #255, lsl #8
366 ; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b }, v0.16b
368 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
369 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
370 %xe = zext <8 x i8> %x to <8 x i16>
371 %ye = zext <8 x i8> %y to <8 x i16>
372 %z = shufflevector <8 x i16> %xe, <8 x i16> %ye, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13>
377 ; CHECK: .byte 0 // 0x0
378 ; CHECK: .byte 16 // 0x10
379 ; CHECK: .byte 32 // 0x20
380 ; CHECK: .byte 48 // 0x30
381 ; CHECK: .byte 2 // 0x2
382 ; CHECK: .byte 18 // 0x12
383 ; CHECK: .byte 34 // 0x22
384 ; CHECK: .byte 50 // 0x32
385 ; CHECK: .byte 4 // 0x4
386 ; CHECK: .byte 20 // 0x14
387 ; CHECK: .byte 36 // 0x24
388 ; CHECK: .byte 52 // 0x34
389 ; CHECK: .byte 6 // 0x6
390 ; CHECK: .byte 22 // 0x16
391 ; CHECK: .byte 38 // 0x26
392 ; CHECK: .byte 54 // 0x36
393 define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) {
394 ; CHECK-LABEL: shuffle4_v4i16_trunc:
396 ; CHECK-NEXT: // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
397 ; CHECK-NEXT: adrp x8, .LCPI9_0
398 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
399 ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0]
400 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
401 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
402 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
404 %a = trunc <4 x i16> %ae to <4 x i8>
405 %b = trunc <4 x i16> %be to <4 x i8>
406 %c = trunc <4 x i16> %ce to <4 x i8>
407 %d = trunc <4 x i16> %de to <4 x i8>
408 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
409 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
410 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
415 ; CHECK: .byte 0 // 0x0
416 ; CHECK: .byte 16 // 0x10
417 ; CHECK: .byte 32 // 0x20
418 ; CHECK: .byte 48 // 0x30
419 ; CHECK: .byte 2 // 0x2
420 ; CHECK: .byte 18 // 0x12
421 ; CHECK: .byte 34 // 0x22
422 ; CHECK: .byte 50 // 0x32
423 ; CHECK: .byte 4 // 0x4
424 ; CHECK: .byte 20 // 0x14
425 ; CHECK: .byte 36 // 0x24
426 ; CHECK: .byte 52 // 0x34
427 ; CHECK: .byte 6 // 0x6
428 ; CHECK: .byte 22 // 0x16
429 ; CHECK: .byte 38 // 0x26
430 ; CHECK: .byte 54 // 0x36
432 define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) {
433 ; CHECK-LABEL: shuffle4_v4i32_trunc:
435 ; CHECK-NEXT: xtn v4.4h, v0.4s
436 ; CHECK-NEXT: adrp x8, .LCPI10_0
437 ; CHECK-NEXT: xtn v5.4h, v1.4s
438 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0]
439 ; CHECK-NEXT: xtn v6.4h, v2.4s
440 ; CHECK-NEXT: xtn v7.4h, v3.4s
441 ; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
443 %a = trunc <4 x i32> %ae to <4 x i8>
444 %b = trunc <4 x i32> %be to <4 x i8>
445 %c = trunc <4 x i32> %ce to <4 x i8>
446 %d = trunc <4 x i32> %de to <4 x i8>
447 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
448 %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
449 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
454 ; CHECK: .byte 0 // 0x0
455 ; CHECK: .byte 16 // 0x10
456 ; CHECK: .byte 32 // 0x20
457 ; CHECK: .byte 2 // 0x2
458 ; CHECK: .byte 18 // 0x12
459 ; CHECK: .byte 34 // 0x22
460 ; CHECK: .byte 4 // 0x4
461 ; CHECK: .byte 20 // 0x14
462 ; CHECK: .byte 36 // 0x24
463 ; CHECK: .byte 6 // 0x6
464 ; CHECK: .byte 22 // 0x16
465 ; CHECK: .byte 38 // 0x26
466 ; CHECK: .byte 255 // 0xff
467 ; CHECK: .byte 255 // 0xff
468 ; CHECK: .byte 255 // 0xff
469 ; CHECK: .byte 255 // 0xff
470 define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) {
471 ; CHECK-LABEL: shuffle3_v4i8:
473 ; CHECK-NEXT: // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
474 ; CHECK-NEXT: adrp x8, .LCPI11_0
475 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_0]
476 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
477 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
478 ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
480 %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
481 %y = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
482 %z = shufflevector <8 x i8> %x, <8 x i8> %y, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
487 ; CHECK: .byte 0 // 0x0
488 ; CHECK: .byte 1 // 0x1
489 ; CHECK: .byte 8 // 0x8
490 ; CHECK: .byte 9 // 0x9
491 ; CHECK: .byte 16 // 0x10
492 ; CHECK: .byte 17 // 0x11
493 ; CHECK: .byte 2 // 0x2
494 ; CHECK: .byte 3 // 0x3
495 ; CHECK: .byte 10 // 0xa
496 ; CHECK: .byte 11 // 0xb
497 ; CHECK: .byte 18 // 0x12
498 ; CHECK: .byte 19 // 0x13
499 ; CHECK: .byte 4 // 0x4
500 ; CHECK: .byte 5 // 0x5
501 ; CHECK: .byte 12 // 0xc
502 ; CHECK: .byte 13 // 0xd
503 define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
504 ; CHECK-LABEL: shuffle3_v4i16:
506 ; CHECK-NEXT: fmov d3, d2
507 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
508 ; CHECK-NEXT: adrp x8, .LCPI12_0
509 ; CHECK-NEXT: fmov d2, d0
510 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_0]
511 ; CHECK-NEXT: mov v2.d[1], v1.d[0]
512 ; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b
514 %x = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
515 %y = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
516 %z = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6>
520 define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
521 ; CHECK-LABEL: shuffle3_v4i32:
523 ; CHECK-NEXT: trn1 v1.4s, v0.4s, v1.4s
524 ; CHECK-NEXT: mov v1.d[1], v0.d[0]
525 ; CHECK-NEXT: mov v1.s[2], v2.s[0]
526 ; CHECK-NEXT: mov v0.16b, v1.16b
528 %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
529 %y = shufflevector <4 x i32> %c, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
530 %z = shufflevector <8 x i32> %x, <8 x i32> %y, <4 x i32> <i32 0, i32 4, i32 8, i32 1>
535 ; CHECK: .byte 4 // 0x4
536 ; CHECK: .byte 8 // 0x8
537 ; CHECK: .byte 255 // 0xff
538 ; CHECK: .byte 255 // 0xff
539 ; CHECK: .byte 14 // 0xe
540 ; CHECK: .byte 3 // 0x3
541 ; CHECK: .byte 255 // 0xff
542 ; CHECK: .byte 255 // 0xff
543 ; CHECK: .section .rodata.cst16,"aM",@progbits,16
546 ; CHECK: .byte 255 // 0xff
547 ; CHECK: .byte 255 // 0xff
548 ; CHECK: .byte 15 // 0xf
549 ; CHECK: .byte 27 // 0x1b
550 ; CHECK: .byte 255 // 0xff
551 ; CHECK: .byte 255 // 0xff
552 ; CHECK: .byte 24 // 0x18
553 ; CHECK: .byte 12 // 0xc
554 ; CHECK: .byte 255 // 0xff
555 ; CHECK: .byte 255 // 0xff
556 ; CHECK: .byte 255 // 0xff
557 ; CHECK: .byte 255 // 0xff
558 ; CHECK: .byte 255 // 0xff
559 ; CHECK: .byte 255 // 0xff
560 ; CHECK: .byte 255 // 0xff
561 ; CHECK: .byte 255 // 0xff
562 define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
563 ; CHECK-LABEL: insert4_v8i8:
565 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
566 ; CHECK-NEXT: mov v4.16b, v3.16b
567 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
568 ; CHECK-NEXT: adrp x8, .LCPI14_0
569 ; CHECK-NEXT: adrp x9, .LCPI14_1
570 ; CHECK-NEXT: mov v0.d[1], v2.d[0]
571 ; CHECK-NEXT: mov v3.16b, v1.16b
572 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
573 ; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1]
574 ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
575 ; CHECK-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v2.16b
576 ; CHECK-NEXT: trn1 v0.4h, v1.4h, v0.4h
577 ; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h
579 %e1 = extractelement <8 x i8> %a, i32 4
580 %e2 = extractelement <8 x i8> %c, i32 0
581 %e3 = extractelement <16 x i8> %b, i32 15
582 %e4 = extractelement <16 x i8> %d, i32 11
583 %e5 = extractelement <8 x i8> %c, i32 6
584 %e6 = extractelement <8 x i8> %a, i32 3
585 %e7 = extractelement <16 x i8> %d, i32 8
586 %e8 = extractelement <16 x i8> %b, i32 12
587 %i1 = insertelement <8 x i8> undef, i8 %e1, i32 0
588 %i2 = insertelement <8 x i8> %i1, i8 %e2, i32 1
589 %i3 = insertelement <8 x i8> %i2, i8 %e3, i32 2
590 %i4 = insertelement <8 x i8> %i3, i8 %e4, i32 3
591 %i5 = insertelement <8 x i8> %i4, i8 %e5, i32 4
592 %i6 = insertelement <8 x i8> %i5, i8 %e6, i32 5
593 %i7 = insertelement <8 x i8> %i6, i8 %e7, i32 6
594 %i8 = insertelement <8 x i8> %i7, i8 %e8, i32 7
599 ; CHECK: .byte 255 // 0xff
600 ; CHECK: .byte 255 // 0xff
601 ; CHECK: .byte 15 // 0xf
602 ; CHECK: .byte 27 // 0x1b
603 ; CHECK: .byte 255 // 0xff
604 ; CHECK: .byte 255 // 0xff
605 ; CHECK: .byte 24 // 0x18
606 ; CHECK: .byte 12 // 0xc
607 ; CHECK: .byte 255 // 0xff
608 ; CHECK: .byte 255 // 0xff
609 ; CHECK: .byte 15 // 0xf
610 ; CHECK: .byte 27 // 0x1b
611 ; CHECK: .byte 255 // 0xff
612 ; CHECK: .byte 255 // 0xff
613 ; CHECK: .byte 24 // 0x18
614 ; CHECK: .byte 12 // 0xc
616 ; CHECK: .byte 20 // 0x14
617 ; CHECK: .byte 24 // 0x18
618 ; CHECK: .byte 2 // 0x2
619 ; CHECK: .byte 3 // 0x3
620 ; CHECK: .byte 30 // 0x1e
621 ; CHECK: .byte 19 // 0x13
622 ; CHECK: .byte 6 // 0x6
623 ; CHECK: .byte 7 // 0x7
624 ; CHECK: .byte 20 // 0x14
625 ; CHECK: .byte 24 // 0x18
626 ; CHECK: .byte 10 // 0xa
627 ; CHECK: .byte 11 // 0xb
628 ; CHECK: .byte 30 // 0x1e
629 ; CHECK: .byte 19 // 0x13
630 ; CHECK: .byte 14 // 0xe
631 ; CHECK: .byte 15 // 0xf
632 define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
633 ; CHECK-LABEL: insert4_v16i8:
635 ; CHECK-NEXT: mov v4.16b, v3.16b
636 ; CHECK-NEXT: adrp x8, .LCPI15_0
637 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q31_q0
638 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
639 ; CHECK-NEXT: mov v3.16b, v1.16b
640 ; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI15_0]
641 ; CHECK-NEXT: mov v0.d[1], v2.d[0]
642 ; CHECK-NEXT: adrp x8, .LCPI15_1
643 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1]
644 ; CHECK-NEXT: tbl v31.16b, { v3.16b, v4.16b }, v5.16b
645 ; CHECK-NEXT: tbl v0.16b, { v31.16b, v0.16b }, v1.16b
647 %e1 = extractelement <8 x i8> %a, i32 4
648 %e2 = extractelement <8 x i8> %c, i32 0
649 %e3 = extractelement <16 x i8> %b, i32 15
650 %e4 = extractelement <16 x i8> %d, i32 11
651 %e5 = extractelement <8 x i8> %c, i32 6
652 %e6 = extractelement <8 x i8> %a, i32 3
653 %e7 = extractelement <16 x i8> %d, i32 8
654 %e8 = extractelement <16 x i8> %b, i32 12
655 %e9 = extractelement <8 x i8> %a, i32 4
656 %e10 = extractelement <8 x i8> %c, i32 0
657 %e11 = extractelement <16 x i8> %b, i32 15
658 %e12 = extractelement <16 x i8> %d, i32 11
659 %e13 = extractelement <8 x i8> %c, i32 6
660 %e14 = extractelement <8 x i8> %a, i32 3
661 %e15 = extractelement <16 x i8> %d, i32 8
662 %e16 = extractelement <16 x i8> %b, i32 12
663 %i1 = insertelement <16 x i8> undef, i8 %e1, i32 0
664 %i2 = insertelement <16 x i8> %i1, i8 %e2, i32 1
665 %i3 = insertelement <16 x i8> %i2, i8 %e3, i32 2
666 %i4 = insertelement <16 x i8> %i3, i8 %e4, i32 3
667 %i5 = insertelement <16 x i8> %i4, i8 %e5, i32 4
668 %i6 = insertelement <16 x i8> %i5, i8 %e6, i32 5
669 %i7 = insertelement <16 x i8> %i6, i8 %e7, i32 6
670 %i8 = insertelement <16 x i8> %i7, i8 %e8, i32 7
671 %i9 = insertelement <16 x i8> %i8, i8 %e9, i32 8
672 %i10 = insertelement <16 x i8> %i9, i8 %e10, i32 9
673 %i11 = insertelement <16 x i8> %i10, i8 %e11, i32 10
674 %i12 = insertelement <16 x i8> %i11, i8 %e12, i32 11
675 %i13 = insertelement <16 x i8> %i12, i8 %e13, i32 12
676 %i14 = insertelement <16 x i8> %i13, i8 %e14, i32 13
677 %i15 = insertelement <16 x i8> %i14, i8 %e15, i32 14
678 %i16 = insertelement <16 x i8> %i15, i8 %e16, i32 15
700 define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l249, <2 x double> %l267, <2 x double> %l285, <2 x double> %l303, <2 x double> %l321, <2 x double> %l339) {
703 ; CHECK-NEXT: frintm v0.2d, v0.2d
704 ; CHECK-NEXT: frintm v4.2d, v4.2d
705 ; CHECK-NEXT: adrp x8, .LCPI16_0
706 ; CHECK-NEXT: frintm v1.2d, v1.2d
707 ; CHECK-NEXT: frintm v5.2d, v5.2d
708 ; CHECK-NEXT: frintm v2.2d, v2.2d
709 ; CHECK-NEXT: frintm v6.2d, v6.2d
710 ; CHECK-NEXT: frintm v3.2d, v3.2d
711 ; CHECK-NEXT: frintm v7.2d, v7.2d
712 ; CHECK-NEXT: fcvtzs v0.2d, v0.2d
713 ; CHECK-NEXT: fcvtzs v4.2d, v4.2d
714 ; CHECK-NEXT: fcvtzs v1.2d, v1.2d
715 ; CHECK-NEXT: fcvtzs v5.2d, v5.2d
716 ; CHECK-NEXT: fcvtzs v2.2d, v2.2d
717 ; CHECK-NEXT: fcvtzs v6.2d, v6.2d
718 ; CHECK-NEXT: fcvtzs v3.2d, v3.2d
719 ; CHECK-NEXT: fcvtzs v7.2d, v7.2d
720 ; CHECK-NEXT: xtn v16.2s, v0.2d
721 ; CHECK-NEXT: xtn v20.2s, v4.2d
722 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_0]
723 ; CHECK-NEXT: xtn v17.2s, v1.2d
724 ; CHECK-NEXT: xtn v21.2s, v5.2d
725 ; CHECK-NEXT: xtn v18.2s, v2.2d
726 ; CHECK-NEXT: xtn v22.2s, v6.2d
727 ; CHECK-NEXT: xtn v19.2s, v3.2d
728 ; CHECK-NEXT: xtn v23.2s, v7.2d
729 ; CHECK-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
730 ; CHECK-NEXT: tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
731 ; CHECK-NEXT: uzp1 v0.8h, v1.8h, v2.8h
732 ; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
734 %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213)
735 %l215 = fptosi <2 x double> %l214 to <2 x i16>
736 %l232 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l231)
737 %l233 = fptosi <2 x double> %l232 to <2 x i16>
738 %l250 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l249)
739 %l251 = fptosi <2 x double> %l250 to <2 x i16>
740 %l268 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l267)
741 %l269 = fptosi <2 x double> %l268 to <2 x i16>
742 %l286 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l285)
743 %l287 = fptosi <2 x double> %l286 to <2 x i16>
744 %l304 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l303)
745 %l305 = fptosi <2 x double> %l304 to <2 x i16>
746 %l322 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l321)
747 %l323 = fptosi <2 x double> %l322 to <2 x i16>
748 %l340 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l339)
749 %l341 = fptosi <2 x double> %l340 to <2 x i16>
750 %l342 = shufflevector <2 x i16> %l215, <2 x i16> %l233, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
751 %l343 = shufflevector <2 x i16> %l251, <2 x i16> %l269, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
752 %l344 = shufflevector <2 x i16> %l287, <2 x i16> %l305, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
753 %l345 = shufflevector <2 x i16> %l323, <2 x i16> %l341, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
754 %l346 = shufflevector <4 x i16> %l342, <4 x i16> %l343, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
755 %l347 = shufflevector <4 x i16> %l344, <4 x i16> %l345, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
756 %interleaved.vec = shufflevector <8 x i16> %l346, <8 x i16> %l347, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
757 ret <16 x i16> %interleaved.vec
760 declare <2 x double> @llvm.floor.v2f64(<2 x double> %l213)