1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s
4 define <4 x i16> @normal_load_v4i8(ptr %p) {
5 ; CHECK-LABEL: normal_load_v4i8:
7 ; CHECK-NEXT: ldp s0, s1, [x0]
8 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
9 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
11 %l1 = load <4 x i8>, ptr %p
12 %q = getelementptr i8, ptr %p, i32 4
13 %l2 = load <4 x i8>, ptr %q
14 %e1 = zext <4 x i8> %l1 to <4 x i16>
15 %e2 = zext <4 x i8> %l2 to <4 x i16>
16 %a = add <4 x i16> %e1, %e2
20 define <4 x i32> @normal_load_v4i16_v4i32(ptr %p) {
21 ; CHECK-LABEL: normal_load_v4i16_v4i32:
23 ; CHECK-NEXT: ldp d0, d1, [x0]
24 ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h
26 %l1 = load <4 x i16>, ptr %p
27 %q = getelementptr i8, ptr %p, i32 8
28 %l2 = load <4 x i16>, ptr %q
29 %e1 = zext <4 x i16> %l1 to <4 x i32>
30 %e2 = zext <4 x i16> %l2 to <4 x i32>
31 %a = add <4 x i32> %e1, %e2
35 define <4 x i16> @load_v4i8(ptr %p) {
36 ; CHECK-LABEL: load_v4i8:
38 ; CHECK-NEXT: ldp s1, s0, [x0]
39 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
40 ; CHECK-NEXT: shl v0.4h, v0.4h, #3
41 ; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b
42 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
44 %l1 = load <4 x i8>, ptr %p
45 %q = getelementptr i8, ptr %p, i32 4
46 %l2 = load <4 x i8>, ptr %q
47 %e1 = zext <4 x i8> %l1 to <4 x i16>
48 %e2 = zext <4 x i8> %l2 to <4 x i16>
49 %e3 = shl <4 x i16> %e2, <i16 3, i16 3, i16 3, i16 3>
50 %a = add <4 x i16> %e1, %e3
54 define <4 x i32> @load_v4i16_v4i32(ptr %p) {
55 ; CHECK-LABEL: load_v4i16_v4i32:
57 ; CHECK-NEXT: ldr q0, [x0]
58 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
59 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
61 %l1 = load <4 x i16>, ptr %p
62 %q = getelementptr i8, ptr %p, i32 8
63 %l2 = load <4 x i16>, ptr %q
64 %e1 = zext <4 x i16> %l1 to <4 x i32>
65 %e2 = zext <4 x i16> %l2 to <4 x i32>
66 %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
67 %a = add <4 x i32> %e1, %e3
71 define <4 x i64> @load_v4i32_v4i64(ptr %p) {
72 ; CHECK-LABEL: load_v4i32_v4i64:
74 ; CHECK-NEXT: ldp q2, q0, [x0]
75 ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3
76 ; CHECK-NEXT: ushll v0.2d, v0.2s, #3
77 ; CHECK-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
78 ; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s
80 %l1 = load <4 x i32>, ptr %p
81 %q = getelementptr i8, ptr %p, i32 16
82 %l2 = load <4 x i32>, ptr %q
83 %e1 = zext <4 x i32> %l1 to <4 x i64>
84 %e2 = zext <4 x i32> %l2 to <4 x i64>
85 %e3 = shl <4 x i64> %e2, <i64 3, i64 3, i64 3, i64 3>
86 %a = add <4 x i64> %e1, %e3
90 define <4 x i32> @load_v4i8_v4i32(ptr %p) {
91 ; CHECK-LABEL: load_v4i8_v4i32:
93 ; CHECK-NEXT: ldr d0, [x0]
94 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
95 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
96 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
98 %l1 = load <4 x i8>, ptr %p
99 %q = getelementptr i8, ptr %p, i32 4
100 %l2 = load <4 x i8>, ptr %q
101 %e1 = zext <4 x i8> %l1 to <4 x i32>
102 %e2 = zext <4 x i8> %l2 to <4 x i32>
103 %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
104 %a = add <4 x i32> %e1, %e3
108 define <4 x i32> @load_v4i12_v4i32(ptr %p) {
109 ; CHECK-LABEL: load_v4i12_v4i32:
111 ; CHECK-NEXT: ldr x8, [x0]
112 ; CHECK-NEXT: ldr w9, [x0, #8]
113 ; CHECK-NEXT: lsr x10, x8, #60
114 ; CHECK-NEXT: ubfx x11, x8, #48, #12
115 ; CHECK-NEXT: ubfx w12, w9, #8, #12
116 ; CHECK-NEXT: orr w10, w10, w9, lsl #4
117 ; CHECK-NEXT: fmov s0, w11
118 ; CHECK-NEXT: and w11, w8, #0xfff
119 ; CHECK-NEXT: fmov s1, w11
120 ; CHECK-NEXT: lsr x9, x9, #20
121 ; CHECK-NEXT: and w10, w10, #0xfff
122 ; CHECK-NEXT: mov v0.h[1], w10
123 ; CHECK-NEXT: ubfx w10, w8, #12, #12
124 ; CHECK-NEXT: mov v1.h[1], w10
125 ; CHECK-NEXT: ubfx x10, x8, #24, #12
126 ; CHECK-NEXT: ubfx x8, x8, #36, #12
127 ; CHECK-NEXT: mov v0.h[2], w12
128 ; CHECK-NEXT: mov v1.h[2], w10
129 ; CHECK-NEXT: mov v0.h[3], w9
130 ; CHECK-NEXT: mov v1.h[3], w8
131 ; CHECK-NEXT: ushll v0.4s, v0.4h, #3
132 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
134 %l1 = load <4 x i12>, ptr %p
135 %q = getelementptr i8, ptr %p, i32 6
136 %l2 = load <4 x i12>, ptr %q
137 %e1 = zext <4 x i12> %l1 to <4 x i32>
138 %e2 = zext <4 x i12> %l2 to <4 x i32>
139 %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
140 %a = add <4 x i32> %e1, %e3
144 define <8 x i16> @load_v8i8(ptr %p) {
145 ; CHECK-LABEL: load_v8i8:
147 ; CHECK-NEXT: ldr q0, [x0]
148 ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3
149 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b
151 %l1 = load <8 x i8>, ptr %p
152 %q = getelementptr i8, ptr %p, i32 8
153 %l2 = load <8 x i8>, ptr %q
154 %e1 = zext <8 x i8> %l1 to <8 x i16>
155 %e2 = zext <8 x i8> %l2 to <8 x i16>
156 %e3 = shl <8 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
157 %a = add <8 x i16> %e1, %e3
161 define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) {
162 ; CHECK-LABEL: loadadd_v8i8:
164 ; CHECK-NEXT: ldr q0, [x0]
165 ; CHECK-NEXT: ldr q1, [x1]
166 ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
167 ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3
168 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b
170 %l11 = load <8 x i8>, ptr %p1
171 %q1 = getelementptr i8, ptr %p1, i32 8
172 %l12 = load <8 x i8>, ptr %q1
173 %l21 = load <8 x i8>, ptr %p2
174 %q2 = getelementptr i8, ptr %p2, i32 8
175 %l22 = load <8 x i8>, ptr %q2
176 %l1 = add <8 x i8> %l11, %l21
177 %l2 = add <8 x i8> %l12, %l22
178 %e1 = zext <8 x i8> %l1 to <8 x i16>
179 %e2 = zext <8 x i8> %l2 to <8 x i16>
180 %e3 = shl <8 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
181 %a = add <8 x i16> %e1, %e3
185 define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) {
186 ; CHECK-LABEL: loadaddext_v8i8:
188 ; CHECK-NEXT: ldr q0, [x0]
189 ; CHECK-NEXT: ldr q1, [x1]
190 ; CHECK-NEXT: uaddl2 v2.8h, v0.16b, v1.16b
191 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
192 ; CHECK-NEXT: ushll2 v1.4s, v2.8h, #3
193 ; CHECK-NEXT: ushll v2.4s, v2.4h, #3
194 ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h
195 ; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h
197 %l11 = load <8 x i8>, ptr %p1
198 %q1 = getelementptr i8, ptr %p1, i32 8
199 %l12 = load <8 x i8>, ptr %q1
200 %l21 = load <8 x i8>, ptr %p2
201 %q2 = getelementptr i8, ptr %p2, i32 8
202 %l22 = load <8 x i8>, ptr %q2
203 %le11 = zext <8 x i8> %l11 to <8 x i16>
204 %le12 = zext <8 x i8> %l12 to <8 x i16>
205 %le21 = zext <8 x i8> %l21 to <8 x i16>
206 %le22 = zext <8 x i8> %l22 to <8 x i16>
207 %l1 = add <8 x i16> %le11, %le21
208 %l2 = add <8 x i16> %le12, %le22
209 %e1 = zext <8 x i16> %l1 to <8 x i32>
210 %e2 = zext <8 x i16> %l2 to <8 x i32>
211 %e3 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
212 %a = add <8 x i32> %e1, %e3
216 define <4 x i32> @loadaddext_v4i8(ptr %p1, ptr %p2) {
217 ; CHECK-LABEL: loadaddext_v4i8:
219 ; CHECK-NEXT: ldr d0, [x0]
220 ; CHECK-NEXT: ldr d1, [x1]
221 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
222 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
223 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
225 %l11 = load <4 x i8>, ptr %p1
226 %q1 = getelementptr i8, ptr %p1, i32 4
227 %l12 = load <4 x i8>, ptr %q1
228 %l21 = load <4 x i8>, ptr %p2
229 %q2 = getelementptr i8, ptr %p2, i32 4
230 %l22 = load <4 x i8>, ptr %q2
231 %le11 = zext <4 x i8> %l11 to <4 x i16>
232 %le12 = zext <4 x i8> %l12 to <4 x i16>
233 %le21 = zext <4 x i8> %l21 to <4 x i16>
234 %le22 = zext <4 x i8> %l22 to <4 x i16>
235 %l1 = add <4 x i16> %le11, %le21
236 %l2 = add <4 x i16> %le12, %le22
237 %e1 = zext <4 x i16> %l1 to <4 x i32>
238 %e2 = zext <4 x i16> %l2 to <4 x i32>
239 %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
240 %a = add <4 x i32> %e1, %e3
244 define <16 x i16> @load_v16i8(ptr %p) {
245 ; CHECK-LABEL: load_v16i8:
247 ; CHECK-NEXT: ldp q2, q0, [x0]
248 ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3
249 ; CHECK-NEXT: ushll v0.8h, v0.8b, #3
250 ; CHECK-NEXT: uaddw2 v1.8h, v1.8h, v2.16b
251 ; CHECK-NEXT: uaddw v0.8h, v0.8h, v2.8b
253 %l1 = load <16 x i8>, ptr %p
254 %q = getelementptr i8, ptr %p, i32 16
255 %l2 = load <16 x i8>, ptr %q
256 %e1 = zext <16 x i8> %l1 to <16 x i16>
257 %e2 = zext <16 x i8> %l2 to <16 x i16>
258 %e3 = shl <16 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
259 %a = add <16 x i16> %e1, %e3
263 define <2 x i16> @std_v2i8_v2i16(ptr %p) {
264 ; CHECK-LABEL: std_v2i8_v2i16:
266 ; CHECK-NEXT: ldrb w8, [x0, #2]
267 ; CHECK-NEXT: ldrb w9, [x0, #3]
268 ; CHECK-NEXT: fmov s0, w8
269 ; CHECK-NEXT: ldrb w8, [x0]
270 ; CHECK-NEXT: fmov s1, w8
271 ; CHECK-NEXT: mov v0.s[1], w9
272 ; CHECK-NEXT: ldrb w9, [x0, #1]
273 ; CHECK-NEXT: mov v1.s[1], w9
274 ; CHECK-NEXT: shl v0.2s, v0.2s, #3
275 ; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
277 %l1 = load <2 x i8>, ptr %p
278 %q = getelementptr i8, ptr %p, i32 2
279 %l2 = load <2 x i8>, ptr %q
280 %e1 = zext <2 x i8> %l1 to <2 x i16>
281 %e2 = zext <2 x i8> %l2 to <2 x i16>
282 %se2 = shl <2 x i16> %e2, <i16 3, i16 3>
283 %a = add <2 x i16> %e1, %se2
287 define <8 x i16> @load_bv_v4i8(ptr %p, ptr %q) {
288 ; CHECK-LABEL: load_bv_v4i8:
290 ; CHECK-NEXT: ldp s0, s1, [x0]
291 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
292 ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
293 ; CHECK-NEXT: ushll v1.8h, v1.8b, #3
294 ; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b
296 %j1 = load <4 x i8>, ptr %p
297 %p1 = getelementptr i8, ptr %p, i32 4
298 %j2 = load <4 x i8>, ptr %p1
299 %k1 = load <4 x i8>, ptr %q
300 %q1 = getelementptr i8, ptr %q, i32 4
301 %k2 = load <4 x i8>, ptr %q1
302 %l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
303 %l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
304 %e1 = zext <8 x i8> %l1 to <8 x i16>
305 %e2 = zext <8 x i8> %l2 to <8 x i16>
306 %e3 = shl <8 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
307 %a = add <8 x i16> %e1, %e3
311 define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) {
312 ; CHECK-LABEL: load_bv_v4i8_i32:
314 ; CHECK-NEXT: ldr d0, [x0]
315 ; CHECK-NEXT: ldr d1, [x1]
316 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
317 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
318 ; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3
319 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3
320 ; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h
321 ; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h
323 %j1 = load <4 x i8>, ptr %p
324 %p1 = getelementptr i8, ptr %p, i32 4
325 %j2 = load <4 x i8>, ptr %p1
326 %k1 = load <4 x i8>, ptr %q
327 %q1 = getelementptr i8, ptr %q, i32 4
328 %k2 = load <4 x i8>, ptr %q1
329 %l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
330 %l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
331 %e1 = zext <8 x i8> %l1 to <8 x i32>
332 %e2 = zext <8 x i8> %l2 to <8 x i32>
333 %e3 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
334 %a = add <8 x i32> %e1, %e3
338 define <8 x i32> @load_bv_v4i16_i32(ptr %p, ptr %q) {
339 ; CHECK-LABEL: load_bv_v4i16_i32:
341 ; CHECK-NEXT: ldr q0, [x0]
342 ; CHECK-NEXT: ldr q1, [x1]
343 ; CHECK-NEXT: ushll2 v2.4s, v0.8h, #3
344 ; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3
345 ; CHECK-NEXT: uaddw v0.4s, v2.4s, v0.4h
346 ; CHECK-NEXT: uaddw v1.4s, v3.4s, v1.4h
348 %j1 = load <4 x i16>, ptr %p
349 %p1 = getelementptr i8, ptr %p, i32 8
350 %j2 = load <4 x i16>, ptr %p1
351 %k1 = load <4 x i16>, ptr %q
352 %q1 = getelementptr i8, ptr %q, i32 8
353 %k2 = load <4 x i16>, ptr %q1
354 %l1 = shufflevector <4 x i16> %j1, <4 x i16> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
355 %l2 = shufflevector <4 x i16> %j2, <4 x i16> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
356 %e1 = zext <8 x i16> %l1 to <8 x i32>
357 %e2 = zext <8 x i16> %l2 to <8 x i32>
358 %e3 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
359 %a = add <8 x i32> %e1, %e3
363 define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) {
364 ; CHECK-LABEL: load_bv_3xv4i8_i32:
366 ; CHECK-NEXT: ldp s0, s1, [x0]
367 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
368 ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
369 ; CHECK-NEXT: ldp s3, s2, [x2]
370 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
371 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
372 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
373 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
374 ; CHECK-NEXT: ushll2 v4.4s, v1.8h, #3
375 ; CHECK-NEXT: ushll v1.4s, v1.4h, #3
376 ; CHECK-NEXT: ushll v2.4s, v2.4h, #3
377 ; CHECK-NEXT: uaddw v2.4s, v2.4s, v3.4h
378 ; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v0.8h
379 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
380 ; CHECK-NEXT: stp q3, q2, [x8, #16]
381 ; CHECK-NEXT: str q0, [x8]
383 %j1 = load <4 x i8>, ptr %p
384 %p1 = getelementptr i8, ptr %p, i32 4
385 %j2 = load <4 x i8>, ptr %p1
386 %k1 = load <4 x i8>, ptr %q
387 %q1 = getelementptr i8, ptr %q, i32 4
388 %k2 = load <4 x i8>, ptr %q1
389 %m1 = load <4 x i8>, ptr %r
390 %r1 = getelementptr i8, ptr %r, i32 4
391 %m2 = load <4 x i8>, ptr %r1
392 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
393 %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
394 %mn1 = shufflevector <4 x i8> %m1, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
395 %mn2 = shufflevector <4 x i8> %m2, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
396 %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
397 %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
398 %e1 = zext <12 x i8> %l1 to <12 x i32>
399 %e2 = zext <12 x i8> %l2 to <12 x i32>
400 %e3 = shl <12 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
401 %a = add <12 x i32> %e1, %e3
405 define <16 x i16> @load_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
406 ; CHECK-LABEL: load_bv_4xv4i8_i32:
408 ; CHECK-NEXT: ldp s0, s1, [x0]
409 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
410 ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
411 ; CHECK-NEXT: ldp s2, s3, [x2]
412 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
413 ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
414 ; CHECK-NEXT: ld1 { v3.s }[1], [x3]
415 ; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b
417 %j1 = load <4 x i8>, ptr %p
418 %p1 = getelementptr i8, ptr %p, i32 4
419 %j2 = load <4 x i8>, ptr %p1
420 %k1 = load <4 x i8>, ptr %q
421 %q1 = getelementptr i8, ptr %q, i32 4
422 %k2 = load <4 x i8>, ptr %q1
423 %m1 = load <4 x i8>, ptr %r
424 %r1 = getelementptr i8, ptr %r, i32 4
425 %m2 = load <4 x i8>, ptr %r1
426 %n1 = load <4 x i8>, ptr %s
427 %s1 = getelementptr i8, ptr %s, i32 4
428 %n2 = load <4 x i8>, ptr %s1
429 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
430 %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
431 %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
432 %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
433 %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
434 %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
435 %e1 = zext <16 x i8> %l1 to <16 x i16>
436 %e2 = zext <16 x i8> %l2 to <16 x i16>
437 %e3 = shl <16 x i16> %e2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
438 %a = add <16 x i16> %e1, %e2
442 define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
443 ; CHECK-LABEL: double_bv_2xv4i8_i32:
445 ; CHECK-NEXT: ldp s0, s1, [x0]
446 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
447 ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
448 ; CHECK-NEXT: ldp s2, s3, [x2]
449 ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
450 ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
451 ; CHECK-NEXT: ld1 { v3.s }[1], [x3]
452 ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b
453 ; CHECK-NEXT: shll v3.4s, v2.4h, #16
454 ; CHECK-NEXT: shll2 v1.4s, v2.8h, #16
455 ; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h
456 ; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h
458 %j1 = load <4 x i8>, ptr %p
459 %p1 = getelementptr i8, ptr %p, i32 4
460 %j2 = load <4 x i8>, ptr %p1
461 %k1 = load <4 x i8>, ptr %q
462 %q1 = getelementptr i8, ptr %q, i32 4
463 %k2 = load <4 x i8>, ptr %q1
464 %m1 = load <4 x i8>, ptr %r
465 %r1 = getelementptr i8, ptr %r, i32 4
466 %m2 = load <4 x i8>, ptr %r1
467 %n1 = load <4 x i8>, ptr %s
468 %s1 = getelementptr i8, ptr %s, i32 4
469 %n2 = load <4 x i8>, ptr %s1
470 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
471 %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
472 %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
473 %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
474 %ejk1 = zext <8 x i8> %jk1 to <8 x i16>
475 %ejk2 = zext <8 x i8> %jk2 to <8 x i16>
476 %ajk = sub <8 x i16> %ejk1, %ejk2
477 %enm1 = zext <8 x i8> %mn1 to <8 x i16>
478 %enm2 = zext <8 x i8> %mn2 to <8 x i16>
479 %anm = sub <8 x i16> %enm1, %enm2
480 %x = sext <8 x i16> %ajk to <8 x i32>
481 %y = zext <8 x i16> %anm to <8 x i32>
482 %ys = shl <8 x i32> %y, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
483 %a = add <8 x i32> %x, %ys
487 define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) {
488 ; CHECK-LABEL: double_bv_4xv4i8_i32:
490 ; CHECK-NEXT: ldp s0, s1, [x0]
491 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
492 ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
493 ; CHECK-NEXT: ldp s2, s3, [x2]
494 ; CHECK-NEXT: usubl v1.8h, v0.8b, v1.8b
495 ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
496 ; CHECK-NEXT: ld1 { v3.s }[1], [x3]
497 ; CHECK-NEXT: ldp s4, s5, [x4]
498 ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b
499 ; CHECK-NEXT: ld1 { v4.s }[1], [x5], #4
500 ; CHECK-NEXT: ld1 { v5.s }[1], [x5]
501 ; CHECK-NEXT: ldp s6, s7, [x6]
502 ; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b
503 ; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4
504 ; CHECK-NEXT: ld1 { v7.s }[1], [x7]
505 ; CHECK-NEXT: shll v0.4s, v4.4h, #16
506 ; CHECK-NEXT: shll2 v4.4s, v4.8h, #16
507 ; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b
508 ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
509 ; CHECK-NEXT: saddw2 v1.4s, v4.4s, v1.8h
510 ; CHECK-NEXT: shll v6.4s, v5.4h, #16
511 ; CHECK-NEXT: shll2 v3.4s, v5.8h, #16
512 ; CHECK-NEXT: saddw2 v3.4s, v3.4s, v2.8h
513 ; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
515 %j1 = load <4 x i8>, ptr %p
516 %p1 = getelementptr i8, ptr %p, i32 4
517 %j2 = load <4 x i8>, ptr %p1
518 %k1 = load <4 x i8>, ptr %q
519 %q1 = getelementptr i8, ptr %q, i32 4
520 %k2 = load <4 x i8>, ptr %q1
521 %m1 = load <4 x i8>, ptr %r
522 %r1 = getelementptr i8, ptr %r, i32 4
523 %m2 = load <4 x i8>, ptr %r1
524 %n1 = load <4 x i8>, ptr %s
525 %s1 = getelementptr i8, ptr %s, i32 4
526 %n2 = load <4 x i8>, ptr %s1
527 %j3 = load <4 x i8>, ptr %t
528 %t3 = getelementptr i8, ptr %t, i32 4
529 %j4 = load <4 x i8>, ptr %t3
530 %k3 = load <4 x i8>, ptr %u
531 %u3 = getelementptr i8, ptr %u, i32 4
532 %k4 = load <4 x i8>, ptr %u3
533 %m3 = load <4 x i8>, ptr %v
534 %v3 = getelementptr i8, ptr %v, i32 4
535 %m4 = load <4 x i8>, ptr %v3
536 %n3 = load <4 x i8>, ptr %w
537 %w3 = getelementptr i8, ptr %w, i32 4
538 %n4 = load <4 x i8>, ptr %w3
539 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
540 %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
541 %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
542 %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
543 %jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
544 %jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
545 %mn3 = shufflevector <4 x i8> %m3, <4 x i8> %n3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
546 %mn4 = shufflevector <4 x i8> %m4, <4 x i8> %n4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
547 %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
548 %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
549 %l3 = shufflevector <8 x i8> %jk3, <8 x i8> %mn3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
550 %l4 = shufflevector <8 x i8> %jk4, <8 x i8> %mn4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
551 %ejk1 = zext <16 x i8> %l1 to <16 x i16>
552 %ejk2 = zext <16 x i8> %l2 to <16 x i16>
553 %ajk = sub <16 x i16> %ejk1, %ejk2
554 %enm1 = zext <16 x i8> %l3 to <16 x i16>
555 %enm2 = zext <16 x i8> %l4 to <16 x i16>
556 %anm = sub <16 x i16> %enm1, %enm2
557 %x = sext <16 x i16> %ajk to <16 x i32>
558 %y = zext <16 x i16> %anm to <16 x i32>
559 %ys = shl <16 x i32> %y, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
560 %a = add <16 x i32> %x, %ys
564 define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) {
565 ; CHECK-LABEL: double2_bv_4xv4i8_i32:
567 ; CHECK-NEXT: ldr d0, [x2]
568 ; CHECK-NEXT: ldr d1, [x0]
569 ; CHECK-NEXT: ldr d2, [x1]
570 ; CHECK-NEXT: ldr d3, [x3]
571 ; CHECK-NEXT: ldr d4, [x4]
572 ; CHECK-NEXT: ldr d5, [x5]
573 ; CHECK-NEXT: ldr d6, [x6]
574 ; CHECK-NEXT: ldr d7, [x7]
575 ; CHECK-NEXT: usubl v1.8h, v1.8b, v4.8b
576 ; CHECK-NEXT: usubl v2.8h, v2.8b, v5.8b
577 ; CHECK-NEXT: usubl v3.8h, v3.8b, v7.8b
578 ; CHECK-NEXT: usubl v4.8h, v0.8b, v6.8b
579 ; CHECK-NEXT: shll2 v0.4s, v1.8h, #16
580 ; CHECK-NEXT: shll2 v5.4s, v2.8h, #16
581 ; CHECK-NEXT: shll2 v6.4s, v4.8h, #16
582 ; CHECK-NEXT: shll2 v7.4s, v3.8h, #16
583 ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h
584 ; CHECK-NEXT: saddw v1.4s, v5.4s, v2.4h
585 ; CHECK-NEXT: saddw v2.4s, v6.4s, v4.4h
586 ; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h
588 %j1 = load <4 x i8>, ptr %p
589 %p1 = getelementptr i8, ptr %p, i32 4
590 %j2 = load <4 x i8>, ptr %p1
591 %k1 = load <4 x i8>, ptr %q
592 %q1 = getelementptr i8, ptr %q, i32 4
593 %k2 = load <4 x i8>, ptr %q1
594 %m1 = load <4 x i8>, ptr %r
595 %r1 = getelementptr i8, ptr %r, i32 4
596 %m2 = load <4 x i8>, ptr %r1
597 %n1 = load <4 x i8>, ptr %s
598 %s1 = getelementptr i8, ptr %s, i32 4
599 %n2 = load <4 x i8>, ptr %s1
600 %j3 = load <4 x i8>, ptr %t
601 %t3 = getelementptr i8, ptr %t, i32 4
602 %j4 = load <4 x i8>, ptr %t3
603 %k3 = load <4 x i8>, ptr %u
604 %u3 = getelementptr i8, ptr %u, i32 4
605 %k4 = load <4 x i8>, ptr %u3
606 %m3 = load <4 x i8>, ptr %v
607 %v3 = getelementptr i8, ptr %v, i32 4
608 %m4 = load <4 x i8>, ptr %v3
609 %n3 = load <4 x i8>, ptr %w
610 %w3 = getelementptr i8, ptr %w, i32 4
611 %n4 = load <4 x i8>, ptr %w3
612 %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
613 %m1l = shufflevector <4 x i8> %m1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
614 %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
615 %n1l = shufflevector <4 x i8> %n1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
616 %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
617 %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
618 %m2l = shufflevector <4 x i8> %m2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
619 %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
620 %n2l = shufflevector <4 x i8> %n2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
621 %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
622 %jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
623 %m3l = shufflevector <4 x i8> %m3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
624 %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
625 %n3l = shufflevector <4 x i8> %n3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
626 %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
627 %jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
628 %m4l = shufflevector <4 x i8> %m4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
629 %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
630 %n4l = shufflevector <4 x i8> %n4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
631 %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
632 %ejk1 = zext <16 x i8> %l1 to <16 x i16>
633 %ejk2 = zext <16 x i8> %l3 to <16 x i16>
634 %ajk = sub <16 x i16> %ejk1, %ejk2
635 %enm1 = zext <16 x i8> %l2 to <16 x i16>
636 %enm2 = zext <16 x i8> %l4 to <16 x i16>
637 %anm = sub <16 x i16> %enm1, %enm2
638 %x = sext <16 x i16> %ajk to <16 x i32>
639 %y = zext <16 x i16> %anm to <16 x i32>
640 %ys = shl <16 x i32> %y, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
641 %a = add <16 x i32> %x, %ys
645 define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
646 ; CHECK-LABEL: extrause_load:
648 ; CHECK-NEXT: ldr s1, [x0]
649 ; CHECK-NEXT: add x8, x3, #8
650 ; CHECK-NEXT: add x11, x1, #12
651 ; CHECK-NEXT: str s1, [x4]
652 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
653 ; CHECK-NEXT: ldr s0, [x2]
654 ; CHECK-NEXT: ushll v2.8h, v0.8b, #0
655 ; CHECK-NEXT: umov w9, v2.h[0]
656 ; CHECK-NEXT: umov w10, v2.h[1]
657 ; CHECK-NEXT: mov v0.b[8], w9
658 ; CHECK-NEXT: umov w9, v2.h[2]
659 ; CHECK-NEXT: mov v0.b[9], w10
660 ; CHECK-NEXT: umov w10, v2.h[3]
661 ; CHECK-NEXT: ldr s2, [x1]
662 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
663 ; CHECK-NEXT: mov v0.b[10], w9
664 ; CHECK-NEXT: add x9, x1, #4
665 ; CHECK-NEXT: mov v1.d[1], v2.d[0]
666 ; CHECK-NEXT: mov v0.b[11], w10
667 ; CHECK-NEXT: add x10, x3, #12
668 ; CHECK-NEXT: bic v1.8h, #255, lsl #8
669 ; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
670 ; CHECK-NEXT: ldr s5, [x0, #4]
671 ; CHECK-NEXT: ldp s2, s3, [x2, #4]
672 ; CHECK-NEXT: ldr s7, [x2, #12]
673 ; CHECK-NEXT: ldp s6, s4, [x0, #8]
674 ; CHECK-NEXT: ld1 { v5.s }[1], [x9]
675 ; CHECK-NEXT: ld1 { v7.s }[1], [x10]
676 ; CHECK-NEXT: ld1 { v3.s }[1], [x8]
677 ; CHECK-NEXT: ld1 { v2.s }[1], [x3]
678 ; CHECK-NEXT: add x8, x1, #8
679 ; CHECK-NEXT: ld1 { v4.s }[1], [x11]
680 ; CHECK-NEXT: ld1 { v6.s }[1], [x8]
681 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
682 ; CHECK-NEXT: uaddl v2.8h, v2.8b, v7.8b
683 ; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b
684 ; CHECK-NEXT: uaddw v1.8h, v1.8h, v6.8b
685 ; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b
686 ; CHECK-NEXT: ushll v6.4s, v2.4h, #3
687 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
688 ; CHECK-NEXT: ushll v0.4s, v4.4h, #3
689 ; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3
690 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
691 ; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
692 ; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v5.8h
693 ; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h
695 %lp1 = load <4 x i8>, ptr %p
696 store <4 x i8> %lp1, ptr %z
697 %p2 = getelementptr i8, ptr %p, i32 4
698 %lp2 = load <4 x i8>, ptr %p2
699 %p3 = getelementptr i8, ptr %p, i32 8
700 %lp3 = load <4 x i8>, ptr %p3
701 %p4 = getelementptr i8, ptr %p, i32 12
702 %lp4 = load <4 x i8>, ptr %p4
703 %lq1 = load <4 x i8>, ptr %q
704 %q2 = getelementptr i8, ptr %q, i32 4
705 %lq2 = load <4 x i8>, ptr %q2
706 %q3 = getelementptr i8, ptr %q, i32 8
707 %lq3 = load <4 x i8>, ptr %q3
708 %q4 = getelementptr i8, ptr %q, i32 12
709 %lq4 = load <4 x i8>, ptr %q4
710 %lr1 = load <4 x i8>, ptr %r
711 %r2 = getelementptr i8, ptr %r, i32 4
712 %lr2 = load <4 x i8>, ptr %r2
713 %r3 = getelementptr i8, ptr %r, i32 8
714 %lr3 = load <4 x i8>, ptr %r3
715 %r4 = getelementptr i8, ptr %r, i32 12
716 %lr4 = load <4 x i8>, ptr %r4
717 %ls1 = load <4 x i8>, ptr %s
718 %s2 = getelementptr i8, ptr %s, i32 4
719 %ls2 = load <4 x i8>, ptr %s2
720 %s3 = getelementptr i8, ptr %s, i32 8
721 %ls3 = load <4 x i8>, ptr %s3
722 %s4 = getelementptr i8, ptr %s, i32 12
723 %ls4 = load <4 x i8>, ptr %s4
725 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
726 %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
727 %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
728 %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
729 %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
730 %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
731 %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
732 %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
733 %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
734 %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
735 %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
736 %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
737 %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
738 %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
739 %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
740 %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
741 %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
742 %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
743 %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
744 %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
746 %le11 = zext <16 x i8> %l1 to <16 x i16>
747 %le12 = zext <16 x i8> %l3 to <16 x i16>
748 %le21 = zext <16 x i8> %l2 to <16 x i16>
749 %le22 = zext <16 x i8> %l4 to <16 x i16>
750 %la1 = add <16 x i16> %le11, %le12
751 %la2 = add <16 x i16> %le21, %le22
752 %e1 = zext <16 x i16> %la1 to <16 x i32>
753 %e2 = zext <16 x i16> %la2 to <16 x i32>
754 %se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
755 %a = add <16 x i32> %e1, %se2
759 define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
760 ; CHECK-LABEL: extrause_shuffle:
762 ; CHECK-NEXT: ldp s0, s1, [x0, #8]
763 ; CHECK-NEXT: add x8, x1, #8
764 ; CHECK-NEXT: ldr s6, [x1, #12]
765 ; CHECK-NEXT: ldp s17, s18, [x2, #8]
766 ; CHECK-NEXT: ldp s3, s5, [x2]
767 ; CHECK-NEXT: add x9, x3, #8
768 ; CHECK-NEXT: mov v4.16b, v1.16b
769 ; CHECK-NEXT: ldp s7, s16, [x0]
770 ; CHECK-NEXT: ldr s2, [x3, #12]
771 ; CHECK-NEXT: mov v1.s[1], v6.s[0]
772 ; CHECK-NEXT: ld1 { v3.s }[1], [x3], #4
773 ; CHECK-NEXT: mov v4.s[1], v6.s[0]
774 ; CHECK-NEXT: ld1 { v7.s }[1], [x1], #4
775 ; CHECK-NEXT: ld1 { v16.s }[1], [x1]
776 ; CHECK-NEXT: ld1 { v5.s }[1], [x3]
777 ; CHECK-NEXT: ld1 { v17.s }[1], [x9]
778 ; CHECK-NEXT: ld1 { v0.s }[1], [x8]
779 ; CHECK-NEXT: mov v4.s[2], v18.s[0]
780 ; CHECK-NEXT: mov v18.s[1], v2.s[0]
781 ; CHECK-NEXT: uaddl v1.8h, v16.8b, v1.8b
782 ; CHECK-NEXT: uaddl v6.8h, v7.8b, v0.8b
783 ; CHECK-NEXT: uaddl v7.8h, v3.8b, v17.8b
784 ; CHECK-NEXT: ushll v0.4s, v1.4h, #3
785 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3
786 ; CHECK-NEXT: uaddl v5.8h, v5.8b, v18.8b
787 ; CHECK-NEXT: mov v4.s[3], v2.s[0]
788 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v6.4h
789 ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v6.8h
790 ; CHECK-NEXT: ushll v16.4s, v5.4h, #3
791 ; CHECK-NEXT: ushll2 v3.4s, v5.8h, #3
792 ; CHECK-NEXT: str q4, [x4]
793 ; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v7.8h
794 ; CHECK-NEXT: uaddw v2.4s, v16.4s, v7.4h
796 %lp1 = load <4 x i8>, ptr %p
797 %p2 = getelementptr i8, ptr %p, i32 4
798 %lp2 = load <4 x i8>, ptr %p2
799 %p3 = getelementptr i8, ptr %p, i32 8
800 %lp3 = load <4 x i8>, ptr %p3
801 %p4 = getelementptr i8, ptr %p, i32 12
802 %lp4 = load <4 x i8>, ptr %p4
803 %lq1 = load <4 x i8>, ptr %q
804 %q2 = getelementptr i8, ptr %q, i32 4
805 %lq2 = load <4 x i8>, ptr %q2
806 %q3 = getelementptr i8, ptr %q, i32 8
807 %lq3 = load <4 x i8>, ptr %q3
808 %q4 = getelementptr i8, ptr %q, i32 12
809 %lq4 = load <4 x i8>, ptr %q4
810 %lr1 = load <4 x i8>, ptr %r
811 %r2 = getelementptr i8, ptr %r, i32 4
812 %lr2 = load <4 x i8>, ptr %r2
813 %r3 = getelementptr i8, ptr %r, i32 8
814 %lr3 = load <4 x i8>, ptr %r3
815 %r4 = getelementptr i8, ptr %r, i32 12
816 %lr4 = load <4 x i8>, ptr %r4
817 %ls1 = load <4 x i8>, ptr %s
818 %s2 = getelementptr i8, ptr %s, i32 4
819 %ls2 = load <4 x i8>, ptr %s2
820 %s3 = getelementptr i8, ptr %s, i32 8
821 %ls3 = load <4 x i8>, ptr %s3
822 %s4 = getelementptr i8, ptr %s, i32 12
823 %ls4 = load <4 x i8>, ptr %s4
825 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
826 %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
827 %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
828 %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
829 %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
830 %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
831 %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
832 %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
833 %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
834 %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
835 %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
836 %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
837 %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
838 %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
839 %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
840 %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
841 %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
842 %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
843 %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
844 %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
845 store <16 x i8> %l4, ptr %z
847 %le11 = zext <16 x i8> %l1 to <16 x i16>
848 %le12 = zext <16 x i8> %l3 to <16 x i16>
849 %le21 = zext <16 x i8> %l2 to <16 x i16>
850 %le22 = zext <16 x i8> %l4 to <16 x i16>
851 %la1 = add <16 x i16> %le11, %le12
852 %la2 = add <16 x i16> %le21, %le22
853 %e1 = zext <16 x i16> %la1 to <16 x i32>
854 %e2 = zext <16 x i16> %la2 to <16 x i32>
855 %se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
856 %a = add <16 x i32> %e1, %se2
860 define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
861 ; CHECK-LABEL: extrause_ext:
863 ; CHECK-NEXT: ldp s1, s2, [x2]
864 ; CHECK-NEXT: add x10, x3, #12
865 ; CHECK-NEXT: ldp s3, s5, [x0]
866 ; CHECK-NEXT: add x11, x1, #12
867 ; CHECK-NEXT: ldp s6, s0, [x2, #8]
868 ; CHECK-NEXT: add x8, x3, #8
869 ; CHECK-NEXT: ldp s7, s4, [x0, #8]
870 ; CHECK-NEXT: add x9, x1, #8
871 ; CHECK-NEXT: ld1 { v1.s }[1], [x3], #4
872 ; CHECK-NEXT: ld1 { v3.s }[1], [x1], #4
873 ; CHECK-NEXT: ld1 { v5.s }[1], [x1]
874 ; CHECK-NEXT: ld1 { v4.s }[1], [x11]
875 ; CHECK-NEXT: ld1 { v2.s }[1], [x3]
876 ; CHECK-NEXT: ld1 { v0.s }[1], [x10]
877 ; CHECK-NEXT: ld1 { v6.s }[1], [x8]
878 ; CHECK-NEXT: ld1 { v7.s }[1], [x9]
879 ; CHECK-NEXT: uaddl v5.8h, v5.8b, v4.8b
880 ; CHECK-NEXT: uaddl v2.8h, v2.8b, v0.8b
881 ; CHECK-NEXT: ushll v16.8h, v0.8b, #0
882 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b
883 ; CHECK-NEXT: uaddl v6.8h, v1.8b, v6.8b
884 ; CHECK-NEXT: ushll v4.8h, v4.8b, #0
885 ; CHECK-NEXT: ushll v1.4s, v5.4h, #3
886 ; CHECK-NEXT: ushll v7.4s, v2.4h, #3
887 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
888 ; CHECK-NEXT: ushll2 v5.4s, v5.8h, #3
889 ; CHECK-NEXT: stp q4, q16, [x4]
890 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v3.4h
891 ; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v3.8h
892 ; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v6.8h
893 ; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h
895 %lp1 = load <4 x i8>, ptr %p
896 %p2 = getelementptr i8, ptr %p, i32 4
897 %lp2 = load <4 x i8>, ptr %p2
898 %p3 = getelementptr i8, ptr %p, i32 8
899 %lp3 = load <4 x i8>, ptr %p3
900 %p4 = getelementptr i8, ptr %p, i32 12
901 %lp4 = load <4 x i8>, ptr %p4
902 %lq1 = load <4 x i8>, ptr %q
903 %q2 = getelementptr i8, ptr %q, i32 4
904 %lq2 = load <4 x i8>, ptr %q2
905 %q3 = getelementptr i8, ptr %q, i32 8
906 %lq3 = load <4 x i8>, ptr %q3
907 %q4 = getelementptr i8, ptr %q, i32 12
908 %lq4 = load <4 x i8>, ptr %q4
909 %lr1 = load <4 x i8>, ptr %r
910 %r2 = getelementptr i8, ptr %r, i32 4
911 %lr2 = load <4 x i8>, ptr %r2
912 %r3 = getelementptr i8, ptr %r, i32 8
913 %lr3 = load <4 x i8>, ptr %r3
914 %r4 = getelementptr i8, ptr %r, i32 12
915 %lr4 = load <4 x i8>, ptr %r4
916 %ls1 = load <4 x i8>, ptr %s
917 %s2 = getelementptr i8, ptr %s, i32 4
918 %ls2 = load <4 x i8>, ptr %s2
919 %s3 = getelementptr i8, ptr %s, i32 8
920 %ls3 = load <4 x i8>, ptr %s3
921 %s4 = getelementptr i8, ptr %s, i32 12
922 %ls4 = load <4 x i8>, ptr %s4
924 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
925 %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
926 %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
927 %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
928 %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
929 %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
930 %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
931 %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
932 %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
933 %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
934 %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
935 %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
936 %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
937 %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
938 %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
939 %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
940 %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
941 %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
942 %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
943 %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
945 %le11 = zext <16 x i8> %l1 to <16 x i16>
946 %le12 = zext <16 x i8> %l3 to <16 x i16>
947 %le21 = zext <16 x i8> %l2 to <16 x i16>
948 %le22 = zext <16 x i8> %l4 to <16 x i16>
949 store <16 x i16> %le22, ptr %z
950 %la1 = add <16 x i16> %le11, %le12
951 %la2 = add <16 x i16> %le21, %le22
952 %e1 = zext <16 x i16> %la1 to <16 x i32>
953 %e2 = zext <16 x i16> %la2 to <16 x i32>
954 %se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
955 %a = add <16 x i32> %e1, %se2
959 define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
960 ; CHECK-LABEL: extrause_add:
962 ; CHECK-NEXT: ldp s0, s1, [x0]
963 ; CHECK-NEXT: add x10, x3, #12
964 ; CHECK-NEXT: ldp s2, s3, [x2]
965 ; CHECK-NEXT: add x11, x1, #12
966 ; CHECK-NEXT: ldp s4, s5, [x0, #8]
967 ; CHECK-NEXT: add x8, x3, #8
968 ; CHECK-NEXT: ldp s6, s7, [x2, #8]
969 ; CHECK-NEXT: add x9, x1, #8
970 ; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4
971 ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
972 ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
973 ; CHECK-NEXT: ld1 { v5.s }[1], [x11]
974 ; CHECK-NEXT: ld1 { v3.s }[1], [x3]
975 ; CHECK-NEXT: ld1 { v7.s }[1], [x10]
976 ; CHECK-NEXT: ld1 { v6.s }[1], [x8]
977 ; CHECK-NEXT: ld1 { v4.s }[1], [x9]
978 ; CHECK-NEXT: uaddl v5.8h, v1.8b, v5.8b
979 ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b
980 ; CHECK-NEXT: uaddl v1.8h, v0.8b, v4.8b
981 ; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b
982 ; CHECK-NEXT: ushll v0.4s, v5.4h, #3
983 ; CHECK-NEXT: ushll v4.4s, v7.4h, #3
984 ; CHECK-NEXT: ushll2 v3.4s, v7.8h, #3
985 ; CHECK-NEXT: ushll2 v6.4s, v5.8h, #3
986 ; CHECK-NEXT: stp q5, q7, [x4]
987 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
988 ; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h
989 ; CHECK-NEXT: uaddw v2.4s, v4.4s, v2.4h
990 ; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v1.8h
992 %lp1 = load <4 x i8>, ptr %p
993 %p2 = getelementptr i8, ptr %p, i32 4
994 %lp2 = load <4 x i8>, ptr %p2
995 %p3 = getelementptr i8, ptr %p, i32 8
996 %lp3 = load <4 x i8>, ptr %p3
997 %p4 = getelementptr i8, ptr %p, i32 12
998 %lp4 = load <4 x i8>, ptr %p4
999 %lq1 = load <4 x i8>, ptr %q
1000 %q2 = getelementptr i8, ptr %q, i32 4
1001 %lq2 = load <4 x i8>, ptr %q2
1002 %q3 = getelementptr i8, ptr %q, i32 8
1003 %lq3 = load <4 x i8>, ptr %q3
1004 %q4 = getelementptr i8, ptr %q, i32 12
1005 %lq4 = load <4 x i8>, ptr %q4
1006 %lr1 = load <4 x i8>, ptr %r
1007 %r2 = getelementptr i8, ptr %r, i32 4
1008 %lr2 = load <4 x i8>, ptr %r2
1009 %r3 = getelementptr i8, ptr %r, i32 8
1010 %lr3 = load <4 x i8>, ptr %r3
1011 %r4 = getelementptr i8, ptr %r, i32 12
1012 %lr4 = load <4 x i8>, ptr %r4
1013 %ls1 = load <4 x i8>, ptr %s
1014 %s2 = getelementptr i8, ptr %s, i32 4
1015 %ls2 = load <4 x i8>, ptr %s2
1016 %s3 = getelementptr i8, ptr %s, i32 8
1017 %ls3 = load <4 x i8>, ptr %s3
1018 %s4 = getelementptr i8, ptr %s, i32 12
1019 %ls4 = load <4 x i8>, ptr %s4
1021 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1022 %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1023 %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1024 %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1025 %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1026 %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1027 %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1028 %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1029 %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1030 %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1031 %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1032 %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1033 %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1034 %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1035 %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1036 %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1037 %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1038 %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1039 %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1040 %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1042 %le11 = zext <16 x i8> %l1 to <16 x i16>
1043 %le12 = zext <16 x i8> %l3 to <16 x i16>
1044 %le21 = zext <16 x i8> %l2 to <16 x i16>
1045 %le22 = zext <16 x i8> %l4 to <16 x i16>
1046 %la1 = add <16 x i16> %le11, %le12
1047 %la2 = add <16 x i16> %le21, %le22
1048 store <16 x i16> %la2, ptr %z
1049 %e1 = zext <16 x i16> %la1 to <16 x i32>
1050 %e2 = zext <16 x i16> %la2 to <16 x i32>
1051 %se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1052 %a = add <16 x i32> %e1, %se2
1056 define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
1057 ; CHECK-LABEL: extrause_ext2:
1059 ; CHECK-NEXT: ldp s0, s1, [x2]
1060 ; CHECK-NEXT: add x10, x3, #12
1061 ; CHECK-NEXT: ldp s2, s3, [x0]
1062 ; CHECK-NEXT: add x11, x1, #12
1063 ; CHECK-NEXT: ldp s4, s5, [x2, #8]
1064 ; CHECK-NEXT: add x8, x3, #8
1065 ; CHECK-NEXT: ldp s6, s7, [x0, #8]
1066 ; CHECK-NEXT: add x9, x1, #8
1067 ; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4
1068 ; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4
1069 ; CHECK-NEXT: ld1 { v3.s }[1], [x1]
1070 ; CHECK-NEXT: ld1 { v7.s }[1], [x11]
1071 ; CHECK-NEXT: ld1 { v1.s }[1], [x3]
1072 ; CHECK-NEXT: ld1 { v5.s }[1], [x10]
1073 ; CHECK-NEXT: ld1 { v6.s }[1], [x9]
1074 ; CHECK-NEXT: ld1 { v4.s }[1], [x8]
1075 ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b
1076 ; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b
1077 ; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b
1078 ; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b
1079 ; CHECK-NEXT: ushll v0.4s, v7.4h, #3
1080 ; CHECK-NEXT: ushll2 v3.4s, v7.8h, #3
1081 ; CHECK-NEXT: ushll v6.4s, v1.4h, #3
1082 ; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3
1083 ; CHECK-NEXT: ushll2 v5.4s, v1.8h, #0
1084 ; CHECK-NEXT: ushll v17.4s, v1.4h, #0
1085 ; CHECK-NEXT: ushll2 v18.4s, v7.8h, #0
1086 ; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v2.8h
1087 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
1088 ; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h
1089 ; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v4.8h
1090 ; CHECK-NEXT: ushll v4.4s, v7.4h, #0
1091 ; CHECK-NEXT: stp q17, q5, [x4, #32]
1092 ; CHECK-NEXT: stp q4, q18, [x4]
1094 %lp1 = load <4 x i8>, ptr %p
1095 %p2 = getelementptr i8, ptr %p, i32 4
1096 %lp2 = load <4 x i8>, ptr %p2
1097 %p3 = getelementptr i8, ptr %p, i32 8
1098 %lp3 = load <4 x i8>, ptr %p3
1099 %p4 = getelementptr i8, ptr %p, i32 12
1100 %lp4 = load <4 x i8>, ptr %p4
1101 %lq1 = load <4 x i8>, ptr %q
1102 %q2 = getelementptr i8, ptr %q, i32 4
1103 %lq2 = load <4 x i8>, ptr %q2
1104 %q3 = getelementptr i8, ptr %q, i32 8
1105 %lq3 = load <4 x i8>, ptr %q3
1106 %q4 = getelementptr i8, ptr %q, i32 12
1107 %lq4 = load <4 x i8>, ptr %q4
1108 %lr1 = load <4 x i8>, ptr %r
1109 %r2 = getelementptr i8, ptr %r, i32 4
1110 %lr2 = load <4 x i8>, ptr %r2
1111 %r3 = getelementptr i8, ptr %r, i32 8
1112 %lr3 = load <4 x i8>, ptr %r3
1113 %r4 = getelementptr i8, ptr %r, i32 12
1114 %lr4 = load <4 x i8>, ptr %r4
1115 %ls1 = load <4 x i8>, ptr %s
1116 %s2 = getelementptr i8, ptr %s, i32 4
1117 %ls2 = load <4 x i8>, ptr %s2
1118 %s3 = getelementptr i8, ptr %s, i32 8
1119 %ls3 = load <4 x i8>, ptr %s3
1120 %s4 = getelementptr i8, ptr %s, i32 12
1121 %ls4 = load <4 x i8>, ptr %s4
1123 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1124 %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1125 %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1126 %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1127 %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1128 %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1129 %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1130 %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1131 %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1132 %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1133 %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1134 %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1135 %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1136 %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1137 %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1138 %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1139 %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1140 %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1141 %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1142 %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1144 %le11 = zext <16 x i8> %l1 to <16 x i16>
1145 %le12 = zext <16 x i8> %l3 to <16 x i16>
1146 %le21 = zext <16 x i8> %l2 to <16 x i16>
1147 %le22 = zext <16 x i8> %l4 to <16 x i16>
1148 %la1 = add <16 x i16> %le11, %le12
1149 %la2 = add <16 x i16> %le21, %le22
1150 %e1 = zext <16 x i16> %la1 to <16 x i32>
1151 %e2 = zext <16 x i16> %la2 to <16 x i32>
1152 store <16 x i32> %e2, ptr %z
1153 %se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1154 %a = add <16 x i32> %e1, %se2
1158 define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
1159 ; CHECK-LABEL: extrause_shl:
1161 ; CHECK-NEXT: ldp s1, s2, [x0]
1162 ; CHECK-NEXT: add x10, x3, #12
1163 ; CHECK-NEXT: ldp s0, s3, [x2]
1164 ; CHECK-NEXT: add x11, x1, #12
1165 ; CHECK-NEXT: ldp s4, s5, [x0, #8]
1166 ; CHECK-NEXT: add x8, x3, #8
1167 ; CHECK-NEXT: ldp s6, s7, [x2, #8]
1168 ; CHECK-NEXT: add x9, x1, #8
1169 ; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4
1170 ; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4
1171 ; CHECK-NEXT: ld1 { v2.s }[1], [x1]
1172 ; CHECK-NEXT: ld1 { v5.s }[1], [x11]
1173 ; CHECK-NEXT: ld1 { v3.s }[1], [x3]
1174 ; CHECK-NEXT: ld1 { v7.s }[1], [x10]
1175 ; CHECK-NEXT: ld1 { v4.s }[1], [x9]
1176 ; CHECK-NEXT: ld1 { v6.s }[1], [x8]
1177 ; CHECK-NEXT: uaddl v2.8h, v2.8b, v5.8b
1178 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b
1179 ; CHECK-NEXT: uaddl v4.8h, v1.8b, v4.8b
1180 ; CHECK-NEXT: ushll v5.4s, v2.4h, #3
1181 ; CHECK-NEXT: ushll2 v7.4s, v2.8h, #3
1182 ; CHECK-NEXT: uaddl v2.8h, v0.8b, v6.8b
1183 ; CHECK-NEXT: ushll v6.4s, v3.4h, #3
1184 ; CHECK-NEXT: ushll2 v16.4s, v3.8h, #3
1185 ; CHECK-NEXT: uaddw2 v1.4s, v7.4s, v4.8h
1186 ; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h
1187 ; CHECK-NEXT: stp q5, q7, [x4]
1188 ; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v2.8h
1189 ; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h
1190 ; CHECK-NEXT: stp q6, q16, [x4, #32]
1192 %lp1 = load <4 x i8>, ptr %p
1193 %p2 = getelementptr i8, ptr %p, i32 4
1194 %lp2 = load <4 x i8>, ptr %p2
1195 %p3 = getelementptr i8, ptr %p, i32 8
1196 %lp3 = load <4 x i8>, ptr %p3
1197 %p4 = getelementptr i8, ptr %p, i32 12
1198 %lp4 = load <4 x i8>, ptr %p4
1199 %lq1 = load <4 x i8>, ptr %q
1200 %q2 = getelementptr i8, ptr %q, i32 4
1201 %lq2 = load <4 x i8>, ptr %q2
1202 %q3 = getelementptr i8, ptr %q, i32 8
1203 %lq3 = load <4 x i8>, ptr %q3
1204 %q4 = getelementptr i8, ptr %q, i32 12
1205 %lq4 = load <4 x i8>, ptr %q4
1206 %lr1 = load <4 x i8>, ptr %r
1207 %r2 = getelementptr i8, ptr %r, i32 4
1208 %lr2 = load <4 x i8>, ptr %r2
1209 %r3 = getelementptr i8, ptr %r, i32 8
1210 %lr3 = load <4 x i8>, ptr %r3
1211 %r4 = getelementptr i8, ptr %r, i32 12
1212 %lr4 = load <4 x i8>, ptr %r4
1213 %ls1 = load <4 x i8>, ptr %s
1214 %s2 = getelementptr i8, ptr %s, i32 4
1215 %ls2 = load <4 x i8>, ptr %s2
1216 %s3 = getelementptr i8, ptr %s, i32 8
1217 %ls3 = load <4 x i8>, ptr %s3
1218 %s4 = getelementptr i8, ptr %s, i32 12
1219 %ls4 = load <4 x i8>, ptr %s4
1221 %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1222 %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1223 %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1224 %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1225 %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1226 %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1227 %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1228 %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1229 %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1230 %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1231 %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1232 %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1233 %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1234 %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1235 %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1236 %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1237 %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1238 %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
1239 %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1240 %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
1242 %le11 = zext <16 x i8> %l1 to <16 x i16>
1243 %le12 = zext <16 x i8> %l3 to <16 x i16>
1244 %le21 = zext <16 x i8> %l2 to <16 x i16>
1245 %le22 = zext <16 x i8> %l4 to <16 x i16>
1246 %la1 = add <16 x i16> %le11, %le12
1247 %la2 = add <16 x i16> %le21, %le22
1248 %e1 = zext <16 x i16> %la1 to <16 x i32>
1249 %e2 = zext <16 x i16> %la2 to <16 x i32>
1250 %se2 = shl <16 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1251 store <16 x i32> %se2, ptr %z
1252 %a = add <16 x i32> %e1, %se2
1257 define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) {
1258 ; CHECK-LABEL: commuted_loads:
1260 ; CHECK-NEXT: ldr q0, [x0]
1261 ; CHECK-NEXT: ldr q1, [x1]
1262 ; CHECK-NEXT: add v0.16b, v1.16b, v0.16b
1263 ; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0
1264 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1265 ; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3
1266 ; CHECK-NEXT: ushll v3.4s, v1.4h, #3
1267 ; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h
1268 ; CHECK-NEXT: uaddw v0.4s, v3.4s, v0.4h
1270 %l11 = load <8 x i8>, ptr %p1
1271 %q1 = getelementptr i8, ptr %p1, i32 8
1272 %l12 = load <8 x i8>, ptr %q1
1273 %l21 = load <8 x i8>, ptr %p2
1274 %q2 = getelementptr i8, ptr %p2, i32 8
1275 %l22 = load <8 x i8>, ptr %q2
1276 %l1 = add <8 x i8> %l21, %l11
1277 %l2 = add <8 x i8> %l22, %l12
1278 %e1 = zext <8 x i8> %l1 to <8 x i32>
1279 %e2 = zext <8 x i8> %l2 to <8 x i32>
1280 %se2 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1281 %a = add <8 x i32> %e1, %se2
1285 define <8 x i32> @commuted_loads2(ptr %p1, ptr %p2) {
1286 ; CHECK-LABEL: commuted_loads2:
1288 ; CHECK-NEXT: ldp d0, d3, [x1]
1289 ; CHECK-NEXT: ldp d1, d2, [x0]
1290 ; CHECK-NEXT: add v0.8b, v1.8b, v0.8b
1291 ; CHECK-NEXT: add v1.8b, v2.8b, v3.8b
1292 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1293 ; CHECK-NEXT: ushll v2.8h, v1.8b, #0
1294 ; CHECK-NEXT: ushll v3.4s, v0.4h, #3
1295 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #3
1296 ; CHECK-NEXT: uaddw2 v1.4s, v0.4s, v2.8h
1297 ; CHECK-NEXT: uaddw v0.4s, v3.4s, v2.4h
1299 %l11 = load <8 x i8>, ptr %p1
1300 %q1 = getelementptr i8, ptr %p1, i32 8
1301 %l12 = load <8 x i8>, ptr %q1
1302 %l21 = load <8 x i8>, ptr %p2
1303 %q2 = getelementptr i8, ptr %p2, i32 8
1304 %l22 = load <8 x i8>, ptr %q2
1305 %l1 = add <8 x i8> %l11, %l21
1306 %l2 = add <8 x i8> %l12, %l22
1307 %e1 = zext <8 x i8> %l2 to <8 x i32>
1308 %e2 = zext <8 x i8> %l1 to <8 x i32>
1309 %se2 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1310 %a = add <8 x i32> %e1, %se2
1314 define <8 x i32> @commuted_sub(ptr %p1, ptr %p2) {
1315 ; CHECK-LABEL: commuted_sub:
1317 ; CHECK-NEXT: ldp d2, d1, [x1]
1318 ; CHECK-NEXT: ldr d0, [x0, #8]
1319 ; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
1320 ; CHECK-NEXT: ldr d1, [x0]
1321 ; CHECK-NEXT: add v1.8b, v1.8b, v2.8b
1322 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1323 ; CHECK-NEXT: ushll v2.8h, v1.8b, #0
1324 ; CHECK-NEXT: ushll v3.4s, v0.4h, #3
1325 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #3
1326 ; CHECK-NEXT: usubw2 v1.4s, v0.4s, v2.8h
1327 ; CHECK-NEXT: usubw v0.4s, v3.4s, v2.4h
1329 %l11 = load <8 x i8>, ptr %p1
1330 %q1 = getelementptr i8, ptr %p1, i32 8
1331 %l12 = load <8 x i8>, ptr %q1
1332 %l21 = load <8 x i8>, ptr %p2
1333 %q2 = getelementptr i8, ptr %p2, i32 8
1334 %l22 = load <8 x i8>, ptr %q2
1335 %l1 = add <8 x i8> %l11, %l21
1336 %l2 = add <8 x i8> %l12, %l22
1337 %e1 = zext <8 x i8> %l1 to <8 x i32>
1338 %e2 = zext <8 x i8> %l2 to <8 x i32>
1339 %se2 = shl <8 x i32> %e2, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1340 %a = sub <8 x i32> %se2, %e1
1344 define <4 x i32> @bitcast(ptr %p) {
1345 ; CHECK-LABEL: bitcast:
1347 ; CHECK-NEXT: ldr d0, [x0]
1348 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1349 ; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3
1350 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
1352 %l1b = load float, ptr %p
1353 %l1 = bitcast float %l1b to <4 x i8>
1354 %q = getelementptr i8, ptr %p, i32 4
1355 %l2b = load float, ptr %q
1356 %l2 = bitcast float %l2b to <4 x i8>
1357 %e1 = zext <4 x i8> %l1 to <4 x i32>
1358 %e2 = zext <4 x i8> %l2 to <4 x i32>
1359 %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
1360 %a = add <4 x i32> %e1, %e3
1364 define <4 x i32> @atomic(ptr %p) {
1365 ; CHECK-LABEL: atomic:
1367 ; CHECK-NEXT: ldar w8, [x0]
1368 ; CHECK-NEXT: movi v0.2d, #0x0000ff000000ff
1369 ; CHECK-NEXT: ldr s1, [x0, #4]
1370 ; CHECK-NEXT: fmov s2, w8
1371 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1372 ; CHECK-NEXT: zip1 v2.8b, v2.8b, v0.8b
1373 ; CHECK-NEXT: ushll v1.4s, v1.4h, #3
1374 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0
1375 ; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
1376 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
1378 %l1b = load atomic float, ptr %p acquire, align 4
1379 %l1 = bitcast float %l1b to <4 x i8>
1380 %q = getelementptr i8, ptr %p, i32 4
1381 %l2b = load float, ptr %q
1382 %l2 = bitcast float %l2b to <4 x i8>
1383 %e1 = zext <4 x i8> %l1 to <4 x i32>
1384 %e2 = zext <4 x i8> %l2 to <4 x i32>
1385 %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
1386 %a = add <4 x i32> %e1, %e3
1390 define <4 x i32> @volatile(ptr %p) {
1391 ; CHECK-LABEL: volatile:
1393 ; CHECK-NEXT: sub sp, sp, #16
1394 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1395 ; CHECK-NEXT: ldr s0, [x0]
1396 ; CHECK-NEXT: ldr s1, [x0, #4]
1397 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
1398 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
1399 ; CHECK-NEXT: ushll v1.4s, v1.4h, #3
1400 ; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
1401 ; CHECK-NEXT: add sp, sp, #16
1403 %l1b = load volatile float, ptr %p
1404 %l1 = bitcast float %l1b to <4 x i8>
1405 %q = getelementptr i8, ptr %p, i32 4
1406 %l2b = load float, ptr %q
1407 %l2 = bitcast float %l2b to <4 x i8>
1408 %e1 = zext <4 x i8> %l1 to <4 x i32>
1409 %e2 = zext <4 x i8> %l2 to <4 x i32>
1410 %e3 = shl <4 x i32> %e2, <i32 3, i32 3, i32 3, i32 3>
1411 %a = add <4 x i32> %e1, %e3