1 ; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
3 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6 ; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7 ; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
18 target triple = "aarch64-unknown-linux-gnu"
20 ; Don't use SVE when its registers are no bigger than NEON.
26 define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 {
27 ; CHECK-LABEL: masked_load_v2f16:
28 ; CHECK: ldr s[[N0:[0-9]+]], [x0]
29 ; CHECK-NEXT: ldr s[[N1:[0-9]+]], [x1]
30 ; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].h, vl4
31 ; CHECK-NEXT: fcmeq v[[N2:[0-9]+]].4h, v[[N0]].4h, v[[N1]].4h
32 ; CHECK-NEXT: umov [[W0:w[0-9]+]], v[[N2]].h[0]
33 ; CHECK-NEXT: umov [[W1:w[0-9]+]], v[[N2]].h[1]
34 ; CHECK-NEXT: fmov s[[V0:[0-9]+]], [[W0]]
35 ; CHECK-NEXT: mov v[[V0]].s[1], [[W1]]
36 ; CHECK-NEXT: shl v[[V0]].2s, v[[V0]].2s, #16
37 ; CHECK-NEXT: sshr v[[V0]].2s, v[[V0]].2s, #16
38 ; CHECK-NEXT: movi [[D0:d[0-9]+]], #0000000000000000
39 ; CHECK-NEXT: fmov [[W1]], s[[V0]]
40 ; CHECK-NEXT: mov [[W0]], v[[V0]].s[1]
41 ; CHECK-NEXT: mov [[V1:v[0-9]+]].h[0], [[W1]]
42 ; CHECK-NEXT: mov [[V1]].h[1], [[W0]]
43 ; CHECK-NEXT: shl v[[V0]].4h, [[V1]].4h, #15
44 ; CHECK-NEXT: sshr v[[V0]].4h, v[[V0]].4h, #15
45 ; CHECK-NEXT: cmpne [[PG1:p[0-9]+]].h, [[PG0]]/z, z[[N2]].h, #0
46 ; CHECK-NEXT: ld1h { z0.h }, [[PG1]]/z, [x0]
48 %a = load <2 x half>, <2 x half>* %ap
49 %b = load <2 x half>, <2 x half>* %bp
50 %mask = fcmp oeq <2 x half> %a, %b
51 %load = call <2 x half> @llvm.masked.load.v2f16(<2 x half>* %ap, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer)
55 define <2 x float> @masked_load_v2f32(<2 x float>* %ap, <2 x float>* %bp) #0 {
56 ; CHECK-LABEL: masked_load_v2f32:
57 ; CHECK: ldr d[[N0:[0-9]+]], [x0]
58 ; CHECK-NEXT: ldr d[[N1:[0-9]+]], [x1]
59 ; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2
60 ; CHECK-NEXT: fcmeq v[[N2:[0-9]+]].2s, v[[N0]].2s, v[[N1]].2s
61 ; CHECK-NEXT: cmpne [[PG1:p[0-9]+]].s, [[PG0]]/z, z[[N2]].s, #0
62 ; CHECK-NEXT: ld1w { z0.s }, [[PG1]]/z, [x0]
64 %a = load <2 x float>, <2 x float>* %ap
65 %b = load <2 x float>, <2 x float>* %bp
66 %mask = fcmp oeq <2 x float> %a, %b
67 %load = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %ap, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer)
71 define <4 x float> @masked_load_v4f32(<4 x float>* %ap, <4 x float>* %bp) #0 {
72 ; CHECK-LABEL: masked_load_v4f32:
73 ; CHECK: ldr q[[N0:[0-9]+]], [x0]
74 ; CHECK-NEXT: ldr q[[N1:[0-9]+]], [x1]
75 ; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl4
76 ; CHECK-NEXT: fcmeq v[[N2:[0-9]+]].4s, v[[N0]].4s, v[[N1]].4s
77 ; CHECK-NEXT: cmpne [[PG1:p[0-9]+]].s, [[PG0]]/z, z[[N2]].s, #0
78 ; CHECK-NEXT: ld1w { z0.s }, [[PG1]]/z, [x0]
80 %a = load <4 x float>, <4 x float>* %ap
81 %b = load <4 x float>, <4 x float>* %bp
82 %mask = fcmp oeq <4 x float> %a, %b
83 %load = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %ap, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
87 define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
88 ; CHECK-LABEL: masked_load_v8f32:
89 ; CHECK: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
90 ; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
91 ; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
92 ; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
93 ; CHECK-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x0]
94 ; CHECK-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
96 %a = load <8 x float>, <8 x float>* %ap
97 %b = load <8 x float>, <8 x float>* %bp
98 %mask = fcmp oeq <8 x float> %a, %b
99 %load = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer)
100 ret <8 x float> %load
103 define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
104 ; CHECK-LABEL: masked_load_v16f32:
105 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
106 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
107 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
108 ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
109 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
110 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
111 ; VBITS_GE_512-NEXT: ret
112 %a = load <16 x float>, <16 x float>* %ap
113 %b = load <16 x float>, <16 x float>* %bp
114 %mask = fcmp oeq <16 x float> %a, %b
115 %load = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer)
116 ret <16 x float> %load
119 define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
120 ; CHECK-LABEL: masked_load_v32f32:
121 ; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
122 ; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
123 ; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
124 ; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
125 ; VBITS_GE_1024-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
126 ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
127 ; VBITS_GE_1024-NEXT: ret
128 %a = load <32 x float>, <32 x float>* %ap
129 %b = load <32 x float>, <32 x float>* %bp
130 %mask = fcmp oeq <32 x float> %a, %b
131 %load = call <32 x float> @llvm.masked.load.v32f32(<32 x float>* %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer)
132 ret <32 x float> %load
135 define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
136 ; CHECK-LABEL: masked_load_v64f32:
137 ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
138 ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
139 ; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
140 ; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
141 ; VBITS_GE_2048-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
142 ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
143 ; VBITS_GE_2048-NEXT: ret
145 %a = load <64 x float>, <64 x float>* %ap
146 %b = load <64 x float>, <64 x float>* %bp
147 %mask = fcmp oeq <64 x float> %a, %b
148 %load = call <64 x float> @llvm.masked.load.v64f32(<64 x float>* %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer)
149 ret <64 x float> %load
152 define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
153 ; CHECK-LABEL: masked_load_v64i8:
154 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].b, vl64
155 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
156 ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
157 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
158 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
159 ; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, [[PG0]], [x8]
160 ; VBITS_GE_512-NEXT: ret
161 %a = load <64 x i8>, <64 x i8>* %ap
162 %b = load <64 x i8>, <64 x i8>* %bp
163 %mask = icmp eq <64 x i8> %a, %b
164 %load = call <64 x i8> @llvm.masked.load.v64i8(<64 x i8>* %ap, i32 8, <64 x i1> %mask, <64 x i8> undef)
168 define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
169 ; CHECK-LABEL: masked_load_v32i16:
170 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].h, vl32
171 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
172 ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
173 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
174 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
175 ; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG0]], [x8]
177 %a = load <32 x i16>, <32 x i16>* %ap
178 %b = load <32 x i16>, <32 x i16>* %bp
179 %mask = icmp eq <32 x i16> %a, %b
180 %load = call <32 x i16> @llvm.masked.load.v32i16(<32 x i16>* %ap, i32 8, <32 x i1> %mask, <32 x i16> undef)
184 define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
185 ; CHECK-LABEL: masked_load_v16i32:
186 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl16
187 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
188 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
189 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
190 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
191 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
192 ; VBITS_GE_512-NEXT: ret
193 %a = load <16 x i32>, <16 x i32>* %ap
194 %b = load <16 x i32>, <16 x i32>* %bp
195 %mask = icmp eq <16 x i32> %a, %b
196 %load = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %ap, i32 8, <16 x i1> %mask, <16 x i32> undef)
200 define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
201 ; CHECK-LABEL: masked_load_v8i64:
202 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
203 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
204 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
205 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
206 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
207 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG0]], [x8]
208 ; VBITS_GE_512-NEXT: ret
209 %a = load <8 x i64>, <8 x i64>* %ap
210 %b = load <8 x i64>, <8 x i64>* %bp
211 %mask = icmp eq <8 x i64> %a, %b
212 %load = call <8 x i64> @llvm.masked.load.v8i64(<8 x i64>* %ap, i32 8, <8 x i1> %mask, <8 x i64> undef)
216 define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
217 ; CHECK-LABEL: masked_load_passthru_v8i64:
218 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
219 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
220 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
221 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
222 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
223 ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
224 ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
225 ; VBITS_GE_512-NEXT: ret
226 %a = load <8 x i64>, <8 x i64>* %ap
227 %b = load <8 x i64>, <8 x i64>* %bp
228 %mask = icmp eq <8 x i64> %a, %b
229 %load = call <8 x i64> @llvm.masked.load.v8i64(<8 x i64>* %ap, i32 8, <8 x i1> %mask, <8 x i64> %b)
233 define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* %bp) #0 {
234 ; CHECK-LABEL: masked_load_passthru_v8f64:
235 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
236 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
237 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
238 ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
239 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
240 ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
241 ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
242 ; VBITS_GE_512-NEXT: ret
243 %a = load <8 x double>, <8 x double>* %ap
244 %b = load <8 x double>, <8 x double>* %bp
245 %mask = fcmp oeq <8 x double> %a, %b
246 %load = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %ap, i32 8, <8 x i1> %mask, <8 x double> %b)
247 ret <8 x double> %load
250 define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
251 ; CHECK-LABEL: masked_load_sext_v32i8i16:
252 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].b, vl32
253 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
254 ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
255 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
256 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
257 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
258 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
259 ; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8]
260 ; VBITS_GE_512-NEXT: ret
261 %a = load <32 x i8>, <32 x i8>* %ap
262 %b = load <32 x i8>, <32 x i8>* %bp
263 %mask = icmp eq <32 x i8> %a, %b
264 %load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
265 %ext = sext <32 x i8> %load to <32 x i16>
269 define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
270 ; CHECK-LABEL: masked_load_sext_v16i8i32:
271 ; VBITS_GE_512: ldr q0, [x0]
272 ; VBITS_GE_512-NEXT: ldr q1, [x1]
273 ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl16
274 ; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
275 ; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
276 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
277 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
278 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
279 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
280 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
282 %a = load <16 x i8>, <16 x i8>* %ap
283 %b = load <16 x i8>, <16 x i8>* %bp
284 %mask = icmp eq <16 x i8> %a, %b
285 %load = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
286 %ext = sext <16 x i8> %load to <16 x i32>
290 define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
291 ; CHECK-LABEL: masked_load_sext_v8i8i64:
292 ; VBITS_GE_512: ldr d0, [x0]
293 ; VBITS_GE_512-NEXT: ldr d1, [x1]
294 ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl8
295 ; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
296 ; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].b, p0/z, z[[V]].b, #0
297 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, p[[PG]]/z, [x{{[0-9]+}}]
298 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
299 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
300 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
301 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
302 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
303 ; VBITS_GE_512-NEXT: ret
304 %a = load <8 x i8>, <8 x i8>* %ap
305 %b = load <8 x i8>, <8 x i8>* %bp
306 %mask = icmp eq <8 x i8> %a, %b
307 %load = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
308 %ext = sext <8 x i8> %load to <8 x i64>
312 define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
313 ; CHECK-LABEL: masked_load_sext_v16i16i32:
314 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].h, vl16
315 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
316 ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
317 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
318 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
319 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
320 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
321 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8]
322 ; VBITS_GE_512-NEXT: ret
323 %a = load <16 x i16>, <16 x i16>* %ap
324 %b = load <16 x i16>, <16 x i16>* %bp
325 %mask = icmp eq <16 x i16> %a, %b
326 %load = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
327 %ext = sext <16 x i16> %load to <16 x i32>
331 define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
332 ; CHECK-LABEL: masked_load_sext_v8i16i64:
333 ; VBITS_GE_512: ldr q0, [x0]
334 ; VBITS_GE_512-NEXT: ldr q1, [x1]
335 ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].h, vl8
336 ; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
337 ; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].h, p0/z, z[[V]].h, #0
338 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, p[[PG]]/z, [x{{[0-9]+}}]
339 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
340 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
341 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
342 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
343 ; VBITS_GE_512-NEXT: ret
344 %a = load <8 x i16>, <8 x i16>* %ap
345 %b = load <8 x i16>, <8 x i16>* %bp
346 %mask = icmp eq <8 x i16> %a, %b
347 %load = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
348 %ext = sext <8 x i16> %load to <8 x i64>
352 define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
353 ; CHECK-LABEL: masked_load_sext_v8i32i64:
354 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8
355 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
356 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
357 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
358 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
359 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
360 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
361 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8]
362 ; VBITS_GE_512-NEXT: ret
363 %a = load <8 x i32>, <8 x i32>* %ap
364 %b = load <8 x i32>, <8 x i32>* %bp
365 %mask = icmp eq <8 x i32> %a, %b
366 %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
367 %ext = sext <8 x i32> %load to <8 x i64>
371 define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
372 ; CHECK-LABEL: masked_load_zext_v32i8i16:
373 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].b, vl32
374 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
375 ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
376 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
377 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
378 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
379 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
380 ; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8]
381 ; VBITS_GE_512-NEXT: ret
382 %a = load <32 x i8>, <32 x i8>* %ap
383 %b = load <32 x i8>, <32 x i8>* %bp
384 %mask = icmp eq <32 x i8> %a, %b
385 %load = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %ap, i32 8, <32 x i1> %mask, <32 x i8> undef)
386 %ext = zext <32 x i8> %load to <32 x i16>
390 define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
391 ; CHECK-LABEL: masked_load_zext_v16i8i32:
392 ; VBITS_GE_512: ldr q0, [x0]
393 ; VBITS_GE_512-NEXT: ldr q1, [x1]
394 ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl16
395 ; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
396 ; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
397 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
398 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
399 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
400 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
401 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
403 %a = load <16 x i8>, <16 x i8>* %ap
404 %b = load <16 x i8>, <16 x i8>* %bp
405 %mask = icmp eq <16 x i8> %a, %b
406 %load = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %ap, i32 8, <16 x i1> %mask, <16 x i8> undef)
407 %ext = zext <16 x i8> %load to <16 x i32>
411 define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
412 ; CHECK-LABEL: masked_load_zext_v8i8i64:
413 ; VBITS_GE_512: ldr d0, [x0]
414 ; VBITS_GE_512-NEXT: ldr d1, [x1]
415 ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].b, vl8
416 ; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
417 ; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].b, p0/z, z[[V]].b, #0
418 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, p[[PG]]/z, [x{{[0-9]+}}]
419 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
420 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
421 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
422 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
423 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
424 ; VBITS_GE_512-NEXT: ret
425 %a = load <8 x i8>, <8 x i8>* %ap
426 %b = load <8 x i8>, <8 x i8>* %bp
427 %mask = icmp eq <8 x i8> %a, %b
428 %load = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %ap, i32 8, <8 x i1> %mask, <8 x i8> undef)
429 %ext = zext <8 x i8> %load to <8 x i64>
433 define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
434 ; CHECK-LABEL: masked_load_zext_v16i16i32:
435 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].h, vl16
436 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
437 ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
438 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
439 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
440 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
441 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
442 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8]
443 ; VBITS_GE_512-NEXT: ret
444 %a = load <16 x i16>, <16 x i16>* %ap
445 %b = load <16 x i16>, <16 x i16>* %bp
446 %mask = icmp eq <16 x i16> %a, %b
447 %load = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %ap, i32 8, <16 x i1> %mask, <16 x i16> undef)
448 %ext = zext <16 x i16> %load to <16 x i32>
452 define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
453 ; CHECK-LABEL: masked_load_zext_v8i16i64:
454 ; VBITS_GE_512: ldr q0, [x0]
455 ; VBITS_GE_512-NEXT: ldr q1, [x1]
456 ; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].h, vl8
457 ; VBITS_GE_512-NEXT: cmeq v[[V:[0-9]+]].8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
458 ; VBITS_GE_512-NEXT: cmpne p[[PG:[0-9]+]].h, p0/z, z[[V]].h, #0
459 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, p[[PG]]/z, [x{{[0-9]+}}]
460 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
461 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
462 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
463 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
464 ; VBITS_GE_512-NEXT: ret
465 %a = load <8 x i16>, <8 x i16>* %ap
466 %b = load <8 x i16>, <8 x i16>* %bp
467 %mask = icmp eq <8 x i16> %a, %b
468 %load = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %ap, i32 8, <8 x i1> %mask, <8 x i16> undef)
469 %ext = zext <8 x i16> %load to <8 x i64>
473 define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
474 ; CHECK-LABEL: masked_load_zext_v8i32i64:
475 ; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8
476 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
477 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
478 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
479 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
480 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
481 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
482 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8]
483 ; VBITS_GE_512-NEXT: ret
484 %a = load <8 x i32>, <8 x i32>* %ap
485 %b = load <8 x i32>, <8 x i32>* %bp
486 %mask = icmp eq <8 x i32> %a, %b
487 %load = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %ap, i32 8, <8 x i1> %mask, <8 x i32> undef)
488 %ext = zext <8 x i32> %load to <8 x i64>
492 declare <2 x half> @llvm.masked.load.v2f16(<2 x half>*, i32, <2 x i1>, <2 x half>)
493 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
494 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
495 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
496 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
497 declare <32 x float> @llvm.masked.load.v32f32(<32 x float>*, i32, <32 x i1>, <32 x float>)
498 declare <64 x float> @llvm.masked.load.v64f32(<64 x float>*, i32, <64 x i1>, <64 x float>)
500 declare <64 x i8> @llvm.masked.load.v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
501 declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
502 declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
503 declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
504 declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
505 declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
506 declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
507 declare <32 x i16> @llvm.masked.load.v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
508 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
509 declare <8 x i64> @llvm.masked.load.v8i64(<8 x i64>*, i32, <8 x i1>, <8 x i64>)
510 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
512 attributes #0 = { "target-features"="+sve" }