1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
3 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
5 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
6 ; unscaled unpacked 32-bit offsets
7 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
9 define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
10 ; CHECK-LABEL: masked_gather_nxv2i8:
12 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
14 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
15 %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
16 %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
17 %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
18 ret <vscale x 2 x i64> %vals.zext
21 define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
22 ; CHECK-LABEL: masked_gather_nxv2i16:
24 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
26 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
27 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
28 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
29 %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
30 %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
31 ret <vscale x 2 x i64> %vals.zext
34 define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
35 ; CHECK-LABEL: masked_gather_nxv2i32:
37 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
39 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
40 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
41 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
42 %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
43 %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
44 ret <vscale x 2 x i64> %vals.zext
47 define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
48 ; CHECK-LABEL: masked_gather_nxv2i64:
50 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
52 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
53 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
54 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
55 %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
56 ret <vscale x 2 x i64> %vals
59 define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
60 ; CHECK-LABEL: masked_gather_nxv2f16:
62 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
64 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
65 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
66 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
67 %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
68 ret <vscale x 2 x half> %vals
71 define <vscale x 2 x bfloat> @masked_gather_nxv2bf16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) #0 {
72 ; CHECK-LABEL: masked_gather_nxv2bf16:
74 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
76 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
77 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
78 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
79 %vals = call <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x bfloat> undef)
80 ret <vscale x 2 x bfloat> %vals
83 define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
84 ; CHECK-LABEL: masked_gather_nxv2f32:
86 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
88 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
89 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
90 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
91 %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
92 ret <vscale x 2 x float> %vals
95 define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
96 ; CHECK-LABEL: masked_gather_nxv2f64:
98 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
100 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
101 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
102 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
103 %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
104 ret <vscale x 2 x double> %vals
107 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
108 ; CHECK-LABEL: masked_sgather_nxv2i8:
110 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
112 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
113 %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
114 %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
115 %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
116 ret <vscale x 2 x i64> %vals.sext
119 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
120 ; CHECK-LABEL: masked_sgather_nxv2i16:
122 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
124 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
125 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
126 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
127 %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
128 %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
129 ret <vscale x 2 x i64> %vals.sext
132 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
133 ; CHECK-LABEL: masked_sgather_nxv2i32:
135 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
137 %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
138 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
139 %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
140 %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
141 %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
142 ret <vscale x 2 x i64> %vals.sext
145 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
146 ; unscaled packed 32-bit offsets
147 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
149 define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
150 ; CHECK-LABEL: masked_gather_nxv4i8:
152 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
154 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
155 %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
156 %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
157 %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
158 ret <vscale x 4 x i32> %vals.zext
161 define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
162 ; CHECK-LABEL: masked_gather_nxv4i16:
164 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
166 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
167 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
168 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
169 %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
170 %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
171 ret <vscale x 4 x i32> %vals.zext
174 define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
175 ; CHECK-LABEL: masked_gather_nxv4i32:
177 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
179 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
180 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
181 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
182 %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
183 ret <vscale x 4 x i32> %vals
186 define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
187 ; CHECK-LABEL: masked_gather_nxv4f16:
189 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
191 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
192 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
193 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
194 %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
195 ret <vscale x 4 x half> %vals
198 define <vscale x 4 x bfloat> @masked_gather_nxv4bf16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) #0 {
199 ; CHECK-LABEL: masked_gather_nxv4bf16:
201 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
203 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
204 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
205 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x bfloat*>
206 %vals = call <vscale x 4 x bfloat> @llvm.masked.gather.nxv4bf16(<vscale x 4 x bfloat*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x bfloat> undef)
207 ret <vscale x 4 x bfloat> %vals
210 define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
211 ; CHECK-LABEL: masked_gather_nxv4f32:
213 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
215 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
216 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
217 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
218 %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
219 ret <vscale x 4 x float> %vals
222 define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
223 ; CHECK-LABEL: masked_sgather_nxv4i8:
225 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
227 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
228 %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
229 %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
230 %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
231 ret <vscale x 4 x i32> %vals.sext
234 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
235 ; CHECK-LABEL: masked_sgather_nxv4i16:
237 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
239 %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
240 %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
241 %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
242 %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
243 %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
244 ret <vscale x 4 x i32> %vals.sext
247 declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
248 declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
249 declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
250 declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
251 declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
252 declare <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>, <vscale x 2 x bfloat>)
253 declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
254 declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
256 declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
257 declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
258 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
259 declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
260 declare <vscale x 4 x bfloat> @llvm.masked.gather.nxv4bf16(<vscale x 4 x bfloat*>, i32, <vscale x 4 x i1>, <vscale x 4 x bfloat>)
261 declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
262 attributes #0 = { "target-features"="+sve,+bf16" }