llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=aarch64-linux-unknown | FileCheck %s
   3
   4
   5 ; Ensure we use a "vscale x 4" wide scatter for the maximum supported offset.
   6 define void @scatter_i8_index_offset_maximum(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
   7 ; CHECK-LABEL: scatter_i8_index_offset_maximum:
   8 ; CHECK:       // %bb.0:
   9 ; CHECK-NEXT:    mov w8, #33554431 // =0x1ffffff
  10 ; CHECK-NEXT:    index z1.s, #0, w8
  11 ; CHECK-NEXT:    add x8, x0, x1
  12 ; CHECK-NEXT:    st1b { z0.s }, p0, [x8, z1.s, sxtw]
  13 ; CHECK-NEXT:    ret
  14   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
  15   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
  16   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
  17   %t4 = mul <vscale x 4 x i64> splat(i64 33554431), %step
  18   %t5 = add <vscale x 4 x i64> %t1, %t4
  19   %t6 = getelementptr i8, ptr %base, <vscale x 4 x i64> %t5
  20   call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x ptr> %t6, i32 2, <vscale x 4 x i1> %pg)
  21   ret void
  22 }
  23
  24 ; Ensure we use a "vscale x 4" wide scatter for the minimum supported offset.
  25 define void @scatter_i16_index_offset_minimum(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i16> %data) #0 {
  26 ; CHECK-LABEL: scatter_i16_index_offset_minimum:
  27 ; CHECK:       // %bb.0:
  28 ; CHECK-NEXT:    mov w8, #-33554432 // =0xfe000000
  29 ; CHECK-NEXT:    index z1.s, #0, w8
  30 ; CHECK-NEXT:    add x8, x0, x1, lsl #1
  31 ; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw #1]
  32 ; CHECK-NEXT:    ret
  33   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
  34   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
  35   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
  36   %t4 = mul <vscale x 4 x i64> splat(i64 -33554432), %step
  37   %t5 = add <vscale x 4 x i64> %t1, %t4
  38   %t6 = getelementptr i16, ptr %base, <vscale x 4 x i64> %t5
  39   call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x ptr> %t6, i32 2, <vscale x 4 x i1> %pg)
  40   ret void
  41 }
  42
  43 ; Ensure we use a "vscale x 4" gather for an offset in the limits of 32 bits.
  44 define <vscale x 4 x i8> @gather_i8_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg) #0 {
  45 ; CHECK-LABEL: gather_i8_index_offset_8:
  46 ; CHECK:       // %bb.0:
  47 ; CHECK-NEXT:    index z0.s, #0, #1
  48 ; CHECK-NEXT:    add x8, x0, x1
  49 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x8, z0.s, sxtw]
  50 ; CHECK-NEXT:    ret
  51   %splat.insert0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
  52   %splat0 = shufflevector <vscale x 4 x i64> %splat.insert0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
  53   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
  54   %t1 = mul <vscale x 4 x i64> splat(i64 1), %step
  55   %t2 = add <vscale x 4 x i64> %splat0, %t1
  56   %t3 = getelementptr i8, ptr %base, <vscale x 4 x i64> %t2
  57   %load = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x ptr> %t3, i32 4, <vscale x 4 x i1> %pg, <vscale x 4 x i8> undef)
  58    ret <vscale x 4 x i8> %load
  59 }
  60
  61 ;; Negative tests
  62
  63 ; Ensure we don't use a "vscale x 4" scatter. Cannot prove that variable stride
  64 ; will not wrap when shrunk to be i32 based.
  65 define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
  66 ; CHECK-LABEL: scatter_f16_index_offset_var:
  67 ; CHECK:       // %bb.0:
  68 ; CHECK-NEXT:    index z1.d, #0, #1
  69 ; CHECK-NEXT:    mov z2.d, x1
  70 ; CHECK-NEXT:    ptrue p1.d
  71 ; CHECK-NEXT:    uunpklo z3.d, z0.s
  72 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
  73 ; CHECK-NEXT:    punpklo p2.h, p0.b
  74 ; CHECK-NEXT:    punpkhi p0.h, p0.b
  75 ; CHECK-NEXT:    movprfx z4, z2
  76 ; CHECK-NEXT:    mla z4.d, p1/m, z1.d, z2.d
  77 ; CHECK-NEXT:    incd z1.d
  78 ; CHECK-NEXT:    mad z1.d, p1/m, z2.d, z2.d
  79 ; CHECK-NEXT:    st1h { z3.d }, p2, [x0, z4.d, lsl #1]
  80 ; CHECK-NEXT:    st1h { z0.d }, p0, [x0, z1.d, lsl #1]
  81 ; CHECK-NEXT:    ret
  82   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
  83   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
  84   %t2 = insertelement <vscale x 4 x i64> undef, i64 %scale, i32 0
  85   %t3 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
  86   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
  87   %t4 = mul <vscale x 4 x i64> %t3, %step
  88   %t5 = add <vscale x 4 x i64> %t1, %t4
  89   %t6 = getelementptr half, ptr %base, <vscale x 4 x i64> %t5
  90   call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x ptr> %t6, i32 2, <vscale x 4 x i1> %pg)
  91   ret void
  92 }
  93
  94 ; Ensure we don't use a "vscale x 4" wide scatter when the offset is too big.
  95 define void @scatter_i8_index_offset_maximum_plus_one(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
  96 ; CHECK-LABEL: scatter_i8_index_offset_maximum_plus_one:
  97 ; CHECK:       // %bb.0:
  98 ; CHECK-NEXT:    mov w8, #33554432 // =0x2000000
  99 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 100 ; CHECK-NEXT:    rdvl x9, #1
 101 ; CHECK-NEXT:    index z1.d, #0, x8
 102 ; CHECK-NEXT:    punpklo p1.h, p0.b
 103 ; CHECK-NEXT:    lsr x9, x9, #4
 104 ; CHECK-NEXT:    add x8, x0, x1
 105 ; CHECK-NEXT:    mov w10, #67108864 // =0x4000000
 106 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 107 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 108 ; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
 109 ; CHECK-NEXT:    madd x8, x9, x10, x8
 110 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
 111 ; CHECK-NEXT:    ret
 112   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 113   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 114   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 115   %t4 = mul <vscale x 4 x i64> splat(i64 33554432), %step
 116   %t5 = add <vscale x 4 x i64> %t1, %t4
 117   %t6 = getelementptr i8, ptr %base, <vscale x 4 x i64> %t5
 118   call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x ptr> %t6, i32 2, <vscale x 4 x i1> %pg)
 119   ret void
 120 }
 121
 122 ; Ensure we don't use a "vscale x 4" wide scatter when the offset is too small.
 123 define void @scatter_i8_index_offset_minimum_minus_one(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 124 ; CHECK-LABEL: scatter_i8_index_offset_minimum_minus_one:
 125 ; CHECK:       // %bb.0:
 126 ; CHECK-NEXT:    mov x8, #-33554433 // =0xfffffffffdffffff
 127 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 128 ; CHECK-NEXT:    rdvl x9, #1
 129 ; CHECK-NEXT:    index z1.d, #0, x8
 130 ; CHECK-NEXT:    punpklo p1.h, p0.b
 131 ; CHECK-NEXT:    lsr x9, x9, #4
 132 ; CHECK-NEXT:    mov x10, #-2 // =0xfffffffffffffffe
 133 ; CHECK-NEXT:    add x8, x0, x1
 134 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 135 ; CHECK-NEXT:    movk x10, #64511, lsl #16
 136 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 137 ; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
 138 ; CHECK-NEXT:    madd x8, x9, x10, x8
 139 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
 140 ; CHECK-NEXT:    ret
 141   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 142   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 143   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 144   %t4 = mul <vscale x 4 x i64> splat(i64 -33554433), %step
 145   %t5 = add <vscale x 4 x i64> %t1, %t4
 146   %t6 = getelementptr i8, ptr %base, <vscale x 4 x i64> %t5
 147   call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x ptr> %t6, i32 2, <vscale x 4 x i1> %pg)
 148   ret void
 149 }
 150
 151 ; Ensure we don't use a "vscale x 4" wide scatter when the stride is too big .
 152 define void @scatter_i8_index_stride_too_big(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 153 ; CHECK-LABEL: scatter_i8_index_stride_too_big:
 154 ; CHECK:       // %bb.0:
 155 ; CHECK-NEXT:    mov x8, #4611686018427387904 // =0x4000000000000000
 156 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 157 ; CHECK-NEXT:    rdvl x9, #1
 158 ; CHECK-NEXT:    index z1.d, #0, x8
 159 ; CHECK-NEXT:    punpklo p1.h, p0.b
 160 ; CHECK-NEXT:    lsr x9, x9, #4
 161 ; CHECK-NEXT:    add x8, x0, x1
 162 ; CHECK-NEXT:    mov x10, #-9223372036854775808 // =0x8000000000000000
 163 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 164 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 165 ; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
 166 ; CHECK-NEXT:    madd x8, x9, x10, x8
 167 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
 168 ; CHECK-NEXT:    ret
 169   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 170   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 171   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 172   %t4 = mul <vscale x 4 x i64> splat(i64 4611686018427387904), %step
 173   %t5 = add <vscale x 4 x i64> %t1, %t4
 174   %t6 = getelementptr i8, ptr %base, <vscale x 4 x i64> %t5
 175   call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x ptr> %t6, i32 2, <vscale x 4 x i1> %pg)
 176   ret void
 177 }
 178
 179 ; Ensure the resulting load is "vscale x 4" wide, despite the offset giving the
 180 ; impression the gather must be split due to it's <vscale x 4 x i64> offset.
 181 ; gather_f32(base, index(offset, 8 * sizeof(float))
 182 define <vscale x 4 x i8> @gather_8i8_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg) #0 {
 183 ; CHECK-LABEL: gather_8i8_index_offset_8:
 184 ; CHECK:       // %bb.0:
 185 ; CHECK-NEXT:    index z0.s, #0, #8
 186 ; CHECK-NEXT:    add x8, x0, x1, lsl #3
 187 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x8, z0.s, sxtw]
 188 ; CHECK-NEXT:    ret
 189   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 190   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 191   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 192   %t2 = add <vscale x 4 x i64> %t1, %step
 193   %t3 = getelementptr [8 x i8], ptr %base, <vscale x 4 x i64> %t2
 194   %t4 = bitcast <vscale x 4 x ptr> %t3 to <vscale x 4 x ptr>
 195   %load = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x ptr> %t4, i32 4, <vscale x 4 x i1> %pg, <vscale x 4 x i8> undef)
 196   ret <vscale x 4 x i8> %load
 197 }
 198
 199 ; Ensure the resulting load is "vscale x 4" wide, despite the offset giving the
 200 ; impression the gather must be split due to it's <vscale x 4 x i64> offset.
 201 ; gather_f32(base, index(offset, 8 * sizeof(float))
 202 define <vscale x 4 x float> @gather_f32_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg) #0 {
 203 ; CHECK-LABEL: gather_f32_index_offset_8:
 204 ; CHECK:       // %bb.0:
 205 ; CHECK-NEXT:    mov w8, #32 // =0x20
 206 ; CHECK-NEXT:    index z0.s, #0, w8
 207 ; CHECK-NEXT:    add x8, x0, x1, lsl #5
 208 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, z0.s, sxtw]
 209 ; CHECK-NEXT:    ret
 210   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 211   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 212   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 213   %t2 = add <vscale x 4 x i64> %t1, %step
 214   %t3 = getelementptr [8 x float], ptr %base, <vscale x 4 x i64> %t2
 215   %t4 = bitcast <vscale x 4 x ptr> %t3 to <vscale x 4 x ptr>
 216   %load = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %t4, i32 4, <vscale x 4 x i1> %pg, <vscale x 4 x float> undef)
 217   ret <vscale x 4 x float> %load
 218 }
 219
 220 ; Ensure the resulting store is "vscale x 4" wide, despite the offset giving the
 221 ; impression the scatter must be split due to it's <vscale x 4 x i64> offset.
 222 ; scatter_f16(base, index(offset, 8 * sizeof(i8))
 223 define void @scatter_i8_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 224 ; CHECK-LABEL: scatter_i8_index_offset_8:
 225 ; CHECK:       // %bb.0:
 226 ; CHECK-NEXT:    index z1.s, #0, #8
 227 ; CHECK-NEXT:    add x8, x0, x1, lsl #3
 228 ; CHECK-NEXT:    st1b { z0.s }, p0, [x8, z1.s, sxtw]
 229 ; CHECK-NEXT:    ret
 230   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 231   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 232   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 233   %t2 = add <vscale x 4 x i64> %t1, %step
 234   %t3 = getelementptr [8 x i8], ptr %base, <vscale x 4 x i64> %t2
 235   %t4 = bitcast <vscale x 4 x ptr> %t3 to <vscale x 4 x ptr>
 236   call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x ptr> %t4, i32 2, <vscale x 4 x i1> %pg)
 237   ret void
 238 }
 239
 240 ; Ensure the resulting store is "vscale x 4" wide, despite the offset giving the
 241 ; impression the scatter must be split due to it's <vscale x 4 x i64> offset.
 242 ; scatter_f16(base, index(offset, 8 * sizeof(half))
 243 define void @scatter_f16_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 244 ; CHECK-LABEL: scatter_f16_index_offset_8:
 245 ; CHECK:       // %bb.0:
 246 ; CHECK-NEXT:    mov w8, #16 // =0x10
 247 ; CHECK-NEXT:    index z1.s, #0, w8
 248 ; CHECK-NEXT:    add x8, x0, x1, lsl #4
 249 ; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw]
 250 ; CHECK-NEXT:    ret
 251   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 252   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 253   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 254   %t2 = add <vscale x 4 x i64> %t1, %step
 255   %t3 = getelementptr [8 x half], ptr %base, <vscale x 4 x i64> %t2
 256   %t4 = bitcast <vscale x 4 x ptr> %t3 to <vscale x 4 x ptr>
 257   call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x ptr> %t4, i32 2, <vscale x 4 x i1> %pg)
 258   ret void
 259 }
 260
 261 ; stepvector is hidden further behind GEP and two adds.
 262 define void @scatter_f16_index_add_add(ptr %base, i64 %offset, i64 %offset2, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 263 ; CHECK-LABEL: scatter_f16_index_add_add:
 264 ; CHECK:       // %bb.0:
 265 ; CHECK-NEXT:    mov w8, #16 // =0x10
 266 ; CHECK-NEXT:    add x9, x0, x2, lsl #4
 267 ; CHECK-NEXT:    index z1.s, #0, w8
 268 ; CHECK-NEXT:    add x8, x9, x1, lsl #4
 269 ; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw]
 270 ; CHECK-NEXT:    ret
 271   %splat.offset.ins = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 272   %splat.offset = shufflevector <vscale x 4 x i64> %splat.offset.ins, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 273   %splat.offset2.ins = insertelement <vscale x 4 x i64> undef, i64 %offset2, i32 0
 274   %splat.offset2 = shufflevector <vscale x 4 x i64> %splat.offset2.ins, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 275   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 276   %add1 = add <vscale x 4 x i64> %splat.offset, %step
 277   %add2 = add <vscale x 4 x i64> %add1, %splat.offset2
 278   %gep = getelementptr [8 x half], ptr %base, <vscale x 4 x i64> %add2
 279   %gep.bc = bitcast <vscale x 4 x ptr> %gep to <vscale x 4 x ptr>
 280   call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x ptr> %gep.bc, i32 2, <vscale x 4 x i1> %pg)
 281   ret void
 282 }
 283
 284 ; stepvector is hidden further behind GEP two adds and a shift.
 285 define void @scatter_f16_index_add_add_mul(ptr %base, i64 %offset, i64 %offset2, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 286 ; CHECK-LABEL: scatter_f16_index_add_add_mul:
 287 ; CHECK:       // %bb.0:
 288 ; CHECK-NEXT:    mov w8, #128 // =0x80
 289 ; CHECK-NEXT:    add x9, x0, x2, lsl #7
 290 ; CHECK-NEXT:    index z1.s, #0, w8
 291 ; CHECK-NEXT:    add x8, x9, x1, lsl #7
 292 ; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw]
 293 ; CHECK-NEXT:    ret
 294   %splat.offset.ins = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
 295   %splat.offset = shufflevector <vscale x 4 x i64> %splat.offset.ins, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 296   %splat.offset2.ins = insertelement <vscale x 4 x i64> undef, i64 %offset2, i32 0
 297   %splat.offset2 = shufflevector <vscale x 4 x i64> %splat.offset2.ins, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
 298   %step = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 299   %add1 = add <vscale x 4 x i64> %splat.offset, %step
 300   %add2 = add <vscale x 4 x i64> %add1, %splat.offset2
 301   %mul = mul <vscale x 4 x i64> %add2, splat(i64 8)
 302   %gep = getelementptr [8 x half], ptr %base, <vscale x 4 x i64> %mul
 303   %gep.bc = bitcast <vscale x 4 x ptr> %gep to <vscale x 4 x ptr>
 304   call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x ptr> %gep.bc, i32 2, <vscale x 4 x i1> %pg)
 305   ret void
 306 }
 307
 308 define <vscale x 2 x i64> @masked_gather_nxv2i64_const_with_vec_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg) #0 {
 309 ; CHECK-LABEL: masked_gather_nxv2i64_const_with_vec_offsets:
 310 ; CHECK:       // %bb.0:
 311 ; CHECK-NEXT:    mov w8, #8 // =0x8
 312 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3]
 313 ; CHECK-NEXT:    ret
 314   %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), <vscale x 2 x i64> %vector_offsets
 315   %data = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr> %ptrs, i32 8, <vscale x 2 x i1> %pg, <vscale x 2 x i64> undef)
 316   ret <vscale x 2 x i64> %data
 317 }
 318
 319 define <vscale x 2 x i64> @masked_gather_nxv2i64_null_with_vec_plus_scalar_offsets(<vscale x 2 x i64> %vector_offsets, i64 %scalar_offset, <vscale x 2 x i1> %pg) #0 {
 320 ; CHECK-LABEL: masked_gather_nxv2i64_null_with_vec_plus_scalar_offsets:
 321 ; CHECK:       // %bb.0:
 322 ; CHECK-NEXT:    lsl x8, x0, #3
 323 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3]
 324 ; CHECK-NEXT:    ret
 325   %scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 %scalar_offset, i64 0
 326   %scalar_offset.splat = shufflevector <vscale x 2 x i64> %scalar_offset.ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
 327   %offsets = add <vscale x 2 x i64> %vector_offsets, %scalar_offset.splat
 328   %ptrs = getelementptr i64, ptr null, <vscale x 2 x i64> %offsets
 329   %data = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr> %ptrs, i32 8, <vscale x 2 x i1> %pg, <vscale x 2 x i64> undef)
 330   ret <vscale x 2 x i64> %data
 331 }
 332
 333 define <vscale x 2 x i64> @masked_gather_nxv2i64_null_with__vec_plus_imm_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg) #0 {
 334 ; CHECK-LABEL: masked_gather_nxv2i64_null_with__vec_plus_imm_offsets:
 335 ; CHECK:       // %bb.0:
 336 ; CHECK-NEXT:    mov w8, #8 // =0x8
 337 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3]
 338 ; CHECK-NEXT:    ret
 339   %offsets = add <vscale x 2 x i64> %vector_offsets, splat(i64 1)
 340   %ptrs = getelementptr i64, ptr null, <vscale x 2 x i64> %offsets
 341   %data = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr> %ptrs, i32 8, <vscale x 2 x i1> %pg, <vscale x 2 x i64> undef)
 342   ret <vscale x 2 x i64> %data
 343 }
 344
 345 define <vscale x 4 x i32> @masked_gather_nxv4i32_s8_offsets(ptr %base, <vscale x 4 x i8> %offsets, <vscale x 4 x i1> %mask) #0 {
 346 ; CHECK-LABEL: masked_gather_nxv4i32_s8_offsets:
 347 ; CHECK:       // %bb.0:
 348 ; CHECK-NEXT:    ptrue p1.s
 349 ; CHECK-NEXT:    sxtb z0.s, p1/m, z0.s
 350 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
 351 ; CHECK-NEXT:    ret
 352   %offsets.sext = sext <vscale x 4 x i8> %offsets to <vscale x 4 x i32>
 353   %ptrs = getelementptr i32, ptr %base, <vscale x 4 x i32> %offsets.sext
 354   %data = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
 355   ret <vscale x 4 x i32> %data
 356 }
 357
 358 define <vscale x 4 x i32> @masked_gather_nxv4i32_u8_offsets(ptr %base, <vscale x 4 x i8> %offsets, <vscale x 4 x i1> %mask) #0 {
 359 ; CHECK-LABEL: masked_gather_nxv4i32_u8_offsets:
 360 ; CHECK:       // %bb.0:
 361 ; CHECK-NEXT:    and z0.s, z0.s, #0xff
 362 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
 363 ; CHECK-NEXT:    ret
 364   %offsets.zext = zext <vscale x 4 x i8> %offsets to <vscale x 4 x i32>
 365   %ptrs = getelementptr i32, ptr %base, <vscale x 4 x i32> %offsets.zext
 366   %data = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
 367   ret <vscale x 4 x i32> %data
 368 }
 369
 370 define <vscale x 4 x i32> @masked_gather_nxv4i32_u32s8_offsets(ptr %base, <vscale x 4 x i8> %offsets, <vscale x 4 x i1> %mask) #0 {
 371 ; CHECK-LABEL: masked_gather_nxv4i32_u32s8_offsets:
 372 ; CHECK:       // %bb.0:
 373 ; CHECK-NEXT:    ptrue p1.s
 374 ; CHECK-NEXT:    sxtb z0.s, p1/m, z0.s
 375 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
 376 ; CHECK-NEXT:    ret
 377   %offsets.sext = sext <vscale x 4 x i8> %offsets to <vscale x 4 x i32>
 378   %offsets.sext.zext = zext <vscale x 4 x i32> %offsets.sext to <vscale x 4 x i64>
 379   %ptrs = getelementptr i32, ptr %base, <vscale x 4 x i64> %offsets.sext.zext
 380   %data = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
 381   ret <vscale x 4 x i32> %data
 382 }
 383
 384 define void @masked_scatter_nxv2i64_const_with_vec_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %data) #0 {
 385 ; CHECK-LABEL: masked_scatter_nxv2i64_const_with_vec_offsets:
 386 ; CHECK:       // %bb.0:
 387 ; CHECK-NEXT:    mov w8, #8 // =0x8
 388 ; CHECK-NEXT:    st1d { z1.d }, p0, [x8, z0.d, lsl #3]
 389 ; CHECK-NEXT:    ret
 390   %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), <vscale x 2 x i64> %vector_offsets
 391   call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x ptr> %ptrs, i32 8, <vscale x 2 x i1> %pg)
 392   ret void
 393 }
 394
 395 define void @masked_scatter_nxv2i64_null_with_vec_plus_scalar_offsets(<vscale x 2 x i64> %vector_offsets, i64 %scalar_offset, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %data) #0 {
 396 ; CHECK-LABEL: masked_scatter_nxv2i64_null_with_vec_plus_scalar_offsets:
 397 ; CHECK:       // %bb.0:
 398 ; CHECK-NEXT:    lsl x8, x0, #3
 399 ; CHECK-NEXT:    st1d { z1.d }, p0, [x8, z0.d, lsl #3]
 400 ; CHECK-NEXT:    ret
 401   %scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 %scalar_offset, i64 0
 402   %scalar_offset.splat = shufflevector <vscale x 2 x i64> %scalar_offset.ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
 403   %offsets = add <vscale x 2 x i64> %vector_offsets, %scalar_offset.splat
 404   %ptrs = getelementptr i64, ptr null, <vscale x 2 x i64> %offsets
 405   call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x ptr> %ptrs, i32 8, <vscale x 2 x i1> %pg)
 406   ret void
 407 }
 408
 409 define void @masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %data) #0 {
 410 ; CHECK-LABEL: masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets:
 411 ; CHECK:       // %bb.0:
 412 ; CHECK-NEXT:    mov w8, #8 // =0x8
 413 ; CHECK-NEXT:    st1d { z1.d }, p0, [x8, z0.d, lsl #3]
 414 ; CHECK-NEXT:    ret
 415   %offsets = add <vscale x 2 x i64> %vector_offsets, splat(i64 1)
 416   %ptrs = getelementptr i64, ptr null, <vscale x 2 x i64> %offsets
 417   call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x ptr> %ptrs, i32 8, <vscale x 2 x i1> %pg)
 418   ret void
 419 }
 420
 421 define void @masked_scatter_nxv4i32_s8_offsets(ptr %base, <vscale x 4 x i8> %offsets, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %data) #0 {
 422 ; CHECK-LABEL: masked_scatter_nxv4i32_s8_offsets:
 423 ; CHECK:       // %bb.0:
 424 ; CHECK-NEXT:    ptrue p1.s
 425 ; CHECK-NEXT:    sxtb z0.s, p1/m, z0.s
 426 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
 427 ; CHECK-NEXT:    ret
 428   %offsets.sext = sext <vscale x 4 x i8> %offsets to <vscale x 4 x i32>
 429   %ptrs = getelementptr i32, ptr %base, <vscale x 4 x i32> %offsets.sext
 430   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask)
 431   ret void
 432 }
 433
 434 define void @masked_scatter_nxv4i32_u8_offsets(ptr %base, <vscale x 4 x i8> %offsets, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %data) #0 {
 435 ; CHECK-LABEL: masked_scatter_nxv4i32_u8_offsets:
 436 ; CHECK:       // %bb.0:
 437 ; CHECK-NEXT:    and z0.s, z0.s, #0xff
 438 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
 439 ; CHECK-NEXT:    ret
 440   %offsets.zext = zext <vscale x 4 x i8> %offsets to <vscale x 4 x i32>
 441   %ptrs = getelementptr i32, ptr %base, <vscale x 4 x i32> %offsets.zext
 442   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask)
 443   ret void
 444 }
 445
 446 define void @masked_scatter_nxv4i32_u32s8_offsets(ptr %base, <vscale x 4 x i8> %offsets, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %data) #0 {
 447 ; CHECK-LABEL: masked_scatter_nxv4i32_u32s8_offsets:
 448 ; CHECK:       // %bb.0:
 449 ; CHECK-NEXT:    ptrue p1.s
 450 ; CHECK-NEXT:    sxtb z0.s, p1/m, z0.s
 451 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
 452 ; CHECK-NEXT:    ret
 453   %offsets.sext = sext <vscale x 4 x i8> %offsets to <vscale x 4 x i32>
 454   %offsets.sext.zext = zext <vscale x 4 x i32> %offsets.sext to <vscale x 4 x i64>
 455   %ptrs = getelementptr i32, ptr %base, <vscale x 4 x i64> %offsets.sext.zext
 456   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask)
 457   ret void
 458 }
 459
 460 attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
 461
 462 declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
 463 declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
 464 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 465 declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
 466
 467 declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x ptr>, i32, <vscale x 2 x i1>)
 468 declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 469 declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 470 declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 471 declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)
 472
 473 declare <vscale x 4 x i64> @llvm.stepvector.nxv4i64()