llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
   3
   4 ;
   5 ; Move Multi-Vector From Tile (Read) x2
   6 ;
   7
   8 ; Horizontal
   9
  10 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
  11 ; CHECK-LABEL: za_read_horiz_vg2_b:
  12 ; CHECK:       // %bb.0:
  13 ; CHECK-NEXT:    mov w12, w0
  14 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 0:1]
  15 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 14:15]
  16 ; CHECK-NEXT:    ret
  17   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
  18   %slice.14 = add i32 %slice, 14
  19   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
  20   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
  21 }
  22
  23 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
  24 ; CHECK-LABEL: za_read_horiz_vg2_h:
  25 ; CHECK:       // %bb.0:
  26 ; CHECK-NEXT:    mov w12, w0
  27 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
  28 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
  29 ; CHECK-NEXT:    ret
  30   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
  31   %slice.6 = add i32 %slice, 6
  32   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
  33   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
  34 }
  35
  36 define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
  37 ; CHECK-LABEL: za_read_horiz_vg2_f16:
  38 ; CHECK:       // %bb.0:
  39 ; CHECK-NEXT:    mov w12, w0
  40 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
  41 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
  42 ; CHECK-NEXT:    ret
  43   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
  44   %slice.6 = add i32 %slice, 6
  45   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
  46   ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
  47 }
  48
  49 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
  50 ; CHECK-LABEL: za_read_horiz_vg2_bf16:
  51 ; CHECK:       // %bb.0:
  52 ; CHECK-NEXT:    mov w12, w0
  53 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
  54 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
  55 ; CHECK-NEXT:    ret
  56   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
  57   %slice.6 = add i32 %slice, 6
  58   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
  59   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
  60 }
  61
  62 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
  63 ; CHECK-LABEL: za_read_horiz_vg2_s:
  64 ; CHECK:       // %bb.0:
  65 ; CHECK-NEXT:    mov w12, w0
  66 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
  67 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
  68 ; CHECK-NEXT:    ret
  69   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
  70   %slice.2 = add i32 %slice, 2
  71   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
  72   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
  73 }
  74
  75 define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
  76 ; CHECK-LABEL: za_read_horiz_vg2_f32:
  77 ; CHECK:       // %bb.0:
  78 ; CHECK-NEXT:    mov w12, w0
  79 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
  80 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
  81 ; CHECK-NEXT:    ret
  82   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
  83   %slice.2 = add i32 %slice, 2
  84   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
  85   ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
  86 }
  87
  88 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
  89 ; CHECK-LABEL: za_read_horiz_vg2_d:
  90 ; CHECK:       // %bb.0:
  91 ; CHECK-NEXT:    mov w12, w0
  92 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0h.d[w12, 0:1]
  93 ; CHECK-NEXT:    ret
  94   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 0, i32 %slice)
  95   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
  96 }
  97
  98 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i32 %slice) {
  99 ; CHECK-LABEL: za_read_horiz_vg2_f64:
 100 ; CHECK:       // %bb.0:
 101 ; CHECK-NEXT:    mov w12, w0
 102 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0h.d[w12, 0:1]
 103 ; CHECK-NEXT:    ret
 104   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 0, i32 %slice)
 105   ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
 106 }
 107
 108 ; Vertical
 109
 110 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
 111 ; CHECK-LABEL: za_read_vert_vg2_b:
 112 ; CHECK:       // %bb.0:
 113 ; CHECK-NEXT:    mov w12, w0
 114 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 0:1]
 115 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 14:15]
 116 ; CHECK-NEXT:    ret
 117   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
 118   %slice.14 = add i32 %slice, 14
 119   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
 120   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 121 }
 122
 123 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
 124 ; CHECK-LABEL: za_read_vert_vg2_h:
 125 ; CHECK:       // %bb.0:
 126 ; CHECK-NEXT:    mov w12, w0
 127 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
 128 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
 129 ; CHECK-NEXT:    ret
 130   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
 131   %slice.6 = add i32 %slice, 6
 132   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
 133   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 134 }
 135
 136 define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
 137 ; CHECK-LABEL: za_read_vert_vg2_f16:
 138 ; CHECK:       // %bb.0:
 139 ; CHECK-NEXT:    mov w12, w0
 140 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
 141 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
 142 ; CHECK-NEXT:    ret
 143   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
 144   %slice.6 = add i32 %slice, 6
 145   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
 146   ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
 147 }
 148
 149 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
 150 ; CHECK-LABEL: za_read_vert_vg2_bf16:
 151 ; CHECK:       // %bb.0:
 152 ; CHECK-NEXT:    mov w12, w0
 153 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
 154 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
 155 ; CHECK-NEXT:    ret
 156   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
 157   %slice.6 = add i32 %slice, 6
 158   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
 159   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 160 }
 161
 162 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
 163 ; CHECK-LABEL: za_read_vert_vg2_s:
 164 ; CHECK:       // %bb.0:
 165 ; CHECK-NEXT:    mov w12, w0
 166 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
 167 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
 168 ; CHECK-NEXT:    ret
 169   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
 170   %slice.2 = add i32 %slice, 2
 171   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
 172   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
 173 }
 174
 175 define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
 176 ; CHECK-LABEL: za_read_vert_vg2_f32:
 177 ; CHECK:       // %bb.0:
 178 ; CHECK-NEXT:    mov w12, w0
 179 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
 180 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
 181 ; CHECK-NEXT:    ret
 182   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
 183   %slice.2 = add i32 %slice, 2
 184   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
 185   ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
 186 }
 187
 188 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
 189 ; CHECK-LABEL: za_read_vert_vg2_d:
 190 ; CHECK:       // %bb.0:
 191 ; CHECK-NEXT:    mov w12, w0
 192 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0v.d[w12, 0:1]
 193 ; CHECK-NEXT:    ret
 194   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 0, i32 %slice)
 195   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 196 }
 197
 198 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i32 %slice) {
 199 ; CHECK-LABEL: za_read_vert_vg2_f64:
 200 ; CHECK:       // %bb.0:
 201 ; CHECK-NEXT:    mov w12, w0
 202 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0v.d[w12, 0:1]
 203 ; CHECK-NEXT:    ret
 204   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 0, i32 %slice)
 205   ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
 206 }
 207
 208 ;
 209 ; Move Multi-Vector From Tile (Read) x4
 210 ;
 211
 212 ; Horizontal
 213
 214 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
 215 ; CHECK-LABEL: za_read_horiz_vg4_b:
 216 ; CHECK:       // %bb.0:
 217 ; CHECK-NEXT:    mov w12, w0
 218 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 0:3]
 219 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 12:15]
 220 ; CHECK-NEXT:    ret
 221   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
 222   %slice.12 = add i32 %slice, 12
 223   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
 224   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 225 }
 226
 227 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
 228 ; CHECK-LABEL: za_read_horiz_vg4_h:
 229 ; CHECK:       // %bb.0:
 230 ; CHECK-NEXT:    mov w12, w0
 231 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
 232 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
 233 ; CHECK-NEXT:    ret
 234   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
 235   %slice.4 = add i32 %slice, 4
 236   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
 237   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 238 }
 239
 240 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
 241 ; CHECK-LABEL: za_read_horiz_vg4_f16:
 242 ; CHECK:       // %bb.0:
 243 ; CHECK-NEXT:    mov w12, w0
 244 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
 245 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
 246 ; CHECK-NEXT:    ret
 247   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
 248   %slice.4 = add i32 %slice, 4
 249   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
 250   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
 251 }
 252
 253 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
 254 ; CHECK-LABEL: za_read_horiz_vg4_bf16:
 255 ; CHECK:       // %bb.0:
 256 ; CHECK-NEXT:    mov w12, w0
 257 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
 258 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
 259 ; CHECK-NEXT:    ret
 260   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
 261   %slice.4 = add i32 %slice, 4
 262   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
 263   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 264 }
 265
 266 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
 267 ; CHECK-LABEL: za_read_horiz_vg4_s:
 268 ; CHECK:       // %bb.0:
 269 ; CHECK-NEXT:    mov w12, w0
 270 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0h.s[w12, 0:3]
 271 ; CHECK-NEXT:    ret
 272   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 0, i32 %slice)
 273   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 274 }
 275
 276 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg4_f32(i32 %slice) {
 277 ; CHECK-LABEL: za_read_horiz_vg4_f32:
 278 ; CHECK:       // %bb.0:
 279 ; CHECK-NEXT:    mov w12, w0
 280 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0h.s[w12, 0:3]
 281 ; CHECK-NEXT:    ret
 282   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 0, i32 %slice)
 283   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 284 }
 285
 286 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg4_d(i32 %slice) {
 287 ; CHECK-LABEL: za_read_horiz_vg4_d:
 288 ; CHECK:       // %bb.0:
 289 ; CHECK-NEXT:    mov w12, w0
 290 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0h.d[w12, 0:3]
 291 ; CHECK-NEXT:    ret
 292   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 0, i32 %slice)
 293   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 294 }
 295
 296 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg4_f64(i32 %slice) {
 297 ; CHECK-LABEL: za_read_horiz_vg4_f64:
 298 ; CHECK:       // %bb.0:
 299 ; CHECK-NEXT:    mov w12, w0
 300 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0h.d[w12, 0:3]
 301 ; CHECK-NEXT:    ret
 302   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 0, i32 %slice)
 303   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
 304 }
 305
 306 ; Vertical
 307
 308 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
 309 ; CHECK-LABEL: za_read_vert_vg4_b:
 310 ; CHECK:       // %bb.0:
 311 ; CHECK-NEXT:    mov w12, w0
 312 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 0:3]
 313 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 12:15]
 314 ; CHECK-NEXT:    ret
 315   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
 316   %slice.12 = add i32 %slice, 12
 317   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
 318   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 319 }
 320
 321 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
 322 ; CHECK-LABEL: za_read_vert_vg4_h:
 323 ; CHECK:       // %bb.0:
 324 ; CHECK-NEXT:    mov w12, w0
 325 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
 326 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
 327 ; CHECK-NEXT:    ret
 328   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
 329   %slice.4 = add i32 %slice, 4
 330   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
 331   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 332 }
 333
 334 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
 335 ; CHECK-LABEL: za_read_vert_vg4_f16:
 336 ; CHECK:       // %bb.0:
 337 ; CHECK-NEXT:    mov w12, w0
 338 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
 339 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
 340 ; CHECK-NEXT:    ret
 341   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
 342   %slice.4 = add i32 %slice, 4
 343   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
 344   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
 345 }
 346
 347 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
 348 ; CHECK-LABEL: za_read_vert_vg4_bf16:
 349 ; CHECK:       // %bb.0:
 350 ; CHECK-NEXT:    mov w12, w0
 351 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
 352 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
 353 ; CHECK-NEXT:    ret
 354   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
 355   %slice.4 = add i32 %slice, 4
 356   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
 357   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 358 }
 359
 360 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
 361 ; CHECK-LABEL: za_read_vert_vg4_s:
 362 ; CHECK:       // %bb.0:
 363 ; CHECK-NEXT:    mov w12, w0
 364 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0v.s[w12, 0:3]
 365 ; CHECK-NEXT:    ret
 366   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 0, i32 %slice)
 367   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 368 }
 369
 370 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg4_f32(i32 %slice) {
 371 ; CHECK-LABEL: za_read_vert_vg4_f32:
 372 ; CHECK:       // %bb.0:
 373 ; CHECK-NEXT:    mov w12, w0
 374 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0v.s[w12, 0:3]
 375 ; CHECK-NEXT:    ret
 376   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 0, i32 %slice)
 377   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 378 }
 379
 380 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg4_d(i32 %slice) {
 381 ; CHECK-LABEL: za_read_vert_vg4_d:
 382 ; CHECK:       // %bb.0:
 383 ; CHECK-NEXT:    mov w12, w0
 384 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0v.d[w12, 0:3]
 385 ; CHECK-NEXT:    ret
 386   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 0, i32 %slice)
 387   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 388 }
 389
 390 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg4_f64(i32 %slice) {
 391 ; CHECK-LABEL: za_read_vert_vg4_f64:
 392 ; CHECK:       // %bb.0:
 393 ; CHECK-NEXT:    mov w12, w0
 394 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0v.d[w12, 0:3]
 395 ; CHECK-NEXT:    ret
 396   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 0, i32 %slice)
 397   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
 398 }
 399
 400 ; Move Multi-Vector From ZA (Read) x2
 401
 402 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x2_b(i32 %slice) {
 403 ; CHECK-LABEL: za_read_vg1x2_b:
 404 ; CHECK:       // %bb.0:
 405 ; CHECK-NEXT:    mov w8, w0
 406 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 407 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 408 ; CHECK-NEXT:    ret
 409   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice)
 410   %slice.7 = add i32 %slice, 7
 411   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice.7)
 412   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 413 }
 414
 415 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x2_h(i32 %slice) {
 416 ; CHECK-LABEL: za_read_vg1x2_h:
 417 ; CHECK:       // %bb.0:
 418 ; CHECK-NEXT:    mov w8, w0
 419 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 420 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 421 ; CHECK-NEXT:    ret
 422   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice)
 423   %slice.7 = add i32 %slice, 7
 424   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice.7)
 425   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 426 }
 427
 428 define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x2_f16(i32 %slice) {
 429 ; CHECK-LABEL: za_read_vg1x2_f16:
 430 ; CHECK:       // %bb.0:
 431 ; CHECK-NEXT:    mov w8, w0
 432 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 433 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 434 ; CHECK-NEXT:    ret
 435   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice)
 436   %slice.7 = add i32 %slice, 7
 437   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice.7)
 438   ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
 439 }
 440
 441 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x2_bf16(i32 %slice) {
 442 ; CHECK-LABEL: za_read_vg1x2_bf16:
 443 ; CHECK:       // %bb.0:
 444 ; CHECK-NEXT:    mov w8, w0
 445 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 446 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 447 ; CHECK-NEXT:    ret
 448   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice)
 449   %slice.7 = add i32 %slice, 7
 450   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice.7)
 451   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 452 }
 453
 454 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x2_s(i32 %slice) {
 455 ; CHECK-LABEL: za_read_vg1x2_s:
 456 ; CHECK:       // %bb.0:
 457 ; CHECK-NEXT:    mov w8, w0
 458 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 459 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 460 ; CHECK-NEXT:    ret
 461   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice)
 462   %slice.7 = add i32 %slice, 7
 463   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice.7)
 464   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
 465 }
 466
 467 define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x2_f32(i32 %slice) {
 468 ; CHECK-LABEL: za_read_vg1x2_f32:
 469 ; CHECK:       // %bb.0:
 470 ; CHECK-NEXT:    mov w8, w0
 471 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 472 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 473 ; CHECK-NEXT:    ret
 474   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice)
 475   %slice.7 = add i32 %slice, 7
 476   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice.7)
 477   ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
 478 }
 479
 480 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
 481 ; CHECK-LABEL: za_read_vg1x2_d:
 482 ; CHECK:       // %bb.0:
 483 ; CHECK-NEXT:    mov w8, w0
 484 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 485 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 486 ; CHECK-NEXT:    ret
 487   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
 488   %slice.7 = add i32 %slice, 7
 489   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
 490   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
 491 }
 492
 493 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
 494 ; CHECK-LABEL: za_read_vg1x2_f64:
 495 ; CHECK:       // %bb.0:
 496 ; CHECK-NEXT:    mov w8, w0
 497 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 498 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 499 ; CHECK-NEXT:    ret
 500   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
 501   %slice.7 = add i32 %slice, 7
 502   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
 503   ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
 504 }
 505
 506 ; Move Multi-Vector From ZA (Read) x4
 507
 508 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x4_b(i32 %slice) {
 509 ; CHECK-LABEL: za_read_vg1x4_b:
 510 ; CHECK:       // %bb.0:
 511 ; CHECK-NEXT:    mov w8, w0
 512 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 513 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 514 ; CHECK-NEXT:    ret
 515   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice)
 516   %slice.7 = add i32 %slice, 7
 517   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice.7)
 518   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 519 }
 520
 521 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x4_h(i32 %slice) {
 522 ; CHECK-LABEL: za_read_vg1x4_h:
 523 ; CHECK:       // %bb.0:
 524 ; CHECK-NEXT:    mov w8, w0
 525 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 526 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 527 ; CHECK-NEXT:    ret
 528   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice)
 529   %slice.7 = add i32 %slice, 7
 530   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice.7)
 531   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 532 }
 533
 534 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x4_f16(i32 %slice) {
 535 ; CHECK-LABEL: za_read_vg1x4_f16:
 536 ; CHECK:       // %bb.0:
 537 ; CHECK-NEXT:    mov w8, w0
 538 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 539 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 540 ; CHECK-NEXT:    ret
 541   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice)
 542   %slice.7 = add i32 %slice, 7
 543   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice.7)
 544   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
 545 }
 546
 547 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x4_bf16(i32 %slice) {
 548 ; CHECK-LABEL: za_read_vg1x4_bf16:
 549 ; CHECK:       // %bb.0:
 550 ; CHECK-NEXT:    mov w8, w0
 551 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 552 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 553 ; CHECK-NEXT:    ret
 554   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice)
 555   %slice.7 = add i32 %slice, 7
 556   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice.7)
 557   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 558 }
 559
 560 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x4_s(i32 %slice) {
 561 ; CHECK-LABEL: za_read_vg1x4_s:
 562 ; CHECK:       // %bb.0:
 563 ; CHECK-NEXT:    mov w8, w0
 564 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 565 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 566 ; CHECK-NEXT:    ret
 567   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice)
 568   %slice.7 = add i32 %slice, 7
 569   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice.7)
 570   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
 571 }
 572
 573 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x4_f32(i32 %slice) {
 574 ; CHECK-LABEL: za_read_vg1x4_f32:
 575 ; CHECK:       // %bb.0:
 576 ; CHECK-NEXT:    mov w8, w0
 577 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 578 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 579 ; CHECK-NEXT:    ret
 580   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice)
 581   %slice.7 = add i32 %slice, 7
 582   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice.7)
 583   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2
 584 }
 585
 586 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
 587 ; CHECK-LABEL: za_read_vg1x4_d:
 588 ; CHECK:       // %bb.0:
 589 ; CHECK-NEXT:    mov w8, w0
 590 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 591 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 592 ; CHECK-NEXT:    ret
 593   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
 594   %slice.7 = add i32 %slice, 7
 595   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
 596   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
 597 }
 598
 599 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
 600 ; CHECK-LABEL: za_read_vg1x4_f64:
 601 ; CHECK:       // %bb.0:
 602 ; CHECK-NEXT:    mov w8, w0
 603 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 604 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 605 ; CHECK-NEXT:    ret
 606   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
 607   %slice.7 = add i32 %slice, 7
 608   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
 609   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
 610 }
 611
 612 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)
 613 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32, i32)
 614 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32, i32)
 615 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32, i32)
 616 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32, i32)
 617 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32, i32)
 618 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32, i32)
 619 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32, i32)
 620
 621 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32, i32)
 622 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32, i32)
 623 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32, i32)
 624 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32, i32)
 625 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32, i32)
 626 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32, i32)
 627 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32, i32)
 628 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32, i32)
 629
 630 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32, i32)
 631 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32, i32)
 632 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32, i32)
 633 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32, i32)
 634 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32, i32)
 635 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32, i32)
 636 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32, i32)
 637 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32, i32)
 638
 639 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32, i32)
 640 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32, i32)
 641 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32, i32)
 642 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32, i32)
 643 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32, i32)
 644 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32, i32)
 645 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32, i32)
 646 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32, i32)
 647
 648 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32)
 649 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32)
 650 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32)
 651 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32)
 652 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32)
 653 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32)
 654 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32)
 655 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32)
 656
 657 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32)
 658 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32)
 659 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32)
 660 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32)
 661 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32)
 662 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32)
 663 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32)
 664 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32)