llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
   3
   4 ;
   5 ; Move Multi-Vector From Tile (Read) x2
   6 ;
   7
   8 ; Horizontal
   9
  10 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
  11 ; CHECK-LABEL: za_read_horiz_vg2_b:
  12 ; CHECK:       // %bb.0:
  13 ; CHECK-NEXT:    mov w12, w0
  14 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 0:1]
  15 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 14:15]
  16 ; CHECK-NEXT:    ret
  17   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
  18   %slice.14 = add i32 %slice, 14
  19   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
  20   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
  21 }
  22
  23 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
  24 ; CHECK-LABEL: za_read_horiz_vg2_h:
  25 ; CHECK:       // %bb.0:
  26 ; CHECK-NEXT:    mov w12, w0
  27 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
  28 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
  29 ; CHECK-NEXT:    ret
  30   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
  31   %slice.6 = add i32 %slice, 6
  32   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
  33   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
  34 }
  35
  36 define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
  37 ; CHECK-LABEL: za_read_horiz_vg2_f16:
  38 ; CHECK:       // %bb.0:
  39 ; CHECK-NEXT:    mov w12, w0
  40 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
  41 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
  42 ; CHECK-NEXT:    ret
  43   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
  44   %slice.6 = add i32 %slice, 6
  45   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
  46   ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
  47 }
  48
  49 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
  50 ; CHECK-LABEL: za_read_horiz_vg2_bf16:
  51 ; CHECK:       // %bb.0:
  52 ; CHECK-NEXT:    mov w12, w0
  53 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
  54 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
  55 ; CHECK-NEXT:    ret
  56   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
  57   %slice.6 = add i32 %slice, 6
  58   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
  59   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
  60 }
  61
  62 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
  63 ; CHECK-LABEL: za_read_horiz_vg2_s:
  64 ; CHECK:       // %bb.0:
  65 ; CHECK-NEXT:    mov w12, w0
  66 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
  67 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
  68 ; CHECK-NEXT:    ret
  69   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
  70   %slice.2 = add i32 %slice, 2
  71   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
  72   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
  73 }
  74
  75 define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
  76 ; CHECK-LABEL: za_read_horiz_vg2_f32:
  77 ; CHECK:       // %bb.0:
  78 ; CHECK-NEXT:    mov w12, w0
  79 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
  80 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
  81 ; CHECK-NEXT:    ret
  82   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
  83   %slice.2 = add i32 %slice, 2
  84   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
  85   ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
  86 }
  87
  88 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
  89 ; CHECK-LABEL: za_read_horiz_vg2_d:
  90 ; CHECK:       // %bb.0:
  91 ; CHECK-NEXT:    mov w12, w0
  92 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0h.d[w12, 0:1]
  93 ; CHECK-NEXT:    ret
  94   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 0, i32 %slice)
  95   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
  96 }
  97
  98 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i32 %slice) {
  99 ; CHECK-LABEL: za_read_horiz_vg2_f64:
 100 ; CHECK:       // %bb.0:
 101 ; CHECK-NEXT:    mov w12, w0
 102 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0h.d[w12, 0:1]
 103 ; CHECK-NEXT:    ret
 104   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 0, i32 %slice)
 105   ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
 106 }
 107
 108 ; Vertical
 109
 110 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
 111 ; CHECK-LABEL: za_read_vert_vg2_b:
 112 ; CHECK:       // %bb.0:
 113 ; CHECK-NEXT:    mov w12, w0
 114 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 0:1]
 115 ; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 14:15]
 116 ; CHECK-NEXT:    ret
 117   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
 118   %slice.14 = add i32 %slice, 14
 119   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
 120   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 121 }
 122
 123 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
 124 ; CHECK-LABEL: za_read_vert_vg2_h:
 125 ; CHECK:       // %bb.0:
 126 ; CHECK-NEXT:    mov w12, w0
 127 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
 128 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
 129 ; CHECK-NEXT:    ret
 130   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
 131   %slice.6 = add i32 %slice, 6
 132   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
 133   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 134 }
 135
 136 define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
 137 ; CHECK-LABEL: za_read_vert_vg2_f16:
 138 ; CHECK:       // %bb.0:
 139 ; CHECK-NEXT:    mov w12, w0
 140 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
 141 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
 142 ; CHECK-NEXT:    ret
 143   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
 144   %slice.6 = add i32 %slice, 6
 145   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
 146   ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
 147 }
 148
 149 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
 150 ; CHECK-LABEL: za_read_vert_vg2_bf16:
 151 ; CHECK:       // %bb.0:
 152 ; CHECK-NEXT:    mov w12, w0
 153 ; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
 154 ; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
 155 ; CHECK-NEXT:    ret
 156   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
 157   %slice.6 = add i32 %slice, 6
 158   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
 159   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 160 }
 161
 162 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
 163 ; CHECK-LABEL: za_read_vert_vg2_s:
 164 ; CHECK:       // %bb.0:
 165 ; CHECK-NEXT:    mov w12, w0
 166 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
 167 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
 168 ; CHECK-NEXT:    ret
 169   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
 170   %slice.2 = add i32 %slice, 2
 171   %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
 172   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
 173 }
 174
 175 define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
 176 ; CHECK-LABEL: za_read_vert_vg2_f32:
 177 ; CHECK:       // %bb.0:
 178 ; CHECK-NEXT:    mov w12, w0
 179 ; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
 180 ; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
 181 ; CHECK-NEXT:    ret
 182   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
 183   %slice.2 = add i32 %slice, 2
 184   %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
 185   ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
 186 }
 187
 188 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
 189 ; CHECK-LABEL: za_read_vert_vg2_d:
 190 ; CHECK:       // %bb.0:
 191 ; CHECK-NEXT:    mov w12, w0
 192 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0v.d[w12, 0:1]
 193 ; CHECK-NEXT:    ret
 194   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 0, i32 %slice)
 195   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 196 }
 197
 198 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i32 %slice) {
 199 ; CHECK-LABEL: za_read_vert_vg2_f64:
 200 ; CHECK:       // %bb.0:
 201 ; CHECK-NEXT:    mov w12, w0
 202 ; CHECK-NEXT:    mov { z0.d, z1.d }, za0v.d[w12, 0:1]
 203 ; CHECK-NEXT:    ret
 204   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 0, i32 %slice)
 205   ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
 206 }
 207
 208 ;
 209 ; Move Multi-Vector From Tile (Read) x4
 210 ;
 211
 212 ; Horizontal
 213
 214 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
 215 ; CHECK-LABEL: za_read_horiz_vg4_b:
 216 ; CHECK:       // %bb.0:
 217 ; CHECK-NEXT:    mov w12, w0
 218 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 0:3]
 219 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 12:15]
 220 ; CHECK-NEXT:    ret
 221   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
 222   %slice.12 = add i32 %slice, 12
 223   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
 224   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 225 }
 226
 227 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
 228 ; CHECK-LABEL: za_read_horiz_vg4_h:
 229 ; CHECK:       // %bb.0:
 230 ; CHECK-NEXT:    mov w12, w0
 231 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
 232 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
 233 ; CHECK-NEXT:    ret
 234   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
 235   %slice.4 = add i32 %slice, 4
 236   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
 237   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 238 }
 239
 240 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
 241 ; CHECK-LABEL: za_read_horiz_vg4_f16:
 242 ; CHECK:       // %bb.0:
 243 ; CHECK-NEXT:    mov w12, w0
 244 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
 245 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
 246 ; CHECK-NEXT:    ret
 247   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
 248   %slice.4 = add i32 %slice, 4
 249   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
 250   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
 251 }
 252
 253 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
 254 ; CHECK-LABEL: za_read_horiz_vg4_bf16:
 255 ; CHECK:       // %bb.0:
 256 ; CHECK-NEXT:    mov w12, w0
 257 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
 258 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
 259 ; CHECK-NEXT:    ret
 260   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
 261   %slice.4 = add i32 %slice, 4
 262   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
 263   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 264 }
 265
 266 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
 267 ; CHECK-LABEL: za_read_horiz_vg4_s:
 268 ; CHECK:       // %bb.0:
 269 ; CHECK-NEXT:    mov w12, w0
 270 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0h.s[w12, 0:3]
 271 ; CHECK-NEXT:    ret
 272   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 0, i32 %slice)
 273   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 274 }
 275
 276 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg4_f32(i32 %slice) {
 277 ; CHECK-LABEL: za_read_horiz_vg4_f32:
 278 ; CHECK:       // %bb.0:
 279 ; CHECK-NEXT:    mov w12, w0
 280 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0h.s[w12, 0:3]
 281 ; CHECK-NEXT:    ret
 282   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 0, i32 %slice)
 283   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 284 }
 285
 286 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg4_d(i32 %slice) {
 287 ; CHECK-LABEL: za_read_horiz_vg4_d:
 288 ; CHECK:       // %bb.0:
 289 ; CHECK-NEXT:    mov w12, w0
 290 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0h.d[w12, 0:3]
 291 ; CHECK-NEXT:    ret
 292   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 0, i32 %slice)
 293   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 294 }
 295
 296 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg4_f64(i32 %slice) {
 297 ; CHECK-LABEL: za_read_horiz_vg4_f64:
 298 ; CHECK:       // %bb.0:
 299 ; CHECK-NEXT:    mov w12, w0
 300 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0h.d[w12, 0:3]
 301 ; CHECK-NEXT:    ret
 302   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 0, i32 %slice)
 303   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
 304 }
 305
 306 ; Vertical
 307
 308 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
 309 ; CHECK-LABEL: za_read_vert_vg4_b:
 310 ; CHECK:       // %bb.0:
 311 ; CHECK-NEXT:    mov w12, w0
 312 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 0:3]
 313 ; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 12:15]
 314 ; CHECK-NEXT:    ret
 315   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
 316   %slice.12 = add i32 %slice, 12
 317   %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
 318   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
 319 }
 320
 321 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
 322 ; CHECK-LABEL: za_read_vert_vg4_h:
 323 ; CHECK:       // %bb.0:
 324 ; CHECK-NEXT:    mov w12, w0
 325 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
 326 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
 327 ; CHECK-NEXT:    ret
 328   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
 329   %slice.4 = add i32 %slice, 4
 330   %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
 331   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
 332 }
 333
 334 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
 335 ; CHECK-LABEL: za_read_vert_vg4_f16:
 336 ; CHECK:       // %bb.0:
 337 ; CHECK-NEXT:    mov w12, w0
 338 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
 339 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
 340 ; CHECK-NEXT:    ret
 341   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
 342   %slice.4 = add i32 %slice, 4
 343   %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
 344   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
 345 }
 346
 347 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
 348 ; CHECK-LABEL: za_read_vert_vg4_bf16:
 349 ; CHECK:       // %bb.0:
 350 ; CHECK-NEXT:    mov w12, w0
 351 ; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
 352 ; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
 353 ; CHECK-NEXT:    ret
 354   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
 355   %slice.4 = add i32 %slice, 4
 356   %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
 357   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
 358 }
 359
 360 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
 361 ; CHECK-LABEL: za_read_vert_vg4_s:
 362 ; CHECK:       // %bb.0:
 363 ; CHECK-NEXT:    mov w12, w0
 364 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0v.s[w12, 0:3]
 365 ; CHECK-NEXT:    ret
 366   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 0, i32 %slice)
 367   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 368 }
 369
 370 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg4_f32(i32 %slice) {
 371 ; CHECK-LABEL: za_read_vert_vg4_f32:
 372 ; CHECK:       // %bb.0:
 373 ; CHECK-NEXT:    mov w12, w0
 374 ; CHECK-NEXT:    mov { z0.s - z3.s }, za0v.s[w12, 0:3]
 375 ; CHECK-NEXT:    ret
 376   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 0, i32 %slice)
 377   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 378 }
 379
 380 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg4_d(i32 %slice) {
 381 ; CHECK-LABEL: za_read_vert_vg4_d:
 382 ; CHECK:       // %bb.0:
 383 ; CHECK-NEXT:    mov w12, w0
 384 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0v.d[w12, 0:3]
 385 ; CHECK-NEXT:    ret
 386   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 0, i32 %slice)
 387   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 388 }
 389
 390 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg4_f64(i32 %slice) {
 391 ; CHECK-LABEL: za_read_vert_vg4_f64:
 392 ; CHECK:       // %bb.0:
 393 ; CHECK-NEXT:    mov w12, w0
 394 ; CHECK-NEXT:    mov { z0.d - z3.d }, za0v.d[w12, 0:3]
 395 ; CHECK-NEXT:    ret
 396   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 0, i32 %slice)
 397   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
 398 }
 399
 400 ; Move Multi-Vector From ZA (Read) x2
 401
 402 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
 403 ; CHECK-LABEL: za_read_vg1x2_d:
 404 ; CHECK:       // %bb.0:
 405 ; CHECK-NEXT:    mov w8, w0
 406 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 407 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 408 ; CHECK-NEXT:    ret
 409   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
 410   %slice.7 = add i32 %slice, 7
 411   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
 412   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
 413 }
 414
 415 define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
 416 ; CHECK-LABEL: za_read_vg1x2_f64:
 417 ; CHECK:       // %bb.0:
 418 ; CHECK-NEXT:    mov w8, w0
 419 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
 420 ; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
 421 ; CHECK-NEXT:    ret
 422   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
 423   %slice.7 = add i32 %slice, 7
 424   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
 425   ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
 426 }
 427
 428 ; Move Multi-Vector From ZA (Read) x4
 429
 430 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
 431 ; CHECK-LABEL: za_read_vg1x4_d:
 432 ; CHECK:       // %bb.0:
 433 ; CHECK-NEXT:    mov w8, w0
 434 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 435 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 436 ; CHECK-NEXT:    ret
 437   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
 438   %slice.7 = add i32 %slice, 7
 439   %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
 440   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
 441 }
 442
 443 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
 444 ; CHECK-LABEL: za_read_vg1x4_f64:
 445 ; CHECK:       // %bb.0:
 446 ; CHECK-NEXT:    mov w8, w0
 447 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
 448 ; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
 449 ; CHECK-NEXT:    ret
 450   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
 451   %slice.7 = add i32 %slice, 7
 452   %res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
 453   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
 454 }
 455
 456 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)
 457 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32, i32)
 458 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32, i32)
 459 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32, i32)
 460 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32, i32)
 461 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32, i32)
 462 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32, i32)
 463 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32, i32)
 464
 465 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32, i32)
 466 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32, i32)
 467 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32, i32)
 468 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32, i32)
 469 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32, i32)
 470 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32, i32)
 471 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32, i32)
 472 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32, i32)
 473
 474 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32, i32)
 475 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32, i32)
 476 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32, i32)
 477 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32, i32)
 478 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32, i32)
 479 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32, i32)
 480 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32, i32)
 481 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32, i32)
 482
 483 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32, i32)
 484 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32, i32)
 485 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32, i32)
 486 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32, i32)
 487 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32, i32)
 488 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32, i32)
 489 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32, i32)
 490 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32, i32)
 491
 492 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32)
 493 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32)
 494
 495 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32)
 496 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32)