llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128
   3 ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128
   4 ; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512
   5 ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512
   6
   7 ; Test optimizing interleaves to widening arithmetic.
   8
   9 define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) {
  10 ; CHECK-LABEL: interleave_v2i8:
  11 ; CHECK:       # %bb.0:
  12 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
  13 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
  14 ; CHECK-NEXT:    li a0, -1
  15 ; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
  16 ; CHECK-NEXT:    vmv1r.v v8, v10
  17 ; CHECK-NEXT:    ret
  18   %a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
  19   ret <4 x i8> %a
  20 }
  21
  22 define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) {
  23 ; CHECK-LABEL: interleave_v2i16:
  24 ; CHECK:       # %bb.0:
  25 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
  26 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
  27 ; CHECK-NEXT:    li a0, -1
  28 ; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
  29 ; CHECK-NEXT:    vmv1r.v v8, v10
  30 ; CHECK-NEXT:    ret
  31   %a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
  32   ret <4 x i16> %a
  33 }
  34
  35 ; Vector order switched for coverage.
  36 define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) {
  37 ; CHECK-LABEL: interleave_v2i32:
  38 ; CHECK:       # %bb.0:
  39 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
  40 ; CHECK-NEXT:    vwaddu.vv v10, v9, v8
  41 ; CHECK-NEXT:    li a0, -1
  42 ; CHECK-NEXT:    vwmaccu.vx v10, a0, v8
  43 ; CHECK-NEXT:    vmv1r.v v8, v10
  44 ; CHECK-NEXT:    ret
  45   %a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
  46   ret <4 x i32> %a
  47 }
  48
  49 ; One vXi64 test case to very that we don't optimize it.
  50 ; FIXME: Is there better codegen we can do here?
  51 define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
  52 ; V128-LABEL: interleave_v2i64:
  53 ; V128:       # %bb.0:
  54 ; V128-NEXT:    vmv1r.v v12, v9
  55 ; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
  56 ; V128-NEXT:    vid.v v9
  57 ; V128-NEXT:    vsrl.vi v14, v9, 1
  58 ; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
  59 ; V128-NEXT:    vrgatherei16.vv v10, v8, v14
  60 ; V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
  61 ; V128-NEXT:    vmv.v.i v0, 10
  62 ; V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
  63 ; V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
  64 ; V128-NEXT:    vmv.v.v v8, v10
  65 ; V128-NEXT:    ret
  66 ;
  67 ; RV32-V512-LABEL: interleave_v2i64:
  68 ; RV32-V512:       # %bb.0:
  69 ; RV32-V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
  70 ; RV32-V512-NEXT:    vid.v v10
  71 ; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
  72 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
  73 ; RV32-V512-NEXT:    vmv.v.i v0, 10
  74 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
  75 ; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
  76 ; RV32-V512-NEXT:    vmv.v.v v8, v10
  77 ; RV32-V512-NEXT:    ret
  78 ;
  79 ; RV64-V512-LABEL: interleave_v2i64:
  80 ; RV64-V512:       # %bb.0:
  81 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
  82 ; RV64-V512-NEXT:    vid.v v10
  83 ; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
  84 ; RV64-V512-NEXT:    vmv.v.i v0, 10
  85 ; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
  86 ; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
  87 ; RV64-V512-NEXT:    vmv.v.v v8, v10
  88 ; RV64-V512-NEXT:    ret
  89   %a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
  90   ret <4 x i64> %a
  91 }
  92
  93 ; Vector order switched for coverage.
  94 define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) {
  95 ; V128-LABEL: interleave_v4i8:
  96 ; V128:       # %bb.0:
  97 ; V128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
  98 ; V128-NEXT:    vwaddu.vv v10, v9, v8
  99 ; V128-NEXT:    li a0, -1
 100 ; V128-NEXT:    vwmaccu.vx v10, a0, v8
 101 ; V128-NEXT:    vmv1r.v v8, v10
 102 ; V128-NEXT:    ret
 103 ;
 104 ; V512-LABEL: interleave_v4i8:
 105 ; V512:       # %bb.0:
 106 ; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
 107 ; V512-NEXT:    vwaddu.vv v10, v9, v8
 108 ; V512-NEXT:    li a0, -1
 109 ; V512-NEXT:    vwmaccu.vx v10, a0, v8
 110 ; V512-NEXT:    vmv1r.v v8, v10
 111 ; V512-NEXT:    ret
 112   %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
 113   ret <8 x i8> %a
 114 }
 115
 116 ; Undef elements for coverage
 117 define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) {
 118 ; V128-LABEL: interleave_v4i16:
 119 ; V128:       # %bb.0:
 120 ; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 121 ; V128-NEXT:    vwaddu.vv v10, v8, v9
 122 ; V128-NEXT:    li a0, -1
 123 ; V128-NEXT:    vwmaccu.vx v10, a0, v9
 124 ; V128-NEXT:    vmv1r.v v8, v10
 125 ; V128-NEXT:    ret
 126 ;
 127 ; V512-LABEL: interleave_v4i16:
 128 ; V512:       # %bb.0:
 129 ; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 130 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 131 ; V512-NEXT:    li a0, -1
 132 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 133 ; V512-NEXT:    vmv1r.v v8, v10
 134 ; V512-NEXT:    ret
 135   %a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7>
 136   ret <8 x i16> %a
 137 }
 138
 139 define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) {
 140 ; V128-LABEL: interleave_v4i32:
 141 ; V128:       # %bb.0:
 142 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 143 ; V128-NEXT:    vwaddu.vv v10, v8, v9
 144 ; V128-NEXT:    li a0, -1
 145 ; V128-NEXT:    vwmaccu.vx v10, a0, v9
 146 ; V128-NEXT:    vmv2r.v v8, v10
 147 ; V128-NEXT:    ret
 148 ;
 149 ; V512-LABEL: interleave_v4i32:
 150 ; V512:       # %bb.0:
 151 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
 152 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 153 ; V512-NEXT:    li a0, -1
 154 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 155 ; V512-NEXT:    vmv1r.v v8, v10
 156 ; V512-NEXT:    ret
 157   %a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 158   ret <8 x i32> %a
 159 }
 160
 161 ; %y should be slid down by 2
 162 define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
 163 ; V128-LABEL: interleave_v4i32_offset_2:
 164 ; V128:       # %bb.0:
 165 ; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 166 ; V128-NEXT:    vslidedown.vi v10, v9, 2
 167 ; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 168 ; V128-NEXT:    vwaddu.vv v9, v8, v10
 169 ; V128-NEXT:    li a0, -1
 170 ; V128-NEXT:    vwmaccu.vx v9, a0, v10
 171 ; V128-NEXT:    vmv1r.v v8, v9
 172 ; V128-NEXT:    ret
 173 ;
 174 ; V512-LABEL: interleave_v4i32_offset_2:
 175 ; V512:       # %bb.0:
 176 ; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 177 ; V512-NEXT:    vslidedown.vi v10, v9, 2
 178 ; V512-NEXT:    vwaddu.vv v9, v8, v10
 179 ; V512-NEXT:    li a0, -1
 180 ; V512-NEXT:    vwmaccu.vx v9, a0, v10
 181 ; V512-NEXT:    vmv1r.v v8, v9
 182 ; V512-NEXT:    ret
 183   %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 6, i32 1, i32 7>
 184   ret <4 x i32> %a
 185 }
 186
 187 ; %y should be slid down by 1
 188 define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 189 ; V128-LABEL: interleave_v4i32_offset_1:
 190 ; V128:       # %bb.0:
 191 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 192 ; V128-NEXT:    vid.v v10
 193 ; V128-NEXT:    vsrl.vi v11, v10, 1
 194 ; V128-NEXT:    vrgather.vv v10, v8, v11
 195 ; V128-NEXT:    vmv.v.i v0, 10
 196 ; V128-NEXT:    vadd.vi v8, v11, 1
 197 ; V128-NEXT:    vrgather.vv v10, v9, v8, v0.t
 198 ; V128-NEXT:    vmv.v.v v8, v10
 199 ; V128-NEXT:    ret
 200 ;
 201 ; V512-LABEL: interleave_v4i32_offset_1:
 202 ; V512:       # %bb.0:
 203 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, mu
 204 ; V512-NEXT:    vid.v v10
 205 ; V512-NEXT:    vsrl.vi v11, v10, 1
 206 ; V512-NEXT:    vrgather.vv v10, v8, v11
 207 ; V512-NEXT:    vmv.v.i v0, 10
 208 ; V512-NEXT:    vadd.vi v8, v11, 1
 209 ; V512-NEXT:    vrgather.vv v10, v9, v8, v0.t
 210 ; V512-NEXT:    vmv1r.v v8, v10
 211 ; V512-NEXT:    ret
 212   %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 1, i32 6>
 213   ret <4 x i32> %a
 214 }
 215
 216 define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) {
 217 ; V128-LABEL: interleave_v8i8:
 218 ; V128:       # %bb.0:
 219 ; V128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 220 ; V128-NEXT:    vwaddu.vv v10, v8, v9
 221 ; V128-NEXT:    li a0, -1
 222 ; V128-NEXT:    vwmaccu.vx v10, a0, v9
 223 ; V128-NEXT:    vmv1r.v v8, v10
 224 ; V128-NEXT:    ret
 225 ;
 226 ; V512-LABEL: interleave_v8i8:
 227 ; V512:       # %bb.0:
 228 ; V512-NEXT:    vsetivli zero, 8, e8, mf8, ta, ma
 229 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 230 ; V512-NEXT:    li a0, -1
 231 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 232 ; V512-NEXT:    vmv1r.v v8, v10
 233 ; V512-NEXT:    ret
 234   %a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 235   ret <16 x i8> %a
 236 }
 237
 238 ; Vector order switched for coverage.
 239 define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) {
 240 ; V128-LABEL: interleave_v8i16:
 241 ; V128:       # %bb.0:
 242 ; V128-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 243 ; V128-NEXT:    vwaddu.vv v10, v9, v8
 244 ; V128-NEXT:    li a0, -1
 245 ; V128-NEXT:    vwmaccu.vx v10, a0, v8
 246 ; V128-NEXT:    vmv2r.v v8, v10
 247 ; V128-NEXT:    ret
 248 ;
 249 ; V512-LABEL: interleave_v8i16:
 250 ; V512:       # %bb.0:
 251 ; V512-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
 252 ; V512-NEXT:    vwaddu.vv v10, v9, v8
 253 ; V512-NEXT:    li a0, -1
 254 ; V512-NEXT:    vwmaccu.vx v10, a0, v8
 255 ; V512-NEXT:    vmv1r.v v8, v10
 256 ; V512-NEXT:    ret
 257   %a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
 258   ret <16 x i16> %a
 259 }
 260
 261 define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) {
 262 ; V128-LABEL: interleave_v8i32:
 263 ; V128:       # %bb.0:
 264 ; V128-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 265 ; V128-NEXT:    vwaddu.vv v12, v8, v10
 266 ; V128-NEXT:    li a0, -1
 267 ; V128-NEXT:    vwmaccu.vx v12, a0, v10
 268 ; V128-NEXT:    vmv4r.v v8, v12
 269 ; V128-NEXT:    ret
 270 ;
 271 ; V512-LABEL: interleave_v8i32:
 272 ; V512:       # %bb.0:
 273 ; V512-NEXT:    vsetivli zero, 8, e32, mf2, ta, ma
 274 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 275 ; V512-NEXT:    li a0, -1
 276 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 277 ; V512-NEXT:    vmv1r.v v8, v10
 278 ; V512-NEXT:    ret
 279   %a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 280   ret <16 x i32> %a
 281 }
 282
 283 define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) {
 284 ; V128-LABEL: interleave_v16i8:
 285 ; V128:       # %bb.0:
 286 ; V128-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 287 ; V128-NEXT:    vwaddu.vv v10, v8, v9
 288 ; V128-NEXT:    li a0, -1
 289 ; V128-NEXT:    vwmaccu.vx v10, a0, v9
 290 ; V128-NEXT:    vmv2r.v v8, v10
 291 ; V128-NEXT:    ret
 292 ;
 293 ; V512-LABEL: interleave_v16i8:
 294 ; V512:       # %bb.0:
 295 ; V512-NEXT:    vsetivli zero, 16, e8, mf4, ta, ma
 296 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 297 ; V512-NEXT:    li a0, -1
 298 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 299 ; V512-NEXT:    vmv1r.v v8, v10
 300 ; V512-NEXT:    ret
 301   %a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 302   ret <32 x i8> %a
 303 }
 304
 305 define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) {
 306 ; V128-LABEL: interleave_v16i16:
 307 ; V128:       # %bb.0:
 308 ; V128-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 309 ; V128-NEXT:    vwaddu.vv v12, v8, v10
 310 ; V128-NEXT:    li a0, -1
 311 ; V128-NEXT:    vwmaccu.vx v12, a0, v10
 312 ; V128-NEXT:    vmv4r.v v8, v12
 313 ; V128-NEXT:    ret
 314 ;
 315 ; V512-LABEL: interleave_v16i16:
 316 ; V512:       # %bb.0:
 317 ; V512-NEXT:    vsetivli zero, 16, e16, mf2, ta, ma
 318 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 319 ; V512-NEXT:    li a0, -1
 320 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 321 ; V512-NEXT:    vmv1r.v v8, v10
 322 ; V512-NEXT:    ret
 323   %a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 324   ret <32 x i16> %a
 325 }
 326
 327 define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) {
 328 ; V128-LABEL: interleave_v16i32:
 329 ; V128:       # %bb.0:
 330 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 331 ; V128-NEXT:    vwaddu.vv v16, v8, v12
 332 ; V128-NEXT:    li a0, -1
 333 ; V128-NEXT:    vwmaccu.vx v16, a0, v12
 334 ; V128-NEXT:    vmv8r.v v8, v16
 335 ; V128-NEXT:    ret
 336 ;
 337 ; V512-LABEL: interleave_v16i32:
 338 ; V512:       # %bb.0:
 339 ; V512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 340 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 341 ; V512-NEXT:    li a0, -1
 342 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 343 ; V512-NEXT:    vmv2r.v v8, v10
 344 ; V512-NEXT:    ret
 345   %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
 346   ret <32 x i32> %a
 347 }
 348
 349 define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) {
 350 ; V128-LABEL: interleave_v32i8:
 351 ; V128:       # %bb.0:
 352 ; V128-NEXT:    li a0, 32
 353 ; V128-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 354 ; V128-NEXT:    vwaddu.vv v12, v8, v10
 355 ; V128-NEXT:    li a0, -1
 356 ; V128-NEXT:    vwmaccu.vx v12, a0, v10
 357 ; V128-NEXT:    vmv4r.v v8, v12
 358 ; V128-NEXT:    ret
 359 ;
 360 ; V512-LABEL: interleave_v32i8:
 361 ; V512:       # %bb.0:
 362 ; V512-NEXT:    li a0, 32
 363 ; V512-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 364 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 365 ; V512-NEXT:    li a0, -1
 366 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 367 ; V512-NEXT:    vmv1r.v v8, v10
 368 ; V512-NEXT:    ret
 369   %a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
 370   ret <64 x i8> %a
 371 }
 372
 373 define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
 374 ; V128-LABEL: interleave_v32i16:
 375 ; V128:       # %bb.0:
 376 ; V128-NEXT:    li a0, 32
 377 ; V128-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 378 ; V128-NEXT:    vwaddu.vv v16, v8, v12
 379 ; V128-NEXT:    li a0, -1
 380 ; V128-NEXT:    vwmaccu.vx v16, a0, v12
 381 ; V128-NEXT:    vmv8r.v v8, v16
 382 ; V128-NEXT:    ret
 383 ;
 384 ; V512-LABEL: interleave_v32i16:
 385 ; V512:       # %bb.0:
 386 ; V512-NEXT:    li a0, 32
 387 ; V512-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 388 ; V512-NEXT:    vwaddu.vv v10, v8, v9
 389 ; V512-NEXT:    li a0, -1
 390 ; V512-NEXT:    vwmaccu.vx v10, a0, v9
 391 ; V512-NEXT:    vmv2r.v v8, v10
 392 ; V512-NEXT:    ret
 393   %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
 394   ret <64 x i16> %a
 395 }
 396
 397 define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 398 ; V128-LABEL: interleave_v32i32:
 399 ; V128:       # %bb.0:
 400 ; V128-NEXT:    addi sp, sp, -16
 401 ; V128-NEXT:    .cfi_def_cfa_offset 16
 402 ; V128-NEXT:    csrr a0, vlenb
 403 ; V128-NEXT:    slli a0, a0, 2
 404 ; V128-NEXT:    sub sp, sp, a0
 405 ; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 406 ; V128-NEXT:    lui a0, %hi(.LCPI17_0)
 407 ; V128-NEXT:    addi a0, a0, %lo(.LCPI17_0)
 408 ; V128-NEXT:    li a1, 32
 409 ; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 410 ; V128-NEXT:    vle16.v v4, (a0)
 411 ; V128-NEXT:    lui a0, %hi(.LCPI17_1)
 412 ; V128-NEXT:    addi a0, a0, %lo(.LCPI17_1)
 413 ; V128-NEXT:    vle16.v v24, (a0)
 414 ; V128-NEXT:    addi a0, sp, 16
 415 ; V128-NEXT:    vs4r.v v24, (a0) # Unknown-size Folded Spill
 416 ; V128-NEXT:    lui a0, 699051
 417 ; V128-NEXT:    addi a0, a0, -1366
 418 ; V128-NEXT:    vmv.s.x v0, a0
 419 ; V128-NEXT:    vrgatherei16.vv v24, v8, v4
 420 ; V128-NEXT:    addi a0, sp, 16
 421 ; V128-NEXT:    vl4r.v v12, (a0) # Unknown-size Folded Reload
 422 ; V128-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
 423 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 424 ; V128-NEXT:    vwaddu.vv v0, v8, v16
 425 ; V128-NEXT:    li a0, -1
 426 ; V128-NEXT:    vwmaccu.vx v0, a0, v16
 427 ; V128-NEXT:    vmv8r.v v8, v0
 428 ; V128-NEXT:    vmv8r.v v16, v24
 429 ; V128-NEXT:    csrr a0, vlenb
 430 ; V128-NEXT:    slli a0, a0, 2
 431 ; V128-NEXT:    add sp, sp, a0
 432 ; V128-NEXT:    addi sp, sp, 16
 433 ; V128-NEXT:    ret
 434 ;
 435 ; V512-LABEL: interleave_v32i32:
 436 ; V512:       # %bb.0:
 437 ; V512-NEXT:    li a0, 32
 438 ; V512-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 439 ; V512-NEXT:    vwaddu.vv v12, v8, v10
 440 ; V512-NEXT:    li a0, -1
 441 ; V512-NEXT:    vwmaccu.vx v12, a0, v10
 442 ; V512-NEXT:    vmv4r.v v8, v12
 443 ; V512-NEXT:    ret
 444   %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
 445   ret <64 x i32> %a
 446 }
 447
 448 define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) {
 449 ; V128-LABEL: unary_interleave_v4i8:
 450 ; V128:       # %bb.0:
 451 ; V128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 452 ; V128-NEXT:    vslidedown.vi v10, v8, 2
 453 ; V128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 454 ; V128-NEXT:    vwaddu.vv v9, v8, v10
 455 ; V128-NEXT:    li a0, -1
 456 ; V128-NEXT:    vwmaccu.vx v9, a0, v10
 457 ; V128-NEXT:    vmv1r.v v8, v9
 458 ; V128-NEXT:    ret
 459 ;
 460 ; V512-LABEL: unary_interleave_v4i8:
 461 ; V512:       # %bb.0:
 462 ; V512-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 463 ; V512-NEXT:    vslidedown.vi v10, v8, 2
 464 ; V512-NEXT:    vwaddu.vv v9, v8, v10
 465 ; V512-NEXT:    li a0, -1
 466 ; V512-NEXT:    vwmaccu.vx v9, a0, v10
 467 ; V512-NEXT:    vmv1r.v v8, v9
 468 ; V512-NEXT:    ret
 469   %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 470   ret <4 x i8> %a
 471 }
 472
 473 ; This shouldn't be interleaved
 474 define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) {
 475 ; V128-LABEL: unary_interleave_v4i8_invalid:
 476 ; V128:       # %bb.0:
 477 ; V128-NEXT:    lui a0, 16
 478 ; V128-NEXT:    addi a0, a0, 768
 479 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 480 ; V128-NEXT:    vmv.s.x v10, a0
 481 ; V128-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 482 ; V128-NEXT:    vrgather.vv v9, v8, v10
 483 ; V128-NEXT:    vmv1r.v v8, v9
 484 ; V128-NEXT:    ret
 485 ;
 486 ; V512-LABEL: unary_interleave_v4i8_invalid:
 487 ; V512:       # %bb.0:
 488 ; V512-NEXT:    lui a0, 16
 489 ; V512-NEXT:    addi a0, a0, 768
 490 ; V512-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 491 ; V512-NEXT:    vmv.s.x v10, a0
 492 ; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
 493 ; V512-NEXT:    vrgather.vv v9, v8, v10
 494 ; V512-NEXT:    vmv1r.v v8, v9
 495 ; V512-NEXT:    ret
 496   %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 4>
 497   ret <4 x i8> %a
 498 }
 499
 500 define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) {
 501 ; V128-LABEL: unary_interleave_v4i16:
 502 ; V128:       # %bb.0:
 503 ; V128-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 504 ; V128-NEXT:    vslidedown.vi v10, v8, 2
 505 ; V128-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 506 ; V128-NEXT:    vwaddu.vv v9, v8, v10
 507 ; V128-NEXT:    li a0, -1
 508 ; V128-NEXT:    vwmaccu.vx v9, a0, v10
 509 ; V128-NEXT:    vmv1r.v v8, v9
 510 ; V128-NEXT:    ret
 511 ;
 512 ; V512-LABEL: unary_interleave_v4i16:
 513 ; V512:       # %bb.0:
 514 ; V512-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 515 ; V512-NEXT:    vslidedown.vi v10, v8, 2
 516 ; V512-NEXT:    vwaddu.vv v9, v8, v10
 517 ; V512-NEXT:    li a0, -1
 518 ; V512-NEXT:    vwmaccu.vx v9, a0, v10
 519 ; V512-NEXT:    vmv1r.v v8, v9
 520 ; V512-NEXT:    ret
 521   %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 522   ret <4 x i16> %a
 523 }
 524
 525 define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) {
 526 ; V128-LABEL: unary_interleave_v4i32:
 527 ; V128:       # %bb.0:
 528 ; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 529 ; V128-NEXT:    vslidedown.vi v10, v8, 2
 530 ; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 531 ; V128-NEXT:    vwaddu.vv v9, v8, v10
 532 ; V128-NEXT:    li a0, -1
 533 ; V128-NEXT:    vwmaccu.vx v9, a0, v10
 534 ; V128-NEXT:    vmv1r.v v8, v9
 535 ; V128-NEXT:    ret
 536 ;
 537 ; V512-LABEL: unary_interleave_v4i32:
 538 ; V512:       # %bb.0:
 539 ; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 540 ; V512-NEXT:    vslidedown.vi v10, v8, 2
 541 ; V512-NEXT:    vwaddu.vv v9, v8, v10
 542 ; V512-NEXT:    li a0, -1
 543 ; V512-NEXT:    vwmaccu.vx v9, a0, v10
 544 ; V512-NEXT:    vmv1r.v v8, v9
 545 ; V512-NEXT:    ret
 546   %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 547   ret <4 x i32> %a
 548 }
 549
 550 ; FIXME: Is there better codegen we can do here?
 551 define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) {
 552 ; V128-LABEL: unary_interleave_v4i64:
 553 ; V128:       # %bb.0:
 554 ; V128-NEXT:    lui a0, 12304
 555 ; V128-NEXT:    addi a0, a0, 512
 556 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 557 ; V128-NEXT:    vmv.s.x v10, a0
 558 ; V128-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 559 ; V128-NEXT:    vsext.vf2 v12, v10
 560 ; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 561 ; V128-NEXT:    vrgatherei16.vv v10, v8, v12
 562 ; V128-NEXT:    vmv.v.v v8, v10
 563 ; V128-NEXT:    ret
 564 ;
 565 ; RV32-V512-LABEL: unary_interleave_v4i64:
 566 ; RV32-V512:       # %bb.0:
 567 ; RV32-V512-NEXT:    lui a0, 12304
 568 ; RV32-V512-NEXT:    addi a0, a0, 512
 569 ; RV32-V512-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 570 ; RV32-V512-NEXT:    vmv.s.x v9, a0
 571 ; RV32-V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 572 ; RV32-V512-NEXT:    vsext.vf2 v10, v9
 573 ; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 574 ; RV32-V512-NEXT:    vrgatherei16.vv v9, v8, v10
 575 ; RV32-V512-NEXT:    vmv.v.v v8, v9
 576 ; RV32-V512-NEXT:    ret
 577 ;
 578 ; RV64-V512-LABEL: unary_interleave_v4i64:
 579 ; RV64-V512:       # %bb.0:
 580 ; RV64-V512-NEXT:    lui a0, 12304
 581 ; RV64-V512-NEXT:    addi a0, a0, 512
 582 ; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 583 ; RV64-V512-NEXT:    vmv.s.x v9, a0
 584 ; RV64-V512-NEXT:    vsext.vf8 v10, v9
 585 ; RV64-V512-NEXT:    vrgather.vv v9, v8, v10
 586 ; RV64-V512-NEXT:    vmv.v.v v8, v9
 587 ; RV64-V512-NEXT:    ret
 588   %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 589   ret <4 x i64> %a
 590 }
 591
 592 define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) {
 593 ; V128-LABEL: unary_interleave_v8i8:
 594 ; V128:       # %bb.0:
 595 ; V128-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 596 ; V128-NEXT:    vslidedown.vi v10, v8, 4
 597 ; V128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 598 ; V128-NEXT:    vwaddu.vv v9, v8, v10
 599 ; V128-NEXT:    li a0, -1
 600 ; V128-NEXT:    vwmaccu.vx v9, a0, v10
 601 ; V128-NEXT:    vmv1r.v v8, v9
 602 ; V128-NEXT:    ret
 603 ;
 604 ; V512-LABEL: unary_interleave_v8i8:
 605 ; V512:       # %bb.0:
 606 ; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
 607 ; V512-NEXT:    vslidedown.vi v10, v8, 4
 608 ; V512-NEXT:    vwaddu.vv v9, v8, v10
 609 ; V512-NEXT:    li a0, -1
 610 ; V512-NEXT:    vwmaccu.vx v9, a0, v10
 611 ; V512-NEXT:    vmv1r.v v8, v9
 612 ; V512-NEXT:    ret
 613   %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 undef, i32 6, i32 3, i32 7>
 614   ret <8 x i8> %a
 615 }
 616
 617 define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) {
 618 ; V128-LABEL: unary_interleave_v8i16:
 619 ; V128:       # %bb.0:
 620 ; V128-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 621 ; V128-NEXT:    vslidedown.vi v10, v8, 4
 622 ; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 623 ; V128-NEXT:    vwaddu.vv v9, v10, v8
 624 ; V128-NEXT:    li a0, -1
 625 ; V128-NEXT:    vwmaccu.vx v9, a0, v8
 626 ; V128-NEXT:    vmv1r.v v8, v9
 627 ; V128-NEXT:    ret
 628 ;
 629 ; V512-LABEL: unary_interleave_v8i16:
 630 ; V512:       # %bb.0:
 631 ; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 632 ; V512-NEXT:    vslidedown.vi v10, v8, 4
 633 ; V512-NEXT:    vwaddu.vv v9, v10, v8
 634 ; V512-NEXT:    li a0, -1
 635 ; V512-NEXT:    vwmaccu.vx v9, a0, v8
 636 ; V512-NEXT:    vmv1r.v v8, v9
 637 ; V512-NEXT:    ret
 638   %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
 639   ret <8 x i16> %a
 640 }
 641
 642 define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) {
 643 ; V128-LABEL: unary_interleave_v8i32:
 644 ; V128:       # %bb.0:
 645 ; V128-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 646 ; V128-NEXT:    vslidedown.vi v12, v8, 4
 647 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 648 ; V128-NEXT:    vwaddu.vv v10, v8, v12
 649 ; V128-NEXT:    li a0, -1
 650 ; V128-NEXT:    vwmaccu.vx v10, a0, v12
 651 ; V128-NEXT:    vmv2r.v v8, v10
 652 ; V128-NEXT:    ret
 653 ;
 654 ; V512-LABEL: unary_interleave_v8i32:
 655 ; V512:       # %bb.0:
 656 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
 657 ; V512-NEXT:    vslidedown.vi v10, v8, 4
 658 ; V512-NEXT:    vwaddu.vv v9, v8, v10
 659 ; V512-NEXT:    li a0, -1
 660 ; V512-NEXT:    vwmaccu.vx v9, a0, v10
 661 ; V512-NEXT:    vmv1r.v v8, v9
 662 ; V512-NEXT:    ret
 663   %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 664   ret <8 x i32> %a
 665 }
 666
 667 ; This interleaves the first 2 elements of a vector in opposite order. With
 668 ; undefs for the remaining elements. We use to miscompile this.
 669 define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) {
 670 ; CHECK-LABEL: unary_interleave_10uu_v4i8:
 671 ; CHECK:       # %bb.0:
 672 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 673 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 674 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 675 ; CHECK-NEXT:    vor.vv v8, v8, v9
 676 ; CHECK-NEXT:    ret
 677   %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
 678   ret <4 x i8> %a
 679 }
 680
 681 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 682 ; RV32-V128: {{.*}}
 683 ; RV64-V128: {{.*}}