llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
   2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
   3
   4 ; Verify that DAG combine rules for LD1 + sext/zext don't apply when the
   5 ; result of LD1 has multiple uses
   6
   7 define <vscale x 2 x i64> @no_dag_combine_zext_sext(<vscale x 2 x i1> %pg,
   8 ; CHECK-LABEL: no_dag_combine_zext_sext:
   9 ; CHECK:       // %bb.0:
  10 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d, #16]
  11 ; CHECK-NEXT:    st1b { z0.d }, p1, [x0]
  12 ; CHECK-NEXT:    and z0.d, z0.d, #0xff
  13 ; CHECK-NEXT:    ret
  14                                                     <vscale x 2 x i64> %base,
  15                                                     ptr %res_out,
  16                                                     <vscale x 2 x i1> %pred) {
  17   %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
  18                                                                                            <vscale x 2 x i64> %base,
  19                                                                                            i64 16)
  20   %res1 = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  21   %res2 = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  22   call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %load,
  23                                       ptr %res_out,
  24                                       i32 8,
  25                                       <vscale x 2 x i1> %pred)
  26
  27   ret <vscale x 2 x i64> %res1
  28 }
  29
  30 define <vscale x 2 x i64> @no_dag_combine_sext(<vscale x 2 x i1> %pg,
  31 ; CHECK-LABEL: no_dag_combine_sext:
  32 ; CHECK:       // %bb.0:
  33 ; CHECK-NEXT:    ld1b { z1.d }, p0/z, [z0.d, #16]
  34 ; CHECK-NEXT:    ptrue p0.d
  35 ; CHECK-NEXT:    movprfx z0, z1
  36 ; CHECK-NEXT:    sxtb z0.d, p0/m, z1.d
  37 ; CHECK-NEXT:    st1b { z1.d }, p1, [x0]
  38 ; CHECK-NEXT:    ret
  39                                                <vscale x 2 x i64> %base,
  40                                                ptr %res_out,
  41                                                <vscale x 2 x i1> %pred) {
  42   %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
  43                                                                                            <vscale x 2 x i64> %base,
  44                                                                                            i64 16)
  45   %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  46   call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %load,
  47                                       ptr %res_out,
  48                                       i32 8,
  49                                       <vscale x 2 x i1> %pred)
  50
  51   ret <vscale x 2 x i64> %res
  52 }
  53
  54 define <vscale x 2 x i64> @no_dag_combine_zext(<vscale x 2 x i1> %pg,
  55 ; CHECK-LABEL: no_dag_combine_zext:
  56 ; CHECK:       // %bb.0:
  57 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d, #16]
  58 ; CHECK-NEXT:    st1b { z0.d }, p1, [x0]
  59 ; CHECK-NEXT:    and z0.d, z0.d, #0xff
  60 ; CHECK-NEXT:    ret
  61                                                <vscale x 2 x i64> %base,
  62                                                ptr %res_out,
  63                                                <vscale x 2 x i1> %pred) {
  64   %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
  65                                                                                            <vscale x 2 x i64> %base,
  66                                                                                            i64 16)
  67   %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  68   call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %load,
  69                                       ptr %res_out,
  70                                       i32 8,
  71                                       <vscale x 2 x i1> %pred)
  72
  73   ret <vscale x 2 x i64> %res
  74 }
  75
  76 define <vscale x 16 x i8> @narrow_i64_gather_index_i8_zext(ptr %out, ptr %in, <vscale x 16 x i8> %d, i64 %ptr){
  77 ; CHECK-LABEL: narrow_i64_gather_index_i8_zext:
  78 ; CHECK:       // %bb.0:
  79 ; CHECK-NEXT:    ptrue p0.s
  80 ; CHECK-NEXT:    add x8, x1, x2
  81 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x8, #3, mul vl]
  82 ; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x8, #2, mul vl]
  83 ; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x1, x2]
  84 ; CHECK-NEXT:    ld1b { z3.s }, p0/z, [x8, #1, mul vl]
  85 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1, z0.s, uxtw]
  86 ; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x1, z1.s, uxtw]
  87 ; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x1, z2.s, uxtw]
  88 ; CHECK-NEXT:    ld1b { z3.s }, p0/z, [x1, z3.s, uxtw]
  89 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
  90 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z3.h
  91 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
  92 ; CHECK-NEXT:    ret
  93   %1 = getelementptr inbounds i8, ptr %in, i64 %ptr
  94   %2 = bitcast ptr %1 to ptr
  95   %wide.load = load <vscale x 16 x i8>, ptr %2, align 1
  96   %3 = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i64>
  97   %4 = getelementptr inbounds i8, ptr %in, <vscale x 16 x i64> %3
  98   %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
  99   ret <vscale x 16 x i8> %wide.masked.gather
 100 }
 101
 102 define <vscale x 16 x i8> @narrow_i64_gather_index_i8_sext(ptr %out, ptr %in, <vscale x 16 x i8> %d, i64 %ptr){
 103 ; CHECK-LABEL: narrow_i64_gather_index_i8_sext:
 104 ; CHECK:       // %bb.0:
 105 ; CHECK-NEXT:    ptrue p0.s
 106 ; CHECK-NEXT:    add x8, x1, x2
 107 ; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x8, #3, mul vl]
 108 ; CHECK-NEXT:    ld1sb { z1.s }, p0/z, [x8, #2, mul vl]
 109 ; CHECK-NEXT:    ld1sb { z2.s }, p0/z, [x1, x2]
 110 ; CHECK-NEXT:    ld1sb { z3.s }, p0/z, [x8, #1, mul vl]
 111 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1, z0.s, sxtw]
 112 ; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x1, z1.s, sxtw]
 113 ; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x1, z2.s, sxtw]
 114 ; CHECK-NEXT:    ld1b { z3.s }, p0/z, [x1, z3.s, sxtw]
 115 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 116 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z3.h
 117 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
 118 ; CHECK-NEXT:    ret
 119   %1 = getelementptr inbounds i8, ptr %in, i64 %ptr
 120   %2 = bitcast ptr %1 to ptr
 121   %wide.load = load <vscale x 16 x i8>, ptr %2, align 1
 122   %3 = sext <vscale x 16 x i8> %wide.load to <vscale x 16 x i64>
 123   %4 = getelementptr inbounds i8, ptr %in, <vscale x 16 x i64> %3
 124   %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
 125   ret <vscale x 16 x i8> %wide.masked.gather
 126 }
 127
 128 define <vscale x 8 x i16> @narrow_i64_gather_index_i16_zext(ptr %out, ptr %in, <vscale x 8 x i16> %d, i64 %ptr){
 129 ; CHECK-LABEL: narrow_i64_gather_index_i16_zext:
 130 ; CHECK:       // %bb.0:
 131 ; CHECK-NEXT:    ptrue p0.s
 132 ; CHECK-NEXT:    add x8, x1, x2, lsl #1
 133 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1, x2, lsl #1]
 134 ; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x8, #1, mul vl]
 135 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1, z0.s, uxtw #1]
 136 ; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x1, z1.s, uxtw #1]
 137 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
 138 ; CHECK-NEXT:    ret
 139   %1 = getelementptr inbounds i16, ptr %in, i64 %ptr
 140   %2 = bitcast ptr %1 to ptr
 141   %wide.load = load <vscale x 8 x i16>, ptr %2, align 1
 142   %3 = zext <vscale x 8 x i16> %wide.load to <vscale x 8 x i64>
 143   %4 = getelementptr inbounds i16, ptr %in, <vscale x 8 x i64> %3
 144   %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> undef)
 145   ret <vscale x 8 x i16> %wide.masked.gather
 146 }
 147
 148 define <vscale x 8 x i16> @narrow_i64_gather_index_i16_sext(ptr %out, ptr %in, <vscale x 8 x i16> %d, i64 %ptr){
 149 ; CHECK-LABEL: narrow_i64_gather_index_i16_sext:
 150 ; CHECK:       // %bb.0:
 151 ; CHECK-NEXT:    ptrue p0.s
 152 ; CHECK-NEXT:    add x8, x1, x2, lsl #1
 153 ; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x1, x2, lsl #1]
 154 ; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x8, #1, mul vl]
 155 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1, z0.s, sxtw #1]
 156 ; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x1, z1.s, sxtw #1]
 157 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
 158 ; CHECK-NEXT:    ret
 159   %1 = getelementptr inbounds i16, ptr %in, i64 %ptr
 160   %2 = bitcast ptr %1 to ptr
 161   %wide.load = load <vscale x 8 x i16>, ptr %2, align 1
 162   %3 = sext <vscale x 8 x i16> %wide.load to <vscale x 8 x i64>
 163   %4 = getelementptr inbounds i16, ptr %in, <vscale x 8 x i64> %3
 164   %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> undef)
 165   ret <vscale x 8 x i16> %wide.masked.gather
 166 }
 167
 168 define <vscale x 4 x i32> @no_narrow_i64_gather_index_i32(ptr %out, ptr %in, <vscale x 4 x i32> %d, i64 %ptr){
 169 ; CHECK-LABEL: no_narrow_i64_gather_index_i32:
 170 ; CHECK:       // %bb.0:
 171 ; CHECK-NEXT:    ptrue p0.s
 172 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, x2, lsl #2]
 173 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, z0.s, uxtw #2]
 174 ; CHECK-NEXT:    ret
 175   %1 = getelementptr inbounds i32, ptr %in, i64 %ptr
 176   %2 = bitcast ptr %1 to ptr
 177   %wide.load = load <vscale x 4 x i32>, ptr %2, align 1
 178   %3 = zext <vscale x 4 x i32> %wide.load to <vscale x 4 x i64>
 179   %4 = getelementptr inbounds i32, ptr %in, <vscale x 4 x i64> %3
 180   %wide.masked.gather = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %4, i32 1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> undef)
 181   ret <vscale x 4 x i32> %wide.masked.gather
 182 }
 183
 184 define <vscale x 2 x i64> @no_narrow_i64_gather_index_i64(ptr %out, ptr %in, <vscale x 2 x i64> %d, i64 %ptr){
 185 ; CHECK-LABEL: no_narrow_i64_gather_index_i64:
 186 ; CHECK:       // %bb.0:
 187 ; CHECK-NEXT:    ptrue p0.d
 188 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1, x2, lsl #3]
 189 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3]
 190 ; CHECK-NEXT:    ret
 191   %1 = getelementptr inbounds i64, ptr %in, i64 %ptr
 192   %2 = bitcast ptr %1 to ptr
 193   %wide.load = load <vscale x 2 x i64>, ptr %2, align 1
 194   %3 = getelementptr inbounds i64, ptr %in, <vscale x 2 x i64> %wide.load
 195   %wide.masked.gather = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> %3, i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> undef)
 196   ret <vscale x 2 x i64> %wide.masked.gather
 197 }
 198
 199 declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv2i8.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
 200 declare void @llvm.masked.store.nxv2i8(<vscale x 2 x i8>, ptr, i32, <vscale x 2 x i1>)
 201 declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
 202 declare <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
 203 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 204 declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)