llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s
   3
   4 define arm_aapcs_vfpcc void @thres_i32(i32* %data, i16 zeroext %N, i32 %T) {
   5 ; CHECK-LABEL: thres_i32:
   6 ; CHECK:       @ %bb.0: @ %entry
   7 ; CHECK-NEXT:    .save {r7, lr}
   8 ; CHECK-NEXT:    push {r7, lr}
   9 ; CHECK-NEXT:    cmp r1, #0
  10 ; CHECK-NEXT:    it eq
  11 ; CHECK-NEXT:    popeq {r7, pc}
  12 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
  13 ; CHECK-NEXT:    mvn r3, #3
  14 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #2
  15 ; CHECK-NEXT:    movs r3, #1
  16 ; CHECK-NEXT:    vmov.i32 q0, #0x0
  17 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
  18 ; CHECK-NEXT:    rsbs r1, r2, #0
  19 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
  20 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  21 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
  22 ; CHECK-NEXT:    vpte.s32 ge, q1, r2
  23 ; CHECK-NEXT:    vcmpt.s32 le, q1, r1
  24 ; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
  25 ; CHECK-NEXT:    le lr, .LBB0_2
  26 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
  27 ; CHECK-NEXT:    pop {r7, pc}
  28 entry:
  29   %conv = zext i16 %N to i32
  30   %mul = shl nuw nsw i32 %conv, 2
  31   %cmp15 = icmp eq i16 %N, 0
  32   br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
  33
  34 vector.ph:                                        ; preds = %entry
  35   %sub = sub nsw i32 0, %T
  36   %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
  37   %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
  38   %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
  39   %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
  40   br label %vector.body
  41
  42 vector.body:                                      ; preds = %vector.body, %vector.ph
  43   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  44   %0 = getelementptr inbounds i32, i32* %data, i32 %index
  45   %1 = bitcast i32* %0 to <4 x i32>*
  46   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
  47   %2 = icmp slt <4 x i32> %wide.load, %broadcast.splat18
  48   %3 = icmp sgt <4 x i32> %wide.load, %broadcast.splat20
  49   %4 = or <4 x i1> %2, %3
  50   %5 = bitcast i32* %0 to <4 x i32>*
  51   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> zeroinitializer, <4 x i32>* %5, i32 4, <4 x i1> %4)
  52   %index.next = add i32 %index, 4
  53   %6 = icmp eq i32 %index.next, %mul
  54   br i1 %6, label %for.cond.cleanup, label %vector.body
  55
  56 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  57   ret void
  58 }
  59
  60 define arm_aapcs_vfpcc void @thresh_i16(i16* %data, i16 zeroext %N, i16 signext %T) {
  61 ; CHECK-LABEL: thresh_i16:
  62 ; CHECK:       @ %bb.0: @ %entry
  63 ; CHECK-NEXT:    .save {r7, lr}
  64 ; CHECK-NEXT:    push {r7, lr}
  65 ; CHECK-NEXT:    cmp r1, #0
  66 ; CHECK-NEXT:    it eq
  67 ; CHECK-NEXT:    popeq {r7, pc}
  68 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
  69 ; CHECK-NEXT:    mvn r3, #7
  70 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
  71 ; CHECK-NEXT:    movs r3, #1
  72 ; CHECK-NEXT:    vmov.i32 q0, #0x0
  73 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
  74 ; CHECK-NEXT:    rsbs r1, r2, #0
  75 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
  76 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  77 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
  78 ; CHECK-NEXT:    vpte.s16 ge, q1, r2
  79 ; CHECK-NEXT:    vcmpt.s16 le, q1, r1
  80 ; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
  81 ; CHECK-NEXT:    le lr, .LBB1_2
  82 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
  83 ; CHECK-NEXT:    pop {r7, pc}
  84 entry:
  85   %conv2 = zext i16 %N to i32
  86   %mul = shl nuw nsw i32 %conv2, 3
  87   %cmp22 = icmp eq i16 %N, 0
  88   br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
  89
  90 vector.ph:                                        ; preds = %entry
  91   %sub = sub i16 0, %T
  92   %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
  93   %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
  94   %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
  95   %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
  96   br label %vector.body
  97
  98 vector.body:                                      ; preds = %vector.body, %vector.ph
  99   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 100   %0 = getelementptr inbounds i16, i16* %data, i32 %index
 101   %1 = bitcast i16* %0 to <8 x i16>*
 102   %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
 103   %2 = icmp slt <8 x i16> %wide.load, %broadcast.splat25
 104   %3 = icmp sgt <8 x i16> %wide.load, %broadcast.splat27
 105   %4 = or <8 x i1> %2, %3
 106   %5 = bitcast i16* %0 to <8 x i16>*
 107   call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> zeroinitializer, <8 x i16>* %5, i32 2, <8 x i1> %4)
 108   %index.next = add i32 %index, 8
 109   %6 = icmp eq i32 %index.next, %mul
 110   br i1 %6, label %for.cond.cleanup, label %vector.body
 111
 112 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 113   ret void
 114 }
 115
 116 define arm_aapcs_vfpcc void @thresh_i8(i8* %data, i16 zeroext %N, i8 signext %T) {
 117 ; CHECK-LABEL: thresh_i8:
 118 ; CHECK:       @ %bb.0: @ %entry
 119 ; CHECK-NEXT:    .save {r7, lr}
 120 ; CHECK-NEXT:    push {r7, lr}
 121 ; CHECK-NEXT:    cmp r1, #0
 122 ; CHECK-NEXT:    it eq
 123 ; CHECK-NEXT:    popeq {r7, pc}
 124 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 125 ; CHECK-NEXT:    mvn r3, #15
 126 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #4
 127 ; CHECK-NEXT:    movs r3, #1
 128 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 129 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #4
 130 ; CHECK-NEXT:    rsbs r1, r2, #0
 131 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 132 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 133 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
 134 ; CHECK-NEXT:    vpte.s8 ge, q1, r2
 135 ; CHECK-NEXT:    vcmpt.s8 le, q1, r1
 136 ; CHECK-NEXT:    vstrbe.8 q0, [r0], #16
 137 ; CHECK-NEXT:    le lr, .LBB2_2
 138 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 139 ; CHECK-NEXT:    pop {r7, pc}
 140 entry:
 141   %conv2 = zext i16 %N to i32
 142   %mul = shl nuw nsw i32 %conv2, 4
 143   %cmp20 = icmp eq i16 %N, 0
 144   br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
 145
 146 vector.ph:                                        ; preds = %entry
 147   %sub = sub i8 0, %T
 148   %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
 149   %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
 150   %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
 151   %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
 152   br label %vector.body
 153
 154 vector.body:                                      ; preds = %vector.body, %vector.ph
 155   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 156   %0 = getelementptr inbounds i8, i8* %data, i32 %index
 157   %1 = bitcast i8* %0 to <16 x i8>*
 158   %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
 159   %2 = icmp slt <16 x i8> %wide.load, %broadcast.splat23
 160   %3 = icmp sgt <16 x i8> %wide.load, %broadcast.splat25
 161   %4 = or <16 x i1> %2, %3
 162   %5 = bitcast i8* %0 to <16 x i8>*
 163   call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> zeroinitializer, <16 x i8>* %5, i32 1, <16 x i1> %4)
 164   %index.next = add i32 %index, 16
 165   %6 = icmp eq i32 %index.next, %mul
 166   br i1 %6, label %for.cond.cleanup, label %vector.body
 167
 168 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 169   ret void
 170 }
 171
 172 define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T) {
 173 ; CHECK-LABEL: thresh_f32:
 174 ; CHECK:       @ %bb.0: @ %entry
 175 ; CHECK-NEXT:    .save {r7, lr}
 176 ; CHECK-NEXT:    push {r7, lr}
 177 ; CHECK-NEXT:    cmp r1, #0
 178 ; CHECK-NEXT:    it eq
 179 ; CHECK-NEXT:    popeq {r7, pc}
 180 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 181 ; CHECK-NEXT:    mvn r2, #3
 182 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 183 ; CHECK-NEXT:    movs r2, #1
 184 ; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
 185 ; CHECK-NEXT:    vmov r1, s0
 186 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 187 ; CHECK-NEXT:    eor r2, r1, #-2147483648
 188 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 189 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 190 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 191 ; CHECK-NEXT:    vpte.f32 ge, q1, r1
 192 ; CHECK-NEXT:    vcmpt.f32 le, q1, r2
 193 ; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
 194 ; CHECK-NEXT:    le lr, .LBB3_2
 195 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 196 ; CHECK-NEXT:    pop {r7, pc}
 197 entry:
 198   %conv = zext i16 %N to i32
 199   %mul = shl nuw nsw i32 %conv, 2
 200   %cmp15 = icmp eq i16 %N, 0
 201   br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
 202
 203 vector.ph:                                        ; preds = %entry
 204   %fneg = fneg fast float %T
 205   %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
 206   %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
 207   %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
 208   %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
 209   br label %vector.body
 210
 211 vector.body:                                      ; preds = %vector.body, %vector.ph
 212   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 213   %0 = getelementptr inbounds float, float* %data, i32 %index
 214   %1 = bitcast float* %0 to <4 x float>*
 215   %wide.load = load <4 x float>, <4 x float>* %1, align 4
 216   %2 = fcmp fast olt <4 x float> %wide.load, %broadcast.splat18
 217   %3 = fcmp fast ogt <4 x float> %wide.load, %broadcast.splat20
 218   %4 = or <4 x i1> %2, %3
 219   %5 = bitcast float* %0 to <4 x float>*
 220   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %5, i32 4, <4 x i1> %4)
 221   %index.next = add i32 %index, 4
 222   %6 = icmp eq i32 %index.next, %mul
 223   br i1 %6, label %for.cond.cleanup, label %vector.body
 224
 225 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 226   ret void
 227 }
 228
 229 define arm_aapcs_vfpcc void @thresh_f16(half* %data, i16 zeroext %N, float %T.coerce) {
 230 ; CHECK-LABEL: thresh_f16:
 231 ; CHECK:       @ %bb.0: @ %entry
 232 ; CHECK-NEXT:    .save {r7, lr}
 233 ; CHECK-NEXT:    push {r7, lr}
 234 ; CHECK-NEXT:    cmp r1, #0
 235 ; CHECK-NEXT:    it eq
 236 ; CHECK-NEXT:    popeq {r7, pc}
 237 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 238 ; CHECK-NEXT:    mvn r3, #7
 239 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
 240 ; CHECK-NEXT:    vmov r2, s0
 241 ; CHECK-NEXT:    vneg.f16 s0, s0
 242 ; CHECK-NEXT:    movs r3, #1
 243 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
 244 ; CHECK-NEXT:    vmov.f16 r1, s0
 245 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 246 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 247 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 248 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
 249 ; CHECK-NEXT:    vpte.f16 ge, q1, r2
 250 ; CHECK-NEXT:    vcmpt.f16 le, q1, r1
 251 ; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
 252 ; CHECK-NEXT:    le lr, .LBB4_2
 253 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 254 ; CHECK-NEXT:    pop {r7, pc}
 255 entry:
 256   %0 = bitcast float %T.coerce to i32
 257   %tmp.0.extract.trunc = trunc i32 %0 to i16
 258   %1 = bitcast i16 %tmp.0.extract.trunc to half
 259   %conv = zext i16 %N to i32
 260   %mul = shl nuw nsw i32 %conv, 3
 261   %cmp17 = icmp eq i16 %N, 0
 262   br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
 263
 264 vector.ph:                                        ; preds = %entry
 265   %fneg = fneg fast half %1
 266   %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
 267   %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
 268   %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
 269   %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
 270   br label %vector.body
 271
 272 vector.body:                                      ; preds = %vector.body, %vector.ph
 273   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 274   %2 = getelementptr inbounds half, half* %data, i32 %index
 275   %3 = bitcast half* %2 to <8 x half>*
 276   %wide.load = load <8 x half>, <8 x half>* %3, align 2
 277   %4 = fcmp fast olt <8 x half> %wide.load, %broadcast.splat20
 278   %5 = fcmp fast ogt <8 x half> %wide.load, %broadcast.splat22
 279   %6 = or <8 x i1> %4, %5
 280   %7 = bitcast half* %2 to <8 x half>*
 281   call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> zeroinitializer, <8 x half>* %7, i32 2, <8 x i1> %6)
 282   %index.next = add i32 %index, 8
 283   %8 = icmp eq i32 %index.next, %mul
 284   br i1 %8, label %for.cond.cleanup, label %vector.body
 285
 286 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 287   ret void
 288 }
 289
 290
 291
 292 define arm_aapcs_vfpcc void @thres_rev_i32(i32* %data, i16 zeroext %N, i32 %T) {
 293 ; CHECK-LABEL: thres_rev_i32:
 294 ; CHECK:       @ %bb.0: @ %entry
 295 ; CHECK-NEXT:    .save {r7, lr}
 296 ; CHECK-NEXT:    push {r7, lr}
 297 ; CHECK-NEXT:    cmp r1, #0
 298 ; CHECK-NEXT:    it eq
 299 ; CHECK-NEXT:    popeq {r7, pc}
 300 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
 301 ; CHECK-NEXT:    mvn r3, #3
 302 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #2
 303 ; CHECK-NEXT:    movs r3, #1
 304 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 305 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
 306 ; CHECK-NEXT:    rsbs r1, r2, #0
 307 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 308 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 309 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 310 ; CHECK-NEXT:    vpte.s32 ge, q1, r2
 311 ; CHECK-NEXT:    vcmpt.s32 le, q1, r1
 312 ; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
 313 ; CHECK-NEXT:    le lr, .LBB5_2
 314 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 315 ; CHECK-NEXT:    pop {r7, pc}
 316 entry:
 317   %conv = zext i16 %N to i32
 318   %mul = shl nuw nsw i32 %conv, 2
 319   %cmp15 = icmp eq i16 %N, 0
 320   br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
 321
 322 vector.ph:                                        ; preds = %entry
 323   %sub = sub nsw i32 0, %T
 324   %broadcast.splatinsert17 = insertelement <4 x i32> undef, i32 %T, i32 0
 325   %broadcast.splat18 = shufflevector <4 x i32> %broadcast.splatinsert17, <4 x i32> undef, <4 x i32> zeroinitializer
 326   %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %sub, i32 0
 327   %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
 328   br label %vector.body
 329
 330 vector.body:                                      ; preds = %vector.body, %vector.ph
 331   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 332   %0 = getelementptr inbounds i32, i32* %data, i32 %index
 333   %1 = bitcast i32* %0 to <4 x i32>*
 334   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
 335   %2 = icmp sgt <4 x i32> %broadcast.splat18, %wide.load
 336   %3 = icmp slt <4 x i32> %broadcast.splat20, %wide.load
 337   %4 = or <4 x i1> %2, %3
 338   %5 = bitcast i32* %0 to <4 x i32>*
 339   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> zeroinitializer, <4 x i32>* %5, i32 4, <4 x i1> %4)
 340   %index.next = add i32 %index, 4
 341   %6 = icmp eq i32 %index.next, %mul
 342   br i1 %6, label %for.cond.cleanup, label %vector.body
 343
 344 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 345   ret void
 346 }
 347
 348 define arm_aapcs_vfpcc void @thresh_rev_i16(i16* %data, i16 zeroext %N, i16 signext %T) {
 349 ; CHECK-LABEL: thresh_rev_i16:
 350 ; CHECK:       @ %bb.0: @ %entry
 351 ; CHECK-NEXT:    .save {r7, lr}
 352 ; CHECK-NEXT:    push {r7, lr}
 353 ; CHECK-NEXT:    cmp r1, #0
 354 ; CHECK-NEXT:    it eq
 355 ; CHECK-NEXT:    popeq {r7, pc}
 356 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
 357 ; CHECK-NEXT:    mvn r3, #7
 358 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
 359 ; CHECK-NEXT:    movs r3, #1
 360 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 361 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
 362 ; CHECK-NEXT:    rsbs r1, r2, #0
 363 ; CHECK-NEXT:  .LBB6_2: @ %vector.body
 364 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 365 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
 366 ; CHECK-NEXT:    vpte.s16 ge, q1, r2
 367 ; CHECK-NEXT:    vcmpt.s16 le, q1, r1
 368 ; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
 369 ; CHECK-NEXT:    le lr, .LBB6_2
 370 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 371 ; CHECK-NEXT:    pop {r7, pc}
 372 entry:
 373   %conv2 = zext i16 %N to i32
 374   %mul = shl nuw nsw i32 %conv2, 3
 375   %cmp22 = icmp eq i16 %N, 0
 376   br i1 %cmp22, label %for.cond.cleanup, label %vector.ph
 377
 378 vector.ph:                                        ; preds = %entry
 379   %sub = sub i16 0, %T
 380   %broadcast.splatinsert24 = insertelement <8 x i16> undef, i16 %T, i32 0
 381   %broadcast.splat25 = shufflevector <8 x i16> %broadcast.splatinsert24, <8 x i16> undef, <8 x i32> zeroinitializer
 382   %broadcast.splatinsert26 = insertelement <8 x i16> undef, i16 %sub, i32 0
 383   %broadcast.splat27 = shufflevector <8 x i16> %broadcast.splatinsert26, <8 x i16> undef, <8 x i32> zeroinitializer
 384   br label %vector.body
 385
 386 vector.body:                                      ; preds = %vector.body, %vector.ph
 387   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 388   %0 = getelementptr inbounds i16, i16* %data, i32 %index
 389   %1 = bitcast i16* %0 to <8 x i16>*
 390   %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
 391   %2 = icmp sgt <8 x i16> %broadcast.splat25, %wide.load
 392   %3 = icmp slt <8 x i16> %broadcast.splat27, %wide.load
 393   %4 = or <8 x i1> %2, %3
 394   %5 = bitcast i16* %0 to <8 x i16>*
 395   call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> zeroinitializer, <8 x i16>* %5, i32 2, <8 x i1> %4)
 396   %index.next = add i32 %index, 8
 397   %6 = icmp eq i32 %index.next, %mul
 398   br i1 %6, label %for.cond.cleanup, label %vector.body
 399
 400 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 401   ret void
 402 }
 403
 404 define arm_aapcs_vfpcc void @thresh_rev_i8(i8* %data, i16 zeroext %N, i8 signext %T) {
 405 ; CHECK-LABEL: thresh_rev_i8:
 406 ; CHECK:       @ %bb.0: @ %entry
 407 ; CHECK-NEXT:    .save {r7, lr}
 408 ; CHECK-NEXT:    push {r7, lr}
 409 ; CHECK-NEXT:    cmp r1, #0
 410 ; CHECK-NEXT:    it eq
 411 ; CHECK-NEXT:    popeq {r7, pc}
 412 ; CHECK-NEXT:  .LBB7_1: @ %vector.ph
 413 ; CHECK-NEXT:    mvn r3, #15
 414 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #4
 415 ; CHECK-NEXT:    movs r3, #1
 416 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 417 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #4
 418 ; CHECK-NEXT:    rsbs r1, r2, #0
 419 ; CHECK-NEXT:  .LBB7_2: @ %vector.body
 420 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 421 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
 422 ; CHECK-NEXT:    vpte.s8 ge, q1, r2
 423 ; CHECK-NEXT:    vcmpt.s8 le, q1, r1
 424 ; CHECK-NEXT:    vstrbe.8 q0, [r0], #16
 425 ; CHECK-NEXT:    le lr, .LBB7_2
 426 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 427 ; CHECK-NEXT:    pop {r7, pc}
 428 entry:
 429   %conv2 = zext i16 %N to i32
 430   %mul = shl nuw nsw i32 %conv2, 4
 431   %cmp20 = icmp eq i16 %N, 0
 432   br i1 %cmp20, label %for.cond.cleanup, label %vector.ph
 433
 434 vector.ph:                                        ; preds = %entry
 435   %sub = sub i8 0, %T
 436   %broadcast.splatinsert22 = insertelement <16 x i8> undef, i8 %T, i32 0
 437   %broadcast.splat23 = shufflevector <16 x i8> %broadcast.splatinsert22, <16 x i8> undef, <16 x i32> zeroinitializer
 438   %broadcast.splatinsert24 = insertelement <16 x i8> undef, i8 %sub, i32 0
 439   %broadcast.splat25 = shufflevector <16 x i8> %broadcast.splatinsert24, <16 x i8> undef, <16 x i32> zeroinitializer
 440   br label %vector.body
 441
 442 vector.body:                                      ; preds = %vector.body, %vector.ph
 443   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 444   %0 = getelementptr inbounds i8, i8* %data, i32 %index
 445   %1 = bitcast i8* %0 to <16 x i8>*
 446   %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
 447   %2 = icmp sgt <16 x i8> %broadcast.splat23, %wide.load
 448   %3 = icmp slt <16 x i8> %broadcast.splat25, %wide.load
 449   %4 = or <16 x i1> %2, %3
 450   %5 = bitcast i8* %0 to <16 x i8>*
 451   call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> zeroinitializer, <16 x i8>* %5, i32 1, <16 x i1> %4)
 452   %index.next = add i32 %index, 16
 453   %6 = icmp eq i32 %index.next, %mul
 454   br i1 %6, label %for.cond.cleanup, label %vector.body
 455
 456 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 457   ret void
 458 }
 459
 460 define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float %T) {
 461 ; CHECK-LABEL: thresh_rev_f32:
 462 ; CHECK:       @ %bb.0: @ %entry
 463 ; CHECK-NEXT:    .save {r7, lr}
 464 ; CHECK-NEXT:    push {r7, lr}
 465 ; CHECK-NEXT:    cmp r1, #0
 466 ; CHECK-NEXT:    it eq
 467 ; CHECK-NEXT:    popeq {r7, pc}
 468 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
 469 ; CHECK-NEXT:    mvn r2, #3
 470 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 471 ; CHECK-NEXT:    movs r2, #1
 472 ; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
 473 ; CHECK-NEXT:    vmov r1, s0
 474 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 475 ; CHECK-NEXT:    eor r2, r1, #-2147483648
 476 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 477 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 478 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 479 ; CHECK-NEXT:    vpte.f32 ge, q1, r1
 480 ; CHECK-NEXT:    vcmpt.f32 le, q1, r2
 481 ; CHECK-NEXT:    vstrwe.32 q0, [r0], #16
 482 ; CHECK-NEXT:    le lr, .LBB8_2
 483 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 484 ; CHECK-NEXT:    pop {r7, pc}
 485 entry:
 486   %conv = zext i16 %N to i32
 487   %mul = shl nuw nsw i32 %conv, 2
 488   %cmp15 = icmp eq i16 %N, 0
 489   br i1 %cmp15, label %for.cond.cleanup, label %vector.ph
 490
 491 vector.ph:                                        ; preds = %entry
 492   %fneg = fneg fast float %T
 493   %broadcast.splatinsert17 = insertelement <4 x float> undef, float %T, i32 0
 494   %broadcast.splat18 = shufflevector <4 x float> %broadcast.splatinsert17, <4 x float> undef, <4 x i32> zeroinitializer
 495   %broadcast.splatinsert19 = insertelement <4 x float> undef, float %fneg, i32 0
 496   %broadcast.splat20 = shufflevector <4 x float> %broadcast.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
 497   br label %vector.body
 498
 499 vector.body:                                      ; preds = %vector.body, %vector.ph
 500   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 501   %0 = getelementptr inbounds float, float* %data, i32 %index
 502   %1 = bitcast float* %0 to <4 x float>*
 503   %wide.load = load <4 x float>, <4 x float>* %1, align 4
 504   %2 = fcmp fast ogt <4 x float> %broadcast.splat18, %wide.load
 505   %3 = fcmp fast olt <4 x float> %broadcast.splat20, %wide.load
 506   %4 = or <4 x i1> %2, %3
 507   %5 = bitcast float* %0 to <4 x float>*
 508   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> zeroinitializer, <4 x float>* %5, i32 4, <4 x i1> %4)
 509   %index.next = add i32 %index, 4
 510   %6 = icmp eq i32 %index.next, %mul
 511   br i1 %6, label %for.cond.cleanup, label %vector.body
 512
 513 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 514   ret void
 515 }
 516
 517 define arm_aapcs_vfpcc void @thresh_rev_f16(half* %data, i16 zeroext %N, float %T.coerce) {
 518 ; CHECK-LABEL: thresh_rev_f16:
 519 ; CHECK:       @ %bb.0: @ %entry
 520 ; CHECK-NEXT:    .save {r7, lr}
 521 ; CHECK-NEXT:    push {r7, lr}
 522 ; CHECK-NEXT:    cmp r1, #0
 523 ; CHECK-NEXT:    it eq
 524 ; CHECK-NEXT:    popeq {r7, pc}
 525 ; CHECK-NEXT:  .LBB9_1: @ %vector.ph
 526 ; CHECK-NEXT:    mvn r3, #7
 527 ; CHECK-NEXT:    add.w r1, r3, r1, lsl #3
 528 ; CHECK-NEXT:    vmov r2, s0
 529 ; CHECK-NEXT:    vneg.f16 s0, s0
 530 ; CHECK-NEXT:    movs r3, #1
 531 ; CHECK-NEXT:    add.w lr, r3, r1, lsr #3
 532 ; CHECK-NEXT:    vmov.f16 r1, s0
 533 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 534 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 535 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 536 ; CHECK-NEXT:    vldrh.u16 q1, [r0]
 537 ; CHECK-NEXT:    vpte.f16 ge, q1, r2
 538 ; CHECK-NEXT:    vcmpt.f16 le, q1, r1
 539 ; CHECK-NEXT:    vstrhe.16 q0, [r0], #16
 540 ; CHECK-NEXT:    le lr, .LBB9_2
 541 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 542 ; CHECK-NEXT:    pop {r7, pc}
 543 entry:
 544   %0 = bitcast float %T.coerce to i32
 545   %tmp.0.extract.trunc = trunc i32 %0 to i16
 546   %1 = bitcast i16 %tmp.0.extract.trunc to half
 547   %conv = zext i16 %N to i32
 548   %mul = shl nuw nsw i32 %conv, 3
 549   %cmp17 = icmp eq i16 %N, 0
 550   br i1 %cmp17, label %for.cond.cleanup, label %vector.ph
 551
 552 vector.ph:                                        ; preds = %entry
 553   %fneg = fneg fast half %1
 554   %broadcast.splatinsert19 = insertelement <8 x half> undef, half %1, i32 0
 555   %broadcast.splat20 = shufflevector <8 x half> %broadcast.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
 556   %broadcast.splatinsert21 = insertelement <8 x half> undef, half %fneg, i32 0
 557   %broadcast.splat22 = shufflevector <8 x half> %broadcast.splatinsert21, <8 x half> undef, <8 x i32> zeroinitializer
 558   br label %vector.body
 559
 560 vector.body:                                      ; preds = %vector.body, %vector.ph
 561   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 562   %2 = getelementptr inbounds half, half* %data, i32 %index
 563   %3 = bitcast half* %2 to <8 x half>*
 564   %wide.load = load <8 x half>, <8 x half>* %3, align 2
 565   %4 = fcmp fast ogt <8 x half> %broadcast.splat20, %wide.load
 566   %5 = fcmp fast olt <8 x half> %broadcast.splat22, %wide.load
 567   %6 = or <8 x i1> %4, %5
 568   %7 = bitcast half* %2 to <8 x half>*
 569   call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> zeroinitializer, <8 x half>* %7, i32 2, <8 x i1> %6)
 570   %index.next = add i32 %index, 8
 571   %8 = icmp eq i32 %index.next, %mul
 572   br i1 %8, label %for.cond.cleanup, label %vector.body
 573
 574 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 575   ret void
 576 }
 577
 578
 579
 580
 581 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 582 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 583 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 584 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
 585 declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)