llvm/test/CodeGen/Thumb2/mve-vmulh.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
   3
   4 define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
   5 ; CHECK-LABEL: vmulhs_v2i32:
   6 ; CHECK:       @ %bb.0: @ %entry
   7 ; CHECK-NEXT:    vmullb.s32 q2, q0, q1
   8 ; CHECK-NEXT:    vmov r0, s11
   9 ; CHECK-NEXT:    vmov r1, s9
  10 ; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
  11 ; CHECK-NEXT:    asrs r0, r0, #31
  12 ; CHECK-NEXT:    asrs r1, r1, #31
  13 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
  14 ; CHECK-NEXT:    bx lr
  15 entry:
  16   %s0s = sext <2 x i32> %s0 to <2 x i64>
  17   %s1s = sext <2 x i32> %s1 to <2 x i64>
  18   %m = mul <2 x i64> %s0s, %s1s
  19   %s = ashr <2 x i64> %m, <i64 32, i64 32>
  20   %s2 = trunc <2 x i64> %s to <2 x i32>
  21   ret <2 x i32> %s2
  22 }
  23
  24 define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
  25 ; CHECK-LABEL: vmulhu_v2i32:
  26 ; CHECK:       @ %bb.0: @ %entry
  27 ; CHECK-NEXT:    vmullb.u32 q2, q0, q1
  28 ; CHECK-NEXT:    vldr s1, .LCPI1_0
  29 ; CHECK-NEXT:    vmov.f32 s0, s9
  30 ; CHECK-NEXT:    vmov.f32 s2, s11
  31 ; CHECK-NEXT:    vmov.f32 s3, s1
  32 ; CHECK-NEXT:    bx lr
  33 ; CHECK-NEXT:    .p2align 2
  34 ; CHECK-NEXT:  @ %bb.1:
  35 ; CHECK-NEXT:  .LCPI1_0:
  36 ; CHECK-NEXT:    .long 0x00000000 @ float 0
  37 entry:
  38   %s0s = zext <2 x i32> %s0 to <2 x i64>
  39   %s1s = zext <2 x i32> %s1 to <2 x i64>
  40   %m = mul <2 x i64> %s0s, %s1s
  41   %s = lshr <2 x i64> %m, <i64 32, i64 32>
  42   %s2 = trunc <2 x i64> %s to <2 x i32>
  43   ret <2 x i32> %s2
  44 }
  45
  46 define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
  47 ; CHECK-LABEL: vmulhs_v4i32:
  48 ; CHECK:       @ %bb.0: @ %entry
  49 ; CHECK-NEXT:    vmulh.s32 q0, q0, q1
  50 ; CHECK-NEXT:    bx lr
  51 entry:
  52   %s0s = sext <4 x i32> %s0 to <4 x i64>
  53   %s1s = sext <4 x i32> %s1 to <4 x i64>
  54   %m = mul <4 x i64> %s0s, %s1s
  55   %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
  56   %s2 = trunc <4 x i64> %s to <4 x i32>
  57   ret <4 x i32> %s2
  58 }
  59
  60 define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
  61 ; CHECK-LABEL: vmulhu_v4i32:
  62 ; CHECK:       @ %bb.0: @ %entry
  63 ; CHECK-NEXT:    vmulh.u32 q0, q0, q1
  64 ; CHECK-NEXT:    bx lr
  65 entry:
  66   %s0s = zext <4 x i32> %s0 to <4 x i64>
  67   %s1s = zext <4 x i32> %s1 to <4 x i64>
  68   %m = mul <4 x i64> %s0s, %s1s
  69   %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
  70   %s2 = trunc <4 x i64> %s to <4 x i32>
  71   ret <4 x i32> %s2
  72 }
  73
  74 define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
  75 ; CHECK-LABEL: vmulhs_v4i16:
  76 ; CHECK:       @ %bb.0: @ %entry
  77 ; CHECK-NEXT:    vmullb.s16 q0, q0, q1
  78 ; CHECK-NEXT:    vshr.s32 q0, q0, #16
  79 ; CHECK-NEXT:    bx lr
  80 entry:
  81   %s0s = sext <4 x i16> %s0 to <4 x i32>
  82   %s1s = sext <4 x i16> %s1 to <4 x i32>
  83   %m = mul <4 x i32> %s0s, %s1s
  84   %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
  85   %s2 = trunc <4 x i32> %s to <4 x i16>
  86   ret <4 x i16> %s2
  87 }
  88
  89 define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
  90 ; CHECK-LABEL: vmulhu_v4i16:
  91 ; CHECK:       @ %bb.0: @ %entry
  92 ; CHECK-NEXT:    vmullb.u16 q0, q0, q1
  93 ; CHECK-NEXT:    vshr.u32 q0, q0, #16
  94 ; CHECK-NEXT:    bx lr
  95 entry:
  96   %s0s = zext <4 x i16> %s0 to <4 x i32>
  97   %s1s = zext <4 x i16> %s1 to <4 x i32>
  98   %m = mul <4 x i32> %s0s, %s1s
  99   %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
 100   %s2 = trunc <4 x i32> %s to <4 x i16>
 101   ret <4 x i16> %s2
 102 }
 103
 104 define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 105 ; CHECK-LABEL: vmulhs_v8i16:
 106 ; CHECK:       @ %bb.0: @ %entry
 107 ; CHECK-NEXT:    vmulh.s16 q0, q0, q1
 108 ; CHECK-NEXT:    bx lr
 109 entry:
 110   %s0s = sext <8 x i16> %s0 to <8 x i32>
 111   %s1s = sext <8 x i16> %s1 to <8 x i32>
 112   %m = mul <8 x i32> %s0s, %s1s
 113   %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 114   %s2 = trunc <8 x i32> %s to <8 x i16>
 115   ret <8 x i16> %s2
 116 }
 117
 118 define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
 119 ; CHECK-LABEL: vmulhu_v8i16:
 120 ; CHECK:       @ %bb.0: @ %entry
 121 ; CHECK-NEXT:    vmulh.u16 q0, q0, q1
 122 ; CHECK-NEXT:    bx lr
 123 entry:
 124   %s0s = zext <8 x i16> %s0 to <8 x i32>
 125   %s1s = zext <8 x i16> %s1 to <8 x i32>
 126   %m = mul <8 x i32> %s0s, %s1s
 127   %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 128   %s2 = trunc <8 x i32> %s to <8 x i16>
 129   ret <8 x i16> %s2
 130 }
 131
 132 define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
 133 ; CHECK-LABEL: vmulhs_v8i8:
 134 ; CHECK:       @ %bb.0: @ %entry
 135 ; CHECK-NEXT:    vmullb.s8 q0, q0, q1
 136 ; CHECK-NEXT:    vshr.s16 q0, q0, #8
 137 ; CHECK-NEXT:    bx lr
 138 entry:
 139   %s0s = sext <8 x i8> %s0 to <8 x i16>
 140   %s1s = sext <8 x i8> %s1 to <8 x i16>
 141   %m = mul <8 x i16> %s0s, %s1s
 142   %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 143   %s2 = trunc <8 x i16> %s to <8 x i8>
 144   ret <8 x i8> %s2
 145 }
 146
 147 define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
 148 ; CHECK-LABEL: vmulhu_v8i8:
 149 ; CHECK:       @ %bb.0: @ %entry
 150 ; CHECK-NEXT:    vmullb.u8 q0, q0, q1
 151 ; CHECK-NEXT:    vshr.u16 q0, q0, #8
 152 ; CHECK-NEXT:    bx lr
 153 entry:
 154   %s0s = zext <8 x i8> %s0 to <8 x i16>
 155   %s1s = zext <8 x i8> %s1 to <8 x i16>
 156   %m = mul <8 x i16> %s0s, %s1s
 157   %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 158   %s2 = trunc <8 x i16> %s to <8 x i8>
 159   ret <8 x i8> %s2
 160 }
 161
 162 define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
 163 ; CHECK-LABEL: vmulhs_v16i8:
 164 ; CHECK:       @ %bb.0: @ %entry
 165 ; CHECK-NEXT:    vmulh.s8 q0, q0, q1
 166 ; CHECK-NEXT:    bx lr
 167 entry:
 168   %s0s = sext <16 x i8> %s0 to <16 x i16>
 169   %s1s = sext <16 x i8> %s1 to <16 x i16>
 170   %m = mul <16 x i16> %s0s, %s1s
 171   %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 172   %s2 = trunc <16 x i16> %s to <16 x i8>
 173   ret <16 x i8> %s2
 174 }
 175
 176 define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
 177 ; CHECK-LABEL: vmulhu_v16i8:
 178 ; CHECK:       @ %bb.0: @ %entry
 179 ; CHECK-NEXT:    vmulh.u8 q0, q0, q1
 180 ; CHECK-NEXT:    bx lr
 181 entry:
 182   %s0s = zext <16 x i8> %s0 to <16 x i16>
 183   %s1s = zext <16 x i8> %s1 to <16 x i16>
 184   %m = mul <16 x i16> %s0s, %s1s
 185   %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 186   %s2 = trunc <16 x i16> %s to <16 x i8>
 187   ret <16 x i8> %s2
 188 }
 189
 190 define void @vmulh_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
 191 ; CHECK-LABEL: vmulh_s8:
 192 ; CHECK:       @ %bb.0: @ %entry
 193 ; CHECK-NEXT:    .save {r7, lr}
 194 ; CHECK-NEXT:    push {r7, lr}
 195 ; CHECK-NEXT:    mov.w lr, #64
 196 ; CHECK-NEXT:  .LBB12_1: @ %vector.body
 197 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 198 ; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
 199 ; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
 200 ; CHECK-NEXT:    vmulh.s8 q0, q1, q0
 201 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 202 ; CHECK-NEXT:    le lr, .LBB12_1
 203 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 204 ; CHECK-NEXT:    pop {r7, pc}
 205 entry:
 206   br label %vector.body
 207
 208 vector.body:                                      ; preds = %vector.body, %entry
 209   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 210   %0 = getelementptr inbounds i8, i8* %x, i32 %index
 211   %1 = bitcast i8* %0 to <16 x i8>*
 212   %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
 213   %2 = sext <16 x i8> %wide.load to <16 x i16>
 214   %3 = getelementptr inbounds i8, i8* %y, i32 %index
 215   %4 = bitcast i8* %3 to <16 x i8>*
 216   %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
 217   %5 = sext <16 x i8> %wide.load17 to <16 x i16>
 218   %6 = mul nsw <16 x i16> %5, %2
 219   %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 220   %8 = trunc <16 x i16> %7 to <16 x i8>
 221   %9 = getelementptr inbounds i8, i8* %z, i32 %index
 222   %10 = bitcast i8* %9 to <16 x i8>*
 223   store <16 x i8> %8, <16 x i8>* %10, align 1
 224   %index.next = add i32 %index, 16
 225   %11 = icmp eq i32 %index.next, 1024
 226   br i1 %11, label %for.cond.cleanup, label %vector.body
 227
 228 for.cond.cleanup:                                 ; preds = %vector.body
 229   ret void
 230 }
 231
 232 define void @vmulh_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
 233 ; CHECK-LABEL: vmulh_s16:
 234 ; CHECK:       @ %bb.0: @ %entry
 235 ; CHECK-NEXT:    .save {r7, lr}
 236 ; CHECK-NEXT:    push {r7, lr}
 237 ; CHECK-NEXT:    mov.w lr, #128
 238 ; CHECK-NEXT:  .LBB13_1: @ %vector.body
 239 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 240 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
 241 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
 242 ; CHECK-NEXT:    vmulh.s16 q0, q1, q0
 243 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 244 ; CHECK-NEXT:    le lr, .LBB13_1
 245 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 246 ; CHECK-NEXT:    pop {r7, pc}
 247 entry:
 248   br label %vector.body
 249
 250 vector.body:                                      ; preds = %vector.body, %entry
 251   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 252   %0 = getelementptr inbounds i16, i16* %x, i32 %index
 253   %1 = bitcast i16* %0 to <8 x i16>*
 254   %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
 255   %2 = sext <8 x i16> %wide.load to <8 x i32>
 256   %3 = getelementptr inbounds i16, i16* %y, i32 %index
 257   %4 = bitcast i16* %3 to <8 x i16>*
 258   %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
 259   %5 = sext <8 x i16> %wide.load17 to <8 x i32>
 260   %6 = mul nsw <8 x i32> %5, %2
 261   %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 262   %8 = trunc <8 x i32> %7 to <8 x i16>
 263   %9 = getelementptr inbounds i16, i16* %z, i32 %index
 264   %10 = bitcast i16* %9 to <8 x i16>*
 265   store <8 x i16> %8, <8 x i16>* %10, align 2
 266   %index.next = add i32 %index, 8
 267   %11 = icmp eq i32 %index.next, 1024
 268   br i1 %11, label %for.cond.cleanup, label %vector.body
 269
 270 for.cond.cleanup:                                 ; preds = %vector.body
 271   ret void
 272 }
 273
 274 define void @vmulh_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
 275 ; CHECK-LABEL: vmulh_s32:
 276 ; CHECK:       @ %bb.0: @ %entry
 277 ; CHECK-NEXT:    .save {r7, lr}
 278 ; CHECK-NEXT:    push {r7, lr}
 279 ; CHECK-NEXT:    mov.w lr, #256
 280 ; CHECK-NEXT:  .LBB14_1: @ %vector.body
 281 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 282 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 283 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 284 ; CHECK-NEXT:    vmulh.s32 q0, q1, q0
 285 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 286 ; CHECK-NEXT:    le lr, .LBB14_1
 287 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 288 ; CHECK-NEXT:    pop {r7, pc}
 289 entry:
 290   br label %vector.body
 291
 292 vector.body:                                      ; preds = %vector.body, %entry
 293   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 294   %0 = getelementptr inbounds i32, i32* %x, i32 %index
 295   %1 = bitcast i32* %0 to <4 x i32>*
 296   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
 297   %2 = sext <4 x i32> %wide.load to <4 x i64>
 298   %3 = getelementptr inbounds i32, i32* %y, i32 %index
 299   %4 = bitcast i32* %3 to <4 x i32>*
 300   %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
 301   %5 = sext <4 x i32> %wide.load17 to <4 x i64>
 302   %6 = mul nsw <4 x i64> %5, %2
 303   %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
 304   %8 = trunc <4 x i64> %7 to <4 x i32>
 305   %9 = getelementptr inbounds i32, i32* %z, i32 %index
 306   %10 = bitcast i32* %9 to <4 x i32>*
 307   store <4 x i32> %8, <4 x i32>* %10, align 4
 308   %index.next = add i32 %index, 4
 309   %11 = icmp eq i32 %index.next, 1024
 310   br i1 %11, label %for.cond.cleanup, label %vector.body
 311
 312 for.cond.cleanup:                                 ; preds = %vector.body
 313   ret void
 314 }
 315
 316 define void @vmulh_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
 317 ; CHECK-LABEL: vmulh_u8:
 318 ; CHECK:       @ %bb.0: @ %entry
 319 ; CHECK-NEXT:    .save {r7, lr}
 320 ; CHECK-NEXT:    push {r7, lr}
 321 ; CHECK-NEXT:    mov.w lr, #64
 322 ; CHECK-NEXT:  .LBB15_1: @ %vector.body
 323 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 324 ; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
 325 ; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
 326 ; CHECK-NEXT:    vmulh.u8 q0, q1, q0
 327 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 328 ; CHECK-NEXT:    le lr, .LBB15_1
 329 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 330 ; CHECK-NEXT:    pop {r7, pc}
 331 entry:
 332   br label %vector.body
 333
 334 vector.body:                                      ; preds = %vector.body, %entry
 335   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 336   %0 = getelementptr inbounds i8, i8* %x, i32 %index
 337   %1 = bitcast i8* %0 to <16 x i8>*
 338   %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
 339   %2 = zext <16 x i8> %wide.load to <16 x i16>
 340   %3 = getelementptr inbounds i8, i8* %y, i32 %index
 341   %4 = bitcast i8* %3 to <16 x i8>*
 342   %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
 343   %5 = zext <16 x i8> %wide.load17 to <16 x i16>
 344   %6 = mul nuw <16 x i16> %5, %2
 345   %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 346   %8 = trunc <16 x i16> %7 to <16 x i8>
 347   %9 = getelementptr inbounds i8, i8* %z, i32 %index
 348   %10 = bitcast i8* %9 to <16 x i8>*
 349   store <16 x i8> %8, <16 x i8>* %10, align 1
 350   %index.next = add i32 %index, 16
 351   %11 = icmp eq i32 %index.next, 1024
 352   br i1 %11, label %for.cond.cleanup, label %vector.body
 353
 354 for.cond.cleanup:                                 ; preds = %vector.body
 355   ret void
 356 }
 357
 358 define void @vmulh_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
 359 ; CHECK-LABEL: vmulh_u16:
 360 ; CHECK:       @ %bb.0: @ %entry
 361 ; CHECK-NEXT:    .save {r7, lr}
 362 ; CHECK-NEXT:    push {r7, lr}
 363 ; CHECK-NEXT:    mov.w lr, #128
 364 ; CHECK-NEXT:  .LBB16_1: @ %vector.body
 365 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 366 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
 367 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
 368 ; CHECK-NEXT:    vmulh.u16 q0, q1, q0
 369 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 370 ; CHECK-NEXT:    le lr, .LBB16_1
 371 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 372 ; CHECK-NEXT:    pop {r7, pc}
 373 entry:
 374   br label %vector.body
 375
 376 vector.body:                                      ; preds = %vector.body, %entry
 377   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 378   %0 = getelementptr inbounds i16, i16* %x, i32 %index
 379   %1 = bitcast i16* %0 to <8 x i16>*
 380   %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
 381   %2 = zext <8 x i16> %wide.load to <8 x i32>
 382   %3 = getelementptr inbounds i16, i16* %y, i32 %index
 383   %4 = bitcast i16* %3 to <8 x i16>*
 384   %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
 385   %5 = zext <8 x i16> %wide.load17 to <8 x i32>
 386   %6 = mul nuw <8 x i32> %5, %2
 387   %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 388   %8 = trunc <8 x i32> %7 to <8 x i16>
 389   %9 = getelementptr inbounds i16, i16* %z, i32 %index
 390   %10 = bitcast i16* %9 to <8 x i16>*
 391   store <8 x i16> %8, <8 x i16>* %10, align 2
 392   %index.next = add i32 %index, 8
 393   %11 = icmp eq i32 %index.next, 1024
 394   br i1 %11, label %for.cond.cleanup, label %vector.body
 395
 396 for.cond.cleanup:                                 ; preds = %vector.body
 397   ret void
 398 }
 399
 400 define void @vmulh_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
 401 ; CHECK-LABEL: vmulh_u32:
 402 ; CHECK:       @ %bb.0: @ %entry
 403 ; CHECK-NEXT:    .save {r7, lr}
 404 ; CHECK-NEXT:    push {r7, lr}
 405 ; CHECK-NEXT:    mov.w lr, #256
 406 ; CHECK-NEXT:  .LBB17_1: @ %vector.body
 407 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 408 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 409 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 410 ; CHECK-NEXT:    vmulh.u32 q0, q1, q0
 411 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 412 ; CHECK-NEXT:    le lr, .LBB17_1
 413 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 414 ; CHECK-NEXT:    pop {r7, pc}
 415 entry:
 416   br label %vector.body
 417
 418 vector.body:                                      ; preds = %vector.body, %entry
 419   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 420   %0 = getelementptr inbounds i32, i32* %x, i32 %index
 421   %1 = bitcast i32* %0 to <4 x i32>*
 422   %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
 423   %2 = zext <4 x i32> %wide.load to <4 x i64>
 424   %3 = getelementptr inbounds i32, i32* %y, i32 %index
 425   %4 = bitcast i32* %3 to <4 x i32>*
 426   %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
 427   %5 = zext <4 x i32> %wide.load17 to <4 x i64>
 428   %6 = mul nuw <4 x i64> %5, %2
 429   %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
 430   %8 = trunc <4 x i64> %7 to <4 x i32>
 431   %9 = getelementptr inbounds i32, i32* %z, i32 %index
 432   %10 = bitcast i32* %9 to <4 x i32>*
 433   store <4 x i32> %8, <4 x i32>* %10, align 4
 434   %index.next = add i32 %index, 4
 435   %11 = icmp eq i32 %index.next, 1024
 436   br i1 %11, label %for.cond.cleanup, label %vector.body
 437
 438 for.cond.cleanup:                                 ; preds = %vector.body
 439   ret void
 440 }
 441
 442
 443 define void @vmulh_s32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
 444 ; CHECK-LABEL: vmulh_s32_pred:
 445 ; CHECK:       @ %bb.0: @ %entry
 446 ; CHECK-NEXT:    .save {r7, lr}
 447 ; CHECK-NEXT:    push {r7, lr}
 448 ; CHECK-NEXT:    cmp r3, #1
 449 ; CHECK-NEXT:    it lt
 450 ; CHECK-NEXT:    poplt {r7, pc}
 451 ; CHECK-NEXT:  .LBB18_1: @ %vector.ph
 452 ; CHECK-NEXT:    dlstp.32 lr, r3
 453 ; CHECK-NEXT:  .LBB18_2: @ %vector.body
 454 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 455 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 456 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
 457 ; CHECK-NEXT:    vmulh.s32 q0, q1, q0
 458 ; CHECK-NEXT:    vstrw.32 q0, [r0], #16
 459 ; CHECK-NEXT:    letp lr, .LBB18_2
 460 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 461 ; CHECK-NEXT:    pop {r7, pc}
 462 entry:
 463   %cmp10 = icmp sgt i32 %n, 0
 464   br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
 465
 466 vector.ph:                                        ; preds = %entry
 467   %n.rnd.up = add i32 %n, 3
 468   %n.vec = and i32 %n.rnd.up, -4
 469   br label %vector.body
 470
 471 vector.body:                                      ; preds = %vector.body, %vector.ph
 472   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 473   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 474   %0 = getelementptr inbounds i32, i32* %x, i32 %index
 475   %1 = bitcast i32* %0 to <4 x i32>*
 476   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
 477   %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
 478   %3 = getelementptr inbounds i32, i32* %y, i32 %index
 479   %4 = bitcast i32* %3 to <4 x i32>*
 480   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
 481   %5 = sext <4 x i32> %wide.masked.load12 to <4 x i64>
 482   %6 = mul nsw <4 x i64> %5, %2
 483   %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
 484   %8 = trunc <4 x i64> %7 to <4 x i32>
 485   %9 = getelementptr inbounds i32, i32* %d, i32 %index
 486   %10 = bitcast i32* %9 to <4 x i32>*
 487   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask)
 488   %index.next = add i32 %index, 4
 489   %11 = icmp eq i32 %index.next, %n.vec
 490   br i1 %11, label %for.cond.cleanup, label %vector.body
 491
 492 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 493   ret void
 494 }
 495
 496 define void @vmulh_u32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
 497 ; CHECK-LABEL: vmulh_u32_pred:
 498 ; CHECK:       @ %bb.0: @ %entry
 499 ; CHECK-NEXT:    .save {r7, lr}
 500 ; CHECK-NEXT:    push {r7, lr}
 501 ; CHECK-NEXT:    cmp r3, #1
 502 ; CHECK-NEXT:    it lt
 503 ; CHECK-NEXT:    poplt {r7, pc}
 504 ; CHECK-NEXT:  .LBB19_1: @ %vector.ph
 505 ; CHECK-NEXT:    dlstp.32 lr, r3
 506 ; CHECK-NEXT:  .LBB19_2: @ %vector.body
 507 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 508 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 509 ; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
 510 ; CHECK-NEXT:    vmulh.u32 q0, q1, q0
 511 ; CHECK-NEXT:    vstrw.32 q0, [r0], #16
 512 ; CHECK-NEXT:    letp lr, .LBB19_2
 513 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 514 ; CHECK-NEXT:    pop {r7, pc}
 515 entry:
 516   %cmp10 = icmp sgt i32 %n, 0
 517   br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
 518
 519 vector.ph:                                        ; preds = %entry
 520   %n.rnd.up = add i32 %n, 3
 521   %n.vec = and i32 %n.rnd.up, -4
 522   br label %vector.body
 523
 524 vector.body:                                      ; preds = %vector.body, %vector.ph
 525   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 526   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
 527   %0 = getelementptr inbounds i32, i32* %x, i32 %index
 528   %1 = bitcast i32* %0 to <4 x i32>*
 529   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
 530   %2 = zext <4 x i32> %wide.masked.load to <4 x i64>
 531   %3 = getelementptr inbounds i32, i32* %y, i32 %index
 532   %4 = bitcast i32* %3 to <4 x i32>*
 533   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
 534   %5 = zext <4 x i32> %wide.masked.load12 to <4 x i64>
 535   %6 = mul nuw <4 x i64> %5, %2
 536   %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
 537   %8 = trunc <4 x i64> %7 to <4 x i32>
 538   %9 = getelementptr inbounds i32, i32* %d, i32 %index
 539   %10 = bitcast i32* %9 to <4 x i32>*
 540   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask)
 541   %index.next = add i32 %index, 4
 542   %11 = icmp eq i32 %index.next, %n.vec
 543   br i1 %11, label %for.cond.cleanup, label %vector.body
 544
 545 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 546   ret void
 547 }
 548
 549 define void @vmulh_s16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
 550 ; CHECK-LABEL: vmulh_s16_pred:
 551 ; CHECK:       @ %bb.0: @ %entry
 552 ; CHECK-NEXT:    .save {r7, lr}
 553 ; CHECK-NEXT:    push {r7, lr}
 554 ; CHECK-NEXT:    cmp r3, #1
 555 ; CHECK-NEXT:    it lt
 556 ; CHECK-NEXT:    poplt {r7, pc}
 557 ; CHECK-NEXT:  .LBB20_1: @ %vector.ph
 558 ; CHECK-NEXT:    dlstp.16 lr, r3
 559 ; CHECK-NEXT:  .LBB20_2: @ %vector.body
 560 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 561 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
 562 ; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
 563 ; CHECK-NEXT:    vmulh.s16 q0, q1, q0
 564 ; CHECK-NEXT:    vstrh.16 q0, [r0], #16
 565 ; CHECK-NEXT:    letp lr, .LBB20_2
 566 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 567 ; CHECK-NEXT:    pop {r7, pc}
 568 entry:
 569   %cmp10 = icmp sgt i32 %n, 0
 570   br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
 571
 572 vector.ph:                                        ; preds = %entry
 573   %n.rnd.up = add i32 %n, 7
 574   %n.vec = and i32 %n.rnd.up, -8
 575   br label %vector.body
 576
 577 vector.body:                                      ; preds = %vector.body, %vector.ph
 578   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 579   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
 580   %0 = getelementptr inbounds i16, i16* %x, i32 %index
 581   %1 = bitcast i16* %0 to <8 x i16>*
 582   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
 583   %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
 584   %3 = getelementptr inbounds i16, i16* %y, i32 %index
 585   %4 = bitcast i16* %3 to <8 x i16>*
 586   %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
 587   %5 = sext <8 x i16> %wide.masked.load12 to <8 x i32>
 588   %6 = mul nsw <8 x i32> %5, %2
 589   %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 590   %8 = trunc <8 x i32> %7 to <8 x i16>
 591   %9 = getelementptr inbounds i16, i16* %d, i32 %index
 592   %10 = bitcast i16* %9 to <8 x i16>*
 593   call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask)
 594   %index.next = add i32 %index, 8
 595   %11 = icmp eq i32 %index.next, %n.vec
 596   br i1 %11, label %for.cond.cleanup, label %vector.body
 597
 598 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 599   ret void
 600 }
 601
 602 define void @vmulh_u16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
 603 ; CHECK-LABEL: vmulh_u16_pred:
 604 ; CHECK:       @ %bb.0: @ %entry
 605 ; CHECK-NEXT:    .save {r7, lr}
 606 ; CHECK-NEXT:    push {r7, lr}
 607 ; CHECK-NEXT:    cmp r3, #1
 608 ; CHECK-NEXT:    it lt
 609 ; CHECK-NEXT:    poplt {r7, pc}
 610 ; CHECK-NEXT:  .LBB21_1: @ %vector.ph
 611 ; CHECK-NEXT:    dlstp.16 lr, r3
 612 ; CHECK-NEXT:  .LBB21_2: @ %vector.body
 613 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 614 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
 615 ; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
 616 ; CHECK-NEXT:    vmulh.u16 q0, q1, q0
 617 ; CHECK-NEXT:    vstrh.16 q0, [r0], #16
 618 ; CHECK-NEXT:    letp lr, .LBB21_2
 619 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 620 ; CHECK-NEXT:    pop {r7, pc}
 621 entry:
 622   %cmp10 = icmp sgt i32 %n, 0
 623   br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
 624
 625 vector.ph:                                        ; preds = %entry
 626   %n.rnd.up = add i32 %n, 7
 627   %n.vec = and i32 %n.rnd.up, -8
 628   br label %vector.body
 629
 630 vector.body:                                      ; preds = %vector.body, %vector.ph
 631   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 632   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
 633   %0 = getelementptr inbounds i16, i16* %x, i32 %index
 634   %1 = bitcast i16* %0 to <8 x i16>*
 635   %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
 636   %2 = zext <8 x i16> %wide.masked.load to <8 x i32>
 637   %3 = getelementptr inbounds i16, i16* %y, i32 %index
 638   %4 = bitcast i16* %3 to <8 x i16>*
 639   %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
 640   %5 = zext <8 x i16> %wide.masked.load12 to <8 x i32>
 641   %6 = mul nuw <8 x i32> %5, %2
 642   %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 643   %8 = trunc <8 x i32> %7 to <8 x i16>
 644   %9 = getelementptr inbounds i16, i16* %d, i32 %index
 645   %10 = bitcast i16* %9 to <8 x i16>*
 646   call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask)
 647   %index.next = add i32 %index, 8
 648   %11 = icmp eq i32 %index.next, %n.vec
 649   br i1 %11, label %for.cond.cleanup, label %vector.body
 650
 651 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 652   ret void
 653 }
 654
 655 define void @vmulh_s8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
 656 ; CHECK-LABEL: vmulh_s8_pred:
 657 ; CHECK:       @ %bb.0: @ %entry
 658 ; CHECK-NEXT:    .save {r7, lr}
 659 ; CHECK-NEXT:    push {r7, lr}
 660 ; CHECK-NEXT:    cmp r3, #1
 661 ; CHECK-NEXT:    it lt
 662 ; CHECK-NEXT:    poplt {r7, pc}
 663 ; CHECK-NEXT:  .LBB22_1: @ %vector.ph
 664 ; CHECK-NEXT:    dlstp.8 lr, r3
 665 ; CHECK-NEXT:  .LBB22_2: @ %vector.body
 666 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 667 ; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
 668 ; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
 669 ; CHECK-NEXT:    vmulh.s8 q0, q1, q0
 670 ; CHECK-NEXT:    vstrb.8 q0, [r0], #16
 671 ; CHECK-NEXT:    letp lr, .LBB22_2
 672 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 673 ; CHECK-NEXT:    pop {r7, pc}
 674 entry:
 675   %cmp10 = icmp sgt i32 %n, 0
 676   br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
 677
 678 vector.ph:                                        ; preds = %entry
 679   %n.rnd.up = add i32 %n, 15
 680   %n.vec = and i32 %n.rnd.up, -16
 681   br label %vector.body
 682
 683 vector.body:                                      ; preds = %vector.body, %vector.ph
 684   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 685   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
 686   %0 = getelementptr inbounds i8, i8* %x, i32 %index
 687   %1 = bitcast i8* %0 to <16 x i8>*
 688   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
 689   %2 = sext <16 x i8> %wide.masked.load to <16 x i16>
 690   %3 = getelementptr inbounds i8, i8* %y, i32 %index
 691   %4 = bitcast i8* %3 to <16 x i8>*
 692   %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
 693   %5 = sext <16 x i8> %wide.masked.load12 to <16 x i16>
 694   %6 = mul nsw <16 x i16> %5, %2
 695   %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 696   %8 = trunc <16 x i16> %7 to <16 x i8>
 697   %9 = getelementptr inbounds i8, i8* %d, i32 %index
 698   %10 = bitcast i8* %9 to <16 x i8>*
 699   call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask)
 700   %index.next = add i32 %index, 16
 701   %11 = icmp eq i32 %index.next, %n.vec
 702   br i1 %11, label %for.cond.cleanup, label %vector.body
 703
 704 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 705   ret void
 706 }
 707
 708 define void @vmulh_u8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
 709 ; CHECK-LABEL: vmulh_u8_pred:
 710 ; CHECK:       @ %bb.0: @ %entry
 711 ; CHECK-NEXT:    .save {r7, lr}
 712 ; CHECK-NEXT:    push {r7, lr}
 713 ; CHECK-NEXT:    cmp r3, #1
 714 ; CHECK-NEXT:    it lt
 715 ; CHECK-NEXT:    poplt {r7, pc}
 716 ; CHECK-NEXT:  .LBB23_1: @ %vector.ph
 717 ; CHECK-NEXT:    dlstp.8 lr, r3
 718 ; CHECK-NEXT:  .LBB23_2: @ %vector.body
 719 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 720 ; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
 721 ; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
 722 ; CHECK-NEXT:    vmulh.u8 q0, q1, q0
 723 ; CHECK-NEXT:    vstrb.8 q0, [r0], #16
 724 ; CHECK-NEXT:    letp lr, .LBB23_2
 725 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 726 ; CHECK-NEXT:    pop {r7, pc}
 727 entry:
 728   %cmp10 = icmp sgt i32 %n, 0
 729   br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
 730
 731 vector.ph:                                        ; preds = %entry
 732   %n.rnd.up = add i32 %n, 15
 733   %n.vec = and i32 %n.rnd.up, -16
 734   br label %vector.body
 735
 736 vector.body:                                      ; preds = %vector.body, %vector.ph
 737   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 738   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
 739   %0 = getelementptr inbounds i8, i8* %x, i32 %index
 740   %1 = bitcast i8* %0 to <16 x i8>*
 741   %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
 742   %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
 743   %3 = getelementptr inbounds i8, i8* %y, i32 %index
 744   %4 = bitcast i8* %3 to <16 x i8>*
 745   %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
 746   %5 = zext <16 x i8> %wide.masked.load12 to <16 x i16>
 747   %6 = mul nuw <16 x i16> %5, %2
 748   %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
 749   %8 = trunc <16 x i16> %7 to <16 x i8>
 750   %9 = getelementptr inbounds i8, i8* %d, i32 %index
 751   %10 = bitcast i8* %9 to <16 x i8>*
 752   call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask)
 753   %index.next = add i32 %index, 16
 754   %11 = icmp eq i32 %index.next, %n.vec
 755   br i1 %11, label %for.cond.cleanup, label %vector.body
 756
 757 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 758   ret void
 759 }
 760
 761 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 762 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 763 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 764 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 765 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
 766 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 767 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
 768 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 769 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)