llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
   3
   4 define void @to_4(float* nocapture readonly %x, half* noalias nocapture %y) {
   5 ; CHECK-LABEL: to_4:
   6 ; CHECK:       @ %bb.0: @ %entry
   7 ; CHECK-NEXT:    .save {r7, lr}
   8 ; CHECK-NEXT:    push {r7, lr}
   9 ; CHECK-NEXT:    mov.w lr, #256
  10 ; CHECK-NEXT:    movw r2, #26214
  11 ; CHECK-NEXT:    movt r2, #16390
  12 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
  13 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  14 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
  15 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
  16 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
  17 ; CHECK-NEXT:    vstrh.32 q0, [r1], #8
  18 ; CHECK-NEXT:    le lr, .LBB0_1
  19 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
  20 ; CHECK-NEXT:    pop {r7, pc}
  21 entry:
  22   br label %vector.body
  23
  24 vector.body:                                      ; preds = %vector.body, %entry
  25   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
  26   %0 = getelementptr inbounds float, float* %x, i32 %index
  27   %1 = bitcast float* %0 to <4 x float>*
  28   %wide.load = load <4 x float>, <4 x float>* %1, align 4
  29   %2 = fmul <4 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
  30   %3 = fptrunc <4 x float> %2 to <4 x half>
  31   %4 = getelementptr inbounds half, half* %y, i32 %index
  32   %5 = bitcast half* %4 to <4 x half>*
  33   store <4 x half> %3, <4 x half>* %5, align 2
  34   %index.next = add i32 %index, 4
  35   %6 = icmp eq i32 %index.next, 1024
  36   br i1 %6, label %for.cond.cleanup, label %vector.body
  37
  38 for.cond.cleanup:                                 ; preds = %vector.body
  39   ret void
  40 }
  41
  42 define void @to_8(float* nocapture readonly %x, half* noalias nocapture %y) {
  43 ; CHECK-LABEL: to_8:
  44 ; CHECK:       @ %bb.0: @ %entry
  45 ; CHECK-NEXT:    .save {r7, lr}
  46 ; CHECK-NEXT:    push {r7, lr}
  47 ; CHECK-NEXT:    mov.w lr, #128
  48 ; CHECK-NEXT:    movw r2, #26214
  49 ; CHECK-NEXT:    movt r2, #16390
  50 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
  51 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  52 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
  53 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
  54 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
  55 ; CHECK-NEXT:    vstrh.32 q0, [r1, #8]
  56 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #32
  57 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
  58 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
  59 ; CHECK-NEXT:    vstrh.32 q0, [r1], #16
  60 ; CHECK-NEXT:    le lr, .LBB1_1
  61 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
  62 ; CHECK-NEXT:    pop {r7, pc}
  63 entry:
  64   br label %vector.body
  65
  66 vector.body:                                      ; preds = %vector.body, %entry
  67   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
  68   %0 = getelementptr inbounds float, float* %x, i32 %index
  69   %1 = bitcast float* %0 to <8 x float>*
  70   %wide.load = load <8 x float>, <8 x float>* %1, align 4
  71   %2 = fmul <8 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
  72   %3 = fptrunc <8 x float> %2 to <8 x half>
  73   %4 = getelementptr inbounds half, half* %y, i32 %index
  74   %5 = bitcast half* %4 to <8 x half>*
  75   store <8 x half> %3, <8 x half>* %5, align 2
  76   %index.next = add i32 %index, 8
  77   %6 = icmp eq i32 %index.next, 1024
  78   br i1 %6, label %for.cond.cleanup, label %vector.body
  79
  80 for.cond.cleanup:                                 ; preds = %vector.body
  81   ret void
  82 }
  83
  84 define void @to_16(float* nocapture readonly %x, half* noalias nocapture %y) {
  85 ; CHECK-LABEL: to_16:
  86 ; CHECK:       @ %bb.0: @ %entry
  87 ; CHECK-NEXT:    .save {r7, lr}
  88 ; CHECK-NEXT:    push {r7, lr}
  89 ; CHECK-NEXT:    mov.w lr, #64
  90 ; CHECK-NEXT:    movw r2, #26214
  91 ; CHECK-NEXT:    movt r2, #16390
  92 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
  93 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  94 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
  95 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
  96 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
  97 ; CHECK-NEXT:    vstrh.32 q0, [r1, #24]
  98 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
  99 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 100 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
 101 ; CHECK-NEXT:    vstrh.32 q0, [r1, #16]
 102 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 103 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 104 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
 105 ; CHECK-NEXT:    vstrh.32 q0, [r1, #8]
 106 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #64
 107 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 108 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
 109 ; CHECK-NEXT:    vstrh.32 q0, [r1], #32
 110 ; CHECK-NEXT:    le lr, .LBB2_1
 111 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 112 ; CHECK-NEXT:    pop {r7, pc}
 113 entry:
 114   br label %vector.body
 115
 116 vector.body:                                      ; preds = %vector.body, %entry
 117   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 118   %0 = getelementptr inbounds float, float* %x, i32 %index
 119   %1 = bitcast float* %0 to <16 x float>*
 120   %wide.load = load <16 x float>, <16 x float>* %1, align 4
 121   %2 = fmul <16 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 122   %3 = fptrunc <16 x float> %2 to <16 x half>
 123   %4 = getelementptr inbounds half, half* %y, i32 %index
 124   %5 = bitcast half* %4 to <16 x half>*
 125   store <16 x half> %3, <16 x half>* %5, align 2
 126   %index.next = add i32 %index, 16
 127   %6 = icmp eq i32 %index.next, 1024
 128   br i1 %6, label %for.cond.cleanup, label %vector.body
 129
 130 for.cond.cleanup:                                 ; preds = %vector.body
 131   ret void
 132 }
 133
 134 define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) {
 135 ; CHECK-LABEL: from_4:
 136 ; CHECK:       @ %bb.0: @ %entry
 137 ; CHECK-NEXT:    .save {r7, lr}
 138 ; CHECK-NEXT:    push {r7, lr}
 139 ; CHECK-NEXT:    mov.w lr, #256
 140 ; CHECK-NEXT:    movw r2, #26214
 141 ; CHECK-NEXT:    movt r2, #16390
 142 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 143 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 144 ; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
 145 ; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
 146 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 147 ; CHECK-NEXT:    vstrb.8 q0, [r1], #16
 148 ; CHECK-NEXT:    le lr, .LBB3_1
 149 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 150 ; CHECK-NEXT:    pop {r7, pc}
 151 entry:
 152   br label %vector.body
 153
 154 vector.body:                                      ; preds = %vector.body, %entry
 155   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 156   %0 = getelementptr inbounds half, half* %x, i32 %index
 157   %1 = bitcast half* %0 to <4 x half>*
 158   %wide.load = load <4 x half>, <4 x half>* %1, align 2
 159   %2 = fpext <4 x half> %wide.load to <4 x float>
 160   %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 161   %4 = getelementptr inbounds float, float* %y, i32 %index
 162   %5 = bitcast float* %4 to <4 x float>*
 163   store <4 x float> %3, <4 x float>* %5, align 4
 164   %index.next = add i32 %index, 4
 165   %6 = icmp eq i32 %index.next, 1024
 166   br i1 %6, label %for.cond.cleanup, label %vector.body
 167
 168 for.cond.cleanup:                                 ; preds = %vector.body
 169   ret void
 170 }
 171
 172 define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) {
 173 ; CHECK-LABEL: from_8:
 174 ; CHECK:       @ %bb.0: @ %entry
 175 ; CHECK-NEXT:    .save {r7, lr}
 176 ; CHECK-NEXT:    push {r7, lr}
 177 ; CHECK-NEXT:    mov.w lr, #128
 178 ; CHECK-NEXT:    movw r2, #26214
 179 ; CHECK-NEXT:    movt r2, #16390
 180 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 181 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 182 ; CHECK-NEXT:    vldrh.u32 q0, [r0], #16
 183 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #-8]
 184 ; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
 185 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 186 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
 187 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 188 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 189 ; CHECK-NEXT:    vstrw.32 q0, [r1], #32
 190 ; CHECK-NEXT:    le lr, .LBB4_1
 191 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 192 ; CHECK-NEXT:    pop {r7, pc}
 193 entry:
 194   br label %vector.body
 195
 196 vector.body:                                      ; preds = %vector.body, %entry
 197   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 198   %0 = getelementptr inbounds half, half* %x, i32 %index
 199   %1 = bitcast half* %0 to <8 x half>*
 200   %wide.load = load <8 x half>, <8 x half>* %1, align 2
 201   %2 = fpext <8 x half> %wide.load to <8 x float>
 202   %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 203   %4 = getelementptr inbounds float, float* %y, i32 %index
 204   %5 = bitcast float* %4 to <8 x float>*
 205   store <8 x float> %3, <8 x float>* %5, align 4
 206   %index.next = add i32 %index, 8
 207   %6 = icmp eq i32 %index.next, 1024
 208   br i1 %6, label %for.cond.cleanup, label %vector.body
 209
 210 for.cond.cleanup:                                 ; preds = %vector.body
 211   ret void
 212 }
 213
 214 define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) {
 215 ; CHECK-LABEL: from_16:
 216 ; CHECK:       @ %bb.0: @ %entry
 217 ; CHECK-NEXT:    .save {r7, lr}
 218 ; CHECK-NEXT:    push {r7, lr}
 219 ; CHECK-NEXT:    mov.w lr, #64
 220 ; CHECK-NEXT:    movw r2, #26214
 221 ; CHECK-NEXT:    movt r2, #16390
 222 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 223 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 224 ; CHECK-NEXT:    vldrh.u32 q0, [r0], #32
 225 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #-24]
 226 ; CHECK-NEXT:    vldrh.u32 q2, [r0, #-16]
 227 ; CHECK-NEXT:    vldrh.u32 q3, [r0, #-8]
 228 ; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
 229 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
 230 ; CHECK-NEXT:    vcvtb.f32.f16 q2, q2
 231 ; CHECK-NEXT:    vcvtb.f32.f16 q3, q3
 232 ; CHECK-NEXT:    vmul.f32 q2, q2, r2
 233 ; CHECK-NEXT:    vmul.f32 q3, q3, r2
 234 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 235 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 236 ; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
 237 ; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
 238 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 239 ; CHECK-NEXT:    vstrw.32 q0, [r1], #64
 240 ; CHECK-NEXT:    le lr, .LBB5_1
 241 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 242 ; CHECK-NEXT:    pop {r7, pc}
 243 entry:
 244   br label %vector.body
 245
 246 vector.body:                                      ; preds = %vector.body, %entry
 247   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 248   %0 = getelementptr inbounds half, half* %x, i32 %index
 249   %1 = bitcast half* %0 to <16 x half>*
 250   %wide.load = load <16 x half>, <16 x half>* %1, align 2
 251   %2 = fpext <16 x half> %wide.load to <16 x float>
 252   %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 253   %4 = getelementptr inbounds float, float* %y, i32 %index
 254   %5 = bitcast float* %4 to <16 x float>*
 255   store <16 x float> %3, <16 x float>* %5, align 4
 256   %index.next = add i32 %index, 16
 257   %6 = icmp eq i32 %index.next, 1024
 258   br i1 %6, label %for.cond.cleanup, label %vector.body
 259
 260 for.cond.cleanup:                                 ; preds = %vector.body
 261   ret void
 262 }
 263
 264 define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) {
 265 ; CHECK-LABEL: both_4:
 266 ; CHECK:       @ %bb.0: @ %entry
 267 ; CHECK-NEXT:    .save {r7, lr}
 268 ; CHECK-NEXT:    push {r7, lr}
 269 ; CHECK-NEXT:    mov.w lr, #256
 270 ; CHECK-NEXT:    movw r2, #26214
 271 ; CHECK-NEXT:    movt r2, #16390
 272 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 273 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 274 ; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
 275 ; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
 276 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 277 ; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
 278 ; CHECK-NEXT:    vstrh.32 q0, [r1], #8
 279 ; CHECK-NEXT:    le lr, .LBB6_1
 280 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 281 ; CHECK-NEXT:    pop {r7, pc}
 282 entry:
 283   br label %vector.body
 284
 285 vector.body:                                      ; preds = %vector.body, %entry
 286   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 287   %0 = getelementptr inbounds half, half* %x, i32 %index
 288   %1 = bitcast half* %0 to <4 x half>*
 289   %wide.load = load <4 x half>, <4 x half>* %1, align 2
 290   %2 = fpext <4 x half> %wide.load to <4 x float>
 291   %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 292   %4 = fptrunc <4 x float> %3 to <4 x half>
 293   %5 = getelementptr inbounds half, half* %y, i32 %index
 294   %6 = bitcast half* %5 to <4 x half>*
 295   store <4 x half> %4, <4 x half>* %6, align 2
 296   %index.next = add i32 %index, 4
 297   %7 = icmp eq i32 %index.next, 1024
 298   br i1 %7, label %for.cond.cleanup, label %vector.body
 299
 300 for.cond.cleanup:                                 ; preds = %vector.body
 301   ret void
 302 }
 303
 304 define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) {
 305 ; CHECK-LABEL: both_8:
 306 ; CHECK:       @ %bb.0: @ %entry
 307 ; CHECK-NEXT:    .save {r7, lr}
 308 ; CHECK-NEXT:    push {r7, lr}
 309 ; CHECK-NEXT:    mov.w lr, #128
 310 ; CHECK-NEXT:    movw r2, #26214
 311 ; CHECK-NEXT:    movt r2, #16390
 312 ; CHECK-NEXT:  .LBB7_1: @ %vector.body
 313 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 314 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
 315 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
 316 ; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
 317 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 318 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 319 ; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
 320 ; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
 321 ; CHECK-NEXT:    vstrb.8 q1, [r1], #16
 322 ; CHECK-NEXT:    le lr, .LBB7_1
 323 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 324 ; CHECK-NEXT:    pop {r7, pc}
 325 entry:
 326   br label %vector.body
 327
 328 vector.body:                                      ; preds = %vector.body, %entry
 329   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 330   %0 = getelementptr inbounds half, half* %x, i32 %index
 331   %1 = bitcast half* %0 to <8 x half>*
 332   %wide.load = load <8 x half>, <8 x half>* %1, align 2
 333   %2 = fpext <8 x half> %wide.load to <8 x float>
 334   %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 335   %4 = fptrunc <8 x float> %3 to <8 x half>
 336   %5 = getelementptr inbounds half, half* %y, i32 %index
 337   %6 = bitcast half* %5 to <8 x half>*
 338   store <8 x half> %4, <8 x half>* %6, align 2
 339   %index.next = add i32 %index, 8
 340   %7 = icmp eq i32 %index.next, 1024
 341   br i1 %7, label %for.cond.cleanup, label %vector.body
 342
 343 for.cond.cleanup:                                 ; preds = %vector.body
 344   ret void
 345 }
 346
 347 define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) {
 348 ; CHECK-LABEL: both_16:
 349 ; CHECK:       @ %bb.0: @ %entry
 350 ; CHECK-NEXT:    .save {r7, lr}
 351 ; CHECK-NEXT:    push {r7, lr}
 352 ; CHECK-NEXT:    mov.w lr, #64
 353 ; CHECK-NEXT:    movw r2, #26214
 354 ; CHECK-NEXT:    movt r2, #16390
 355 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 356 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 357 ; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
 358 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
 359 ; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
 360 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 361 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 362 ; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
 363 ; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
 364 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #32
 365 ; CHECK-NEXT:    vstrh.16 q1, [r1, #16]
 366 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
 367 ; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
 368 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 369 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 370 ; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
 371 ; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
 372 ; CHECK-NEXT:    vstrh.16 q1, [r1], #32
 373 ; CHECK-NEXT:    le lr, .LBB8_1
 374 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 375 ; CHECK-NEXT:    pop {r7, pc}
 376 entry:
 377   br label %vector.body
 378
 379 vector.body:                                      ; preds = %vector.body, %entry
 380   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 381   %0 = getelementptr inbounds half, half* %x, i32 %index
 382   %1 = bitcast half* %0 to <16 x half>*
 383   %wide.load = load <16 x half>, <16 x half>* %1, align 2
 384   %2 = fpext <16 x half> %wide.load to <16 x float>
 385   %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 386   %4 = fptrunc <16 x float> %3 to <16 x half>
 387   %5 = getelementptr inbounds half, half* %y, i32 %index
 388   %6 = bitcast half* %5 to <16 x half>*
 389   store <16 x half> %4, <16 x half>* %6, align 2
 390   %index.next = add i32 %index, 16
 391   %7 = icmp eq i32 %index.next, 1024
 392   br i1 %7, label %for.cond.cleanup, label %vector.body
 393
 394 for.cond.cleanup:                                 ; preds = %vector.body
 395   ret void
 396 }
 397
 398 define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) {
 399 ; CHECK-LABEL: both_8_I:
 400 ; CHECK:       @ %bb.0: @ %entry
 401 ; CHECK-NEXT:    .save {r7, lr}
 402 ; CHECK-NEXT:    push {r7, lr}
 403 ; CHECK-NEXT:    mov.w lr, #128
 404 ; CHECK-NEXT:    movw r2, #26214
 405 ; CHECK-NEXT:    movt r2, #16390
 406 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 407 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 408 ; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
 409 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
 410 ; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
 411 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 412 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 413 ; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
 414 ; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
 415 ; CHECK-NEXT:    vstrb.8 q1, [r1], #16
 416 ; CHECK-NEXT:    le lr, .LBB9_1
 417 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 418 ; CHECK-NEXT:    pop {r7, pc}
 419 entry:
 420   br label %vector.body
 421
 422 vector.body:                                      ; preds = %vector.body, %entry
 423   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 424   %0 = getelementptr inbounds half, half* %x, i32 %index
 425   %1 = bitcast half* %0 to <8 x half>*
 426   %wide.load = load <8 x half>, <8 x half>* %1, align 2
 427   %2 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 428   %3 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 429   %4 = fpext <4 x half> %2 to <4 x float>
 430   %5 = fpext <4 x half> %3 to <4 x float>
 431   %6 = fmul <4 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 432   %7 = fmul <4 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 433   %8 = shufflevector <4 x float> %6, <4 x float> %7, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 434   %9 = fptrunc <8 x float> %8 to <8 x half>
 435   %10 = getelementptr inbounds half, half* %y, i32 %index
 436   %11 = bitcast half* %10 to <8 x half>*
 437   store <8 x half> %9, <8 x half>* %11, align 2
 438   %index.next = add i32 %index, 8
 439   %12 = icmp eq i32 %index.next, 1024
 440   br i1 %12, label %for.cond.cleanup, label %vector.body
 441
 442 for.cond.cleanup:                                 ; preds = %vector.body
 443   ret void
 444 }
 445
 446 define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y) {
 447 ; CHECK-LABEL: both_16_I:
 448 ; CHECK:       @ %bb.0: @ %entry
 449 ; CHECK-NEXT:    .save {r7, lr}
 450 ; CHECK-NEXT:    push {r7, lr}
 451 ; CHECK-NEXT:    mov.w lr, #128
 452 ; CHECK-NEXT:    movw r2, #26214
 453 ; CHECK-NEXT:    movt r2, #16390
 454 ; CHECK-NEXT:  .LBB10_1: @ %vector.body
 455 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 456 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
 457 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
 458 ; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
 459 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 460 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 461 ; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
 462 ; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
 463 ; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]!
 464 ; CHECK-NEXT:    vstrh.16 q1, [r1]
 465 ; CHECK-NEXT:    vcvtb.f32.f16 q1, q0
 466 ; CHECK-NEXT:    vcvtt.f32.f16 q0, q0
 467 ; CHECK-NEXT:    vmul.f32 q1, q1, r2
 468 ; CHECK-NEXT:    vmul.f32 q0, q0, r2
 469 ; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
 470 ; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
 471 ; CHECK-NEXT:    vstrb.8 q1, [r1, #16]!
 472 ; CHECK-NEXT:    le lr, .LBB10_1
 473 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 474 ; CHECK-NEXT:    pop {r7, pc}
 475 entry:
 476   br label %vector.body
 477
 478 vector.body:                                      ; preds = %vector.body, %entry
 479   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 480   %0 = getelementptr inbounds half, half* %x, i32 %index
 481   %1 = bitcast half* %0 to <16 x half>*
 482   %wide.load = load <16 x half>, <16 x half>* %1, align 2
 483   %2 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 484   %3 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 485   %4 = fpext <8 x half> %2 to <8 x float>
 486   %5 = fpext <8 x half> %3 to <8 x float>
 487   %6 = fmul <8 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 488   %7 = fmul <8 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000>
 489   %8 = shufflevector <8 x float> %6, <8 x float> %7, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 490   %9 = fptrunc <16 x float> %8 to <16 x half>
 491   %10 = getelementptr inbounds half, half* %y, i32 %index
 492   %11 = bitcast half* %10 to <16 x half>*
 493   store <16 x half> %9, <16 x half>* %11, align 2
 494   %index.next = add i32 %index, 8
 495   %12 = icmp eq i32 %index.next, 1024
 496   br i1 %12, label %for.cond.cleanup, label %vector.body
 497
 498 for.cond.cleanup:                                 ; preds = %vector.body
 499   ret void
 500 }