llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
   3 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX89,VI
   4 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -| FileCheck %s --check-prefixes=GFX89,GFX9
   5
   6 ; XXX - Why the packing?
   7 define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
   8 ; SI-LABEL: scalar_to_vector_v2i32:
   9 ; SI:       ; %bb.0:
  10 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
  11 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  12 ; SI-NEXT:    s_mov_b32 s6, -1
  13 ; SI-NEXT:    s_mov_b32 s10, s6
  14 ; SI-NEXT:    s_mov_b32 s11, s7
  15 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  16 ; SI-NEXT:    s_mov_b32 s8, s2
  17 ; SI-NEXT:    s_mov_b32 s9, s3
  18 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
  19 ; SI-NEXT:    s_waitcnt vmcnt(0)
  20 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
  21 ; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
  22 ; SI-NEXT:    s_mov_b32 s4, s0
  23 ; SI-NEXT:    s_mov_b32 s5, s1
  24 ; SI-NEXT:    v_mov_b32_e32 v1, v0
  25 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  26 ; SI-NEXT:    s_endpgm
  27 ;
  28 ; VI-LABEL: scalar_to_vector_v2i32:
  29 ; VI:       ; %bb.0:
  30 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
  31 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  32 ; VI-NEXT:    s_mov_b32 s6, -1
  33 ; VI-NEXT:    s_mov_b32 s10, s6
  34 ; VI-NEXT:    s_mov_b32 s11, s7
  35 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  36 ; VI-NEXT:    s_mov_b32 s8, s2
  37 ; VI-NEXT:    s_mov_b32 s9, s3
  38 ; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
  39 ; VI-NEXT:    s_mov_b32 s4, s0
  40 ; VI-NEXT:    s_mov_b32 s5, s1
  41 ; VI-NEXT:    s_waitcnt vmcnt(0)
  42 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
  43 ; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
  44 ; VI-NEXT:    v_mov_b32_e32 v1, v0
  45 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  46 ; VI-NEXT:    s_endpgm
  47 ;
  48 ; GFX9-LABEL: scalar_to_vector_v2i32:
  49 ; GFX9:       ; %bb.0:
  50 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
  51 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
  52 ; GFX9-NEXT:    s_mov_b32 s6, -1
  53 ; GFX9-NEXT:    s_mov_b32 s10, s6
  54 ; GFX9-NEXT:    s_mov_b32 s11, s7
  55 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
  56 ; GFX9-NEXT:    s_mov_b32 s8, s2
  57 ; GFX9-NEXT:    s_mov_b32 s9, s3
  58 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
  59 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff0000
  60 ; GFX9-NEXT:    s_mov_b32 s4, s0
  61 ; GFX9-NEXT:    s_mov_b32 s5, s1
  62 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
  63 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
  64 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
  65 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
  66 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  67 ; GFX9-NEXT:    s_endpgm
  68   %tmp1 = load i32, ptr addrspace(1) %in, align 4
  69   %bc = bitcast i32 %tmp1 to <2 x i16>
  70   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  71   store <4 x i16> %tmp2, ptr addrspace(1) %out, align 8
  72   ret void
  73 }
  74
  75 define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
  76 ; SI-LABEL: scalar_to_vector_v2f32:
  77 ; SI:       ; %bb.0:
  78 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
  79 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  80 ; SI-NEXT:    s_mov_b32 s6, -1
  81 ; SI-NEXT:    s_mov_b32 s10, s6
  82 ; SI-NEXT:    s_mov_b32 s11, s7
  83 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  84 ; SI-NEXT:    s_mov_b32 s8, s2
  85 ; SI-NEXT:    s_mov_b32 s9, s3
  86 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
  87 ; SI-NEXT:    s_waitcnt vmcnt(0)
  88 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
  89 ; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
  90 ; SI-NEXT:    s_mov_b32 s4, s0
  91 ; SI-NEXT:    s_mov_b32 s5, s1
  92 ; SI-NEXT:    v_mov_b32_e32 v1, v0
  93 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  94 ; SI-NEXT:    s_endpgm
  95 ;
  96 ; VI-LABEL: scalar_to_vector_v2f32:
  97 ; VI:       ; %bb.0:
  98 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
  99 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 100 ; VI-NEXT:    s_mov_b32 s6, -1
 101 ; VI-NEXT:    s_mov_b32 s10, s6
 102 ; VI-NEXT:    s_mov_b32 s11, s7
 103 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 104 ; VI-NEXT:    s_mov_b32 s8, s2
 105 ; VI-NEXT:    s_mov_b32 s9, s3
 106 ; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 107 ; VI-NEXT:    s_mov_b32 s4, s0
 108 ; VI-NEXT:    s_mov_b32 s5, s1
 109 ; VI-NEXT:    s_waitcnt vmcnt(0)
 110 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 111 ; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 112 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 113 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 114 ; VI-NEXT:    s_endpgm
 115 ;
 116 ; GFX9-LABEL: scalar_to_vector_v2f32:
 117 ; GFX9:       ; %bb.0:
 118 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 119 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 120 ; GFX9-NEXT:    s_mov_b32 s6, -1
 121 ; GFX9-NEXT:    s_mov_b32 s10, s6
 122 ; GFX9-NEXT:    s_mov_b32 s11, s7
 123 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 124 ; GFX9-NEXT:    s_mov_b32 s8, s2
 125 ; GFX9-NEXT:    s_mov_b32 s9, s3
 126 ; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 127 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff0000
 128 ; GFX9-NEXT:    s_mov_b32 s4, s0
 129 ; GFX9-NEXT:    s_mov_b32 s5, s1
 130 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 131 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 132 ; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v1
 133 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 134 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 135 ; GFX9-NEXT:    s_endpgm
 136   %tmp1 = load float, ptr addrspace(1) %in, align 4
 137   %bc = bitcast float %tmp1 to <2 x i16>
 138   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 139   store <4 x i16> %tmp2, ptr addrspace(1) %out, align 8
 140   ret void
 141 }
 142
 143 define amdgpu_kernel void @scalar_to_vector_v4i16() {
 144 ; SI-LABEL: scalar_to_vector_v4i16:
 145 ; SI:       ; %bb.0: ; %bb
 146 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 147 ; SI-NEXT:    s_mov_b32 s2, -1
 148 ; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 149 ; SI-NEXT:    s_waitcnt vmcnt(0)
 150 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
 151 ; SI-NEXT:    v_or_b32_e32 v2, v1, v0
 152 ; SI-NEXT:    v_and_b32_e32 v1, 0xff00, v2
 153 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 154 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 155 ; SI-NEXT:    v_or_b32_e32 v1, v0, v3
 156 ; SI-NEXT:    v_or_b32_e32 v0, v2, v3
 157 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 158 ; SI-NEXT:    s_endpgm
 159 ;
 160 ; VI-LABEL: scalar_to_vector_v4i16:
 161 ; VI:       ; %bb.0: ; %bb
 162 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 163 ; VI-NEXT:    s_mov_b32 s2, -1
 164 ; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 165 ; VI-NEXT:    s_waitcnt vmcnt(0)
 166 ; VI-NEXT:    v_readfirstlane_b32 s0, v0
 167 ; VI-NEXT:    s_lshl_b32 s1, s0, 8
 168 ; VI-NEXT:    s_or_b32 s0, s0, s1
 169 ; VI-NEXT:    s_lshl_b32 s1, s0, 16
 170 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 171 ; VI-NEXT:    s_or_b32 s0, s0, s1
 172 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 173 ; VI-NEXT:    v_mov_b32_e32 v1, s0
 174 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 175 ; VI-NEXT:    s_endpgm
 176 ;
 177 ; GFX9-LABEL: scalar_to_vector_v4i16:
 178 ; GFX9:       ; %bb.0: ; %bb
 179 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 180 ; GFX9-NEXT:    s_mov_b32 s2, -1
 181 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 182 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 183 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 184 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 8
 185 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 186 ; GFX9-NEXT:    s_and_b32 s1, s0, 0xffff
 187 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
 188 ; GFX9-NEXT:    s_or_b32 s0, s1, s0
 189 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 190 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 191 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 192 ; GFX9-NEXT:    s_endpgm
 193 bb:
 194   %tmp = load <2 x i8>, ptr addrspace(1) undef, align 1
 195   %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 196   %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
 197   store <8 x i8> %tmp2, ptr addrspace(1) undef, align 8
 198   ret void
 199 }
 200
 201 define amdgpu_kernel void @scalar_to_vector_v4f16() {
 202 ; SI-LABEL: scalar_to_vector_v4f16:
 203 ; SI:       ; %bb.0: ; %bb
 204 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 205 ; SI-NEXT:    s_mov_b32 s2, -1
 206 ; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 207 ; SI-NEXT:    s_waitcnt vmcnt(0)
 208 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
 209 ; SI-NEXT:    v_or_b32_e32 v2, v1, v0
 210 ; SI-NEXT:    v_and_b32_e32 v1, 0xff00, v2
 211 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 212 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 213 ; SI-NEXT:    v_or_b32_e32 v1, v0, v3
 214 ; SI-NEXT:    v_or_b32_e32 v0, v2, v3
 215 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 216 ; SI-NEXT:    s_endpgm
 217 ;
 218 ; VI-LABEL: scalar_to_vector_v4f16:
 219 ; VI:       ; %bb.0: ; %bb
 220 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 221 ; VI-NEXT:    s_mov_b32 s2, -1
 222 ; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 223 ; VI-NEXT:    s_waitcnt vmcnt(0)
 224 ; VI-NEXT:    v_readfirstlane_b32 s0, v0
 225 ; VI-NEXT:    s_lshl_b32 s1, s0, 8
 226 ; VI-NEXT:    s_or_b32 s0, s1, s0
 227 ; VI-NEXT:    s_and_b32 s1, s0, 0xff00
 228 ; VI-NEXT:    s_bfe_u32 s4, s0, 0x80008
 229 ; VI-NEXT:    s_or_b32 s1, s4, s1
 230 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 231 ; VI-NEXT:    s_lshl_b32 s4, s1, 16
 232 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
 233 ; VI-NEXT:    s_or_b32 s1, s1, s4
 234 ; VI-NEXT:    s_or_b32 s0, s0, s4
 235 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 236 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 237 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 238 ; VI-NEXT:    s_endpgm
 239 ;
 240 ; GFX9-LABEL: scalar_to_vector_v4f16:
 241 ; GFX9:       ; %bb.0: ; %bb
 242 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 243 ; GFX9-NEXT:    s_mov_b32 s2, -1
 244 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 245 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 246 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 247 ; GFX9-NEXT:    s_lshl_b32 s1, s0, 8
 248 ; GFX9-NEXT:    s_or_b32 s0, s1, s0
 249 ; GFX9-NEXT:    s_and_b32 s1, s0, 0xff00
 250 ; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x80008
 251 ; GFX9-NEXT:    s_or_b32 s1, s4, s1
 252 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 253 ; GFX9-NEXT:    s_and_b32 s4, s1, 0xffff
 254 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 255 ; GFX9-NEXT:    s_or_b32 s4, s4, s1
 256 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 257 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 258 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 259 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 260 ; GFX9-NEXT:    s_endpgm
 261 bb:
 262   %load = load half, ptr addrspace(1) undef, align 1
 263   %tmp = bitcast half %load to <2 x i8>
 264   %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 265   %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
 266   store <8 x i8> %tmp2, ptr addrspace(1) undef, align 8
 267   ret void
 268 }
 269
 270 ; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
 271 ; to produce one, but for some reason never made it to selection.
 272
 273
 274 ; define amdgpu_kernel void @scalar_to_vector_test2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 275 ;   %tmp1 = load i32, ptr addrspace(1) %in, align 4
 276 ;   %bc = bitcast i32 %tmp1 to <4 x i8>
 277
 278 ;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 279 ;   store <8 x i8> %tmp2, ptr addrspace(1) %out, align 4
 280 ;   ret void
 281 ; }
 282
 283 ; define amdgpu_kernel void @scalar_to_vector_test3(ptr addrspace(1) %out) nounwind {
 284 ;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
 285 ;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
 286 ;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
 287 ;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
 288 ;   store <4 x i32> %add, ptr addrspace(1) %out, align 16
 289 ;   ret void
 290 ; }
 291
 292 ; define amdgpu_kernel void @scalar_to_vector_test4(ptr addrspace(1) %out) nounwind {
 293 ;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
 294 ;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
 295 ;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
 296 ;   store <8 x i16> %add, ptr addrspace(1) %out, align 16
 297 ;   ret void
 298 ; }
 299
 300 ; define amdgpu_kernel void @scalar_to_vector_test5(ptr addrspace(1) %out) nounwind {
 301 ;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
 302 ;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
 303 ;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
 304 ;   store <4 x i16> %add, ptr addrspace(1) %out, align 16
 305 ;   ret void
 306 ; }
 307
 308 define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zeroext %val) nounwind {
 309 ; SI-LABEL: scalar_to_vector_test6:
 310 ; SI:       ; %bb.0:
 311 ; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
 312 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 313 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 314 ; SI-NEXT:    s_mov_b32 s2, -1
 315 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 316 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 317 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 318 ; SI-NEXT:    s_endpgm
 319 ;
 320 ; GFX89-LABEL: scalar_to_vector_test6:
 321 ; GFX89:       ; %bb.0:
 322 ; GFX89-NEXT:    s_load_dword s6, s[4:5], 0x2c
 323 ; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 324 ; GFX89-NEXT:    s_mov_b32 s3, 0xf000
 325 ; GFX89-NEXT:    s_mov_b32 s2, -1
 326 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
 327 ; GFX89-NEXT:    v_mov_b32_e32 v0, s6
 328 ; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 329 ; GFX89-NEXT:    s_endpgm
 330   %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
 331   %bc = bitcast <4 x i8> %newvec0 to <2 x half>
 332   store <2 x half> %bc, ptr addrspace(1) %out
 333   ret void
 334 }
 335
 336 ; bitcast (scalar_to_vector x) -> any_extend x
 337 define i64 @bitcast_combine_scalar_to_vector_v4i16(i16 %arg) {
 338 ; SI-LABEL: bitcast_combine_scalar_to_vector_v4i16:
 339 ; SI:       ; %bb.0:
 340 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 341 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 342 ; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
 343 ; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 344 ; SI-NEXT:    v_or_b32_e32 v2, v0, v2
 345 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 346 ; SI-NEXT:    v_or_b32_e32 v0, v1, v3
 347 ; SI-NEXT:    v_or_b32_e32 v1, v2, v3
 348 ; SI-NEXT:    s_setpc_b64 s[30:31]
 349 ;
 350 ; GFX89-LABEL: bitcast_combine_scalar_to_vector_v4i16:
 351 ; GFX89:       ; %bb.0:
 352 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 353 ; GFX89-NEXT:    v_and_b32_e32 v1, 0xffffff00, v0
 354 ; GFX89-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 355 ; GFX89-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 356 ; GFX89-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 357 ; GFX89-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 358 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 359   %arg.cast = bitcast i16 %arg to <2 x i8>
 360   %tmp1 = shufflevector <2 x i8> %arg.cast, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 361   %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 362   %cast = bitcast <8 x i8> %tmp2 to i64
 363   ret i64 %cast
 364 }