test/CodeGen/AMDGPU/sign_extend.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI
   3 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI
   4
   5 define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   6 ; SI-LABEL: s_sext_i1_to_i32:
   7 ; SI:       ; %bb.0:
   8 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
   9 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
  10 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  11 ; SI-NEXT:    s_mov_b32 s6, -1
  12 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  13 ; SI-NEXT:    v_mov_b32_e32 v0, s1
  14 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
  15 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
  16 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  17 ; SI-NEXT:    s_endpgm
  18 ;
  19 ; VI-LABEL: s_sext_i1_to_i32:
  20 ; VI:       ; %bb.0:
  21 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
  22 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
  23 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  24 ; VI-NEXT:    s_mov_b32 s6, -1
  25 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  26 ; VI-NEXT:    v_mov_b32_e32 v0, s1
  27 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
  28 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
  29 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  30 ; VI-NEXT:    s_endpgm
  31   %cmp = icmp eq i32 %a, %b
  32   %sext = sext i1 %cmp to i32
  33   store i32 %sext, i32 addrspace(1)* %out, align 4
  34   ret void
  35 }
  36
  37 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
  38 ; SI-LABEL: test_s_sext_i32_to_i64:
  39 ; SI:       ; %bb.0: ; %entry
  40 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
  41 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
  42 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  43 ; SI-NEXT:    s_mov_b32 s6, -1
  44 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  45 ; SI-NEXT:    s_mul_i32 s0, s0, s1
  46 ; SI-NEXT:    s_add_i32 s0, s0, s2
  47 ; SI-NEXT:    s_ashr_i32 s1, s0, 31
  48 ; SI-NEXT:    v_mov_b32_e32 v0, s0
  49 ; SI-NEXT:    v_mov_b32_e32 v1, s1
  50 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  51 ; SI-NEXT:    s_endpgm
  52 ;
  53 ; VI-LABEL: test_s_sext_i32_to_i64:
  54 ; VI:       ; %bb.0: ; %entry
  55 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
  56 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
  57 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  58 ; VI-NEXT:    s_mov_b32 s6, -1
  59 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  60 ; VI-NEXT:    s_mul_i32 s0, s0, s1
  61 ; VI-NEXT:    s_add_i32 s0, s0, s2
  62 ; VI-NEXT:    s_ashr_i32 s1, s0, 31
  63 ; VI-NEXT:    v_mov_b32_e32 v0, s0
  64 ; VI-NEXT:    v_mov_b32_e32 v1, s1
  65 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  66 ; VI-NEXT:    s_endpgm
  67 entry:
  68   %mul = mul i32 %a, %b
  69   %add = add i32 %mul, %c
  70   %sext = sext i32 %add to i64
  71   store i64 %sext, i64 addrspace(1)* %out, align 8
  72   ret void
  73 }
  74
  75 define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
  76 ; SI-LABEL: s_sext_i1_to_i64:
  77 ; SI:       ; %bb.0:
  78 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
  79 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
  80 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  81 ; SI-NEXT:    s_mov_b32 s6, -1
  82 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  83 ; SI-NEXT:    v_mov_b32_e32 v0, s1
  84 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
  85 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
  86 ; SI-NEXT:    v_mov_b32_e32 v1, v0
  87 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  88 ; SI-NEXT:    s_endpgm
  89 ;
  90 ; VI-LABEL: s_sext_i1_to_i64:
  91 ; VI:       ; %bb.0:
  92 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
  93 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
  94 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  95 ; VI-NEXT:    s_mov_b32 s6, -1
  96 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  97 ; VI-NEXT:    v_mov_b32_e32 v0, s1
  98 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
  99 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 100 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 101 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 102 ; VI-NEXT:    s_endpgm
 103   %cmp = icmp eq i32 %a, %b
 104   %sext = sext i1 %cmp to i64
 105   store i64 %sext, i64 addrspace(1)* %out, align 8
 106   ret void
 107 }
 108
 109 define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
 110 ; SI-LABEL: s_sext_i32_to_i64:
 111 ; SI:       ; %bb.0:
 112 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 113 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
 114 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 115 ; SI-NEXT:    s_mov_b32 s6, -1
 116 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 117 ; SI-NEXT:    s_ashr_i32 s1, s0, 31
 118 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 119 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 120 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 121 ; SI-NEXT:    s_endpgm
 122 ;
 123 ; VI-LABEL: s_sext_i32_to_i64:
 124 ; VI:       ; %bb.0:
 125 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 126 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
 127 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 128 ; VI-NEXT:    s_mov_b32 s6, -1
 129 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 130 ; VI-NEXT:    s_ashr_i32 s1, s0, 31
 131 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 132 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 133 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 134 ; VI-NEXT:    s_endpgm
 135   %sext = sext i32 %a to i64
 136   store i64 %sext, i64 addrspace(1)* %out, align 8
 137   ret void
 138 }
 139
 140 define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 141 ; SI-LABEL: v_sext_i32_to_i64:
 142 ; SI:       ; %bb.0:
 143 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 144 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 145 ; SI-NEXT:    s_mov_b32 s2, -1
 146 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 147 ; SI-NEXT:    s_mov_b32 s0, s4
 148 ; SI-NEXT:    s_mov_b32 s1, s5
 149 ; SI-NEXT:    s_mov_b32 s4, s6
 150 ; SI-NEXT:    s_mov_b32 s5, s7
 151 ; SI-NEXT:    s_mov_b32 s6, s2
 152 ; SI-NEXT:    s_mov_b32 s7, s3
 153 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 154 ; SI-NEXT:    s_waitcnt vmcnt(0)
 155 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 156 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 157 ; SI-NEXT:    s_endpgm
 158 ;
 159 ; VI-LABEL: v_sext_i32_to_i64:
 160 ; VI:       ; %bb.0:
 161 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 162 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 163 ; VI-NEXT:    s_mov_b32 s2, -1
 164 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 165 ; VI-NEXT:    s_mov_b32 s0, s4
 166 ; VI-NEXT:    s_mov_b32 s1, s5
 167 ; VI-NEXT:    s_mov_b32 s4, s6
 168 ; VI-NEXT:    s_mov_b32 s5, s7
 169 ; VI-NEXT:    s_mov_b32 s6, s2
 170 ; VI-NEXT:    s_mov_b32 s7, s3
 171 ; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 172 ; VI-NEXT:    s_waitcnt vmcnt(0)
 173 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 174 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 175 ; VI-NEXT:    s_endpgm
 176   %val = load i32, i32 addrspace(1)* %in, align 4
 177   %sext = sext i32 %val to i64
 178   store i64 %sext, i64 addrspace(1)* %out, align 8
 179   ret void
 180 }
 181
 182 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
 183 ; SI-LABEL: s_sext_i16_to_i64:
 184 ; SI:       ; %bb.0:
 185 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 186 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
 187 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 188 ; SI-NEXT:    s_mov_b32 s6, -1
 189 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 190 ; SI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
 191 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 192 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 193 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 194 ; SI-NEXT:    s_endpgm
 195 ;
 196 ; VI-LABEL: s_sext_i16_to_i64:
 197 ; VI:       ; %bb.0:
 198 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 199 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
 200 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 201 ; VI-NEXT:    s_mov_b32 s6, -1
 202 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 203 ; VI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
 204 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 205 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 206 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 207 ; VI-NEXT:    s_endpgm
 208   %sext = sext i16 %a to i64
 209   store i64 %sext, i64 addrspace(1)* %out, align 8
 210   ret void
 211 }
 212
 213 define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 214 ; SI-LABEL: s_sext_i1_to_i16:
 215 ; SI:       ; %bb.0:
 216 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 217 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 218 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 219 ; SI-NEXT:    s_mov_b32 s6, -1
 220 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 221 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 222 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 223 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 224 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 225 ; SI-NEXT:    s_endpgm
 226 ;
 227 ; VI-LABEL: s_sext_i1_to_i16:
 228 ; VI:       ; %bb.0:
 229 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 230 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 231 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 232 ; VI-NEXT:    s_mov_b32 s6, -1
 233 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 234 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 235 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 236 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 237 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 238 ; VI-NEXT:    s_endpgm
 239   %cmp = icmp eq i32 %a, %b
 240   %sext = sext i1 %cmp to i16
 241   store i16 %sext, i16 addrspace(1)* %out
 242   ret void
 243 }
 244
 245 ; This purpose of this test is to make sure the i16 = sign_extend i1 node
 246 ; makes it all the way throught the legalizer/optimizer to make sure
 247 ; we select this correctly.  In the s_sext_i1_to_i16, the sign_extend node
 248 ; is optimized to a select very early.
 249 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 250 ; SI-LABEL: s_sext_i1_to_i16_with_and:
 251 ; SI:       ; %bb.0:
 252 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 253 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
 254 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 255 ; SI-NEXT:    s_mov_b32 s6, -1
 256 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 257 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 258 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 259 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 260 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
 261 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 262 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 263 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 264 ; SI-NEXT:    s_endpgm
 265 ;
 266 ; VI-LABEL: s_sext_i1_to_i16_with_and:
 267 ; VI:       ; %bb.0:
 268 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 269 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 270 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 271 ; VI-NEXT:    s_mov_b32 s6, -1
 272 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 273 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 274 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 275 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 276 ; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
 277 ; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 278 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 279 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 280 ; VI-NEXT:    s_endpgm
 281   %cmp0 = icmp eq i32 %a, %b
 282   %cmp1 = icmp eq i32 %c, %d
 283   %cmp = and i1 %cmp0, %cmp1
 284   %sext = sext i1 %cmp to i16
 285   store i16 %sext, i16 addrspace(1)* %out
 286   ret void
 287 }
 288
 289 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 290 ; SI-LABEL: v_sext_i1_to_i16_with_and:
 291 ; SI:       ; %bb.0:
 292 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 293 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
 294 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 295 ; SI-NEXT:    s_mov_b32 s6, -1
 296 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 297 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 298 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 299 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v1
 300 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 301 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 302 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 303 ; SI-NEXT:    s_endpgm
 304 ;
 305 ; VI-LABEL: v_sext_i1_to_i16_with_and:
 306 ; VI:       ; %bb.0:
 307 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 308 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 309 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 310 ; VI-NEXT:    s_mov_b32 s6, -1
 311 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 312 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 313 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 314 ; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, v1
 315 ; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 316 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 317 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 318 ; VI-NEXT:    s_endpgm
 319   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 320   %cmp0 = icmp eq i32 %a, %tid
 321   %cmp1 = icmp eq i32 %b, %c
 322   %cmp = and i1 %cmp0, %cmp1
 323   %sext = sext i1 %cmp to i16
 324   store i16 %sext, i16 addrspace(1)* %out
 325   ret void
 326 }
 327
 328 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
 329 ; gets selected to a v_lshrrev_b16 instructions, so the input to
 330 ; the bfe is a vector registers.  To fix this we need to be able to
 331 ; optimize:
 332 ; t29: i16 = truncate t10
 333 ; t55: i16 = srl t29, Constant:i32<8>
 334 ; t63: i32 = any_extend t55
 335 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
 336 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 337 ; SI-LABEL: s_sext_v4i8_to_v4i32:
 338 ; SI:       ; %bb.0:
 339 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 340 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
 341 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 342 ; SI-NEXT:    s_mov_b32 s6, -1
 343 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 344 ; SI-NEXT:    s_ashr_i32 s1, s0, 24
 345 ; SI-NEXT:    s_bfe_i32 s2, s0, 0x80010
 346 ; SI-NEXT:    s_bfe_i32 s3, s0, 0x80008
 347 ; SI-NEXT:    s_sext_i32_i8 s0, s0
 348 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 349 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 350 ; SI-NEXT:    s_waitcnt expcnt(0)
 351 ; SI-NEXT:    v_mov_b32_e32 v0, s3
 352 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 353 ; SI-NEXT:    s_waitcnt expcnt(0)
 354 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 355 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 356 ; SI-NEXT:    s_waitcnt expcnt(0)
 357 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 358 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 359 ; SI-NEXT:    s_endpgm
 360 ;
 361 ; VI-LABEL: s_sext_v4i8_to_v4i32:
 362 ; VI:       ; %bb.0:
 363 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 364 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
 365 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 366 ; VI-NEXT:    s_mov_b32 s6, -1
 367 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 368 ; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s0
 369 ; VI-NEXT:    s_ashr_i32 s1, s0, 24
 370 ; VI-NEXT:    s_bfe_i32 s2, s0, 0x80010
 371 ; VI-NEXT:    s_sext_i32_i8 s0, s0
 372 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 373 ; VI-NEXT:    v_mov_b32_e32 v1, s0
 374 ; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 375 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 376 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 377 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 378 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 379 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 380 ; VI-NEXT:    s_endpgm
 381   %cast = bitcast i32 %a to <4 x i8>
 382   %ext = sext <4 x i8> %cast to <4 x i32>
 383   %elt0 = extractelement <4 x i32> %ext, i32 0
 384   %elt1 = extractelement <4 x i32> %ext, i32 1
 385   %elt2 = extractelement <4 x i32> %ext, i32 2
 386   %elt3 = extractelement <4 x i32> %ext, i32 3
 387   store volatile i32 %elt0, i32 addrspace(1)* %out
 388   store volatile i32 %elt1, i32 addrspace(1)* %out
 389   store volatile i32 %elt2, i32 addrspace(1)* %out
 390   store volatile i32 %elt3, i32 addrspace(1)* %out
 391   ret void
 392 }
 393
 394 ; FIXME: need to optimize same sequence as above test to avoid
 395 ; this shift.
 396 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 397 ; SI-LABEL: v_sext_v4i8_to_v4i32:
 398 ; SI:       ; %bb.0:
 399 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 400 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 401 ; SI-NEXT:    s_mov_b32 s2, -1
 402 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 403 ; SI-NEXT:    s_mov_b32 s0, s4
 404 ; SI-NEXT:    s_mov_b32 s1, s5
 405 ; SI-NEXT:    s_mov_b32 s4, s6
 406 ; SI-NEXT:    s_mov_b32 s5, s7
 407 ; SI-NEXT:    s_mov_b32 s6, s2
 408 ; SI-NEXT:    s_mov_b32 s7, s3
 409 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 410 ; SI-NEXT:    s_waitcnt vmcnt(0)
 411 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
 412 ; SI-NEXT:    v_bfe_i32 v2, v0, 16, 8
 413 ; SI-NEXT:    v_bfe_i32 v3, v0, 8, 8
 414 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 415 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 416 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 417 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 418 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 419 ; SI-NEXT:    s_endpgm
 420 ;
 421 ; VI-LABEL: v_sext_v4i8_to_v4i32:
 422 ; VI:       ; %bb.0:
 423 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 424 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 425 ; VI-NEXT:    s_mov_b32 s2, -1
 426 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 427 ; VI-NEXT:    s_mov_b32 s0, s4
 428 ; VI-NEXT:    s_mov_b32 s1, s5
 429 ; VI-NEXT:    s_mov_b32 s4, s6
 430 ; VI-NEXT:    s_mov_b32 s5, s7
 431 ; VI-NEXT:    s_mov_b32 s6, s2
 432 ; VI-NEXT:    s_mov_b32 s7, s3
 433 ; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 434 ; VI-NEXT:    s_waitcnt vmcnt(0)
 435 ; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 436 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
 437 ; VI-NEXT:    v_bfe_i32 v3, v0, 16, 8
 438 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 439 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 8
 440 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 441 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 442 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 443 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 444 ; VI-NEXT:    s_endpgm
 445   %a = load i32, i32 addrspace(1)* %in
 446   %cast = bitcast i32 %a to <4 x i8>
 447   %ext = sext <4 x i8> %cast to <4 x i32>
 448   %elt0 = extractelement <4 x i32> %ext, i32 0
 449   %elt1 = extractelement <4 x i32> %ext, i32 1
 450   %elt2 = extractelement <4 x i32> %ext, i32 2
 451   %elt3 = extractelement <4 x i32> %ext, i32 3
 452   store volatile i32 %elt0, i32 addrspace(1)* %out
 453   store volatile i32 %elt1, i32 addrspace(1)* %out
 454   store volatile i32 %elt2, i32 addrspace(1)* %out
 455   store volatile i32 %elt3, i32 addrspace(1)* %out
 456   ret void
 457 }
 458
 459 ; FIXME: s_bfe_i64, same on SI and VI
 460 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
 461 ; SI-LABEL: s_sext_v4i16_to_v4i32:
 462 ; SI:       ; %bb.0:
 463 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 464 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 465 ; SI-NEXT:    s_mov_b32 s2, -1
 466 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 467 ; SI-NEXT:    s_mov_b32 s0, s4
 468 ; SI-NEXT:    s_mov_b32 s1, s5
 469 ; SI-NEXT:    s_ashr_i64 s[4:5], s[6:7], 48
 470 ; SI-NEXT:    s_ashr_i32 s5, s6, 16
 471 ; SI-NEXT:    s_sext_i32_i16 s6, s6
 472 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 473 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 474 ; SI-NEXT:    s_waitcnt expcnt(0)
 475 ; SI-NEXT:    v_mov_b32_e32 v0, s5
 476 ; SI-NEXT:    s_sext_i32_i16 s7, s7
 477 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 478 ; SI-NEXT:    s_waitcnt expcnt(0)
 479 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 480 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 481 ; SI-NEXT:    s_waitcnt expcnt(0)
 482 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 483 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 484 ; SI-NEXT:    s_endpgm
 485 ;
 486 ; VI-LABEL: s_sext_v4i16_to_v4i32:
 487 ; VI:       ; %bb.0:
 488 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 489 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 490 ; VI-NEXT:    s_mov_b32 s2, -1
 491 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 492 ; VI-NEXT:    s_mov_b32 s1, s5
 493 ; VI-NEXT:    s_ashr_i32 s5, s6, 16
 494 ; VI-NEXT:    s_sext_i32_i16 s6, s6
 495 ; VI-NEXT:    s_mov_b32 s0, s4
 496 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 497 ; VI-NEXT:    s_ashr_i32 s4, s7, 16
 498 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 499 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 500 ; VI-NEXT:    s_sext_i32_i16 s7, s7
 501 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 502 ; VI-NEXT:    v_mov_b32_e32 v0, s7
 503 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 504 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 505 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 506 ; VI-NEXT:    s_endpgm
 507   %cast = bitcast i64 %a to <4 x i16>
 508   %ext = sext <4 x i16> %cast to <4 x i32>
 509   %elt0 = extractelement <4 x i32> %ext, i32 0
 510   %elt1 = extractelement <4 x i32> %ext, i32 1
 511   %elt2 = extractelement <4 x i32> %ext, i32 2
 512   %elt3 = extractelement <4 x i32> %ext, i32 3
 513   store volatile i32 %elt0, i32 addrspace(1)* %out
 514   store volatile i32 %elt1, i32 addrspace(1)* %out
 515   store volatile i32 %elt2, i32 addrspace(1)* %out
 516   store volatile i32 %elt3, i32 addrspace(1)* %out
 517   ret void
 518 }
 519
 520 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
 521 ; SI-LABEL: v_sext_v4i16_to_v4i32:
 522 ; SI:       ; %bb.0:
 523 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 524 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 525 ; SI-NEXT:    s_mov_b32 s2, -1
 526 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 527 ; SI-NEXT:    s_mov_b32 s0, s4
 528 ; SI-NEXT:    s_mov_b32 s1, s5
 529 ; SI-NEXT:    s_mov_b32 s4, s6
 530 ; SI-NEXT:    s_mov_b32 s5, s7
 531 ; SI-NEXT:    s_mov_b32 s6, s2
 532 ; SI-NEXT:    s_mov_b32 s7, s3
 533 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 534 ; SI-NEXT:    s_waitcnt vmcnt(0)
 535 ; SI-NEXT:    v_ashr_i64 v[2:3], v[0:1], 48
 536 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 537 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 538 ; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 539 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 540 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 541 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 542 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 543 ; SI-NEXT:    s_endpgm
 544 ;
 545 ; VI-LABEL: v_sext_v4i16_to_v4i32:
 546 ; VI:       ; %bb.0:
 547 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 548 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 549 ; VI-NEXT:    s_mov_b32 s2, -1
 550 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 551 ; VI-NEXT:    s_mov_b32 s0, s4
 552 ; VI-NEXT:    s_mov_b32 s1, s5
 553 ; VI-NEXT:    s_mov_b32 s4, s6
 554 ; VI-NEXT:    s_mov_b32 s5, s7
 555 ; VI-NEXT:    s_mov_b32 s6, s2
 556 ; VI-NEXT:    s_mov_b32 s7, s3
 557 ; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 558 ; VI-NEXT:    s_waitcnt vmcnt(0)
 559 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
 560 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 561 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 562 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 563 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 564 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 565 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 566 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 567 ; VI-NEXT:    s_endpgm
 568   %a = load i64, i64 addrspace(1)* %in
 569   %cast = bitcast i64 %a to <4 x i16>
 570   %ext = sext <4 x i16> %cast to <4 x i32>
 571   %elt0 = extractelement <4 x i32> %ext, i32 0
 572   %elt1 = extractelement <4 x i32> %ext, i32 1
 573   %elt2 = extractelement <4 x i32> %ext, i32 2
 574   %elt3 = extractelement <4 x i32> %ext, i32 3
 575   store volatile i32 %elt0, i32 addrspace(1)* %out
 576   store volatile i32 %elt1, i32 addrspace(1)* %out
 577   store volatile i32 %elt2, i32 addrspace(1)* %out
 578   store volatile i32 %elt3, i32 addrspace(1)* %out
 579   ret void
 580 }
 581
 582 declare i32 @llvm.amdgcn.workitem.id.x() #1
 583
 584 attributes #1 = { nounwind readnone }