llvm/test/CodeGen/AMDGPU/sign_extend.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
   3 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
   4
   5 define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   6 ; SI-LABEL: s_sext_i1_to_i32:
   7 ; SI:       ; %bb.0:
   8 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
   9 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
  10 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  11 ; SI-NEXT:    s_mov_b32 s6, -1
  12 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  13 ; SI-NEXT:    v_mov_b32_e32 v0, s1
  14 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
  15 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
  16 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  17 ; SI-NEXT:    s_endpgm
  18 ;
  19 ; VI-LABEL: s_sext_i1_to_i32:
  20 ; VI:       ; %bb.0:
  21 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
  22 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
  23 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  24 ; VI-NEXT:    s_mov_b32 s6, -1
  25 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  26 ; VI-NEXT:    v_mov_b32_e32 v0, s1
  27 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
  28 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
  29 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  30 ; VI-NEXT:    s_endpgm
  31   %cmp = icmp eq i32 %a, %b
  32   %sext = sext i1 %cmp to i32
  33   store i32 %sext, i32 addrspace(1)* %out, align 4
  34   ret void
  35 }
  36
  37 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
  38 ; SI-LABEL: test_s_sext_i32_to_i64:
  39 ; SI:       ; %bb.0: ; %entry
  40 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
  41 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
  42 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
  43 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  44 ; SI-NEXT:    s_mov_b32 s6, -1
  45 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  46 ; SI-NEXT:    s_mul_i32 s1, s2, s3
  47 ; SI-NEXT:    s_add_i32 s1, s1, s0
  48 ; SI-NEXT:    s_ashr_i32 s0, s1, 31
  49 ; SI-NEXT:    v_mov_b32_e32 v0, s1
  50 ; SI-NEXT:    v_mov_b32_e32 v1, s0
  51 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  52 ; SI-NEXT:    s_endpgm
  53 ;
  54 ; VI-LABEL: test_s_sext_i32_to_i64:
  55 ; VI:       ; %bb.0: ; %entry
  56 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
  57 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
  58 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
  59 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  60 ; VI-NEXT:    s_mov_b32 s6, -1
  61 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  62 ; VI-NEXT:    s_mul_i32 s1, s2, s3
  63 ; VI-NEXT:    s_add_i32 s1, s1, s0
  64 ; VI-NEXT:    s_ashr_i32 s0, s1, 31
  65 ; VI-NEXT:    v_mov_b32_e32 v0, s1
  66 ; VI-NEXT:    v_mov_b32_e32 v1, s0
  67 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  68 ; VI-NEXT:    s_endpgm
  69 entry:
  70   %mul = mul i32 %a, %b
  71   %add = add i32 %mul, %c
  72   %sext = sext i32 %add to i64
  73   store i64 %sext, i64 addrspace(1)* %out, align 8
  74   ret void
  75 }
  76
  77 define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
  78 ; SI-LABEL: s_sext_i1_to_i64:
  79 ; SI:       ; %bb.0:
  80 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
  81 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
  82 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  83 ; SI-NEXT:    s_mov_b32 s6, -1
  84 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  85 ; SI-NEXT:    v_mov_b32_e32 v0, s1
  86 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
  87 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
  88 ; SI-NEXT:    v_mov_b32_e32 v1, v0
  89 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  90 ; SI-NEXT:    s_endpgm
  91 ;
  92 ; VI-LABEL: s_sext_i1_to_i64:
  93 ; VI:       ; %bb.0:
  94 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
  95 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
  96 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  97 ; VI-NEXT:    s_mov_b32 s6, -1
  98 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  99 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 100 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 101 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 102 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 103 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 104 ; VI-NEXT:    s_endpgm
 105   %cmp = icmp eq i32 %a, %b
 106   %sext = sext i1 %cmp to i64
 107   store i64 %sext, i64 addrspace(1)* %out, align 8
 108   ret void
 109 }
 110
 111 define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
 112 ; SI-LABEL: s_sext_i32_to_i64:
 113 ; SI:       ; %bb.0:
 114 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 115 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
 116 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 117 ; SI-NEXT:    s_mov_b32 s6, -1
 118 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 119 ; SI-NEXT:    s_ashr_i32 s1, s0, 31
 120 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 121 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 122 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 123 ; SI-NEXT:    s_endpgm
 124 ;
 125 ; VI-LABEL: s_sext_i32_to_i64:
 126 ; VI:       ; %bb.0:
 127 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 128 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
 129 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 130 ; VI-NEXT:    s_mov_b32 s6, -1
 131 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 132 ; VI-NEXT:    s_ashr_i32 s1, s0, 31
 133 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 134 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 135 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 136 ; VI-NEXT:    s_endpgm
 137   %sext = sext i32 %a to i64
 138   store i64 %sext, i64 addrspace(1)* %out, align 8
 139   ret void
 140 }
 141
 142 define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 143 ; SI-LABEL: v_sext_i32_to_i64:
 144 ; SI:       ; %bb.0:
 145 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 146 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 147 ; SI-NEXT:    s_mov_b32 s2, -1
 148 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 149 ; SI-NEXT:    s_mov_b32 s0, s4
 150 ; SI-NEXT:    s_mov_b32 s1, s5
 151 ; SI-NEXT:    s_mov_b32 s4, s6
 152 ; SI-NEXT:    s_mov_b32 s5, s7
 153 ; SI-NEXT:    s_mov_b32 s6, s2
 154 ; SI-NEXT:    s_mov_b32 s7, s3
 155 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 156 ; SI-NEXT:    s_waitcnt vmcnt(0)
 157 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 158 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 159 ; SI-NEXT:    s_endpgm
 160 ;
 161 ; VI-LABEL: v_sext_i32_to_i64:
 162 ; VI:       ; %bb.0:
 163 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 164 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 165 ; VI-NEXT:    s_mov_b32 s2, -1
 166 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 167 ; VI-NEXT:    s_mov_b32 s0, s4
 168 ; VI-NEXT:    s_mov_b32 s1, s5
 169 ; VI-NEXT:    s_mov_b32 s4, s6
 170 ; VI-NEXT:    s_mov_b32 s5, s7
 171 ; VI-NEXT:    s_mov_b32 s6, s2
 172 ; VI-NEXT:    s_mov_b32 s7, s3
 173 ; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 174 ; VI-NEXT:    s_waitcnt vmcnt(0)
 175 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 176 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 177 ; VI-NEXT:    s_endpgm
 178   %val = load i32, i32 addrspace(1)* %in, align 4
 179   %sext = sext i32 %val to i64
 180   store i64 %sext, i64 addrspace(1)* %out, align 8
 181   ret void
 182 }
 183
 184 define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
 185 ; SI-LABEL: s_sext_i16_to_i64:
 186 ; SI:       ; %bb.0:
 187 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 188 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
 189 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 190 ; SI-NEXT:    s_mov_b32 s6, -1
 191 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 192 ; SI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
 193 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 194 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 195 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 196 ; SI-NEXT:    s_endpgm
 197 ;
 198 ; VI-LABEL: s_sext_i16_to_i64:
 199 ; VI:       ; %bb.0:
 200 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 201 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
 202 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 203 ; VI-NEXT:    s_mov_b32 s6, -1
 204 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 205 ; VI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
 206 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 207 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 208 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 209 ; VI-NEXT:    s_endpgm
 210   %sext = sext i16 %a to i64
 211   store i64 %sext, i64 addrspace(1)* %out, align 8
 212   ret void
 213 }
 214
 215 define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 216 ; SI-LABEL: s_sext_i1_to_i16:
 217 ; SI:       ; %bb.0:
 218 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 219 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 220 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 221 ; SI-NEXT:    s_mov_b32 s6, -1
 222 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 223 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 224 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 225 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 226 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 227 ; SI-NEXT:    s_endpgm
 228 ;
 229 ; VI-LABEL: s_sext_i1_to_i16:
 230 ; VI:       ; %bb.0:
 231 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 232 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
 233 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 234 ; VI-NEXT:    s_mov_b32 s6, -1
 235 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 236 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 237 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 238 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 239 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 240 ; VI-NEXT:    s_endpgm
 241   %cmp = icmp eq i32 %a, %b
 242   %sext = sext i1 %cmp to i16
 243   store i16 %sext, i16 addrspace(1)* %out
 244   ret void
 245 }
 246
 247 ; This purpose of this test is to make sure the i16 = sign_extend i1 node
 248 ; makes it all the way throught the legalizer/optimizer to make sure
 249 ; we select this correctly.  In the s_sext_i1_to_i16, the sign_extend node
 250 ; is optimized to a select very early.
 251 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 252 ; SI-LABEL: s_sext_i1_to_i16_with_and:
 253 ; SI:       ; %bb.0:
 254 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 255 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
 256 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 257 ; SI-NEXT:    s_mov_b32 s6, -1
 258 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 259 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 260 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 261 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 262 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
 263 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 264 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 265 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 266 ; SI-NEXT:    s_endpgm
 267 ;
 268 ; VI-LABEL: s_sext_i1_to_i16_with_and:
 269 ; VI:       ; %bb.0:
 270 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 271 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
 272 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 273 ; VI-NEXT:    s_mov_b32 s6, -1
 274 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 275 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 276 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 277 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
 278 ; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
 279 ; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 280 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 281 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 282 ; VI-NEXT:    s_endpgm
 283   %cmp0 = icmp eq i32 %a, %b
 284   %cmp1 = icmp eq i32 %c, %d
 285   %cmp = and i1 %cmp0, %cmp1
 286   %sext = sext i1 %cmp to i16
 287   store i16 %sext, i16 addrspace(1)* %out
 288   ret void
 289 }
 290
 291 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 292 ; SI-LABEL: v_sext_i1_to_i16_with_and:
 293 ; SI:       ; %bb.0:
 294 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 295 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
 296 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
 297 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 298 ; SI-NEXT:    s_mov_b32 s6, -1
 299 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 300 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
 301 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 302 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
 303 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 304 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 305 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 306 ; SI-NEXT:    s_endpgm
 307 ;
 308 ; VI-LABEL: v_sext_i1_to_i16_with_and:
 309 ; VI:       ; %bb.0:
 310 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 311 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 312 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
 313 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 314 ; VI-NEXT:    s_mov_b32 s6, -1
 315 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 316 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
 317 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 318 ; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
 319 ; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 320 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 321 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 322 ; VI-NEXT:    s_endpgm
 323   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 324   %cmp0 = icmp eq i32 %a, %tid
 325   %cmp1 = icmp eq i32 %b, %c
 326   %cmp = and i1 %cmp0, %cmp1
 327   %sext = sext i1 %cmp to i16
 328   store i16 %sext, i16 addrspace(1)* %out
 329   ret void
 330 }
 331
 332 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
 333 ; gets selected to a v_lshrrev_b16 instructions, so the input to
 334 ; the bfe is a vector registers.  To fix this we need to be able to
 335 ; optimize:
 336 ; t29: i16 = truncate t10
 337 ; t55: i16 = srl t29, Constant:i32<8>
 338 ; t63: i32 = any_extend t55
 339 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
 340 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 341 ; SI-LABEL: s_sext_v4i8_to_v4i32:
 342 ; SI:       ; %bb.0:
 343 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 344 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
 345 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 346 ; SI-NEXT:    s_mov_b32 s6, -1
 347 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 348 ; SI-NEXT:    s_ashr_i32 s1, s0, 24
 349 ; SI-NEXT:    s_bfe_i32 s2, s0, 0x80010
 350 ; SI-NEXT:    s_bfe_i32 s3, s0, 0x80008
 351 ; SI-NEXT:    s_sext_i32_i8 s0, s0
 352 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 353 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 354 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 355 ; SI-NEXT:    v_mov_b32_e32 v0, s3
 356 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 357 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 358 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 359 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 360 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 361 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 362 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 363 ; SI-NEXT:    s_waitcnt vmcnt(0)
 364 ; SI-NEXT:    s_endpgm
 365 ;
 366 ; VI-LABEL: s_sext_v4i8_to_v4i32:
 367 ; VI:       ; %bb.0:
 368 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 369 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
 370 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 371 ; VI-NEXT:    s_mov_b32 s6, -1
 372 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 373 ; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s0
 374 ; VI-NEXT:    s_ashr_i32 s1, s0, 24
 375 ; VI-NEXT:    s_bfe_i32 s2, s0, 0x80010
 376 ; VI-NEXT:    s_sext_i32_i8 s0, s0
 377 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 378 ; VI-NEXT:    v_mov_b32_e32 v1, s0
 379 ; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 380 ; VI-NEXT:    s_waitcnt vmcnt(0)
 381 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 382 ; VI-NEXT:    s_waitcnt vmcnt(0)
 383 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 384 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 385 ; VI-NEXT:    s_waitcnt vmcnt(0)
 386 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 387 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 388 ; VI-NEXT:    s_waitcnt vmcnt(0)
 389 ; VI-NEXT:    s_endpgm
 390   %cast = bitcast i32 %a to <4 x i8>
 391   %ext = sext <4 x i8> %cast to <4 x i32>
 392   %elt0 = extractelement <4 x i32> %ext, i32 0
 393   %elt1 = extractelement <4 x i32> %ext, i32 1
 394   %elt2 = extractelement <4 x i32> %ext, i32 2
 395   %elt3 = extractelement <4 x i32> %ext, i32 3
 396   store volatile i32 %elt0, i32 addrspace(1)* %out
 397   store volatile i32 %elt1, i32 addrspace(1)* %out
 398   store volatile i32 %elt2, i32 addrspace(1)* %out
 399   store volatile i32 %elt3, i32 addrspace(1)* %out
 400   ret void
 401 }
 402
 403 ; FIXME: need to optimize same sequence as above test to avoid
 404 ; this shift.
 405 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 406 ; SI-LABEL: v_sext_v4i8_to_v4i32:
 407 ; SI:       ; %bb.0:
 408 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 409 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 410 ; SI-NEXT:    s_mov_b32 s2, -1
 411 ; SI-NEXT:    s_mov_b32 s10, s2
 412 ; SI-NEXT:    s_mov_b32 s11, s3
 413 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 414 ; SI-NEXT:    s_mov_b32 s8, s6
 415 ; SI-NEXT:    s_mov_b32 s9, s7
 416 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 417 ; SI-NEXT:    s_mov_b32 s0, s4
 418 ; SI-NEXT:    s_mov_b32 s1, s5
 419 ; SI-NEXT:    s_waitcnt vmcnt(0)
 420 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
 421 ; SI-NEXT:    v_bfe_i32 v2, v0, 16, 8
 422 ; SI-NEXT:    v_bfe_i32 v3, v0, 8, 8
 423 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 424 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 425 ; SI-NEXT:    s_waitcnt vmcnt(0)
 426 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 427 ; SI-NEXT:    s_waitcnt vmcnt(0)
 428 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 429 ; SI-NEXT:    s_waitcnt vmcnt(0)
 430 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 431 ; SI-NEXT:    s_waitcnt vmcnt(0)
 432 ; SI-NEXT:    s_endpgm
 433 ;
 434 ; VI-LABEL: v_sext_v4i8_to_v4i32:
 435 ; VI:       ; %bb.0:
 436 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 437 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 438 ; VI-NEXT:    s_mov_b32 s2, -1
 439 ; VI-NEXT:    s_mov_b32 s10, s2
 440 ; VI-NEXT:    s_mov_b32 s11, s3
 441 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 442 ; VI-NEXT:    s_mov_b32 s8, s6
 443 ; VI-NEXT:    s_mov_b32 s9, s7
 444 ; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 445 ; VI-NEXT:    s_mov_b32 s0, s4
 446 ; VI-NEXT:    s_mov_b32 s1, s5
 447 ; VI-NEXT:    s_waitcnt vmcnt(0)
 448 ; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 449 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
 450 ; VI-NEXT:    v_bfe_i32 v3, v0, 16, 8
 451 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 452 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 8
 453 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 454 ; VI-NEXT:    s_waitcnt vmcnt(0)
 455 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 456 ; VI-NEXT:    s_waitcnt vmcnt(0)
 457 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 458 ; VI-NEXT:    s_waitcnt vmcnt(0)
 459 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 460 ; VI-NEXT:    s_waitcnt vmcnt(0)
 461 ; VI-NEXT:    s_endpgm
 462   %a = load i32, i32 addrspace(1)* %in
 463   %cast = bitcast i32 %a to <4 x i8>
 464   %ext = sext <4 x i8> %cast to <4 x i32>
 465   %elt0 = extractelement <4 x i32> %ext, i32 0
 466   %elt1 = extractelement <4 x i32> %ext, i32 1
 467   %elt2 = extractelement <4 x i32> %ext, i32 2
 468   %elt3 = extractelement <4 x i32> %ext, i32 3
 469   store volatile i32 %elt0, i32 addrspace(1)* %out
 470   store volatile i32 %elt1, i32 addrspace(1)* %out
 471   store volatile i32 %elt2, i32 addrspace(1)* %out
 472   store volatile i32 %elt3, i32 addrspace(1)* %out
 473   ret void
 474 }
 475
 476 ; FIXME: s_bfe_i64, same on SI and VI
 477 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
 478 ; SI-LABEL: s_sext_v4i16_to_v4i32:
 479 ; SI:       ; %bb.0:
 480 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 481 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 482 ; SI-NEXT:    s_mov_b32 s2, -1
 483 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 484 ; SI-NEXT:    s_mov_b32 s0, s4
 485 ; SI-NEXT:    s_mov_b32 s1, s5
 486 ; SI-NEXT:    s_ashr_i64 s[4:5], s[6:7], 48
 487 ; SI-NEXT:    s_ashr_i32 s5, s6, 16
 488 ; SI-NEXT:    s_sext_i32_i16 s6, s6
 489 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 490 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 491 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 492 ; SI-NEXT:    v_mov_b32_e32 v0, s5
 493 ; SI-NEXT:    s_sext_i32_i16 s7, s7
 494 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 495 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 496 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 497 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 498 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 499 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 500 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 501 ; SI-NEXT:    s_waitcnt vmcnt(0)
 502 ; SI-NEXT:    s_endpgm
 503 ;
 504 ; VI-LABEL: s_sext_v4i16_to_v4i32:
 505 ; VI:       ; %bb.0:
 506 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 507 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 508 ; VI-NEXT:    s_mov_b32 s2, -1
 509 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 510 ; VI-NEXT:    s_mov_b32 s1, s5
 511 ; VI-NEXT:    s_ashr_i32 s5, s6, 16
 512 ; VI-NEXT:    s_sext_i32_i16 s6, s6
 513 ; VI-NEXT:    s_mov_b32 s0, s4
 514 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 515 ; VI-NEXT:    s_ashr_i32 s4, s7, 16
 516 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 517 ; VI-NEXT:    s_waitcnt vmcnt(0)
 518 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 519 ; VI-NEXT:    s_sext_i32_i16 s7, s7
 520 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 521 ; VI-NEXT:    s_waitcnt vmcnt(0)
 522 ; VI-NEXT:    v_mov_b32_e32 v0, s7
 523 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 524 ; VI-NEXT:    s_waitcnt vmcnt(0)
 525 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 526 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 527 ; VI-NEXT:    s_waitcnt vmcnt(0)
 528 ; VI-NEXT:    s_endpgm
 529   %cast = bitcast i64 %a to <4 x i16>
 530   %ext = sext <4 x i16> %cast to <4 x i32>
 531   %elt0 = extractelement <4 x i32> %ext, i32 0
 532   %elt1 = extractelement <4 x i32> %ext, i32 1
 533   %elt2 = extractelement <4 x i32> %ext, i32 2
 534   %elt3 = extractelement <4 x i32> %ext, i32 3
 535   store volatile i32 %elt0, i32 addrspace(1)* %out
 536   store volatile i32 %elt1, i32 addrspace(1)* %out
 537   store volatile i32 %elt2, i32 addrspace(1)* %out
 538   store volatile i32 %elt3, i32 addrspace(1)* %out
 539   ret void
 540 }
 541
 542 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
 543 ; SI-LABEL: v_sext_v4i16_to_v4i32:
 544 ; SI:       ; %bb.0:
 545 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 546 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 547 ; SI-NEXT:    s_mov_b32 s2, -1
 548 ; SI-NEXT:    s_mov_b32 s10, s2
 549 ; SI-NEXT:    s_mov_b32 s11, s3
 550 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 551 ; SI-NEXT:    s_mov_b32 s8, s6
 552 ; SI-NEXT:    s_mov_b32 s9, s7
 553 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
 554 ; SI-NEXT:    s_mov_b32 s0, s4
 555 ; SI-NEXT:    s_mov_b32 s1, s5
 556 ; SI-NEXT:    s_waitcnt vmcnt(0)
 557 ; SI-NEXT:    v_ashr_i64 v[2:3], v[0:1], 48
 558 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 559 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 560 ; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 561 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 562 ; SI-NEXT:    s_waitcnt vmcnt(0)
 563 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 564 ; SI-NEXT:    s_waitcnt vmcnt(0)
 565 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 566 ; SI-NEXT:    s_waitcnt vmcnt(0)
 567 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 568 ; SI-NEXT:    s_waitcnt vmcnt(0)
 569 ; SI-NEXT:    s_endpgm
 570 ;
 571 ; VI-LABEL: v_sext_v4i16_to_v4i32:
 572 ; VI:       ; %bb.0:
 573 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 574 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 575 ; VI-NEXT:    s_mov_b32 s2, -1
 576 ; VI-NEXT:    s_mov_b32 s10, s2
 577 ; VI-NEXT:    s_mov_b32 s11, s3
 578 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 579 ; VI-NEXT:    s_mov_b32 s8, s6
 580 ; VI-NEXT:    s_mov_b32 s9, s7
 581 ; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
 582 ; VI-NEXT:    s_mov_b32 s0, s4
 583 ; VI-NEXT:    s_mov_b32 s1, s5
 584 ; VI-NEXT:    s_waitcnt vmcnt(0)
 585 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 586 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 587 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
 588 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 589 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 590 ; VI-NEXT:    s_waitcnt vmcnt(0)
 591 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
 592 ; VI-NEXT:    s_waitcnt vmcnt(0)
 593 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 594 ; VI-NEXT:    s_waitcnt vmcnt(0)
 595 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 596 ; VI-NEXT:    s_waitcnt vmcnt(0)
 597 ; VI-NEXT:    s_endpgm
 598   %a = load i64, i64 addrspace(1)* %in
 599   %cast = bitcast i64 %a to <4 x i16>
 600   %ext = sext <4 x i16> %cast to <4 x i32>
 601   %elt0 = extractelement <4 x i32> %ext, i32 0
 602   %elt1 = extractelement <4 x i32> %ext, i32 1
 603   %elt2 = extractelement <4 x i32> %ext, i32 2
 604   %elt3 = extractelement <4 x i32> %ext, i32 3
 605   store volatile i32 %elt0, i32 addrspace(1)* %out
 606   store volatile i32 %elt1, i32 addrspace(1)* %out
 607   store volatile i32 %elt2, i32 addrspace(1)* %out
 608   store volatile i32 %elt3, i32 addrspace(1)* %out
 609   ret void
 610 }
 611
 612 declare i32 @llvm.amdgcn.workitem.id.x() #1
 613
 614 attributes #1 = { nounwind readnone }