llvm/test/CodeGen/AMDGPU/sign_extend.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
   3 ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
   4
   5 define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
   6 ; SI-LABEL: s_sext_i1_to_i32:
   7 ; SI:       ; %bb.0:
   8 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
   9 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  10 ; SI-NEXT:    s_mov_b32 s6, -1
  11 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  12 ; SI-NEXT:    s_cmp_eq_u32 s2, s3
  13 ; SI-NEXT:    s_mov_b32 s4, s0
  14 ; SI-NEXT:    s_mov_b32 s5, s1
  15 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
  16 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
  17 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  18 ; SI-NEXT:    s_endpgm
  19 ;
  20 ; VI-LABEL: s_sext_i1_to_i32:
  21 ; VI:       ; %bb.0:
  22 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
  23 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  24 ; VI-NEXT:    s_mov_b32 s6, -1
  25 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  26 ; VI-NEXT:    s_cmp_eq_u32 s2, s3
  27 ; VI-NEXT:    s_mov_b32 s4, s0
  28 ; VI-NEXT:    s_mov_b32 s5, s1
  29 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
  30 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
  31 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  32 ; VI-NEXT:    s_endpgm
  33   %cmp = icmp eq i32 %a, %b
  34   %sext = sext i1 %cmp to i32
  35   store i32 %sext, ptr addrspace(1) %out, align 4
  36   ret void
  37 }
  38
  39 define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
  40 ; SI-LABEL: test_s_sext_i32_to_i64:
  41 ; SI:       ; %bb.0: ; %entry
  42 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
  43 ; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
  44 ; SI-NEXT:    s_mov_b32 s3, 0xf000
  45 ; SI-NEXT:    s_mov_b32 s2, -1
  46 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  47 ; SI-NEXT:    s_mov_b32 s0, s4
  48 ; SI-NEXT:    s_mul_i32 s4, s6, s7
  49 ; SI-NEXT:    s_add_i32 s4, s4, s8
  50 ; SI-NEXT:    s_mov_b32 s1, s5
  51 ; SI-NEXT:    s_ashr_i32 s5, s4, 31
  52 ; SI-NEXT:    v_mov_b32_e32 v0, s4
  53 ; SI-NEXT:    v_mov_b32_e32 v1, s5
  54 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
  55 ; SI-NEXT:    s_endpgm
  56 ;
  57 ; VI-LABEL: test_s_sext_i32_to_i64:
  58 ; VI:       ; %bb.0: ; %entry
  59 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
  60 ; VI-NEXT:    s_load_dword s8, s[0:1], 0x34
  61 ; VI-NEXT:    s_mov_b32 s3, 0xf000
  62 ; VI-NEXT:    s_mov_b32 s2, -1
  63 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  64 ; VI-NEXT:    s_mov_b32 s0, s4
  65 ; VI-NEXT:    s_mul_i32 s4, s6, s7
  66 ; VI-NEXT:    s_add_i32 s4, s4, s8
  67 ; VI-NEXT:    s_mov_b32 s1, s5
  68 ; VI-NEXT:    s_ashr_i32 s5, s4, 31
  69 ; VI-NEXT:    v_mov_b32_e32 v0, s4
  70 ; VI-NEXT:    v_mov_b32_e32 v1, s5
  71 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
  72 ; VI-NEXT:    s_endpgm
  73 entry:
  74   %mul = mul i32 %a, %b
  75   %add = add i32 %mul, %c
  76   %sext = sext i32 %add to i64
  77   store i64 %sext, ptr addrspace(1) %out, align 8
  78   ret void
  79 }
  80
  81 define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
  82 ; SI-LABEL: s_sext_i1_to_i64:
  83 ; SI:       ; %bb.0:
  84 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
  85 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  86 ; SI-NEXT:    s_mov_b32 s6, -1
  87 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  88 ; SI-NEXT:    s_cmp_eq_u32 s2, s3
  89 ; SI-NEXT:    s_mov_b32 s4, s0
  90 ; SI-NEXT:    s_mov_b32 s5, s1
  91 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
  92 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
  93 ; SI-NEXT:    v_mov_b32_e32 v1, v0
  94 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
  95 ; SI-NEXT:    s_endpgm
  96 ;
  97 ; VI-LABEL: s_sext_i1_to_i64:
  98 ; VI:       ; %bb.0:
  99 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 100 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 101 ; VI-NEXT:    s_mov_b32 s6, -1
 102 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 103 ; VI-NEXT:    s_cmp_eq_u32 s2, s3
 104 ; VI-NEXT:    s_mov_b32 s4, s0
 105 ; VI-NEXT:    s_mov_b32 s5, s1
 106 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 107 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 108 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 109 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 110 ; VI-NEXT:    s_endpgm
 111   %cmp = icmp eq i32 %a, %b
 112   %sext = sext i1 %cmp to i64
 113   store i64 %sext, ptr addrspace(1) %out, align 8
 114   ret void
 115 }
 116
 117 define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind {
 118 ; SI-LABEL: s_sext_i32_to_i64:
 119 ; SI:       ; %bb.0:
 120 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
 121 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 122 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 123 ; SI-NEXT:    s_mov_b32 s2, -1
 124 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 125 ; SI-NEXT:    s_ashr_i32 s5, s4, 31
 126 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 127 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 128 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 129 ; SI-NEXT:    s_endpgm
 130 ;
 131 ; VI-LABEL: s_sext_i32_to_i64:
 132 ; VI:       ; %bb.0:
 133 ; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 134 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 135 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 136 ; VI-NEXT:    s_mov_b32 s2, -1
 137 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 138 ; VI-NEXT:    s_ashr_i32 s5, s4, 31
 139 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 140 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 141 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 142 ; VI-NEXT:    s_endpgm
 143   %sext = sext i32 %a to i64
 144   store i64 %sext, ptr addrspace(1) %out, align 8
 145   ret void
 146 }
 147
 148 define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 149 ; SI-LABEL: v_sext_i32_to_i64:
 150 ; SI:       ; %bb.0:
 151 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 152 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 153 ; SI-NEXT:    s_mov_b32 s6, -1
 154 ; SI-NEXT:    s_mov_b32 s10, s6
 155 ; SI-NEXT:    s_mov_b32 s11, s7
 156 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 157 ; SI-NEXT:    s_mov_b32 s8, s2
 158 ; SI-NEXT:    s_mov_b32 s9, s3
 159 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 160 ; SI-NEXT:    s_mov_b32 s4, s0
 161 ; SI-NEXT:    s_mov_b32 s5, s1
 162 ; SI-NEXT:    s_waitcnt vmcnt(0)
 163 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 164 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 165 ; SI-NEXT:    s_endpgm
 166 ;
 167 ; VI-LABEL: v_sext_i32_to_i64:
 168 ; VI:       ; %bb.0:
 169 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 170 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 171 ; VI-NEXT:    s_mov_b32 s6, -1
 172 ; VI-NEXT:    s_mov_b32 s10, s6
 173 ; VI-NEXT:    s_mov_b32 s11, s7
 174 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 175 ; VI-NEXT:    s_mov_b32 s8, s2
 176 ; VI-NEXT:    s_mov_b32 s9, s3
 177 ; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 178 ; VI-NEXT:    s_mov_b32 s4, s0
 179 ; VI-NEXT:    s_mov_b32 s5, s1
 180 ; VI-NEXT:    s_waitcnt vmcnt(0)
 181 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 182 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 183 ; VI-NEXT:    s_endpgm
 184   %val = load i32, ptr addrspace(1) %in, align 4
 185   %sext = sext i32 %val to i64
 186   store i64 %sext, ptr addrspace(1) %out, align 8
 187   ret void
 188 }
 189
 190 define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind {
 191 ; SI-LABEL: s_sext_i16_to_i64:
 192 ; SI:       ; %bb.0:
 193 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
 194 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 195 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 196 ; SI-NEXT:    s_mov_b32 s2, -1
 197 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 198 ; SI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
 199 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 200 ; SI-NEXT:    v_mov_b32_e32 v1, s5
 201 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 202 ; SI-NEXT:    s_endpgm
 203 ;
 204 ; VI-LABEL: s_sext_i16_to_i64:
 205 ; VI:       ; %bb.0:
 206 ; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 207 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 208 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 209 ; VI-NEXT:    s_mov_b32 s2, -1
 210 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 211 ; VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
 212 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 213 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 214 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 215 ; VI-NEXT:    s_endpgm
 216   %sext = sext i16 %a to i64
 217   store i64 %sext, ptr addrspace(1) %out, align 8
 218   ret void
 219 }
 220
 221 define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
 222 ; SI-LABEL: s_sext_i1_to_i16:
 223 ; SI:       ; %bb.0:
 224 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 225 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 226 ; SI-NEXT:    s_mov_b32 s6, -1
 227 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 228 ; SI-NEXT:    s_cmp_eq_u32 s2, s3
 229 ; SI-NEXT:    s_mov_b32 s4, s0
 230 ; SI-NEXT:    s_mov_b32 s5, s1
 231 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 232 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 233 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 234 ; SI-NEXT:    s_endpgm
 235 ;
 236 ; VI-LABEL: s_sext_i1_to_i16:
 237 ; VI:       ; %bb.0:
 238 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 239 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 240 ; VI-NEXT:    s_mov_b32 s6, -1
 241 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 242 ; VI-NEXT:    s_cmp_eq_u32 s2, s3
 243 ; VI-NEXT:    s_mov_b32 s4, s0
 244 ; VI-NEXT:    s_mov_b32 s5, s1
 245 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 246 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 247 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 248 ; VI-NEXT:    s_endpgm
 249   %cmp = icmp eq i32 %a, %b
 250   %sext = sext i1 %cmp to i16
 251   store i16 %sext, ptr addrspace(1) %out
 252   ret void
 253 }
 254
 255 ; This purpose of this test is to make sure the i16 = sign_extend i1 node
 256 ; makes it all the way throught the legalizer/optimizer to make sure
 257 ; we select this correctly.  In the s_sext_i1_to_i16, the sign_extend node
 258 ; is optimized to a select very early.
 259 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 260 ; SI-LABEL: s_sext_i1_to_i16_with_and:
 261 ; SI:       ; %bb.0:
 262 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 263 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 264 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 265 ; SI-NEXT:    s_mov_b32 s2, -1
 266 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 267 ; SI-NEXT:    s_cmp_eq_u32 s4, s5
 268 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 269 ; SI-NEXT:    s_cmp_eq_u32 s6, s7
 270 ; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 271 ; SI-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 272 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
 273 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 274 ; SI-NEXT:    s_endpgm
 275 ;
 276 ; VI-LABEL: s_sext_i1_to_i16_with_and:
 277 ; VI:       ; %bb.0:
 278 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
 279 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 280 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 281 ; VI-NEXT:    s_mov_b32 s2, -1
 282 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 283 ; VI-NEXT:    s_cmp_eq_u32 s4, s5
 284 ; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 285 ; VI-NEXT:    s_cmp_eq_u32 s6, s7
 286 ; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 287 ; VI-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 288 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
 289 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 290 ; VI-NEXT:    s_endpgm
 291   %cmp0 = icmp eq i32 %a, %b
 292   %cmp1 = icmp eq i32 %c, %d
 293   %cmp = and i1 %cmp0, %cmp1
 294   %sext = sext i1 %cmp to i16
 295   store i16 %sext, ptr addrspace(1) %out
 296   ret void
 297 }
 298
 299 define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
 300 ; SI-LABEL: v_sext_i1_to_i16_with_and:
 301 ; SI:       ; %bb.0:
 302 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 303 ; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
 304 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 305 ; SI-NEXT:    s_mov_b32 s2, -1
 306 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 307 ; SI-NEXT:    s_mov_b32 s0, s4
 308 ; SI-NEXT:    s_cmp_eq_u32 s7, s8
 309 ; SI-NEXT:    s_mov_b32 s1, s5
 310 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v0
 311 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 312 ; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 313 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
 314 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 315 ; SI-NEXT:    s_endpgm
 316 ;
 317 ; VI-LABEL: v_sext_i1_to_i16_with_and:
 318 ; VI:       ; %bb.0:
 319 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 320 ; VI-NEXT:    s_load_dword s8, s[0:1], 0x34
 321 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 322 ; VI-NEXT:    s_mov_b32 s2, -1
 323 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 324 ; VI-NEXT:    s_mov_b32 s0, s4
 325 ; VI-NEXT:    s_cmp_eq_u32 s7, s8
 326 ; VI-NEXT:    s_mov_b32 s1, s5
 327 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v0
 328 ; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 329 ; VI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 330 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
 331 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 332 ; VI-NEXT:    s_endpgm
 333   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 334   %cmp0 = icmp eq i32 %a, %tid
 335   %cmp1 = icmp eq i32 %b, %c
 336   %cmp = and i1 %cmp0, %cmp1
 337   %sext = sext i1 %cmp to i16
 338   store i16 %sext, ptr addrspace(1) %out
 339   ret void
 340 }
 341
 342 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
 343 ; gets selected to a v_lshrrev_b16 instructions, so the input to
 344 ; the bfe is a vector registers.  To fix this we need to be able to
 345 ; optimize:
 346 ; t29: i16 = truncate t10
 347 ; t55: i16 = srl t29, Constant:i32<8>
 348 ; t63: i32 = any_extend t55
 349 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
 350 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind {
 351 ; SI-LABEL: s_sext_v4i8_to_v4i32:
 352 ; SI:       ; %bb.0:
 353 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
 354 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 355 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 356 ; SI-NEXT:    s_mov_b32 s2, -1
 357 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 358 ; SI-NEXT:    s_ashr_i32 s5, s4, 24
 359 ; SI-NEXT:    s_bfe_i32 s6, s4, 0x80010
 360 ; SI-NEXT:    s_bfe_i32 s7, s4, 0x80008
 361 ; SI-NEXT:    s_sext_i32_i8 s4, s4
 362 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 363 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 364 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 365 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 366 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 367 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 368 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 369 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 370 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 371 ; SI-NEXT:    v_mov_b32_e32 v0, s5
 372 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 373 ; SI-NEXT:    s_waitcnt vmcnt(0)
 374 ; SI-NEXT:    s_endpgm
 375 ;
 376 ; VI-LABEL: s_sext_v4i8_to_v4i32:
 377 ; VI:       ; %bb.0:
 378 ; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 379 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 380 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 381 ; VI-NEXT:    s_mov_b32 s2, -1
 382 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 383 ; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s4
 384 ; VI-NEXT:    s_ashr_i32 s5, s4, 24
 385 ; VI-NEXT:    s_bfe_i32 s6, s4, 0x80010
 386 ; VI-NEXT:    s_sext_i32_i8 s4, s4
 387 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 388 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 389 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 390 ; VI-NEXT:    s_waitcnt vmcnt(0)
 391 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 392 ; VI-NEXT:    s_waitcnt vmcnt(0)
 393 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 394 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 395 ; VI-NEXT:    s_waitcnt vmcnt(0)
 396 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 397 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 398 ; VI-NEXT:    s_waitcnt vmcnt(0)
 399 ; VI-NEXT:    s_endpgm
 400   %cast = bitcast i32 %a to <4 x i8>
 401   %ext = sext <4 x i8> %cast to <4 x i32>
 402   %elt0 = extractelement <4 x i32> %ext, i32 0
 403   %elt1 = extractelement <4 x i32> %ext, i32 1
 404   %elt2 = extractelement <4 x i32> %ext, i32 2
 405   %elt3 = extractelement <4 x i32> %ext, i32 3
 406   store volatile i32 %elt0, ptr addrspace(1) %out
 407   store volatile i32 %elt1, ptr addrspace(1) %out
 408   store volatile i32 %elt2, ptr addrspace(1) %out
 409   store volatile i32 %elt3, ptr addrspace(1) %out
 410   ret void
 411 }
 412
 413 ; FIXME: need to optimize same sequence as above test to avoid
 414 ; this shift.
 415 define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 416 ; SI-LABEL: v_sext_v4i8_to_v4i32:
 417 ; SI:       ; %bb.0:
 418 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 419 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 420 ; SI-NEXT:    s_mov_b32 s6, -1
 421 ; SI-NEXT:    s_mov_b32 s10, s6
 422 ; SI-NEXT:    s_mov_b32 s11, s7
 423 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 424 ; SI-NEXT:    s_mov_b32 s8, s2
 425 ; SI-NEXT:    s_mov_b32 s9, s3
 426 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 427 ; SI-NEXT:    s_mov_b32 s4, s0
 428 ; SI-NEXT:    s_mov_b32 s5, s1
 429 ; SI-NEXT:    s_waitcnt vmcnt(0)
 430 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
 431 ; SI-NEXT:    v_bfe_i32 v2, v0, 16, 8
 432 ; SI-NEXT:    v_bfe_i32 v3, v0, 8, 8
 433 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 434 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 435 ; SI-NEXT:    s_waitcnt vmcnt(0)
 436 ; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
 437 ; SI-NEXT:    s_waitcnt vmcnt(0)
 438 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 439 ; SI-NEXT:    s_waitcnt vmcnt(0)
 440 ; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 441 ; SI-NEXT:    s_waitcnt vmcnt(0)
 442 ; SI-NEXT:    s_endpgm
 443 ;
 444 ; VI-LABEL: v_sext_v4i8_to_v4i32:
 445 ; VI:       ; %bb.0:
 446 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 447 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 448 ; VI-NEXT:    s_mov_b32 s6, -1
 449 ; VI-NEXT:    s_mov_b32 s10, s6
 450 ; VI-NEXT:    s_mov_b32 s11, s7
 451 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 452 ; VI-NEXT:    s_mov_b32 s8, s2
 453 ; VI-NEXT:    s_mov_b32 s9, s3
 454 ; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 455 ; VI-NEXT:    s_mov_b32 s4, s0
 456 ; VI-NEXT:    s_mov_b32 s5, s1
 457 ; VI-NEXT:    s_waitcnt vmcnt(0)
 458 ; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 459 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
 460 ; VI-NEXT:    v_bfe_i32 v3, v0, 16, 8
 461 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 462 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 8
 463 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 464 ; VI-NEXT:    s_waitcnt vmcnt(0)
 465 ; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 466 ; VI-NEXT:    s_waitcnt vmcnt(0)
 467 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
 468 ; VI-NEXT:    s_waitcnt vmcnt(0)
 469 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 470 ; VI-NEXT:    s_waitcnt vmcnt(0)
 471 ; VI-NEXT:    s_endpgm
 472   %a = load i32, ptr addrspace(1) %in
 473   %cast = bitcast i32 %a to <4 x i8>
 474   %ext = sext <4 x i8> %cast to <4 x i32>
 475   %elt0 = extractelement <4 x i32> %ext, i32 0
 476   %elt1 = extractelement <4 x i32> %ext, i32 1
 477   %elt2 = extractelement <4 x i32> %ext, i32 2
 478   %elt3 = extractelement <4 x i32> %ext, i32 3
 479   store volatile i32 %elt0, ptr addrspace(1) %out
 480   store volatile i32 %elt1, ptr addrspace(1) %out
 481   store volatile i32 %elt2, ptr addrspace(1) %out
 482   store volatile i32 %elt3, ptr addrspace(1) %out
 483   ret void
 484 }
 485
 486 ; FIXME: s_bfe_i64, same on SI and VI
 487 define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind {
 488 ; SI-LABEL: s_sext_v4i16_to_v4i32:
 489 ; SI:       ; %bb.0:
 490 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 491 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 492 ; SI-NEXT:    s_mov_b32 s6, -1
 493 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 494 ; SI-NEXT:    s_mov_b32 s4, s0
 495 ; SI-NEXT:    s_mov_b32 s5, s1
 496 ; SI-NEXT:    s_ashr_i64 s[0:1], s[2:3], 48
 497 ; SI-NEXT:    s_ashr_i32 s1, s2, 16
 498 ; SI-NEXT:    s_sext_i32_i16 s2, s2
 499 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 500 ; SI-NEXT:    s_sext_i32_i16 s3, s3
 501 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 502 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 503 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 504 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 505 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 506 ; SI-NEXT:    v_mov_b32_e32 v0, s3
 507 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 508 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 509 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 510 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 511 ; SI-NEXT:    s_waitcnt vmcnt(0)
 512 ; SI-NEXT:    s_endpgm
 513 ;
 514 ; VI-LABEL: s_sext_v4i16_to_v4i32:
 515 ; VI:       ; %bb.0:
 516 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 517 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 518 ; VI-NEXT:    s_mov_b32 s6, -1
 519 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 520 ; VI-NEXT:    s_mov_b32 s5, s1
 521 ; VI-NEXT:    s_ashr_i32 s1, s2, 16
 522 ; VI-NEXT:    s_sext_i32_i16 s2, s2
 523 ; VI-NEXT:    s_mov_b32 s4, s0
 524 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 525 ; VI-NEXT:    s_ashr_i32 s0, s3, 16
 526 ; VI-NEXT:    s_sext_i32_i16 s3, s3
 527 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 528 ; VI-NEXT:    s_waitcnt vmcnt(0)
 529 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 530 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 531 ; VI-NEXT:    s_waitcnt vmcnt(0)
 532 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 533 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 534 ; VI-NEXT:    s_waitcnt vmcnt(0)
 535 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 536 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 537 ; VI-NEXT:    s_waitcnt vmcnt(0)
 538 ; VI-NEXT:    s_endpgm
 539   %cast = bitcast i64 %a to <4 x i16>
 540   %ext = sext <4 x i16> %cast to <4 x i32>
 541   %elt0 = extractelement <4 x i32> %ext, i32 0
 542   %elt1 = extractelement <4 x i32> %ext, i32 1
 543   %elt2 = extractelement <4 x i32> %ext, i32 2
 544   %elt3 = extractelement <4 x i32> %ext, i32 3
 545   store volatile i32 %elt0, ptr addrspace(1) %out
 546   store volatile i32 %elt1, ptr addrspace(1) %out
 547   store volatile i32 %elt2, ptr addrspace(1) %out
 548   store volatile i32 %elt3, ptr addrspace(1) %out
 549   ret void
 550 }
 551
 552 define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
 553 ; SI-LABEL: v_sext_v4i16_to_v4i32:
 554 ; SI:       ; %bb.0:
 555 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 556 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 557 ; SI-NEXT:    s_mov_b32 s6, -1
 558 ; SI-NEXT:    s_mov_b32 s10, s6
 559 ; SI-NEXT:    s_mov_b32 s11, s7
 560 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 561 ; SI-NEXT:    s_mov_b32 s8, s2
 562 ; SI-NEXT:    s_mov_b32 s9, s3
 563 ; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
 564 ; SI-NEXT:    s_mov_b32 s4, s0
 565 ; SI-NEXT:    s_mov_b32 s5, s1
 566 ; SI-NEXT:    s_waitcnt vmcnt(0)
 567 ; SI-NEXT:    v_ashr_i64 v[2:3], v[0:1], 48
 568 ; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 569 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 570 ; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 571 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 572 ; SI-NEXT:    s_waitcnt vmcnt(0)
 573 ; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
 574 ; SI-NEXT:    s_waitcnt vmcnt(0)
 575 ; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 576 ; SI-NEXT:    s_waitcnt vmcnt(0)
 577 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 578 ; SI-NEXT:    s_waitcnt vmcnt(0)
 579 ; SI-NEXT:    s_endpgm
 580 ;
 581 ; VI-LABEL: v_sext_v4i16_to_v4i32:
 582 ; VI:       ; %bb.0:
 583 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 584 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 585 ; VI-NEXT:    s_mov_b32 s6, -1
 586 ; VI-NEXT:    s_mov_b32 s10, s6
 587 ; VI-NEXT:    s_mov_b32 s11, s7
 588 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 589 ; VI-NEXT:    s_mov_b32 s8, s2
 590 ; VI-NEXT:    s_mov_b32 s9, s3
 591 ; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
 592 ; VI-NEXT:    s_mov_b32 s4, s0
 593 ; VI-NEXT:    s_mov_b32 s5, s1
 594 ; VI-NEXT:    s_waitcnt vmcnt(0)
 595 ; VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
 596 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 597 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
 598 ; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
 599 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 600 ; VI-NEXT:    s_waitcnt vmcnt(0)
 601 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
 602 ; VI-NEXT:    s_waitcnt vmcnt(0)
 603 ; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 604 ; VI-NEXT:    s_waitcnt vmcnt(0)
 605 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 606 ; VI-NEXT:    s_waitcnt vmcnt(0)
 607 ; VI-NEXT:    s_endpgm
 608   %a = load i64, ptr addrspace(1) %in
 609   %cast = bitcast i64 %a to <4 x i16>
 610   %ext = sext <4 x i16> %cast to <4 x i32>
 611   %elt0 = extractelement <4 x i32> %ext, i32 0
 612   %elt1 = extractelement <4 x i32> %ext, i32 1
 613   %elt2 = extractelement <4 x i32> %ext, i32 2
 614   %elt3 = extractelement <4 x i32> %ext, i32 3
 615   store volatile i32 %elt0, ptr addrspace(1) %out
 616   store volatile i32 %elt1, ptr addrspace(1) %out
 617   store volatile i32 %elt2, ptr addrspace(1) %out
 618   store volatile i32 %elt3, ptr addrspace(1) %out
 619   ret void
 620 }
 621
 622 declare i32 @llvm.amdgcn.workitem.id.x() #1
 623
 624 attributes #1 = { nounwind readnone }