llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
   3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
   4
   5 define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
   6 ; SI-LABEL: v_ubfe_sub_i32:
   7 ; SI:       ; %bb.0:
   8 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
   9 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  10 ; SI-NEXT:    s_mov_b32 s6, 0
  11 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  12 ; SI-NEXT:    v_mov_b32_e32 v1, 0
  13 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  14 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
  15 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
  16 ; SI-NEXT:    s_waitcnt vmcnt(0)
  17 ; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
  18 ; SI-NEXT:    s_waitcnt vmcnt(0)
  19 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
  20 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, 32, v3
  21 ; SI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
  22 ; SI-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
  23 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
  24 ; SI-NEXT:    s_endpgm
  25 ;
  26 ; VI-LABEL: v_ubfe_sub_i32:
  27 ; VI:       ; %bb.0:
  28 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
  29 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
  30 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  31 ; VI-NEXT:    v_mov_b32_e32 v1, s3
  32 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
  33 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  34 ; VI-NEXT:    flat_load_dword v3, v[0:1] glc
  35 ; VI-NEXT:    s_waitcnt vmcnt(0)
  36 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
  37 ; VI-NEXT:    s_waitcnt vmcnt(0)
  38 ; VI-NEXT:    v_mov_b32_e32 v1, s1
  39 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
  40 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  41 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
  42 ; VI-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
  43 ; VI-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
  44 ; VI-NEXT:    flat_store_dword v[0:1], v2
  45 ; VI-NEXT:    s_endpgm
  46   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  47   %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
  48   %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
  49   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
  50   %src = load volatile i32, ptr addrspace(1) %in0.gep
  51   %width = load volatile i32, ptr addrspace(1) %in0.gep
  52   %sub = sub i32 32, %width
  53   %shl = shl i32 %src, %sub
  54   %bfe = lshr i32 %shl, %sub
  55   store i32 %bfe, ptr addrspace(1) %out.gep
  56   ret void
  57 }
  58
  59 define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
  60 ; SI-LABEL: v_ubfe_sub_multi_use_shl_i32:
  61 ; SI:       ; %bb.0:
  62 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
  63 ; SI-NEXT:    s_mov_b32 s6, 0
  64 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  65 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  66 ; SI-NEXT:    v_mov_b32_e32 v1, 0
  67 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  68 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
  69 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
  70 ; SI-NEXT:    s_waitcnt vmcnt(0)
  71 ; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
  72 ; SI-NEXT:    s_waitcnt vmcnt(0)
  73 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
  74 ; SI-NEXT:    s_mov_b32 s6, -1
  75 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, 32, v3
  76 ; SI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
  77 ; SI-NEXT:    v_lshrrev_b32_e32 v3, v3, v2
  78 ; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
  79 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
  80 ; SI-NEXT:    s_waitcnt vmcnt(0)
  81 ; SI-NEXT:    s_endpgm
  82 ;
  83 ; VI-LABEL: v_ubfe_sub_multi_use_shl_i32:
  84 ; VI:       ; %bb.0:
  85 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
  86 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
  87 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  88 ; VI-NEXT:    v_mov_b32_e32 v1, s3
  89 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
  90 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  91 ; VI-NEXT:    flat_load_dword v3, v[0:1] glc
  92 ; VI-NEXT:    s_waitcnt vmcnt(0)
  93 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
  94 ; VI-NEXT:    s_waitcnt vmcnt(0)
  95 ; VI-NEXT:    v_mov_b32_e32 v1, s1
  96 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
  97 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  98 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
  99 ; VI-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
 100 ; VI-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 101 ; VI-NEXT:    flat_store_dword v[0:1], v2
 102 ; VI-NEXT:    flat_store_dword v[0:1], v3
 103 ; VI-NEXT:    s_waitcnt vmcnt(0)
 104 ; VI-NEXT:    s_endpgm
 105   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 106   %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
 107   %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
 108   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
 109   %src = load volatile i32, ptr addrspace(1) %in0.gep
 110   %width = load volatile i32, ptr addrspace(1) %in0.gep
 111   %sub = sub i32 32, %width
 112   %shl = shl i32 %src, %sub
 113   %bfe = lshr i32 %shl, %sub
 114   store i32 %bfe, ptr addrspace(1) %out.gep
 115   store volatile i32 %shl, ptr addrspace(1) undef
 116   ret void
 117 }
 118
 119 define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
 120 ; SI-LABEL: s_ubfe_sub_i32:
 121 ; SI:       ; %bb.0:
 122 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 123 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 124 ; SI-NEXT:    s_mov_b32 s6, 0
 125 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 126 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 127 ; SI-NEXT:    s_sub_i32 s3, 32, s3
 128 ; SI-NEXT:    s_lshl_b32 s2, s2, s3
 129 ; SI-NEXT:    s_lshr_b32 s2, s2, s3
 130 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 131 ; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 132 ; SI-NEXT:    v_mov_b32_e32 v2, s2
 133 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 134 ; SI-NEXT:    s_endpgm
 135 ;
 136 ; VI-LABEL: s_ubfe_sub_i32:
 137 ; VI:       ; %bb.0:
 138 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 139 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 140 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 141 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 142 ; VI-NEXT:    s_sub_i32 s0, 32, s3
 143 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 144 ; VI-NEXT:    s_lshl_b32 s1, s2, s0
 145 ; VI-NEXT:    s_lshr_b32 s0, s1, s0
 146 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 147 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 148 ; VI-NEXT:    flat_store_dword v[0:1], v2
 149 ; VI-NEXT:    s_endpgm
 150   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 151   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
 152   %sub = sub i32 32, %width
 153   %shl = shl i32 %src, %sub
 154   %bfe = lshr i32 %shl, %sub
 155   store i32 %bfe, ptr addrspace(1) %out.gep
 156   ret void
 157 }
 158
 159 define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
 160 ; SI-LABEL: s_ubfe_sub_multi_use_shl_i32:
 161 ; SI:       ; %bb.0:
 162 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 163 ; SI-NEXT:    s_mov_b32 s6, 0
 164 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 165 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 166 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 167 ; SI-NEXT:    s_sub_i32 s3, 32, s3
 168 ; SI-NEXT:    s_lshl_b32 s2, s2, s3
 169 ; SI-NEXT:    s_lshr_b32 s3, s2, s3
 170 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 171 ; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 172 ; SI-NEXT:    v_mov_b32_e32 v2, s3
 173 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 174 ; SI-NEXT:    s_mov_b32 s6, -1
 175 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 176 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 177 ; SI-NEXT:    s_waitcnt vmcnt(0)
 178 ; SI-NEXT:    s_endpgm
 179 ;
 180 ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32:
 181 ; VI:       ; %bb.0:
 182 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 183 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 184 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 185 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 186 ; VI-NEXT:    s_sub_i32 s0, 32, s3
 187 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 188 ; VI-NEXT:    s_lshl_b32 s1, s2, s0
 189 ; VI-NEXT:    s_lshr_b32 s0, s1, s0
 190 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 191 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 192 ; VI-NEXT:    flat_store_dword v[0:1], v2
 193 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 194 ; VI-NEXT:    flat_store_dword v[0:1], v0
 195 ; VI-NEXT:    s_waitcnt vmcnt(0)
 196 ; VI-NEXT:    s_endpgm
 197   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 198   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
 199   %sub = sub i32 32, %width
 200   %shl = shl i32 %src, %sub
 201   %bfe = lshr i32 %shl, %sub
 202   store i32 %bfe, ptr addrspace(1) %out.gep
 203   store volatile i32 %shl, ptr addrspace(1) undef
 204   ret void
 205 }
 206
 207 define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 208 ; SI-LABEL: v_sbfe_sub_i32:
 209 ; SI:       ; %bb.0:
 210 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 211 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 212 ; SI-NEXT:    s_mov_b32 s6, 0
 213 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 214 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 215 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 216 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 217 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 218 ; SI-NEXT:    s_waitcnt vmcnt(0)
 219 ; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
 220 ; SI-NEXT:    s_waitcnt vmcnt(0)
 221 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 222 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, 32, v3
 223 ; SI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 224 ; SI-NEXT:    v_ashrrev_i32_e32 v2, v3, v2
 225 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 226 ; SI-NEXT:    s_endpgm
 227 ;
 228 ; VI-LABEL: v_sbfe_sub_i32:
 229 ; VI:       ; %bb.0:
 230 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 231 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 232 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 233 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 234 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 235 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 236 ; VI-NEXT:    flat_load_dword v3, v[0:1] glc
 237 ; VI-NEXT:    s_waitcnt vmcnt(0)
 238 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
 239 ; VI-NEXT:    s_waitcnt vmcnt(0)
 240 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 241 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 242 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 243 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
 244 ; VI-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
 245 ; VI-NEXT:    v_ashrrev_i32_e32 v2, v2, v3
 246 ; VI-NEXT:    flat_store_dword v[0:1], v2
 247 ; VI-NEXT:    s_endpgm
 248   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 249   %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
 250   %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
 251   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
 252   %src = load volatile i32, ptr addrspace(1) %in0.gep
 253   %width = load volatile i32, ptr addrspace(1) %in0.gep
 254   %sub = sub i32 32, %width
 255   %shl = shl i32 %src, %sub
 256   %bfe = ashr i32 %shl, %sub
 257   store i32 %bfe, ptr addrspace(1) %out.gep
 258   ret void
 259 }
 260
 261 define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
 262 ; SI-LABEL: v_sbfe_sub_multi_use_shl_i32:
 263 ; SI:       ; %bb.0:
 264 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 265 ; SI-NEXT:    s_mov_b32 s6, 0
 266 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 267 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 268 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 269 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 270 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 271 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
 272 ; SI-NEXT:    s_waitcnt vmcnt(0)
 273 ; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
 274 ; SI-NEXT:    s_waitcnt vmcnt(0)
 275 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 276 ; SI-NEXT:    s_mov_b32 s6, -1
 277 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, 32, v3
 278 ; SI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
 279 ; SI-NEXT:    v_ashrrev_i32_e32 v3, v3, v2
 280 ; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
 281 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 282 ; SI-NEXT:    s_waitcnt vmcnt(0)
 283 ; SI-NEXT:    s_endpgm
 284 ;
 285 ; VI-LABEL: v_sbfe_sub_multi_use_shl_i32:
 286 ; VI:       ; %bb.0:
 287 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 288 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 289 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 290 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 291 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 292 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 293 ; VI-NEXT:    flat_load_dword v3, v[0:1] glc
 294 ; VI-NEXT:    s_waitcnt vmcnt(0)
 295 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
 296 ; VI-NEXT:    s_waitcnt vmcnt(0)
 297 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 298 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 299 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 300 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
 301 ; VI-NEXT:    v_lshlrev_b32_e32 v3, v2, v3
 302 ; VI-NEXT:    v_ashrrev_i32_e32 v2, v2, v3
 303 ; VI-NEXT:    flat_store_dword v[0:1], v2
 304 ; VI-NEXT:    flat_store_dword v[0:1], v3
 305 ; VI-NEXT:    s_waitcnt vmcnt(0)
 306 ; VI-NEXT:    s_endpgm
 307   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 308   %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x
 309   %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x
 310   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
 311   %src = load volatile i32, ptr addrspace(1) %in0.gep
 312   %width = load volatile i32, ptr addrspace(1) %in0.gep
 313   %sub = sub i32 32, %width
 314   %shl = shl i32 %src, %sub
 315   %bfe = ashr i32 %shl, %sub
 316   store i32 %bfe, ptr addrspace(1) %out.gep
 317   store volatile i32 %shl, ptr addrspace(1) undef
 318   ret void
 319 }
 320
 321 define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
 322 ; SI-LABEL: s_sbfe_sub_i32:
 323 ; SI:       ; %bb.0:
 324 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 325 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 326 ; SI-NEXT:    s_mov_b32 s6, 0
 327 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 328 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 329 ; SI-NEXT:    s_sub_i32 s3, 32, s3
 330 ; SI-NEXT:    s_lshl_b32 s2, s2, s3
 331 ; SI-NEXT:    s_ashr_i32 s2, s2, s3
 332 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 333 ; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 334 ; SI-NEXT:    v_mov_b32_e32 v2, s2
 335 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 336 ; SI-NEXT:    s_endpgm
 337 ;
 338 ; VI-LABEL: s_sbfe_sub_i32:
 339 ; VI:       ; %bb.0:
 340 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 341 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 342 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 343 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 344 ; VI-NEXT:    s_sub_i32 s0, 32, s3
 345 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 346 ; VI-NEXT:    s_lshl_b32 s1, s2, s0
 347 ; VI-NEXT:    s_ashr_i32 s0, s1, s0
 348 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 349 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 350 ; VI-NEXT:    flat_store_dword v[0:1], v2
 351 ; VI-NEXT:    s_endpgm
 352   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 353   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
 354   %sub = sub i32 32, %width
 355   %shl = shl i32 %src, %sub
 356   %bfe = ashr i32 %shl, %sub
 357   store i32 %bfe, ptr addrspace(1) %out.gep
 358   ret void
 359 }
 360
 361 define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 {
 362 ; SI-LABEL: s_sbfe_sub_multi_use_shl_i32:
 363 ; SI:       ; %bb.0:
 364 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 365 ; SI-NEXT:    s_mov_b32 s6, 0
 366 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 367 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 368 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 369 ; SI-NEXT:    s_sub_i32 s3, 32, s3
 370 ; SI-NEXT:    s_lshl_b32 s2, s2, s3
 371 ; SI-NEXT:    s_ashr_i32 s3, s2, s3
 372 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 373 ; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 374 ; SI-NEXT:    v_mov_b32_e32 v2, s3
 375 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 376 ; SI-NEXT:    s_mov_b32 s6, -1
 377 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 378 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 379 ; SI-NEXT:    s_waitcnt vmcnt(0)
 380 ; SI-NEXT:    s_endpgm
 381 ;
 382 ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32:
 383 ; VI:       ; %bb.0:
 384 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 385 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 386 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 387 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 388 ; VI-NEXT:    s_sub_i32 s0, 32, s3
 389 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 390 ; VI-NEXT:    s_lshl_b32 s1, s2, s0
 391 ; VI-NEXT:    s_ashr_i32 s0, s1, s0
 392 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 393 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 394 ; VI-NEXT:    flat_store_dword v[0:1], v2
 395 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 396 ; VI-NEXT:    flat_store_dword v[0:1], v0
 397 ; VI-NEXT:    s_waitcnt vmcnt(0)
 398 ; VI-NEXT:    s_endpgm
 399   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
 400   %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
 401   %sub = sub i32 32, %width
 402   %shl = shl i32 %src, %sub
 403   %bfe = ashr i32 %shl, %sub
 404   store i32 %bfe, ptr addrspace(1) %out.gep
 405   store volatile i32 %shl, ptr addrspace(1) undef
 406   ret void
 407 }
 408
 409 define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
 410 ; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
 411 ; SI:       ; %bb.0:
 412 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 413 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 414 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 415 ; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
 416 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x0
 417 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 418 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 419 ; SI-NEXT:    s_or_b32 s2, s2, s4
 420 ; SI-NEXT:    s_bfe_i32 s4, s2, 0xf0000
 421 ; SI-NEXT:    s_mov_b32 s2, -1
 422 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 423 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 424 ; SI-NEXT:    s_endpgm
 425 ;
 426 ; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
 427 ; VI:       ; %bb.0:
 428 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 429 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 430 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 431 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 432 ; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
 433 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 434 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 435 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 436 ; VI-NEXT:    s_or_b32 s0, s2, s3
 437 ; VI-NEXT:    s_bfe_i32 s0, s0, 0xf0000
 438 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 439 ; VI-NEXT:    flat_store_dword v[0:1], v2
 440 ; VI-NEXT:    s_endpgm
 441   %a0 = load i32, ptr addrspace(1) %in0
 442   %b0 = load i32, ptr addrspace(1) %in1
 443   %a1 = shl i32 %a0, 17
 444   %b1 = shl i32 %b0, 17
 445   %or = or i32 %a1, %b1
 446   %result = ashr i32 %or, 17
 447   store i32 %result, ptr addrspace(1) %out
 448   ret void
 449 }
 450
 451 ; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1
 452 define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
 453 ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
 454 ; SI:       ; %bb.0:
 455 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 456 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 457 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 458 ; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
 459 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x0
 460 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 461 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 462 ; SI-NEXT:    s_lshl_b32 s2, s2, 17
 463 ; SI-NEXT:    s_lshl_b32 s4, s4, 19
 464 ; SI-NEXT:    s_or_b32 s2, s2, s4
 465 ; SI-NEXT:    s_ashr_i32 s4, s2, 17
 466 ; SI-NEXT:    s_mov_b32 s2, -1
 467 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 468 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 469 ; SI-NEXT:    s_endpgm
 470 ;
 471 ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
 472 ; VI:       ; %bb.0:
 473 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 474 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 475 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 476 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 477 ; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
 478 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 479 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 480 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 481 ; VI-NEXT:    s_lshl_b32 s0, s2, 17
 482 ; VI-NEXT:    s_lshl_b32 s1, s3, 19
 483 ; VI-NEXT:    s_or_b32 s0, s0, s1
 484 ; VI-NEXT:    s_ashr_i32 s0, s0, 17
 485 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 486 ; VI-NEXT:    flat_store_dword v[0:1], v2
 487 ; VI-NEXT:    s_endpgm
 488   %a0 = load i32, ptr addrspace(1) %x
 489   %b0 = load i32, ptr addrspace(1) %y
 490   %a1 = shl i32 %a0, 17
 491   %b1 = shl i32 %b0, 19
 492   %or = or i32 %a1, %b1
 493   %result = ashr i32 %or, 17
 494   store i32 %result, ptr addrspace(1) %out
 495   ret void
 496 }
 497
 498 ; Don't fold as 'other shl' amount is less than the sign_extend_inreg type.
 499 define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) {
 500 ; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
 501 ; SI:       ; %bb.0:
 502 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 503 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 504 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 505 ; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
 506 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x0
 507 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 508 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 509 ; SI-NEXT:    s_lshl_b32 s2, s2, 17
 510 ; SI-NEXT:    s_lshl_b32 s4, s4, 16
 511 ; SI-NEXT:    s_or_b32 s2, s2, s4
 512 ; SI-NEXT:    s_ashr_i32 s4, s2, 17
 513 ; SI-NEXT:    s_mov_b32 s2, -1
 514 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 515 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 516 ; SI-NEXT:    s_endpgm
 517 ;
 518 ; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
 519 ; VI:       ; %bb.0:
 520 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 521 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 522 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 523 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 524 ; VI-NEXT:    s_load_dword s3, s[4:5], 0x0
 525 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 526 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 527 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 528 ; VI-NEXT:    s_lshl_b32 s0, s2, 17
 529 ; VI-NEXT:    s_lshl_b32 s1, s3, 16
 530 ; VI-NEXT:    s_or_b32 s0, s0, s1
 531 ; VI-NEXT:    s_ashr_i32 s0, s0, 17
 532 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 533 ; VI-NEXT:    flat_store_dword v[0:1], v2
 534 ; VI-NEXT:    s_endpgm
 535   %a0 = load i32, ptr addrspace(1) %x
 536   %b0 = load i32, ptr addrspace(1) %y
 537   %a1 = shl i32 %a0, 17
 538   %b1 = shl i32 %b0, 16
 539   %or = or i32 %a1, %b1
 540   %result = ashr i32 %or, 17
 541   store i32 %result, ptr addrspace(1) %out
 542   ret void
 543 }
 544
 545 declare i32 @llvm.amdgcn.workitem.id.x() #0
 546
 547 attributes #0 = { nounwind readnone }
 548 attributes #1 = { nounwind }