test/CodeGen/AMDGPU/copy-illegal-type.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
   3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
   4
   5 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   6 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
   7
   8 define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   9 ; SI-LABEL: test_copy_v4i8:
  10 ; SI:       ; %bb.0:
  11 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
  12 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  13 ; SI-NEXT:    s_mov_b32 s10, 0
  14 ; SI-NEXT:    s_mov_b32 s11, s7
  15 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  16 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  17 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
  18 ; SI-NEXT:    v_mov_b32_e32 v1, 0
  19 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
  20 ; SI-NEXT:    s_mov_b32 s6, -1
  21 ; SI-NEXT:    s_mov_b32 s4, s0
  22 ; SI-NEXT:    s_mov_b32 s5, s1
  23 ; SI-NEXT:    s_waitcnt vmcnt(0)
  24 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  25 ; SI-NEXT:    s_endpgm
  26 ;
  27 ; VI-LABEL: test_copy_v4i8:
  28 ; VI:       ; %bb.0:
  29 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
  30 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  31 ; VI-NEXT:    s_mov_b32 s7, 0xf000
  32 ; VI-NEXT:    s_mov_b32 s6, -1
  33 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  34 ; VI-NEXT:    v_mov_b32_e32 v1, s3
  35 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
  36 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  37 ; VI-NEXT:    flat_load_dword v0, v[0:1]
  38 ; VI-NEXT:    s_mov_b32 s4, s0
  39 ; VI-NEXT:    s_mov_b32 s5, s1
  40 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
  41 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  42 ; VI-NEXT:    s_endpgm
  43   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  44   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  45   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  46   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
  47   ret void
  48 }
  49
  50 define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
  51 ; SI-LABEL: test_copy_v4i8_x2:
  52 ; SI:       ; %bb.0:
  53 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
  54 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
  55 ; SI-NEXT:    s_mov_b32 s11, 0xf000
  56 ; SI-NEXT:    s_mov_b32 s2, 0
  57 ; SI-NEXT:    s_mov_b32 s3, s11
  58 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  59 ; SI-NEXT:    v_mov_b32_e32 v1, 0
  60 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  61 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
  62 ; SI-NEXT:    s_mov_b32 s10, -1
  63 ; SI-NEXT:    s_mov_b32 s8, s6
  64 ; SI-NEXT:    s_mov_b32 s9, s7
  65 ; SI-NEXT:    s_mov_b32 s6, s10
  66 ; SI-NEXT:    s_mov_b32 s7, s11
  67 ; SI-NEXT:    s_waitcnt vmcnt(0)
  68 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  69 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
  70 ; SI-NEXT:    s_endpgm
  71 ;
  72 ; VI-LABEL: test_copy_v4i8_x2:
  73 ; VI:       ; %bb.0:
  74 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
  75 ; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
  76 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  77 ; VI-NEXT:    s_mov_b32 s3, 0xf000
  78 ; VI-NEXT:    s_mov_b32 s2, -1
  79 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  80 ; VI-NEXT:    s_mov_b32 s0, s6
  81 ; VI-NEXT:    v_mov_b32_e32 v1, s9
  82 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s8, v0
  83 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  84 ; VI-NEXT:    flat_load_dword v0, v[0:1]
  85 ; VI-NEXT:    s_mov_b32 s1, s7
  86 ; VI-NEXT:    s_mov_b32 s6, s2
  87 ; VI-NEXT:    s_mov_b32 s7, s3
  88 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
  89 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
  90 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
  91 ; VI-NEXT:    s_endpgm
  92   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  93   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  94   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  95   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  96   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
  97   ret void
  98 }
  99
 100 define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
 101 ; SI-LABEL: test_copy_v4i8_x3:
 102 ; SI:       ; %bb.0:
 103 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 104 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 105 ; SI-NEXT:    s_mov_b32 s14, 0
 106 ; SI-NEXT:    s_mov_b32 s15, s11
 107 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 108 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 109 ; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
 110 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 111 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
 112 ; SI-NEXT:    s_mov_b32 s10, -1
 113 ; SI-NEXT:    s_mov_b32 s8, s4
 114 ; SI-NEXT:    s_mov_b32 s9, s5
 115 ; SI-NEXT:    s_mov_b32 s4, s2
 116 ; SI-NEXT:    s_mov_b32 s5, s3
 117 ; SI-NEXT:    s_mov_b32 s6, s10
 118 ; SI-NEXT:    s_mov_b32 s7, s11
 119 ; SI-NEXT:    s_mov_b32 s2, s10
 120 ; SI-NEXT:    s_mov_b32 s3, s11
 121 ; SI-NEXT:    s_waitcnt vmcnt(0)
 122 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 123 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 124 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 125 ; SI-NEXT:    s_endpgm
 126 ;
 127 ; VI-LABEL: test_copy_v4i8_x3:
 128 ; VI:       ; %bb.0:
 129 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 130 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 131 ; VI-NEXT:    s_mov_b32 s11, 0xf000
 132 ; VI-NEXT:    s_mov_b32 s10, -1
 133 ; VI-NEXT:    s_mov_b32 s14, s10
 134 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 135 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 136 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 137 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 138 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 139 ; VI-NEXT:    s_mov_b32 s12, s2
 140 ; VI-NEXT:    s_mov_b32 s13, s3
 141 ; VI-NEXT:    s_mov_b32 s8, s4
 142 ; VI-NEXT:    s_mov_b32 s9, s5
 143 ; VI-NEXT:    s_mov_b32 s15, s11
 144 ; VI-NEXT:    s_mov_b32 s2, s10
 145 ; VI-NEXT:    s_mov_b32 s3, s11
 146 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 147 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 148 ; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
 149 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 150 ; VI-NEXT:    s_endpgm
 151   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
 152   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
 153   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
 154   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
 155   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
 156   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
 157   ret void
 158 }
 159
 160 define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
 161 ; SI-LABEL: test_copy_v4i8_x4:
 162 ; SI:       ; %bb.0:
 163 ; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x11
 164 ; SI-NEXT:    s_mov_b32 s15, 0xf000
 165 ; SI-NEXT:    s_mov_b32 s10, 0
 166 ; SI-NEXT:    s_mov_b32 s11, s15
 167 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 168 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 169 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 170 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 171 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 172 ; SI-NEXT:    s_mov_b32 s14, -1
 173 ; SI-NEXT:    s_mov_b32 s18, s14
 174 ; SI-NEXT:    s_mov_b32 s19, s15
 175 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 176 ; SI-NEXT:    s_mov_b32 s12, s6
 177 ; SI-NEXT:    s_mov_b32 s13, s7
 178 ; SI-NEXT:    s_mov_b32 s16, s2
 179 ; SI-NEXT:    s_mov_b32 s17, s3
 180 ; SI-NEXT:    s_mov_b32 s6, s14
 181 ; SI-NEXT:    s_mov_b32 s7, s15
 182 ; SI-NEXT:    s_mov_b32 s2, s14
 183 ; SI-NEXT:    s_mov_b32 s3, s15
 184 ; SI-NEXT:    s_waitcnt vmcnt(0)
 185 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 186 ; SI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
 187 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 188 ; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
 189 ; SI-NEXT:    s_endpgm
 190 ;
 191 ; VI-LABEL: test_copy_v4i8_x4:
 192 ; VI:       ; %bb.0:
 193 ; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x44
 194 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 195 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 196 ; VI-NEXT:    s_mov_b32 s11, 0xf000
 197 ; VI-NEXT:    s_mov_b32 s10, -1
 198 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 199 ; VI-NEXT:    v_mov_b32_e32 v1, s9
 200 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s8, v0
 201 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 202 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 203 ; VI-NEXT:    s_mov_b32 s8, s6
 204 ; VI-NEXT:    s_mov_b32 s9, s7
 205 ; VI-NEXT:    s_mov_b32 s12, s2
 206 ; VI-NEXT:    s_mov_b32 s13, s3
 207 ; VI-NEXT:    s_mov_b32 s6, s10
 208 ; VI-NEXT:    s_mov_b32 s7, s11
 209 ; VI-NEXT:    s_mov_b32 s14, s10
 210 ; VI-NEXT:    s_mov_b32 s15, s11
 211 ; VI-NEXT:    s_mov_b32 s2, s10
 212 ; VI-NEXT:    s_mov_b32 s3, s11
 213 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 214 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 215 ; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
 216 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 217 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 218 ; VI-NEXT:    s_endpgm
 219   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
 220   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
 221   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
 222   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
 223   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
 224   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
 225   store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
 226   ret void
 227 }
 228
 229 define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
 230 ; SI-LABEL: test_copy_v4i8_extra_use:
 231 ; SI:       ; %bb.0:
 232 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 233 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 234 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 235 ; SI-NEXT:    s_mov_b32 s2, 0
 236 ; SI-NEXT:    s_mov_b32 s3, s11
 237 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 238 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 239 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 240 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
 241 ; SI-NEXT:    s_mov_b32 s10, -1
 242 ; SI-NEXT:    s_mov_b32 s0, 0xff00
 243 ; SI-NEXT:    s_mov_b32 s8, s6
 244 ; SI-NEXT:    s_mov_b32 s9, s7
 245 ; SI-NEXT:    s_mov_b32 s6, s10
 246 ; SI-NEXT:    s_mov_b32 s7, s11
 247 ; SI-NEXT:    s_movk_i32 s1, 0xff
 248 ; SI-NEXT:    s_movk_i32 s2, 0x900
 249 ; SI-NEXT:    s_waitcnt vmcnt(0)
 250 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 251 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 252 ; SI-NEXT:    v_and_b32_e32 v2, s0, v0
 253 ; SI-NEXT:    s_waitcnt expcnt(0)
 254 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 9, v0
 255 ; SI-NEXT:    v_and_b32_e32 v0, s1, v0
 256 ; SI-NEXT:    v_and_b32_e32 v3, s0, v1
 257 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
 258 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 259 ; SI-NEXT:    v_and_b32_e32 v1, s1, v1
 260 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
 261 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 262 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 263 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 264 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 265 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
 266 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 267 ; SI-NEXT:    s_endpgm
 268 ;
 269 ; VI-LABEL: test_copy_v4i8_extra_use:
 270 ; VI:       ; %bb.0:
 271 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 272 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 273 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 274 ; VI-NEXT:    s_movk_i32 s10, 0xff00
 275 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 276 ; VI-NEXT:    s_mov_b32 s2, -1
 277 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 278 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 279 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 280 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 281 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 282 ; VI-NEXT:    s_mov_b32 s0, s6
 283 ; VI-NEXT:    s_mov_b32 s1, s7
 284 ; VI-NEXT:    s_movk_i32 s8, 0xff
 285 ; VI-NEXT:    s_mov_b32 s6, s2
 286 ; VI-NEXT:    s_mov_b32 s7, s3
 287 ; VI-NEXT:    s_movk_i32 s9, 0x900
 288 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 289 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 290 ; VI-NEXT:    v_and_b32_e32 v3, s10, v1
 291 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
 292 ; VI-NEXT:    v_and_b32_e32 v1, s8, v1
 293 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 294 ; VI-NEXT:    v_and_b32_e32 v2, s10, v0
 295 ; VI-NEXT:    v_add_u16_e32 v0, 9, v0
 296 ; VI-NEXT:    v_and_b32_e32 v0, s8, v0
 297 ; VI-NEXT:    v_or_b32_e32 v1, v3, v1
 298 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 299 ; VI-NEXT:    v_add_u16_e32 v1, s9, v1
 300 ; VI-NEXT:    v_add_u16_e32 v0, s9, v0
 301 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 302 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 303 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 304 ; VI-NEXT:    s_endpgm
 305   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
 306   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
 307   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
 308   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
 309   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
 310   store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
 311   ret void
 312 }
 313
 314 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 315 define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
 316 ; SI-LABEL: test_copy_v4i8_x2_extra_use:
 317 ; SI:       ; %bb.0:
 318 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
 319 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 320 ; SI-NEXT:    s_mov_b32 s14, 0
 321 ; SI-NEXT:    s_mov_b32 s15, s11
 322 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 323 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 324 ; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
 325 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 326 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
 327 ; SI-NEXT:    s_mov_b32 s16, 0xff00
 328 ; SI-NEXT:    s_movk_i32 s17, 0xff
 329 ; SI-NEXT:    s_movk_i32 s18, 0x900
 330 ; SI-NEXT:    s_mov_b32 s10, -1
 331 ; SI-NEXT:    s_mov_b32 s8, s4
 332 ; SI-NEXT:    s_mov_b32 s9, s5
 333 ; SI-NEXT:    s_mov_b32 s4, s2
 334 ; SI-NEXT:    s_mov_b32 s5, s3
 335 ; SI-NEXT:    s_mov_b32 s6, s10
 336 ; SI-NEXT:    s_mov_b32 s7, s11
 337 ; SI-NEXT:    s_mov_b32 s2, s10
 338 ; SI-NEXT:    s_mov_b32 s3, s11
 339 ; SI-NEXT:    s_waitcnt vmcnt(0)
 340 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
 341 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 342 ; SI-NEXT:    v_and_b32_e32 v4, s16, v1
 343 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
 344 ; SI-NEXT:    v_and_b32_e32 v2, s16, v0
 345 ; SI-NEXT:    v_and_b32_e32 v3, s17, v3
 346 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 347 ; SI-NEXT:    v_and_b32_e32 v1, s17, v1
 348 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s18, v2
 349 ; SI-NEXT:    v_or_b32_e32 v1, v4, v1
 350 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 351 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 352 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 353 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
 354 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 355 ; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 356 ; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 357 ; SI-NEXT:    s_endpgm
 358 ;
 359 ; VI-LABEL: test_copy_v4i8_x2_extra_use:
 360 ; VI:       ; %bb.0:
 361 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
 362 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 363 ; VI-NEXT:    s_movk_i32 s14, 0xff00
 364 ; VI-NEXT:    s_movk_i32 s12, 0xff
 365 ; VI-NEXT:    s_movk_i32 s13, 0x900
 366 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 367 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 368 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
 369 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 370 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 371 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 372 ; VI-NEXT:    s_mov_b32 s6, -1
 373 ; VI-NEXT:    s_mov_b32 s8, s2
 374 ; VI-NEXT:    s_mov_b32 s9, s3
 375 ; VI-NEXT:    s_mov_b32 s10, s6
 376 ; VI-NEXT:    s_mov_b32 s11, s7
 377 ; VI-NEXT:    s_mov_b32 s2, s6
 378 ; VI-NEXT:    s_mov_b32 s3, s7
 379 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 380 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 381 ; VI-NEXT:    v_and_b32_e32 v4, s14, v1
 382 ; VI-NEXT:    v_add_u16_e32 v1, 9, v1
 383 ; VI-NEXT:    v_add_u16_e32 v3, 9, v0
 384 ; VI-NEXT:    v_and_b32_e32 v1, s12, v1
 385 ; VI-NEXT:    v_or_b32_e32 v1, v4, v1
 386 ; VI-NEXT:    v_and_b32_e32 v2, s14, v0
 387 ; VI-NEXT:    v_and_b32_e32 v3, s12, v3
 388 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
 389 ; VI-NEXT:    v_add_u16_e32 v1, s13, v1
 390 ; VI-NEXT:    v_add_u16_e32 v2, s13, v2
 391 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 392 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 393 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 394 ; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 395 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 396 ; VI-NEXT:    s_endpgm
 397   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
 398   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
 399   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
 400   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
 401   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
 402   store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
 403   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
 404   ret void
 405 }
 406
 407 define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
 408 ; SI-LABEL: test_copy_v3i8_align4:
 409 ; SI:       ; %bb.0:
 410 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 411 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 412 ; SI-NEXT:    s_mov_b32 s10, 0
 413 ; SI-NEXT:    s_mov_b32 s11, s7
 414 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 415 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 416 ; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
 417 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 418 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 419 ; SI-NEXT:    s_mov_b32 s6, -1
 420 ; SI-NEXT:    s_mov_b32 s4, s0
 421 ; SI-NEXT:    s_mov_b32 s5, s1
 422 ; SI-NEXT:    s_waitcnt vmcnt(0)
 423 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 424 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 425 ; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
 426 ; SI-NEXT:    s_endpgm
 427 ;
 428 ; VI-LABEL: test_copy_v3i8_align4:
 429 ; VI:       ; %bb.0:
 430 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 431 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 432 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 433 ; VI-NEXT:    s_mov_b32 s6, -1
 434 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 435 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 436 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 437 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 438 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 439 ; VI-NEXT:    s_mov_b32 s4, s0
 440 ; VI-NEXT:    s_mov_b32 s5, s1
 441 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 442 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 443 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 444 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:2
 445 ; VI-NEXT:    s_endpgm
 446   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
 447   %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
 448   %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
 449   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
 450   ret void
 451 }
 452
 453 define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
 454 ; SI-LABEL: test_copy_v3i8_align2:
 455 ; SI:       ; %bb.0:
 456 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 457 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 458 ; SI-NEXT:    s_mov_b32 s2, -1
 459 ; SI-NEXT:    s_mov_b32 s10, s2
 460 ; SI-NEXT:    s_mov_b32 s11, s3
 461 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 462 ; SI-NEXT:    s_mov_b32 s8, s6
 463 ; SI-NEXT:    s_mov_b32 s9, s7
 464 ; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 465 ; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
 466 ; SI-NEXT:    s_mov_b32 s0, s4
 467 ; SI-NEXT:    s_mov_b32 s1, s5
 468 ; SI-NEXT:    s_waitcnt vmcnt(0)
 469 ; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
 470 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 471 ; SI-NEXT:    s_endpgm
 472 ;
 473 ; VI-LABEL: test_copy_v3i8_align2:
 474 ; VI:       ; %bb.0:
 475 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 476 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 477 ; VI-NEXT:    s_mov_b32 s2, -1
 478 ; VI-NEXT:    s_mov_b32 s10, s2
 479 ; VI-NEXT:    s_mov_b32 s11, s3
 480 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 481 ; VI-NEXT:    s_mov_b32 s8, s6
 482 ; VI-NEXT:    s_mov_b32 s9, s7
 483 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
 484 ; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
 485 ; VI-NEXT:    s_mov_b32 s0, s4
 486 ; VI-NEXT:    s_mov_b32 s1, s5
 487 ; VI-NEXT:    s_waitcnt vmcnt(0)
 488 ; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
 489 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 490 ; VI-NEXT:    s_endpgm
 491   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
 492   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
 493   ret void
 494 }
 495
 496 define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
 497 ; SI-LABEL: test_copy_v3i8_align1:
 498 ; SI:       ; %bb.0:
 499 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 500 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 501 ; SI-NEXT:    s_mov_b32 s2, -1
 502 ; SI-NEXT:    s_mov_b32 s10, s2
 503 ; SI-NEXT:    s_mov_b32 s11, s3
 504 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 505 ; SI-NEXT:    s_mov_b32 s8, s6
 506 ; SI-NEXT:    s_mov_b32 s9, s7
 507 ; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 508 ; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
 509 ; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
 510 ; SI-NEXT:    s_mov_b32 s0, s4
 511 ; SI-NEXT:    s_mov_b32 s1, s5
 512 ; SI-NEXT:    s_waitcnt vmcnt(2)
 513 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 514 ; SI-NEXT:    s_waitcnt vmcnt(2)
 515 ; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:1
 516 ; SI-NEXT:    s_waitcnt vmcnt(2)
 517 ; SI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:2
 518 ; SI-NEXT:    s_endpgm
 519 ;
 520 ; VI-LABEL: test_copy_v3i8_align1:
 521 ; VI:       ; %bb.0:
 522 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 523 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 524 ; VI-NEXT:    s_mov_b32 s2, -1
 525 ; VI-NEXT:    s_mov_b32 s10, s2
 526 ; VI-NEXT:    s_mov_b32 s11, s3
 527 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 528 ; VI-NEXT:    s_mov_b32 s8, s6
 529 ; VI-NEXT:    s_mov_b32 s9, s7
 530 ; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 531 ; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
 532 ; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
 533 ; VI-NEXT:    s_mov_b32 s0, s4
 534 ; VI-NEXT:    s_mov_b32 s1, s5
 535 ; VI-NEXT:    s_waitcnt vmcnt(2)
 536 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 537 ; VI-NEXT:    s_waitcnt vmcnt(2)
 538 ; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:1
 539 ; VI-NEXT:    s_waitcnt vmcnt(2)
 540 ; VI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:2
 541 ; VI-NEXT:    s_endpgm
 542   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
 543   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
 544   ret void
 545 }
 546
 547 define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
 548 ; SI-LABEL: test_copy_v4i8_volatile_load:
 549 ; SI:       ; %bb.0:
 550 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 551 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 552 ; SI-NEXT:    s_mov_b32 s2, -1
 553 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 554 ; SI-NEXT:    s_mov_b32 s0, s4
 555 ; SI-NEXT:    s_mov_b32 s1, s5
 556 ; SI-NEXT:    s_mov_b32 s4, s6
 557 ; SI-NEXT:    s_mov_b32 s5, s7
 558 ; SI-NEXT:    s_mov_b32 s6, s2
 559 ; SI-NEXT:    s_mov_b32 s7, s3
 560 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 561 ; SI-NEXT:    s_waitcnt vmcnt(0)
 562 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 563 ; SI-NEXT:    s_endpgm
 564 ;
 565 ; VI-LABEL: test_copy_v4i8_volatile_load:
 566 ; VI:       ; %bb.0:
 567 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 568 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 569 ; VI-NEXT:    s_mov_b32 s2, -1
 570 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 571 ; VI-NEXT:    s_mov_b32 s0, s4
 572 ; VI-NEXT:    s_mov_b32 s1, s5
 573 ; VI-NEXT:    s_mov_b32 s4, s6
 574 ; VI-NEXT:    s_mov_b32 s5, s7
 575 ; VI-NEXT:    s_mov_b32 s6, s2
 576 ; VI-NEXT:    s_mov_b32 s7, s3
 577 ; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 578 ; VI-NEXT:    s_waitcnt vmcnt(0)
 579 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 580 ; VI-NEXT:    s_endpgm
 581   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
 582   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
 583   ret void
 584 }
 585
 586 define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
 587 ; SI-LABEL: test_copy_v4i8_volatile_store:
 588 ; SI:       ; %bb.0:
 589 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 590 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 591 ; SI-NEXT:    s_mov_b32 s2, -1
 592 ; SI-NEXT:    s_mov_b32 s10, s2
 593 ; SI-NEXT:    s_mov_b32 s11, s3
 594 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 595 ; SI-NEXT:    s_mov_b32 s8, s6
 596 ; SI-NEXT:    s_mov_b32 s9, s7
 597 ; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 598 ; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
 599 ; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
 600 ; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
 601 ; SI-NEXT:    s_mov_b32 s0, s4
 602 ; SI-NEXT:    s_mov_b32 s1, s5
 603 ; SI-NEXT:    s_waitcnt vmcnt(0)
 604 ; SI-NEXT:    buffer_store_byte v3, off, s[0:3], 0 offset:3
 605 ; SI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:2
 606 ; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:1
 607 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 608 ; SI-NEXT:    s_endpgm
 609 ;
 610 ; VI-LABEL: test_copy_v4i8_volatile_store:
 611 ; VI:       ; %bb.0:
 612 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 613 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 614 ; VI-NEXT:    s_mov_b32 s2, -1
 615 ; VI-NEXT:    s_mov_b32 s10, s2
 616 ; VI-NEXT:    s_mov_b32 s11, s3
 617 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 618 ; VI-NEXT:    s_mov_b32 s8, s6
 619 ; VI-NEXT:    s_mov_b32 s9, s7
 620 ; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
 621 ; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
 622 ; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
 623 ; VI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
 624 ; VI-NEXT:    s_mov_b32 s0, s4
 625 ; VI-NEXT:    s_mov_b32 s1, s5
 626 ; VI-NEXT:    s_waitcnt vmcnt(0)
 627 ; VI-NEXT:    buffer_store_byte v3, off, s[0:3], 0 offset:3
 628 ; VI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:2
 629 ; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:1
 630 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 631 ; VI-NEXT:    s_endpgm
 632   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
 633   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
 634   ret void
 635 }