test/CodeGen/AMDGPU/copy-illegal-type.ll

   1 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
   2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
   3
   4 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   5 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
   6
   7 ; FUNC-LABEL: {{^}}test_copy_v4i8:
   8 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
   9 ; GCN: buffer_store_dword [[REG]]
  10 ; GCN: s_endpgm
  11 define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
  12   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  13   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  14   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  15   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
  16   ret void
  17 }
  18
  19 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
  20 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
  21 ; GCN: buffer_store_dword [[REG]]
  22 ; GCN: buffer_store_dword [[REG]]
  23 ; GCN: s_endpgm
  24 define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
  25   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  26   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  27   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  28   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  29   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
  30   ret void
  31 }
  32
  33 ; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
  34 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
  35 ; GCN: buffer_store_dword [[REG]]
  36 ; GCN: buffer_store_dword [[REG]]
  37 ; GCN: buffer_store_dword [[REG]]
  38 ; GCN: s_endpgm
  39 define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
  40   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  41   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  42   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  43   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  44   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
  45   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
  46   ret void
  47 }
  48
  49 ; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
  50 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
  51 ; GCN: buffer_store_dword [[REG]]
  52 ; GCN: buffer_store_dword [[REG]]
  53 ; GCN: buffer_store_dword [[REG]]
  54 ; GCN: buffer_store_dword [[REG]]
  55 ; GCN: s_endpgm
  56 define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
  57   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  58   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  59   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  60   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  61   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
  62   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
  63   store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
  64   ret void
  65 }
  66
  67 ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
  68 ; GCN: {{buffer|flat}}_load_dword
  69 ; GCN-DAG: v_lshrrev_b32
  70 ; GCN: v_and_b32
  71 ; GCN: v_or_b32
  72 ; GCN-DAG: buffer_store_dword
  73 ; GCN-DAG: buffer_store_dword
  74
  75 ; GCN: s_endpgm
  76 define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
  77   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  78   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  79   %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
  80   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
  81   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  82   store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
  83   ret void
  84 }
  85
  86 ; FIXME: Need to handle non-uniform case for function below (load without gep).
  87 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
  88 ; GCN: {{buffer|flat}}_load_dword
  89 ; GCN-DAG: v_lshrrev_b32
  90 ; SI-DAG: v_add_i32
  91 ; VI-DAG: v_add_u16
  92 ; GCN-DAG: v_and_b32
  93 ; GCN-DAG: v_or_b32
  94 ; GCN-DAG: {{buffer|flat}}_store_dword
  95 ; GCN: {{buffer|flat}}_store_dword
  96 ; GCN: {{buffer|flat}}_store_dword
  97 ; GCN: s_endpgm
  98 define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
  99   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
 100   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
 101   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
 102   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
 103   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
 104   store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
 105   store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
 106   ret void
 107 }
 108
 109 ; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
 110 ; GCN: {{buffer|flat}}_load_dword
 111 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 112 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 113 ; GCN: s_endpgm
 114 define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
 115   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
 116   %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
 117   %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
 118   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
 119   ret void
 120 }
 121
 122 ; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
 123 ; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 124 ; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 125 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 126 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 127 ; GCN: s_endpgm
 128 define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
 129   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
 130   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
 131   ret void
 132 }
 133
 134 ; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
 135 ; GCN: {{buffer|flat}}_load_ubyte
 136 ; GCN: {{buffer|flat}}_load_ubyte
 137 ; GCN: {{buffer|flat}}_load_ubyte
 138
 139 ; GCN: buffer_store_byte
 140 ; GCN: buffer_store_byte
 141 ; GCN: buffer_store_byte
 142 ; GCN: s_endpgm
 143 define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
 144   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
 145   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
 146   ret void
 147 }
 148
 149 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
 150 ; GCN: {{buffer|flat}}_load_dword
 151 ; GCN: buffer_store_dword
 152 ; GCN: s_endpgm
 153 define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
 154   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
 155   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
 156   ret void
 157 }
 158
 159 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
 160 ; GCN: {{buffer|flat}}_load_ubyte
 161 ; GCN: {{buffer|flat}}_load_ubyte
 162 ; GCN: {{buffer|flat}}_load_ubyte
 163 ; GCN: {{buffer|flat}}_load_ubyte
 164 ; GCN: buffer_store_byte
 165 ; GCN: buffer_store_byte
 166 ; GCN: buffer_store_byte
 167 ; GCN: buffer_store_byte
 168 ; GCN: s_endpgm
 169 define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
 170   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
 171   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
 172   ret void
 173 }