llvm/test/CodeGen/AMDGPU/inline-asm.ll

   1 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
   2 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck  --check-prefix=CHECK %s
   3
   4 ; CHECK-LABEL: {{^}}inline_asm:
   5 ; CHECK: s_endpgm
   6 ; CHECK: s_endpgm
   7 define amdgpu_kernel void @inline_asm(ptr addrspace(1) %out) {
   8 entry:
   9   store i32 5, ptr addrspace(1) %out
  10   call void asm sideeffect "s_endpgm", ""()
  11   ret void
  12 }
  13
  14 ; CHECK-LABEL: {{^}}inline_asm_shader:
  15 ; CHECK: s_endpgm
  16 ; CHECK: s_endpgm
  17 define amdgpu_ps void @inline_asm_shader() {
  18 entry:
  19   call void asm sideeffect "s_endpgm", ""()
  20   ret void
  21 }
  22
  23
  24 ; CHECK-LABEL: {{^}}branch_on_asm_vgpr:
  25 ; Make sure VGPR inline assembly is treated as divergent.
  26 ; CHECK: v_mov_b32 v{{[0-9]+}}, 0
  27 ; CHECK: v_cmp_eq_u32
  28 ; CHECK: s_and_saveexec_b64
  29 define amdgpu_kernel void @branch_on_asm_vgpr(ptr addrspace(1) %out) {
  30         %zero = call i32 asm "v_mov_b32 $0, 0", "=v"()
  31         %cmp = icmp eq i32 %zero, 0
  32         br i1 %cmp, label %if, label %endif
  33
  34 if:
  35         store i32 0, ptr addrspace(1) %out
  36         br label %endif
  37
  38 endif:
  39   ret void
  40 }
  41
  42 ; CHECK-LABEL: {{^}}branch_on_asm_sgpr:
  43 ; Make sure SGPR inline assembly is treated as uniform
  44 ; CHECK: s_mov_b32 s{{[0-9]+}}, 0
  45 ; CHECK: s_cmp_lg_u32
  46 ; CHECK: s_cbranch_scc0
  47 define amdgpu_kernel void @branch_on_asm_sgpr(ptr addrspace(1) %out) {
  48         %zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
  49         %cmp = icmp eq i32 %zero, 0
  50         br i1 %cmp, label %if, label %endif
  51
  52 if:
  53         store i32 0, ptr addrspace(1) %out
  54         br label %endif
  55
  56 endif:
  57   ret void
  58 }
  59
  60 ; CHECK-LABEL: {{^}}v_cmp_asm:
  61 ; CHECK: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
  62 ; CHECK: v_cmp_ne_u32_e64 s[[[MASK_LO:[0-9]+]]:[[MASK_HI:[0-9]+]]], 0, [[SRC]]
  63 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
  64 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
  65 ; CHECK: buffer_store_dwordx2 v[[[V_LO]]:[[V_HI]]]
  66 define amdgpu_kernel void @v_cmp_asm(ptr addrspace(1) %out, i32 %in) {
  67   %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
  68   store i64 %sgpr, ptr addrspace(1) %out
  69   ret void
  70 }
  71
  72 ; CHECK-LABEL: {{^}}code_size_inline_asm:
  73 ; CHECK: codeLenInByte = 12
  74 define amdgpu_kernel void @code_size_inline_asm(ptr addrspace(1) %out) {
  75 entry:
  76   call void asm sideeffect "v_nop_e64", ""()
  77   ret void
  78 }
  79
  80 ; All inlineasm instructions are assumed to be the maximum size
  81 ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
  82 ; CHECK: codeLenInByte = 12
  83 define amdgpu_kernel void @code_size_inline_asm_small_inst(ptr addrspace(1) %out) {
  84 entry:
  85   call void asm sideeffect "v_nop_e32", ""()
  86   ret void
  87 }
  88
  89 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
  90 ; CHECK: codeLenInByte = 20
  91 define amdgpu_kernel void @code_size_inline_asm_2_inst(ptr addrspace(1) %out) {
  92 entry:
  93   call void asm sideeffect "
  94     v_nop_e64
  95     v_nop_e64
  96    ", ""()
  97   ret void
  98 }
  99
 100 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
 101 ; CHECK: codeLenInByte = 20
 102 define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(ptr addrspace(1) %out) {
 103 entry:
 104   call void asm sideeffect "
 105     v_nop_e64
 106
 107     v_nop_e64
 108    ", ""()
 109   ret void
 110 }
 111
 112 ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
 113 ; CHECK: codeLenInByte = 4
 114 define amdgpu_kernel void @code_size_inline_asm_0_inst(ptr addrspace(1) %out) {
 115 entry:
 116   call void asm sideeffect "", ""()
 117   ret void
 118 }
 119
 120 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
 121 ; CHECK: codeLenInByte = 4
 122 define amdgpu_kernel void @code_size_inline_asm_1_comment(ptr addrspace(1) %out) {
 123 entry:
 124   call void asm sideeffect "; comment", ""()
 125   ret void
 126 }
 127
 128 ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
 129 ; CHECK: codeLenInByte = 4
 130 define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(ptr addrspace(1) %out) {
 131 entry:
 132   call void asm sideeffect "
 133 ; comment", ""()
 134   ret void
 135 }
 136
 137 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
 138 ; CHECK: codeLenInByte = 4
 139 define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(ptr addrspace(1) %out) {
 140 entry:
 141   call void asm sideeffect "; comment
 142 ", ""()
 143   ret void
 144 }
 145
 146 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
 147 ; CHECK: codeLenInByte = 4
 148 define amdgpu_kernel void @code_size_inline_asm_2_comments_line(ptr addrspace(1) %out) {
 149 entry:
 150   call void asm sideeffect "; first comment ; second comment", ""()
 151   ret void
 152 }
 153
 154 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
 155 ; CHECK: codeLenInByte = 4
 156 define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(ptr addrspace(1) %out) {
 157 entry:
 158   call void asm sideeffect "; first comment;second comment", ""()
 159   ret void
 160 }
 161
 162 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
 163 ; CHECK: codeLenInByte = 20
 164 define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(ptr addrspace(1) %out) {
 165 entry:
 166   call void asm sideeffect "; comment
 167     v_nop_e64 ; inline comment
 168 ; separate comment
 169     v_nop_e64
 170
 171     ; trailing comment
 172     ; extra comment
 173   ", ""()
 174   ret void
 175 }
 176
 177 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
 178 ; CHECK: codeLenInByte = 20
 179 define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(ptr addrspace(1) %out) {
 180 entry:
 181   call void asm sideeffect "v_nop_e64 ; inline comment
 182 ; separate comment
 183     v_nop_e64
 184
 185     ; trailing comment
 186     ; extra comment
 187   ", ""()
 188   ret void
 189 }
 190
 191 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
 192 ; CHECK: codeLenInByte = 20
 193 define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(ptr addrspace(1) %out) {
 194 entry:
 195   call void asm sideeffect "; comment
 196     v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
 197 ; separate comment
 198     v_bfrev_b32_e32 v0, 1
 199
 200     ; trailing comment
 201     ; extra comment
 202   ", ""()
 203   ret void
 204 }
 205
 206 ; FIXME: Should not have intermediate sgprs
 207 ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
 208 ; CHECK: v_mov_b32_e32 v0, 0x1e240
 209 ; CHECK: v_mov_b32_e32 v1, 0
 210 ; CHECK: use v[0:1]
 211 define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
 212 entry:
 213   call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456)
 214   ret void
 215 }
 216
 217 ; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
 218 ; CHECK: v_mov_b32_e32 v0, 1{{$}}
 219 ; CHECK: ; use v0
 220 define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
 221 entry:
 222   call void asm sideeffect "; use $0 ", "{v0}"(i1 true)
 223   ret void
 224 }
 225
 226
 227 ; FIXME: This behavior is nonsense. We should probably disallow i1 asm
 228
 229 ; CHECK-LABEL: {{^}}i1_input_phys_vgpr:
 230 ; CHECK: {{buffer|flat}}_load_ubyte [[LOAD:v[0-9]+]]
 231 ; CHECK-NOT: [[LOAD]]
 232 ; CHECK: ; use v0
 233 ; CHECK: v_and_b32_e32 [[STORE:v[0-9]+]], 1, v1
 234 ; CHECK: {{buffer|flat}}_store_byte [[STORE]],
 235 define amdgpu_kernel void @i1_input_phys_vgpr() {
 236 entry:
 237   %val = load i1, ptr addrspace(1) undef
 238   %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
 239   store i1 %cc, ptr addrspace(1) undef
 240   ret void
 241 }
 242
 243 ; FIXME: Should prodbably be masking high bits of load.
 244 ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
 245 ; CHECK: buffer_load_ubyte v0
 246 ; CHECK-NEXT: s_waitcnt
 247 ; CHECK-NEXT: buffer_load_ubyte v1
 248 ; CHECK-NEXT: s_waitcnt
 249 ; CHECK-NEXT: ASMSTART
 250 define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
 251 entry:
 252   %val0 = load volatile i1, ptr addrspace(1) undef
 253   %val1 = load volatile i1, ptr addrspace(1) undef
 254   call void asm sideeffect "; use $0 $1 ", "{v0}, {v1}"(i1 %val0, i1 %val1)
 255   ret void
 256 }
 257
 258 ; CHECK-LABEL: {{^}}muliple_def_phys_vgpr:
 259 ; CHECK: ; def v0
 260 ; CHECK: v_mov_b32_e32 v1, v0
 261 ; CHECK: ; def v0
 262 ; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1
 263 define amdgpu_kernel void @muliple_def_phys_vgpr() {
 264 entry:
 265   %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"()
 266   %def1 = call i32 asm sideeffect "; def $0 ", "={v0}"()
 267   %add = shl i32 %def0, %def1
 268   store i32 %add, ptr addrspace(1) undef
 269   ret void
 270 }
 271
 272 ; CHECK-LABEL: {{^}}asm_constraint_c_n:
 273 ; CHECK: s_trap 10{{$}}
 274 define amdgpu_kernel void @asm_constraint_c_n()  {
 275 entry:
 276   tail call void asm sideeffect "s_trap ${0:c}", "n"(i32 10) #1
 277   ret void
 278 }
 279
 280 ; CHECK-LABEL: {{^}}asm_constraint_n_n:
 281 ; CHECK: s_trap -10{{$}}
 282 define amdgpu_kernel void @asm_constraint_n_n()  {
 283 entry:
 284   tail call void asm sideeffect "s_trap ${0:n}", "n"(i32 10) #1
 285   ret void
 286 }
 287
 288 ; Make sure tuples of 3 SGPRs are printed with the [] syntax instead
 289 ; of the tablegen default.
 290 ; CHECK-LABEL: {{^}}sgpr96_name_format:
 291 ; CHECK: ; sgpr96 s[0:2]
 292 define amdgpu_kernel void @sgpr96_name_format()  {
 293 entry:
 294   tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
 295   ret void
 296 }
 297
 298 ; Check aggregate types are handled properly.
 299 ; CHECK-LABEL: mad_u64
 300 ; CHECK: v_mad_u64_u32
 301 define void @mad_u64(i32 %x, i1 %c0) {
 302 entry:
 303   br i1 %c0, label %exit, label %false
 304
 305 false:
 306   %s0 = tail call { i64, i64 } asm sideeffect "v_mad_u64_u32 $0, $1, $2, $3, $4", "=v,=s,v,v,v"(i32 -766435501, i32 %x, i64 0)
 307   br label %exit
 308
 309 exit:
 310   %s1 = phi { i64, i64} [ undef, %entry ], [ %s0, %false]
 311   %v0 = extractvalue { i64, i64 } %s1, 0
 312   %v1 = extractvalue { i64, i64 } %s1, 1
 313   tail call void asm sideeffect "; use $0", "v"(i64 %v0)
 314   tail call void asm sideeffect "; use $0", "v"(i64 %v1)
 315   ret void
 316 }
 317
 318 ; CHECK-LABEL: {{^}}scc_as_i32:
 319 ; CHECK: ; def scc
 320 ; CHECK: ; use scc
 321 define void @scc_as_i32() {
 322   %scc = call i32 asm sideeffect "; def $0", "={scc}"()
 323   call void asm sideeffect "; use $0 ", "{scc}"(i32 %scc)
 324   ret void
 325 }
 326
 327 ; CHECK-LABEL: {{^}}scc_as_i1:
 328 ; CHECK: ; def scc
 329 ; CHECK: ; use scc
 330 define void @scc_as_i1() {
 331   %scc = call i1 asm sideeffect "; def $0", "={scc}"()
 332   call void asm sideeffect "; use $0 ", "{scc}"(i1 %scc)
 333   ret void
 334 }
 335
 336 ; Make sure the SGPR def is treated as a uniform value when the inline
 337 ; assembly also defines a divergent value. The add should be scalar
 338 ; and not introduce illegal vgpr to sgpr copies.
 339 ; CHECK-LABEL: {{^}}mixed_def_vgpr_sgpr_def_asm:
 340 ; CHECK: ; def v0 s[4:5]
 341 ; CHECK: s_add_u32
 342 ; CHECK-NEXT: s_addc_u32
 343 ; CHECK: ; use s[4:5]
 344 define void @mixed_def_vgpr_sgpr_def_asm() {
 345   %vgpr_sgpr = call { i32, i64 } asm sideeffect "; def $0 $1 ", "=v,={s[4:5]}"()
 346   %vgpr = extractvalue { i32, i64 } %vgpr_sgpr, 0
 347   %sgpr = extractvalue { i32, i64 } %vgpr_sgpr, 1
 348   %sgpr.add = add i64 %sgpr, 2
 349   call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
 350   ret void
 351 }
 352
 353 ; CHECK-LABEL: {{^}}mixed_def_sgpr_vgpr_def_asm:
 354 ; CHECK: ; def s[4:5] v0
 355 ; CHECK: s_add_u32
 356 ; CHECK-NEXT: s_addc_u32
 357 ; CHECK: ; use s[4:5]
 358 define void @mixed_def_sgpr_vgpr_def_asm() {
 359   %sgpr_vgpr = call { i64, i32 } asm sideeffect "; def $0 $1 ", "={s[4:5]},=v"()
 360   %sgpr = extractvalue { i64, i32 } %sgpr_vgpr, 0
 361   %vgpr = extractvalue { i64, i32 } %sgpr_vgpr, 1
 362   %sgpr.add = add i64 %sgpr, 2
 363   call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add)
 364   ret void
 365 }