1 ; RUN: llc -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r - | FileCheck --check-prefix=RELS %s
5 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s
6 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
7 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR %s
8 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR %s
9 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX9-FLATSCR-PAL %s
10 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,FLATSCR,GFX10-FLATSCR-PAL %s
11 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
12 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,GFX11-FLATSCR %s
14 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0
15 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1
17 ; This used to fail due to a v_add_i32 instruction with an illegal immediate
18 ; operand that was created during Local Stack Slot Allocation. Test case derived
19 ; from https://bugs.freedesktop.org/show_bug.cgi?id=96602
21 ; GCN-LABEL: {{^}}ps_main:
23 ; GFX9-FLATSCR-DAG: s_add_u32 flat_scratch_lo, s0, s2
24 ; GFX9-FLATSCR-DAG: s_addc_u32 flat_scratch_hi, s1, 0
25 ; GFX9-FLATSCR-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
27 ; GFX10-FLATSCR: s_add_u32 s0, s0, s2
28 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
29 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
30 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
32 ; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
33 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
34 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
35 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
36 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
37 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
38 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
39 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
40 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
41 ; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
42 ; GFX9-FLATSCR-PAL-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
44 ; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
45 ; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
46 ; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
47 ; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
48 ; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
49 ; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
50 ; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
51 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
52 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
54 ; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
55 ; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
56 ; MUBUF-DAG: s_mov_b32 s2, -1
57 ; SI-DAG: s_mov_b32 s3, 0xe8f000
58 ; VI-DAG: s_mov_b32 s3, 0xe80000
59 ; GFX9-MUBUF-DAG: s_mov_b32 s3, 0xe00000
60 ; GFX10_W32-MUBUF-DAG: s_mov_b32 s3, 0x31c16000
61 ; GFX10_W64-MUBUF-DAG: s_mov_b32 s3, 0x31e16000
63 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
65 ; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
66 ; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset:
68 ; GFX10-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], off offset:
70 ; MUBUF-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
71 ; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
72 ; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
73 ; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
74 ; GFX11-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
76 ; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
77 ; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
78 ; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
80 ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
81 ; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
82 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
83 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off offset:128
84 define amdgpu_ps float @ps_main(i32 %idx) {
85 %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
86 %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
87 %r = fadd float %v1, %v2
91 ; GCN-LABEL: {{^}}vs_main:
92 ; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
93 ; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
95 ; GFX10-FLATSCR: s_add_u32 s0, s0, s2
96 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
97 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
98 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
100 ; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
101 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
102 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
103 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
104 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
105 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
106 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
107 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
108 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
109 ; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
111 ; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
112 ; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
113 ; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
114 ; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
115 ; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
116 ; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
117 ; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
118 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
119 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
121 ; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
123 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
125 ; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
126 ; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
128 ; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
129 ; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset:
131 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
132 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
134 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
135 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
137 define amdgpu_vs float @vs_main(i32 %idx) {
138 %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
139 %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
140 %r = fadd float %v1, %v2
144 ; GCN-LABEL: {{^}}cs_main:
145 ; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
146 ; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
148 ; GFX10-FLATSCR: s_add_u32 s0, s0, s2
149 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
150 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
151 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
153 ; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
154 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
155 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10
156 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
157 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
158 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
159 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
160 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
161 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
162 ; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
164 ; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
165 ; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
166 ; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x10
167 ; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
168 ; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
169 ; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
170 ; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
171 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
172 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
174 ; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
176 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
178 ; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
179 ; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
181 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
182 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
184 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
185 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
187 define amdgpu_cs float @cs_main(i32 %idx) {
188 %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
189 %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
190 %r = fadd float %v1, %v2
194 ; GCN-LABEL: {{^}}hs_main:
195 ; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
196 ; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
198 ; GFX10-FLATSCR: s_add_u32 s0, s0, s5
199 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
200 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
201 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
203 ; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
204 ; SIVI-NOT: s_mov_b32 s0
205 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
206 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
208 ; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
209 ; GFX9PLUS-NOT: s_mov_b32 s5
210 ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
211 ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
213 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
214 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
215 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
217 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
218 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
219 define amdgpu_hs float @hs_main(i32 %idx) {
220 %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
221 %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
222 %r = fadd float %v1, %v2
226 ; GCN-LABEL: {{^}}gs_main:
227 ; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
228 ; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
230 ; GFX10-FLATSCR: s_add_u32 s0, s0, s5
231 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
232 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
233 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
235 ; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
236 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
237 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
238 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
239 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
240 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
241 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
242 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
243 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
244 ; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
246 ; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
247 ; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
248 ; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
249 ; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
250 ; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
251 ; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
252 ; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
253 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
254 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
256 ; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
257 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
258 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
260 ; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
261 ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
262 ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
264 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
265 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
266 ; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
268 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
269 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
270 define amdgpu_gs float @gs_main(i32 %idx) {
271 %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
272 %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
273 %r = fadd float %v1, %v2
277 ; Mesa GS and HS shaders have the preloaded scratch wave offset SGPR fixed at
278 ; SGPR5, and the inreg implementation is used to reference it in the IR. The
279 ; following tests confirm the shader and anything inserted after the return
280 ; (i.e. SI_RETURN_TO_EPILOG) can access the scratch wave offset.
282 ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset:
283 ; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
284 ; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
286 ; GFX10-FLATSCR: s_add_u32 s0, s0, s5
287 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
288 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
289 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
291 ; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
292 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
293 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
294 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
295 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
296 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
297 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
298 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
299 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
300 ; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
302 ; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
303 ; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
304 ; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
305 ; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
306 ; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
307 ; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
308 ; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
309 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
310 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
312 ; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
313 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
315 ; SIVI-NOT: s_mov_b32 s6
316 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
317 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
319 ; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
320 ; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
322 ; MUBUF-DAG: s_mov_b32 s2, s5
324 ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
325 ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
327 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
328 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
329 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
330 %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
331 %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
332 %f = fadd float %v1, %v2
333 %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2
334 %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3
335 ret <{i32, i32, i32, float}> %r2
338 ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset:
339 ; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
340 ; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
342 ; GFX10-FLATSCR: s_add_u32 s0, s0, s5
343 ; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
344 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
345 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
347 ; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
348 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
349 ; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
350 ; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
351 ; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
352 ; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, 0
353 ; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
354 ; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
355 ; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
356 ; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
358 ; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
359 ; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
360 ; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
361 ; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
362 ; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
363 ; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
364 ; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
365 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
366 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
368 ; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
369 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD
371 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
372 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
374 ; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
375 ; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
377 ; MUBUF-DAG: s_mov_b32 s2, s5
379 ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
380 ; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
382 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
383 ; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
384 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
385 %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
386 %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
387 %f = fadd float %v1, %v2
388 %r1 = insertvalue <{i32, i32, i32, float}> undef, i32 %swo, 2
389 %r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3
390 ret <{i32, i32, i32, float}> %r2