2 * Copyright 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include <linux/list.h>
26 #include "amdgpu_xgmi.h"
27 #include "amdgpu_smu.h"
28 #include "amdgpu_ras.h"
29 #include "df/df_3_6_offset.h"
31 static DEFINE_MUTEX(xgmi_mutex
);
33 #define AMDGPU_MAX_XGMI_HIVE 8
34 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
36 static struct amdgpu_hive_info xgmi_hives
[AMDGPU_MAX_XGMI_HIVE
];
37 static unsigned hive_count
= 0;
39 void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info
*hive
)
41 return &hive
->device_list
;
45 * DOC: AMDGPU XGMI Support
47 * XGMI is a high speed interconnect that joins multiple GPU cards
48 * into a homogeneous memory space that is organized by a collective
49 * hive ID and individual node IDs, both of which are 64-bit numbers.
51 * The file xgmi_device_id contains the unique per GPU device ID and
52 * is stored in the /sys/class/drm/card${cardno}/device/ directory.
54 * Inside the device directory a sub-directory 'xgmi_hive_info' is
55 * created which contains the hive ID and the list of nodes.
57 * The hive ID is stored in:
58 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
60 * The node information is stored in numbered directories:
61 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
63 * Each device has their own xgmi_hive_info direction with a mirror
64 * set of node sub-directories.
66 * The XGMI memory space is built by contiguously adding the power of
67 * two padded VRAM space from each node to each other.
72 static ssize_t
amdgpu_xgmi_show_hive_id(struct device
*dev
,
73 struct device_attribute
*attr
, char *buf
)
75 struct amdgpu_hive_info
*hive
=
76 container_of(attr
, struct amdgpu_hive_info
, dev_attr
);
78 return snprintf(buf
, PAGE_SIZE
, "%llu\n", hive
->hive_id
);
81 static int amdgpu_xgmi_sysfs_create(struct amdgpu_device
*adev
,
82 struct amdgpu_hive_info
*hive
)
86 if (WARN_ON(hive
->kobj
))
89 hive
->kobj
= kobject_create_and_add("xgmi_hive_info", &adev
->dev
->kobj
);
91 dev_err(adev
->dev
, "XGMI: Failed to allocate sysfs entry!\n");
95 hive
->dev_attr
= (struct device_attribute
) {
97 .name
= "xgmi_hive_id",
101 .show
= amdgpu_xgmi_show_hive_id
,
104 ret
= sysfs_create_file(hive
->kobj
, &hive
->dev_attr
.attr
);
106 dev_err(adev
->dev
, "XGMI: Failed to create device file xgmi_hive_id\n");
107 kobject_del(hive
->kobj
);
108 kobject_put(hive
->kobj
);
115 static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device
*adev
,
116 struct amdgpu_hive_info
*hive
)
118 sysfs_remove_file(hive
->kobj
, &hive
->dev_attr
.attr
);
119 kobject_del(hive
->kobj
);
120 kobject_put(hive
->kobj
);
124 static ssize_t
amdgpu_xgmi_show_device_id(struct device
*dev
,
125 struct device_attribute
*attr
,
128 struct drm_device
*ddev
= dev_get_drvdata(dev
);
129 struct amdgpu_device
*adev
= ddev
->dev_private
;
131 return snprintf(buf
, PAGE_SIZE
, "%llu\n", adev
->gmc
.xgmi
.node_id
);
135 #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
136 static ssize_t
amdgpu_xgmi_show_error(struct device
*dev
,
137 struct device_attribute
*attr
,
140 struct drm_device
*ddev
= dev_get_drvdata(dev
);
141 struct amdgpu_device
*adev
= ddev
->dev_private
;
142 uint32_t ficaa_pie_ctl_in
, ficaa_pie_status_in
;
144 unsigned int error_count
= 0;
146 ficaa_pie_ctl_in
= AMDGPU_XGMI_SET_FICAA(0x200);
147 ficaa_pie_status_in
= AMDGPU_XGMI_SET_FICAA(0x208);
149 fica_out
= adev
->df
.funcs
->get_fica(adev
, ficaa_pie_ctl_in
);
150 if (fica_out
!= 0x1f)
151 pr_err("xGMI error counters not enabled!\n");
153 fica_out
= adev
->df
.funcs
->get_fica(adev
, ficaa_pie_status_in
);
155 if ((fica_out
& 0xffff) == 2)
156 error_count
= ((fica_out
>> 62) & 0x1) + (fica_out
>> 63);
158 adev
->df
.funcs
->set_fica(adev
, ficaa_pie_status_in
, 0, 0);
160 return snprintf(buf
, PAGE_SIZE
, "%d\n", error_count
);
164 static DEVICE_ATTR(xgmi_device_id
, S_IRUGO
, amdgpu_xgmi_show_device_id
, NULL
);
165 static DEVICE_ATTR(xgmi_error
, S_IRUGO
, amdgpu_xgmi_show_error
, NULL
);
167 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device
*adev
,
168 struct amdgpu_hive_info
*hive
)
171 char node
[10] = { 0 };
173 /* Create xgmi device id file */
174 ret
= device_create_file(adev
->dev
, &dev_attr_xgmi_device_id
);
176 dev_err(adev
->dev
, "XGMI: Failed to create device file xgmi_device_id\n");
180 /* Create xgmi error file */
181 ret
= device_create_file(adev
->dev
, &dev_attr_xgmi_error
);
183 pr_err("failed to create xgmi_error\n");
186 /* Create sysfs link to hive info folder on the first device */
187 if (adev
!= hive
->adev
) {
188 ret
= sysfs_create_link(&adev
->dev
->kobj
, hive
->kobj
,
191 dev_err(adev
->dev
, "XGMI: Failed to create link to hive info");
196 sprintf(node
, "node%d", hive
->number_devices
);
197 /* Create sysfs link form the hive folder to yourself */
198 ret
= sysfs_create_link(hive
->kobj
, &adev
->dev
->kobj
, node
);
200 dev_err(adev
->dev
, "XGMI: Failed to create link from hive info");
208 sysfs_remove_link(&adev
->dev
->kobj
, adev
->ddev
->unique
);
211 device_remove_file(adev
->dev
, &dev_attr_xgmi_device_id
);
217 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device
*adev
,
218 struct amdgpu_hive_info
*hive
)
220 device_remove_file(adev
->dev
, &dev_attr_xgmi_device_id
);
221 sysfs_remove_link(&adev
->dev
->kobj
, adev
->ddev
->unique
);
222 sysfs_remove_link(hive
->kobj
, adev
->ddev
->unique
);
227 struct amdgpu_hive_info
*amdgpu_get_xgmi_hive(struct amdgpu_device
*adev
, int lock
)
230 struct amdgpu_hive_info
*tmp
;
232 if (!adev
->gmc
.xgmi
.hive_id
)
235 mutex_lock(&xgmi_mutex
);
237 for (i
= 0 ; i
< hive_count
; ++i
) {
238 tmp
= &xgmi_hives
[i
];
239 if (tmp
->hive_id
== adev
->gmc
.xgmi
.hive_id
) {
241 mutex_lock(&tmp
->hive_lock
);
242 mutex_unlock(&xgmi_mutex
);
246 if (i
>= AMDGPU_MAX_XGMI_HIVE
) {
247 mutex_unlock(&xgmi_mutex
);
251 /* initialize new hive if not exist */
252 tmp
= &xgmi_hives
[hive_count
++];
254 if (amdgpu_xgmi_sysfs_create(adev
, tmp
)) {
255 mutex_unlock(&xgmi_mutex
);
260 tmp
->hive_id
= adev
->gmc
.xgmi
.hive_id
;
261 INIT_LIST_HEAD(&tmp
->device_list
);
262 mutex_init(&tmp
->hive_lock
);
263 mutex_init(&tmp
->reset_lock
);
264 task_barrier_init(&tmp
->tb
);
267 mutex_lock(&tmp
->hive_lock
);
269 mutex_unlock(&xgmi_mutex
);
274 int amdgpu_xgmi_set_pstate(struct amdgpu_device
*adev
, int pstate
)
277 struct amdgpu_hive_info
*hive
= amdgpu_get_xgmi_hive(adev
, 0);
278 struct amdgpu_device
*tmp_adev
;
279 bool update_hive_pstate
= true;
280 bool is_high_pstate
= pstate
&& adev
->asic_type
== CHIP_VEGA20
;
285 mutex_lock(&hive
->hive_lock
);
287 if (hive
->pstate
== pstate
) {
288 adev
->pstate
= is_high_pstate
? pstate
: adev
->pstate
;
292 dev_dbg(adev
->dev
, "Set xgmi pstate %d.\n", pstate
);
294 ret
= amdgpu_dpm_set_xgmi_pstate(adev
, pstate
);
297 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
298 adev
->gmc
.xgmi
.node_id
,
299 adev
->gmc
.xgmi
.hive_id
, ret
);
303 /* Update device pstate */
304 adev
->pstate
= pstate
;
307 * Update the hive pstate only all devices of the hive
308 * are in the same pstate
310 list_for_each_entry(tmp_adev
, &hive
->device_list
, gmc
.xgmi
.head
) {
311 if (tmp_adev
->pstate
!= adev
->pstate
) {
312 update_hive_pstate
= false;
316 if (update_hive_pstate
|| is_high_pstate
)
317 hive
->pstate
= pstate
;
320 mutex_unlock(&hive
->hive_lock
);
325 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info
*hive
, struct amdgpu_device
*adev
)
329 /* Each psp need to set the latest topology */
330 ret
= psp_xgmi_set_topology_info(&adev
->psp
,
331 hive
->number_devices
,
332 &adev
->psp
.xgmi_context
.top_info
);
335 "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
336 adev
->gmc
.xgmi
.node_id
,
337 adev
->gmc
.xgmi
.hive_id
, ret
);
343 int amdgpu_xgmi_get_hops_count(struct amdgpu_device
*adev
,
344 struct amdgpu_device
*peer_adev
)
346 struct psp_xgmi_topology_info
*top
= &adev
->psp
.xgmi_context
.top_info
;
349 for (i
= 0 ; i
< top
->num_nodes
; ++i
)
350 if (top
->nodes
[i
].node_id
== peer_adev
->gmc
.xgmi
.node_id
)
351 return top
->nodes
[i
].num_hops
;
355 int amdgpu_xgmi_add_device(struct amdgpu_device
*adev
)
357 struct psp_xgmi_topology_info
*top_info
;
358 struct amdgpu_hive_info
*hive
;
359 struct amdgpu_xgmi
*entry
;
360 struct amdgpu_device
*tmp_adev
= NULL
;
362 int count
= 0, ret
= 0;
364 if (!adev
->gmc
.xgmi
.supported
)
367 if (amdgpu_device_ip_get_ip_block(adev
, AMD_IP_BLOCK_TYPE_PSP
)) {
368 ret
= psp_xgmi_get_hive_id(&adev
->psp
, &adev
->gmc
.xgmi
.hive_id
);
371 "XGMI: Failed to get hive id\n");
375 ret
= psp_xgmi_get_node_id(&adev
->psp
, &adev
->gmc
.xgmi
.node_id
);
378 "XGMI: Failed to get node id\n");
382 adev
->gmc
.xgmi
.hive_id
= 16;
383 adev
->gmc
.xgmi
.node_id
= adev
->gmc
.xgmi
.physical_node_id
+ 16;
386 hive
= amdgpu_get_xgmi_hive(adev
, 1);
390 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
391 adev
->gmc
.xgmi
.node_id
, adev
->gmc
.xgmi
.hive_id
);
395 /* Set default device pstate */
398 top_info
= &adev
->psp
.xgmi_context
.top_info
;
400 list_add_tail(&adev
->gmc
.xgmi
.head
, &hive
->device_list
);
401 list_for_each_entry(entry
, &hive
->device_list
, head
)
402 top_info
->nodes
[count
++].node_id
= entry
->node_id
;
403 top_info
->num_nodes
= count
;
404 hive
->number_devices
= count
;
406 task_barrier_add_task(&hive
->tb
);
408 if (amdgpu_device_ip_get_ip_block(adev
, AMD_IP_BLOCK_TYPE_PSP
)) {
409 list_for_each_entry(tmp_adev
, &hive
->device_list
, gmc
.xgmi
.head
) {
410 /* update node list for other device in the hive */
411 if (tmp_adev
!= adev
) {
412 top_info
= &tmp_adev
->psp
.xgmi_context
.top_info
;
413 top_info
->nodes
[count
- 1].node_id
=
414 adev
->gmc
.xgmi
.node_id
;
415 top_info
->num_nodes
= count
;
417 ret
= amdgpu_xgmi_update_topology(hive
, tmp_adev
);
422 /* get latest topology info for each device from psp */
423 list_for_each_entry(tmp_adev
, &hive
->device_list
, gmc
.xgmi
.head
) {
424 ret
= psp_xgmi_get_topology_info(&tmp_adev
->psp
, count
,
425 &tmp_adev
->psp
.xgmi_context
.top_info
);
427 dev_err(tmp_adev
->dev
,
428 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
429 tmp_adev
->gmc
.xgmi
.node_id
,
430 tmp_adev
->gmc
.xgmi
.hive_id
, ret
);
431 /* To do : continue with some node failed or disable the whole hive */
438 ret
= amdgpu_xgmi_sysfs_add_dev_info(adev
, hive
);
441 mutex_unlock(&hive
->hive_lock
);
444 dev_info(adev
->dev
, "XGMI: Add node %d, hive 0x%llx.\n",
445 adev
->gmc
.xgmi
.physical_node_id
, adev
->gmc
.xgmi
.hive_id
);
447 dev_err(adev
->dev
, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
448 adev
->gmc
.xgmi
.physical_node_id
, adev
->gmc
.xgmi
.hive_id
,
454 void amdgpu_xgmi_remove_device(struct amdgpu_device
*adev
)
456 struct amdgpu_hive_info
*hive
;
458 if (!adev
->gmc
.xgmi
.supported
)
461 hive
= amdgpu_get_xgmi_hive(adev
, 1);
465 if (!(hive
->number_devices
--)) {
466 amdgpu_xgmi_sysfs_destroy(adev
, hive
);
467 mutex_destroy(&hive
->hive_lock
);
468 mutex_destroy(&hive
->reset_lock
);
470 task_barrier_rem_task(&hive
->tb
);
471 amdgpu_xgmi_sysfs_rem_dev_info(adev
, hive
);
472 mutex_unlock(&hive
->hive_lock
);
476 int amdgpu_xgmi_ras_late_init(struct amdgpu_device
*adev
)
479 struct ras_ih_if ih_info
= {
482 struct ras_fs_if fs_info
= {
483 .sysfs_name
= "xgmi_wafl_err_count",
484 .debugfs_name
= "xgmi_wafl_err_inject",
487 if (!adev
->gmc
.xgmi
.supported
||
488 adev
->gmc
.xgmi
.num_physical_nodes
== 0)
491 if (!adev
->gmc
.xgmi
.ras_if
) {
492 adev
->gmc
.xgmi
.ras_if
= kmalloc(sizeof(struct ras_common_if
), GFP_KERNEL
);
493 if (!adev
->gmc
.xgmi
.ras_if
)
495 adev
->gmc
.xgmi
.ras_if
->block
= AMDGPU_RAS_BLOCK__XGMI_WAFL
;
496 adev
->gmc
.xgmi
.ras_if
->type
= AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
;
497 adev
->gmc
.xgmi
.ras_if
->sub_block_index
= 0;
498 strcpy(adev
->gmc
.xgmi
.ras_if
->name
, "xgmi_wafl");
500 ih_info
.head
= fs_info
.head
= *adev
->gmc
.xgmi
.ras_if
;
501 r
= amdgpu_ras_late_init(adev
, adev
->gmc
.xgmi
.ras_if
,
503 if (r
|| !amdgpu_ras_is_supported(adev
, adev
->gmc
.xgmi
.ras_if
->block
)) {
504 kfree(adev
->gmc
.xgmi
.ras_if
);
505 adev
->gmc
.xgmi
.ras_if
= NULL
;
511 void amdgpu_xgmi_ras_fini(struct amdgpu_device
*adev
)
513 if (amdgpu_ras_is_supported(adev
, AMDGPU_RAS_BLOCK__XGMI_WAFL
) &&
514 adev
->gmc
.xgmi
.ras_if
) {
515 struct ras_common_if
*ras_if
= adev
->gmc
.xgmi
.ras_if
;
516 struct ras_ih_if ih_info
= {
520 amdgpu_ras_late_fini(adev
, ras_if
, &ih_info
);