2 * Copyright (c) 2008 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Eric Anholt <eric@anholt.net>
25 * Keith Packard <keithp@keithp.com>
26 * Mika Kuoppala <mika.kuoppala@intel.com>
30 #include <generated/utsrelease.h>
31 #include <linux/stop_machine.h>
32 #include <linux/zlib.h>
33 #include <drm/drm_print.h>
34 #include <linux/ascii85.h>
36 #include "i915_gpu_error.h"
39 static inline const struct intel_engine_cs
*
40 engine_lookup(const struct drm_i915_private
*i915
, unsigned int id
)
42 if (id
>= I915_NUM_ENGINES
)
45 return i915
->engine
[id
];
48 static inline const char *
49 __engine_name(const struct intel_engine_cs
*engine
)
51 return engine
? engine
->name
: "";
55 engine_name(const struct drm_i915_private
*i915
, unsigned int id
)
57 return __engine_name(engine_lookup(i915
, id
));
60 static const char *tiling_flag(int tiling
)
64 case I915_TILING_NONE
: return "";
65 case I915_TILING_X
: return " X";
66 case I915_TILING_Y
: return " Y";
70 static const char *dirty_flag(int dirty
)
72 return dirty
? " dirty" : "";
75 static const char *purgeable_flag(int purgeable
)
77 return purgeable
? " purgeable" : "";
80 static bool __i915_error_ok(struct drm_i915_error_state_buf
*e
)
83 if (!e
->err
&& WARN(e
->bytes
> (e
->size
- 1), "overflow")) {
88 if (e
->bytes
== e
->size
- 1 || e
->err
)
94 static bool __i915_error_seek(struct drm_i915_error_state_buf
*e
,
97 if (e
->pos
+ len
<= e
->start
) {
102 /* First vsnprintf needs to fit in its entirety for memmove */
103 if (len
>= e
->size
) {
111 static void __i915_error_advance(struct drm_i915_error_state_buf
*e
,
114 /* If this is first printf in this window, adjust it so that
115 * start position matches start of the buffer
118 if (e
->pos
< e
->start
) {
119 const size_t off
= e
->start
- e
->pos
;
121 /* Should not happen but be paranoid */
122 if (off
> len
|| e
->bytes
) {
127 memmove(e
->buf
, e
->buf
+ off
, len
- off
);
128 e
->bytes
= len
- off
;
138 static void i915_error_vprintf(struct drm_i915_error_state_buf
*e
,
139 const char *f
, va_list args
)
143 if (!__i915_error_ok(e
))
146 /* Seek the first printf which is hits start position */
147 if (e
->pos
< e
->start
) {
151 len
= vsnprintf(NULL
, 0, f
, tmp
);
154 if (!__i915_error_seek(e
, len
))
158 len
= vsnprintf(e
->buf
+ e
->bytes
, e
->size
- e
->bytes
, f
, args
);
159 if (len
>= e
->size
- e
->bytes
)
160 len
= e
->size
- e
->bytes
- 1;
162 __i915_error_advance(e
, len
);
165 static void i915_error_puts(struct drm_i915_error_state_buf
*e
,
170 if (!__i915_error_ok(e
))
175 /* Seek the first printf which is hits start position */
176 if (e
->pos
< e
->start
) {
177 if (!__i915_error_seek(e
, len
))
181 if (len
>= e
->size
- e
->bytes
)
182 len
= e
->size
- e
->bytes
- 1;
183 memcpy(e
->buf
+ e
->bytes
, str
, len
);
185 __i915_error_advance(e
, len
);
188 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
189 #define err_puts(e, s) i915_error_puts(e, s)
191 static void __i915_printfn_error(struct drm_printer
*p
, struct va_format
*vaf
)
193 i915_error_vprintf(p
->arg
, vaf
->fmt
, *vaf
->va
);
196 static inline struct drm_printer
197 i915_error_printer(struct drm_i915_error_state_buf
*e
)
199 struct drm_printer p
= {
200 .printfn
= __i915_printfn_error
,
206 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
209 struct z_stream_s zstream
;
213 static bool compress_init(struct compress
*c
)
215 struct z_stream_s
*zstream
= memset(&c
->zstream
, 0, sizeof(c
->zstream
));
218 kmalloc(zlib_deflate_workspacesize(MAX_WBITS
, MAX_MEM_LEVEL
),
219 GFP_ATOMIC
| __GFP_NOWARN
);
220 if (!zstream
->workspace
)
223 if (zlib_deflateInit(zstream
, Z_DEFAULT_COMPRESSION
) != Z_OK
) {
224 kfree(zstream
->workspace
);
229 if (i915_has_memcpy_from_wc())
230 c
->tmp
= (void *)__get_free_page(GFP_ATOMIC
| __GFP_NOWARN
);
235 static void *compress_next_page(struct drm_i915_error_object
*dst
)
239 if (dst
->page_count
>= dst
->num_pages
)
240 return ERR_PTR(-ENOSPC
);
242 page
= __get_free_page(GFP_ATOMIC
| __GFP_NOWARN
);
244 return ERR_PTR(-ENOMEM
);
246 return dst
->pages
[dst
->page_count
++] = (void *)page
;
249 static int compress_page(struct compress
*c
,
251 struct drm_i915_error_object
*dst
)
253 struct z_stream_s
*zstream
= &c
->zstream
;
255 zstream
->next_in
= src
;
256 if (c
->tmp
&& i915_memcpy_from_wc(c
->tmp
, src
, PAGE_SIZE
))
257 zstream
->next_in
= c
->tmp
;
258 zstream
->avail_in
= PAGE_SIZE
;
261 if (zstream
->avail_out
== 0) {
262 zstream
->next_out
= compress_next_page(dst
);
263 if (IS_ERR(zstream
->next_out
))
264 return PTR_ERR(zstream
->next_out
);
266 zstream
->avail_out
= PAGE_SIZE
;
269 if (zlib_deflate(zstream
, Z_NO_FLUSH
) != Z_OK
)
271 } while (zstream
->avail_in
);
273 /* Fallback to uncompressed if we increase size? */
274 if (0 && zstream
->total_out
> zstream
->total_in
)
280 static int compress_flush(struct compress
*c
,
281 struct drm_i915_error_object
*dst
)
283 struct z_stream_s
*zstream
= &c
->zstream
;
286 switch (zlib_deflate(zstream
, Z_FINISH
)) {
287 case Z_OK
: /* more space requested */
288 zstream
->next_out
= compress_next_page(dst
);
289 if (IS_ERR(zstream
->next_out
))
290 return PTR_ERR(zstream
->next_out
);
292 zstream
->avail_out
= PAGE_SIZE
;
298 default: /* any error */
304 memset(zstream
->next_out
, 0, zstream
->avail_out
);
305 dst
->unused
= zstream
->avail_out
;
309 static void compress_fini(struct compress
*c
,
310 struct drm_i915_error_object
*dst
)
312 struct z_stream_s
*zstream
= &c
->zstream
;
314 zlib_deflateEnd(zstream
);
315 kfree(zstream
->workspace
);
317 free_page((unsigned long)c
->tmp
);
320 static void err_compression_marker(struct drm_i915_error_state_buf
*m
)
330 static bool compress_init(struct compress
*c
)
335 static int compress_page(struct compress
*c
,
337 struct drm_i915_error_object
*dst
)
342 page
= __get_free_page(GFP_ATOMIC
| __GFP_NOWARN
);
347 if (!i915_memcpy_from_wc(ptr
, src
, PAGE_SIZE
))
348 memcpy(ptr
, src
, PAGE_SIZE
);
349 dst
->pages
[dst
->page_count
++] = ptr
;
354 static int compress_flush(struct compress
*c
,
355 struct drm_i915_error_object
*dst
)
360 static void compress_fini(struct compress
*c
,
361 struct drm_i915_error_object
*dst
)
365 static void err_compression_marker(struct drm_i915_error_state_buf
*m
)
372 static void print_error_buffers(struct drm_i915_error_state_buf
*m
,
374 struct drm_i915_error_buffer
*err
,
377 err_printf(m
, "%s [%d]:\n", name
, count
);
380 err_printf(m
, " %08x_%08x %8u %02x %02x %02x",
381 upper_32_bits(err
->gtt_offset
),
382 lower_32_bits(err
->gtt_offset
),
387 err_puts(m
, tiling_flag(err
->tiling
));
388 err_puts(m
, dirty_flag(err
->dirty
));
389 err_puts(m
, purgeable_flag(err
->purgeable
));
390 err_puts(m
, err
->userptr
? " userptr" : "");
391 err_puts(m
, err
->engine
!= -1 ? " " : "");
392 err_puts(m
, engine_name(m
->i915
, err
->engine
));
393 err_puts(m
, i915_cache_level_str(m
->i915
, err
->cache_level
));
396 err_printf(m
, " (name: %d)", err
->name
);
397 if (err
->fence_reg
!= I915_FENCE_REG_NONE
)
398 err_printf(m
, " (fence: %d)", err
->fence_reg
);
405 static void error_print_instdone(struct drm_i915_error_state_buf
*m
,
406 const struct drm_i915_error_engine
*ee
)
411 err_printf(m
, " INSTDONE: 0x%08x\n",
412 ee
->instdone
.instdone
);
414 if (ee
->engine_id
!= RCS
|| INTEL_GEN(m
->i915
) <= 3)
417 err_printf(m
, " SC_INSTDONE: 0x%08x\n",
418 ee
->instdone
.slice_common
);
420 if (INTEL_GEN(m
->i915
) <= 6)
423 for_each_instdone_slice_subslice(m
->i915
, slice
, subslice
)
424 err_printf(m
, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
426 ee
->instdone
.sampler
[slice
][subslice
]);
428 for_each_instdone_slice_subslice(m
->i915
, slice
, subslice
)
429 err_printf(m
, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
431 ee
->instdone
.row
[slice
][subslice
]);
434 static const char *bannable(const struct drm_i915_error_context
*ctx
)
436 return ctx
->bannable
? "" : " (unbannable)";
439 static void error_print_request(struct drm_i915_error_state_buf
*m
,
441 const struct drm_i915_error_request
*erq
,
442 const unsigned long epoch
)
447 err_printf(m
, "%s pid %d, ban score %d, seqno %8x:%08x, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
448 prefix
, erq
->pid
, erq
->ban_score
,
449 erq
->context
, erq
->seqno
, erq
->sched_attr
.priority
,
450 jiffies_to_msecs(erq
->jiffies
- epoch
),
451 erq
->start
, erq
->head
, erq
->tail
);
454 static void error_print_context(struct drm_i915_error_state_buf
*m
,
456 const struct drm_i915_error_context
*ctx
)
458 err_printf(m
, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
459 header
, ctx
->comm
, ctx
->pid
, ctx
->handle
, ctx
->hw_id
,
460 ctx
->sched_attr
.priority
, ctx
->ban_score
, bannable(ctx
),
461 ctx
->guilty
, ctx
->active
);
464 static void error_print_engine(struct drm_i915_error_state_buf
*m
,
465 const struct drm_i915_error_engine
*ee
,
466 const unsigned long epoch
)
470 err_printf(m
, "%s command stream:\n",
471 engine_name(m
->i915
, ee
->engine_id
));
472 err_printf(m
, " IDLE?: %s\n", yesno(ee
->idle
));
473 err_printf(m
, " START: 0x%08x\n", ee
->start
);
474 err_printf(m
, " HEAD: 0x%08x [0x%08x]\n", ee
->head
, ee
->rq_head
);
475 err_printf(m
, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
476 ee
->tail
, ee
->rq_post
, ee
->rq_tail
);
477 err_printf(m
, " CTL: 0x%08x\n", ee
->ctl
);
478 err_printf(m
, " MODE: 0x%08x\n", ee
->mode
);
479 err_printf(m
, " HWS: 0x%08x\n", ee
->hws
);
480 err_printf(m
, " ACTHD: 0x%08x %08x\n",
481 (u32
)(ee
->acthd
>>32), (u32
)ee
->acthd
);
482 err_printf(m
, " IPEIR: 0x%08x\n", ee
->ipeir
);
483 err_printf(m
, " IPEHR: 0x%08x\n", ee
->ipehr
);
485 error_print_instdone(m
, ee
);
487 if (ee
->batchbuffer
) {
488 u64 start
= ee
->batchbuffer
->gtt_offset
;
489 u64 end
= start
+ ee
->batchbuffer
->gtt_size
;
491 err_printf(m
, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
492 upper_32_bits(start
), lower_32_bits(start
),
493 upper_32_bits(end
), lower_32_bits(end
));
495 if (INTEL_GEN(m
->i915
) >= 4) {
496 err_printf(m
, " BBADDR: 0x%08x_%08x\n",
497 (u32
)(ee
->bbaddr
>>32), (u32
)ee
->bbaddr
);
498 err_printf(m
, " BB_STATE: 0x%08x\n", ee
->bbstate
);
499 err_printf(m
, " INSTPS: 0x%08x\n", ee
->instps
);
501 err_printf(m
, " INSTPM: 0x%08x\n", ee
->instpm
);
502 err_printf(m
, " FADDR: 0x%08x %08x\n", upper_32_bits(ee
->faddr
),
503 lower_32_bits(ee
->faddr
));
504 if (INTEL_GEN(m
->i915
) >= 6) {
505 err_printf(m
, " RC PSMI: 0x%08x\n", ee
->rc_psmi
);
506 err_printf(m
, " FAULT_REG: 0x%08x\n", ee
->fault_reg
);
507 err_printf(m
, " SYNC_0: 0x%08x\n",
508 ee
->semaphore_mboxes
[0]);
509 err_printf(m
, " SYNC_1: 0x%08x\n",
510 ee
->semaphore_mboxes
[1]);
511 if (HAS_VEBOX(m
->i915
))
512 err_printf(m
, " SYNC_2: 0x%08x\n",
513 ee
->semaphore_mboxes
[2]);
515 if (USES_PPGTT(m
->i915
)) {
516 err_printf(m
, " GFX_MODE: 0x%08x\n", ee
->vm_info
.gfx_mode
);
518 if (INTEL_GEN(m
->i915
) >= 8) {
520 for (i
= 0; i
< 4; i
++)
521 err_printf(m
, " PDP%d: 0x%016llx\n",
522 i
, ee
->vm_info
.pdp
[i
]);
524 err_printf(m
, " PP_DIR_BASE: 0x%08x\n",
525 ee
->vm_info
.pp_dir_base
);
528 err_printf(m
, " seqno: 0x%08x\n", ee
->seqno
);
529 err_printf(m
, " last_seqno: 0x%08x\n", ee
->last_seqno
);
530 err_printf(m
, " waiting: %s\n", yesno(ee
->waiting
));
531 err_printf(m
, " ring->head: 0x%08x\n", ee
->cpu_ring_head
);
532 err_printf(m
, " ring->tail: 0x%08x\n", ee
->cpu_ring_tail
);
533 err_printf(m
, " hangcheck stall: %s\n", yesno(ee
->hangcheck_stalled
));
534 err_printf(m
, " hangcheck action: %s\n",
535 hangcheck_action_to_str(ee
->hangcheck_action
));
536 err_printf(m
, " hangcheck action timestamp: %dms (%lu%s)\n",
537 jiffies_to_msecs(ee
->hangcheck_timestamp
- epoch
),
538 ee
->hangcheck_timestamp
,
539 ee
->hangcheck_timestamp
== epoch
? "; epoch" : "");
540 err_printf(m
, " engine reset count: %u\n", ee
->reset_count
);
542 for (n
= 0; n
< ee
->num_ports
; n
++) {
543 err_printf(m
, " ELSP[%d]:", n
);
544 error_print_request(m
, " ", &ee
->execlist
[n
], epoch
);
547 error_print_context(m
, " Active context: ", &ee
->context
);
550 void i915_error_printf(struct drm_i915_error_state_buf
*e
, const char *f
, ...)
555 i915_error_vprintf(e
, f
, args
);
559 static void print_error_obj(struct drm_i915_error_state_buf
*m
,
560 struct intel_engine_cs
*engine
,
562 struct drm_i915_error_object
*obj
)
564 char out
[ASCII85_BUFSZ
];
571 err_printf(m
, "%s --- %s = 0x%08x %08x\n",
572 engine
? engine
->name
: "global", name
,
573 upper_32_bits(obj
->gtt_offset
),
574 lower_32_bits(obj
->gtt_offset
));
577 err_compression_marker(m
);
578 for (page
= 0; page
< obj
->page_count
; page
++) {
582 if (page
== obj
->page_count
- 1)
584 len
= ascii85_encode_len(len
);
586 for (i
= 0; i
< len
; i
++)
587 err_puts(m
, ascii85_encode(obj
->pages
[page
][i
], out
));
592 static void err_print_capabilities(struct drm_i915_error_state_buf
*m
,
593 const struct intel_device_info
*info
,
594 const struct intel_driver_caps
*caps
)
596 struct drm_printer p
= i915_error_printer(m
);
598 intel_device_info_dump_flags(info
, &p
);
599 intel_driver_caps_print(caps
, &p
);
600 intel_device_info_dump_topology(&info
->sseu
, &p
);
603 static void err_print_params(struct drm_i915_error_state_buf
*m
,
604 const struct i915_params
*params
)
606 struct drm_printer p
= i915_error_printer(m
);
608 i915_params_dump(params
, &p
);
611 static void err_print_pciid(struct drm_i915_error_state_buf
*m
,
612 struct drm_i915_private
*i915
)
614 struct pci_dev
*pdev
= i915
->drm
.pdev
;
616 err_printf(m
, "PCI ID: 0x%04x\n", pdev
->device
);
617 err_printf(m
, "PCI Revision: 0x%02x\n", pdev
->revision
);
618 err_printf(m
, "PCI Subsystem: %04x:%04x\n",
619 pdev
->subsystem_vendor
,
620 pdev
->subsystem_device
);
623 static void err_print_uc(struct drm_i915_error_state_buf
*m
,
624 const struct i915_error_uc
*error_uc
)
626 struct drm_printer p
= i915_error_printer(m
);
627 const struct i915_gpu_state
*error
=
628 container_of(error_uc
, typeof(*error
), uc
);
630 if (!error
->device_info
.has_guc
)
633 intel_uc_fw_dump(&error_uc
->guc_fw
, &p
);
634 intel_uc_fw_dump(&error_uc
->huc_fw
, &p
);
635 print_error_obj(m
, NULL
, "GuC log buffer", error_uc
->guc_log
);
638 int i915_error_state_to_str(struct drm_i915_error_state_buf
*m
,
639 const struct i915_gpu_state
*error
)
641 struct drm_i915_private
*dev_priv
= m
->i915
;
642 struct drm_i915_error_object
*obj
;
643 struct timespec64 ts
;
647 err_printf(m
, "No error state collected\n");
651 if (*error
->error_msg
)
652 err_printf(m
, "%s\n", error
->error_msg
);
653 err_printf(m
, "Kernel: " UTS_RELEASE
"\n");
654 ts
= ktime_to_timespec64(error
->time
);
655 err_printf(m
, "Time: %lld s %ld us\n",
656 (s64
)ts
.tv_sec
, ts
.tv_nsec
/ NSEC_PER_USEC
);
657 ts
= ktime_to_timespec64(error
->boottime
);
658 err_printf(m
, "Boottime: %lld s %ld us\n",
659 (s64
)ts
.tv_sec
, ts
.tv_nsec
/ NSEC_PER_USEC
);
660 ts
= ktime_to_timespec64(error
->uptime
);
661 err_printf(m
, "Uptime: %lld s %ld us\n",
662 (s64
)ts
.tv_sec
, ts
.tv_nsec
/ NSEC_PER_USEC
);
663 err_printf(m
, "Epoch: %lu jiffies (%u HZ)\n", error
->epoch
, HZ
);
664 err_printf(m
, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
666 jiffies_to_msecs(jiffies
- error
->capture
),
667 jiffies_to_msecs(error
->capture
- error
->epoch
));
669 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
670 if (error
->engine
[i
].hangcheck_stalled
&&
671 error
->engine
[i
].context
.pid
) {
672 err_printf(m
, "Active process (on ring %s): %s [%d], score %d%s\n",
673 engine_name(m
->i915
, i
),
674 error
->engine
[i
].context
.comm
,
675 error
->engine
[i
].context
.pid
,
676 error
->engine
[i
].context
.ban_score
,
677 bannable(&error
->engine
[i
].context
));
680 err_printf(m
, "Reset count: %u\n", error
->reset_count
);
681 err_printf(m
, "Suspend count: %u\n", error
->suspend_count
);
682 err_printf(m
, "Platform: %s\n", intel_platform_name(error
->device_info
.platform
));
683 err_print_pciid(m
, error
->i915
);
685 err_printf(m
, "IOMMU enabled?: %d\n", error
->iommu
);
687 if (HAS_CSR(dev_priv
)) {
688 struct intel_csr
*csr
= &dev_priv
->csr
;
690 err_printf(m
, "DMC loaded: %s\n",
691 yesno(csr
->dmc_payload
!= NULL
));
692 err_printf(m
, "DMC fw version: %d.%d\n",
693 CSR_VERSION_MAJOR(csr
->version
),
694 CSR_VERSION_MINOR(csr
->version
));
697 err_printf(m
, "GT awake: %s\n", yesno(error
->awake
));
698 err_printf(m
, "RPM wakelock: %s\n", yesno(error
->wakelock
));
699 err_printf(m
, "PM suspended: %s\n", yesno(error
->suspended
));
700 err_printf(m
, "EIR: 0x%08x\n", error
->eir
);
701 err_printf(m
, "IER: 0x%08x\n", error
->ier
);
702 for (i
= 0; i
< error
->ngtier
; i
++)
703 err_printf(m
, "GTIER[%d]: 0x%08x\n", i
, error
->gtier
[i
]);
704 err_printf(m
, "PGTBL_ER: 0x%08x\n", error
->pgtbl_er
);
705 err_printf(m
, "FORCEWAKE: 0x%08x\n", error
->forcewake
);
706 err_printf(m
, "DERRMR: 0x%08x\n", error
->derrmr
);
707 err_printf(m
, "CCID: 0x%08x\n", error
->ccid
);
708 err_printf(m
, "Missed interrupts: 0x%08lx\n", dev_priv
->gpu_error
.missed_irq_rings
);
710 for (i
= 0; i
< error
->nfence
; i
++)
711 err_printf(m
, " fence[%d] = %08llx\n", i
, error
->fence
[i
]);
713 if (INTEL_GEN(dev_priv
) >= 6) {
714 err_printf(m
, "ERROR: 0x%08x\n", error
->error
);
716 if (INTEL_GEN(dev_priv
) >= 8)
717 err_printf(m
, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
718 error
->fault_data1
, error
->fault_data0
);
720 err_printf(m
, "DONE_REG: 0x%08x\n", error
->done_reg
);
723 if (IS_GEN7(dev_priv
))
724 err_printf(m
, "ERR_INT: 0x%08x\n", error
->err_int
);
726 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
727 if (error
->engine
[i
].engine_id
!= -1)
728 error_print_engine(m
, &error
->engine
[i
], error
->epoch
);
731 for (i
= 0; i
< ARRAY_SIZE(error
->active_vm
); i
++) {
735 if (!error
->active_vm
[i
])
738 len
= scnprintf(buf
, sizeof(buf
), "Active (");
739 for (j
= 0; j
< ARRAY_SIZE(error
->engine
); j
++) {
740 if (error
->engine
[j
].vm
!= error
->active_vm
[i
])
743 len
+= scnprintf(buf
+ len
, sizeof(buf
), "%s%s",
745 dev_priv
->engine
[j
]->name
);
748 scnprintf(buf
+ len
, sizeof(buf
), ")");
749 print_error_buffers(m
, buf
,
751 error
->active_bo_count
[i
]);
754 print_error_buffers(m
, "Pinned (global)",
756 error
->pinned_bo_count
);
758 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
759 const struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
761 obj
= ee
->batchbuffer
;
763 err_puts(m
, dev_priv
->engine
[i
]->name
);
765 err_printf(m
, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
770 ee
->context
.ban_score
,
771 bannable(&ee
->context
));
772 err_printf(m
, " --- gtt_offset = 0x%08x %08x\n",
773 upper_32_bits(obj
->gtt_offset
),
774 lower_32_bits(obj
->gtt_offset
));
775 print_error_obj(m
, dev_priv
->engine
[i
], NULL
, obj
);
778 for (j
= 0; j
< ee
->user_bo_count
; j
++)
779 print_error_obj(m
, dev_priv
->engine
[i
],
780 "user", ee
->user_bo
[j
]);
782 if (ee
->num_requests
) {
783 err_printf(m
, "%s --- %d requests\n",
784 dev_priv
->engine
[i
]->name
,
786 for (j
= 0; j
< ee
->num_requests
; j
++)
787 error_print_request(m
, " ",
792 if (IS_ERR(ee
->waiters
)) {
793 err_printf(m
, "%s --- ? waiters [unable to acquire spinlock]\n",
794 dev_priv
->engine
[i
]->name
);
795 } else if (ee
->num_waiters
) {
796 err_printf(m
, "%s --- %d waiters\n",
797 dev_priv
->engine
[i
]->name
,
799 for (j
= 0; j
< ee
->num_waiters
; j
++) {
800 err_printf(m
, " seqno 0x%08x for %s [%d]\n",
801 ee
->waiters
[j
].seqno
,
807 print_error_obj(m
, dev_priv
->engine
[i
],
808 "ringbuffer", ee
->ringbuffer
);
810 print_error_obj(m
, dev_priv
->engine
[i
],
811 "HW Status", ee
->hws_page
);
813 print_error_obj(m
, dev_priv
->engine
[i
],
814 "HW context", ee
->ctx
);
816 print_error_obj(m
, dev_priv
->engine
[i
],
817 "WA context", ee
->wa_ctx
);
819 print_error_obj(m
, dev_priv
->engine
[i
],
820 "WA batchbuffer", ee
->wa_batchbuffer
);
822 print_error_obj(m
, dev_priv
->engine
[i
],
823 "NULL context", ee
->default_state
);
827 intel_overlay_print_error_state(m
, error
->overlay
);
830 intel_display_print_error_state(m
, error
->display
);
832 err_print_capabilities(m
, &error
->device_info
, &error
->driver_caps
);
833 err_print_params(m
, &error
->params
);
834 err_print_uc(m
, &error
->uc
);
836 if (m
->bytes
== 0 && m
->err
)
842 int i915_error_state_buf_init(struct drm_i915_error_state_buf
*ebuf
,
843 struct drm_i915_private
*i915
,
844 size_t count
, loff_t pos
)
846 memset(ebuf
, 0, sizeof(*ebuf
));
849 /* We need to have enough room to store any i915_error_state printf
850 * so that we can move it to start position.
852 ebuf
->size
= count
+ 1 > PAGE_SIZE
? count
+ 1 : PAGE_SIZE
;
853 ebuf
->buf
= kmalloc(ebuf
->size
,
854 GFP_KERNEL
| __GFP_NORETRY
| __GFP_NOWARN
);
856 if (ebuf
->buf
== NULL
) {
857 ebuf
->size
= PAGE_SIZE
;
858 ebuf
->buf
= kmalloc(ebuf
->size
, GFP_KERNEL
);
861 if (ebuf
->buf
== NULL
) {
863 ebuf
->buf
= kmalloc(ebuf
->size
, GFP_KERNEL
);
866 if (ebuf
->buf
== NULL
)
874 static void i915_error_object_free(struct drm_i915_error_object
*obj
)
881 for (page
= 0; page
< obj
->page_count
; page
++)
882 free_page((unsigned long)obj
->pages
[page
]);
887 static __always_inline
void free_param(const char *type
, void *x
)
889 if (!__builtin_strcmp(type
, "char *"))
893 static void cleanup_params(struct i915_gpu_state
*error
)
895 #define FREE(T, x, ...) free_param(#T, &error->params.x);
896 I915_PARAMS_FOR_EACH(FREE
);
900 static void cleanup_uc_state(struct i915_gpu_state
*error
)
902 struct i915_error_uc
*error_uc
= &error
->uc
;
904 kfree(error_uc
->guc_fw
.path
);
905 kfree(error_uc
->huc_fw
.path
);
906 i915_error_object_free(error_uc
->guc_log
);
909 void __i915_gpu_state_free(struct kref
*error_ref
)
911 struct i915_gpu_state
*error
=
912 container_of(error_ref
, typeof(*error
), ref
);
915 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
916 struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
918 for (j
= 0; j
< ee
->user_bo_count
; j
++)
919 i915_error_object_free(ee
->user_bo
[j
]);
922 i915_error_object_free(ee
->batchbuffer
);
923 i915_error_object_free(ee
->wa_batchbuffer
);
924 i915_error_object_free(ee
->ringbuffer
);
925 i915_error_object_free(ee
->hws_page
);
926 i915_error_object_free(ee
->ctx
);
927 i915_error_object_free(ee
->wa_ctx
);
930 if (!IS_ERR_OR_NULL(ee
->waiters
))
934 for (i
= 0; i
< ARRAY_SIZE(error
->active_bo
); i
++)
935 kfree(error
->active_bo
[i
]);
936 kfree(error
->pinned_bo
);
938 kfree(error
->overlay
);
939 kfree(error
->display
);
941 cleanup_params(error
);
942 cleanup_uc_state(error
);
947 static struct drm_i915_error_object
*
948 i915_error_object_create(struct drm_i915_private
*i915
,
949 struct i915_vma
*vma
)
951 struct i915_ggtt
*ggtt
= &i915
->ggtt
;
952 const u64 slot
= ggtt
->error_capture
.start
;
953 struct drm_i915_error_object
*dst
;
954 struct compress compress
;
955 unsigned long num_pages
;
956 struct sgt_iter iter
;
963 num_pages
= min_t(u64
, vma
->size
, vma
->obj
->base
.size
) >> PAGE_SHIFT
;
964 num_pages
= DIV_ROUND_UP(10 * num_pages
, 8); /* worstcase zlib growth */
965 dst
= kmalloc(sizeof(*dst
) + num_pages
* sizeof(u32
*),
966 GFP_ATOMIC
| __GFP_NOWARN
);
970 dst
->gtt_offset
= vma
->node
.start
;
971 dst
->gtt_size
= vma
->node
.size
;
972 dst
->num_pages
= num_pages
;
976 if (!compress_init(&compress
)) {
982 for_each_sgt_dma(dma
, iter
, vma
->pages
) {
985 ggtt
->vm
.insert_page(&ggtt
->vm
, dma
, slot
, I915_CACHE_NONE
, 0);
987 s
= io_mapping_map_atomic_wc(&ggtt
->iomap
, slot
);
988 ret
= compress_page(&compress
, (void __force
*)s
, dst
);
989 io_mapping_unmap_atomic(s
);
994 if (ret
|| compress_flush(&compress
, dst
)) {
995 while (dst
->page_count
--)
996 free_page((unsigned long)dst
->pages
[dst
->page_count
]);
1001 compress_fini(&compress
, dst
);
1002 ggtt
->vm
.clear_range(&ggtt
->vm
, slot
, PAGE_SIZE
);
1006 /* The error capture is special as tries to run underneath the normal
1007 * locking rules - so we use the raw version of the i915_gem_active lookup.
1009 static inline uint32_t
1010 __active_get_seqno(struct i915_gem_active
*active
)
1012 struct i915_request
*request
;
1014 request
= __i915_gem_active_peek(active
);
1015 return request
? request
->global_seqno
: 0;
1019 __active_get_engine_id(struct i915_gem_active
*active
)
1021 struct i915_request
*request
;
1023 request
= __i915_gem_active_peek(active
);
1024 return request
? request
->engine
->id
: -1;
1027 static void capture_bo(struct drm_i915_error_buffer
*err
,
1028 struct i915_vma
*vma
)
1030 struct drm_i915_gem_object
*obj
= vma
->obj
;
1032 err
->size
= obj
->base
.size
;
1033 err
->name
= obj
->base
.name
;
1035 err
->wseqno
= __active_get_seqno(&obj
->frontbuffer_write
);
1036 err
->engine
= __active_get_engine_id(&obj
->frontbuffer_write
);
1038 err
->gtt_offset
= vma
->node
.start
;
1039 err
->read_domains
= obj
->read_domains
;
1040 err
->write_domain
= obj
->write_domain
;
1041 err
->fence_reg
= vma
->fence
? vma
->fence
->id
: -1;
1042 err
->tiling
= i915_gem_object_get_tiling(obj
);
1043 err
->dirty
= obj
->mm
.dirty
;
1044 err
->purgeable
= obj
->mm
.madv
!= I915_MADV_WILLNEED
;
1045 err
->userptr
= obj
->userptr
.mm
!= NULL
;
1046 err
->cache_level
= obj
->cache_level
;
1049 static u32
capture_error_bo(struct drm_i915_error_buffer
*err
,
1050 int count
, struct list_head
*head
,
1053 struct i915_vma
*vma
;
1056 list_for_each_entry(vma
, head
, vm_link
) {
1060 if (pinned_only
&& !i915_vma_is_pinned(vma
))
1063 capture_bo(err
++, vma
);
1071 /* Generate a semi-unique error code. The code is not meant to have meaning, The
1072 * code's only purpose is to try to prevent false duplicated bug reports by
1073 * grossly estimating a GPU error state.
1075 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1076 * the hang if we could strip the GTT offset information from it.
1078 * It's only a small step better than a random number in its current form.
1080 static uint32_t i915_error_generate_code(struct drm_i915_private
*dev_priv
,
1081 struct i915_gpu_state
*error
,
1084 uint32_t error_code
= 0;
1087 /* IPEHR would be an ideal way to detect errors, as it's the gross
1088 * measure of "the command that hung." However, has some very common
1089 * synchronization commands which almost always appear in the case
1090 * strictly a client bug. Use instdone to differentiate those some.
1092 for (i
= 0; i
< I915_NUM_ENGINES
; i
++) {
1093 if (error
->engine
[i
].hangcheck_stalled
) {
1097 return error
->engine
[i
].ipehr
^
1098 error
->engine
[i
].instdone
.instdone
;
1105 static void gem_record_fences(struct i915_gpu_state
*error
)
1107 struct drm_i915_private
*dev_priv
= error
->i915
;
1110 if (INTEL_GEN(dev_priv
) >= 6) {
1111 for (i
= 0; i
< dev_priv
->num_fence_regs
; i
++)
1112 error
->fence
[i
] = I915_READ64(FENCE_REG_GEN6_LO(i
));
1113 } else if (INTEL_GEN(dev_priv
) >= 4) {
1114 for (i
= 0; i
< dev_priv
->num_fence_regs
; i
++)
1115 error
->fence
[i
] = I915_READ64(FENCE_REG_965_LO(i
));
1117 for (i
= 0; i
< dev_priv
->num_fence_regs
; i
++)
1118 error
->fence
[i
] = I915_READ(FENCE_REG(i
));
1123 static void gen6_record_semaphore_state(struct intel_engine_cs
*engine
,
1124 struct drm_i915_error_engine
*ee
)
1126 struct drm_i915_private
*dev_priv
= engine
->i915
;
1128 ee
->semaphore_mboxes
[0] = I915_READ(RING_SYNC_0(engine
->mmio_base
));
1129 ee
->semaphore_mboxes
[1] = I915_READ(RING_SYNC_1(engine
->mmio_base
));
1130 if (HAS_VEBOX(dev_priv
))
1131 ee
->semaphore_mboxes
[2] =
1132 I915_READ(RING_SYNC_2(engine
->mmio_base
));
1135 static void error_record_engine_waiters(struct intel_engine_cs
*engine
,
1136 struct drm_i915_error_engine
*ee
)
1138 struct intel_breadcrumbs
*b
= &engine
->breadcrumbs
;
1139 struct drm_i915_error_waiter
*waiter
;
1143 ee
->num_waiters
= 0;
1146 if (RB_EMPTY_ROOT(&b
->waiters
))
1149 if (!spin_trylock_irq(&b
->rb_lock
)) {
1150 ee
->waiters
= ERR_PTR(-EDEADLK
);
1155 for (rb
= rb_first(&b
->waiters
); rb
!= NULL
; rb
= rb_next(rb
))
1157 spin_unlock_irq(&b
->rb_lock
);
1161 waiter
= kmalloc_array(count
,
1162 sizeof(struct drm_i915_error_waiter
),
1167 if (!spin_trylock_irq(&b
->rb_lock
)) {
1169 ee
->waiters
= ERR_PTR(-EDEADLK
);
1173 ee
->waiters
= waiter
;
1174 for (rb
= rb_first(&b
->waiters
); rb
; rb
= rb_next(rb
)) {
1175 struct intel_wait
*w
= rb_entry(rb
, typeof(*w
), node
);
1177 strcpy(waiter
->comm
, w
->tsk
->comm
);
1178 waiter
->pid
= w
->tsk
->pid
;
1179 waiter
->seqno
= w
->seqno
;
1182 if (++ee
->num_waiters
== count
)
1185 spin_unlock_irq(&b
->rb_lock
);
1188 static void error_record_engine_registers(struct i915_gpu_state
*error
,
1189 struct intel_engine_cs
*engine
,
1190 struct drm_i915_error_engine
*ee
)
1192 struct drm_i915_private
*dev_priv
= engine
->i915
;
1194 if (INTEL_GEN(dev_priv
) >= 6) {
1195 ee
->rc_psmi
= I915_READ(RING_PSMI_CTL(engine
->mmio_base
));
1196 if (INTEL_GEN(dev_priv
) >= 8) {
1197 ee
->fault_reg
= I915_READ(GEN8_RING_FAULT_REG
);
1199 gen6_record_semaphore_state(engine
, ee
);
1200 ee
->fault_reg
= I915_READ(RING_FAULT_REG(engine
));
1204 if (INTEL_GEN(dev_priv
) >= 4) {
1205 ee
->faddr
= I915_READ(RING_DMA_FADD(engine
->mmio_base
));
1206 ee
->ipeir
= I915_READ(RING_IPEIR(engine
->mmio_base
));
1207 ee
->ipehr
= I915_READ(RING_IPEHR(engine
->mmio_base
));
1208 ee
->instps
= I915_READ(RING_INSTPS(engine
->mmio_base
));
1209 ee
->bbaddr
= I915_READ(RING_BBADDR(engine
->mmio_base
));
1210 if (INTEL_GEN(dev_priv
) >= 8) {
1211 ee
->faddr
|= (u64
) I915_READ(RING_DMA_FADD_UDW(engine
->mmio_base
)) << 32;
1212 ee
->bbaddr
|= (u64
) I915_READ(RING_BBADDR_UDW(engine
->mmio_base
)) << 32;
1214 ee
->bbstate
= I915_READ(RING_BBSTATE(engine
->mmio_base
));
1216 ee
->faddr
= I915_READ(DMA_FADD_I8XX
);
1217 ee
->ipeir
= I915_READ(IPEIR
);
1218 ee
->ipehr
= I915_READ(IPEHR
);
1221 intel_engine_get_instdone(engine
, &ee
->instdone
);
1223 ee
->waiting
= intel_engine_has_waiter(engine
);
1224 ee
->instpm
= I915_READ(RING_INSTPM(engine
->mmio_base
));
1225 ee
->acthd
= intel_engine_get_active_head(engine
);
1226 ee
->seqno
= intel_engine_get_seqno(engine
);
1227 ee
->last_seqno
= intel_engine_last_submit(engine
);
1228 ee
->start
= I915_READ_START(engine
);
1229 ee
->head
= I915_READ_HEAD(engine
);
1230 ee
->tail
= I915_READ_TAIL(engine
);
1231 ee
->ctl
= I915_READ_CTL(engine
);
1232 if (INTEL_GEN(dev_priv
) > 2)
1233 ee
->mode
= I915_READ_MODE(engine
);
1235 if (!HWS_NEEDS_PHYSICAL(dev_priv
)) {
1238 if (IS_GEN7(dev_priv
)) {
1239 switch (engine
->id
) {
1242 mmio
= RENDER_HWS_PGA_GEN7
;
1245 mmio
= BLT_HWS_PGA_GEN7
;
1248 mmio
= BSD_HWS_PGA_GEN7
;
1251 mmio
= VEBOX_HWS_PGA_GEN7
;
1254 } else if (IS_GEN6(engine
->i915
)) {
1255 mmio
= RING_HWS_PGA_GEN6(engine
->mmio_base
);
1257 /* XXX: gen8 returns to sanity */
1258 mmio
= RING_HWS_PGA(engine
->mmio_base
);
1261 ee
->hws
= I915_READ(mmio
);
1264 ee
->idle
= intel_engine_is_idle(engine
);
1265 ee
->hangcheck_timestamp
= engine
->hangcheck
.action_timestamp
;
1266 ee
->hangcheck_action
= engine
->hangcheck
.action
;
1267 ee
->hangcheck_stalled
= engine
->hangcheck
.stalled
;
1268 ee
->reset_count
= i915_reset_engine_count(&dev_priv
->gpu_error
,
1271 if (USES_PPGTT(dev_priv
)) {
1274 ee
->vm_info
.gfx_mode
= I915_READ(RING_MODE_GEN7(engine
));
1276 if (IS_GEN6(dev_priv
))
1277 ee
->vm_info
.pp_dir_base
=
1278 I915_READ(RING_PP_DIR_BASE_READ(engine
));
1279 else if (IS_GEN7(dev_priv
))
1280 ee
->vm_info
.pp_dir_base
=
1281 I915_READ(RING_PP_DIR_BASE(engine
));
1282 else if (INTEL_GEN(dev_priv
) >= 8)
1283 for (i
= 0; i
< 4; i
++) {
1284 ee
->vm_info
.pdp
[i
] =
1285 I915_READ(GEN8_RING_PDP_UDW(engine
, i
));
1286 ee
->vm_info
.pdp
[i
] <<= 32;
1287 ee
->vm_info
.pdp
[i
] |=
1288 I915_READ(GEN8_RING_PDP_LDW(engine
, i
));
1293 static void record_request(struct i915_request
*request
,
1294 struct drm_i915_error_request
*erq
)
1296 struct i915_gem_context
*ctx
= request
->gem_context
;
1298 erq
->context
= ctx
->hw_id
;
1299 erq
->sched_attr
= request
->sched
.attr
;
1300 erq
->ban_score
= atomic_read(&ctx
->ban_score
);
1301 erq
->seqno
= request
->global_seqno
;
1302 erq
->jiffies
= request
->emitted_jiffies
;
1303 erq
->start
= i915_ggtt_offset(request
->ring
->vma
);
1304 erq
->head
= request
->head
;
1305 erq
->tail
= request
->tail
;
1308 erq
->pid
= ctx
->pid
? pid_nr(ctx
->pid
) : 0;
1312 static void engine_record_requests(struct intel_engine_cs
*engine
,
1313 struct i915_request
*first
,
1314 struct drm_i915_error_engine
*ee
)
1316 struct i915_request
*request
;
1321 list_for_each_entry_from(request
, &engine
->timeline
.requests
, link
)
1326 ee
->requests
= kcalloc(count
, sizeof(*ee
->requests
), GFP_ATOMIC
);
1330 ee
->num_requests
= count
;
1334 list_for_each_entry_from(request
, &engine
->timeline
.requests
, link
) {
1335 if (count
>= ee
->num_requests
) {
1337 * If the ring request list was changed in
1338 * between the point where the error request
1339 * list was created and dimensioned and this
1340 * point then just exit early to avoid crashes.
1342 * We don't need to communicate that the
1343 * request list changed state during error
1344 * state capture and that the error state is
1345 * slightly incorrect as a consequence since we
1346 * are typically only interested in the request
1347 * list state at the point of error state
1348 * capture, not in any changes happening during
1354 record_request(request
, &ee
->requests
[count
++]);
1356 ee
->num_requests
= count
;
1359 static void error_record_engine_execlists(struct intel_engine_cs
*engine
,
1360 struct drm_i915_error_engine
*ee
)
1362 const struct intel_engine_execlists
* const execlists
= &engine
->execlists
;
1365 for (n
= 0; n
< execlists_num_ports(execlists
); n
++) {
1366 struct i915_request
*rq
= port_request(&execlists
->port
[n
]);
1371 record_request(rq
, &ee
->execlist
[n
]);
1377 static void record_context(struct drm_i915_error_context
*e
,
1378 struct i915_gem_context
*ctx
)
1381 struct task_struct
*task
;
1384 task
= pid_task(ctx
->pid
, PIDTYPE_PID
);
1386 strcpy(e
->comm
, task
->comm
);
1392 e
->handle
= ctx
->user_handle
;
1393 e
->hw_id
= ctx
->hw_id
;
1394 e
->sched_attr
= ctx
->sched
;
1395 e
->ban_score
= atomic_read(&ctx
->ban_score
);
1396 e
->bannable
= i915_gem_context_is_bannable(ctx
);
1397 e
->guilty
= atomic_read(&ctx
->guilty_count
);
1398 e
->active
= atomic_read(&ctx
->active_count
);
1401 static void request_record_user_bo(struct i915_request
*request
,
1402 struct drm_i915_error_engine
*ee
)
1404 struct i915_capture_list
*c
;
1405 struct drm_i915_error_object
**bo
;
1409 for (c
= request
->capture_list
; c
; c
= c
->next
)
1414 bo
= kcalloc(count
, sizeof(*bo
), GFP_ATOMIC
);
1419 for (c
= request
->capture_list
; c
; c
= c
->next
) {
1420 bo
[count
] = i915_error_object_create(request
->i915
, c
->vma
);
1427 ee
->user_bo_count
= count
;
1430 static struct drm_i915_error_object
*
1431 capture_object(struct drm_i915_private
*dev_priv
,
1432 struct drm_i915_gem_object
*obj
)
1434 if (obj
&& i915_gem_object_has_pages(obj
)) {
1435 struct i915_vma fake
= {
1436 .node
= { .start
= U64_MAX
, .size
= obj
->base
.size
},
1437 .size
= obj
->base
.size
,
1438 .pages
= obj
->mm
.pages
,
1442 return i915_error_object_create(dev_priv
, &fake
);
1448 static void gem_record_rings(struct i915_gpu_state
*error
)
1450 struct drm_i915_private
*i915
= error
->i915
;
1451 struct i915_ggtt
*ggtt
= &i915
->ggtt
;
1454 for (i
= 0; i
< I915_NUM_ENGINES
; i
++) {
1455 struct intel_engine_cs
*engine
= i915
->engine
[i
];
1456 struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
1457 struct i915_request
*request
;
1466 error_record_engine_registers(error
, engine
, ee
);
1467 error_record_engine_waiters(engine
, ee
);
1468 error_record_engine_execlists(engine
, ee
);
1470 request
= i915_gem_find_active_request(engine
);
1472 struct i915_gem_context
*ctx
= request
->gem_context
;
1473 struct intel_ring
*ring
;
1475 ee
->vm
= ctx
->ppgtt
? &ctx
->ppgtt
->vm
: &ggtt
->vm
;
1477 record_context(&ee
->context
, ctx
);
1479 /* We need to copy these to an anonymous buffer
1480 * as the simplest method to avoid being overwritten
1484 i915_error_object_create(i915
, request
->batch
);
1486 if (HAS_BROKEN_CS_TLB(i915
))
1487 ee
->wa_batchbuffer
=
1488 i915_error_object_create(i915
,
1490 request_record_user_bo(request
, ee
);
1493 i915_error_object_create(i915
,
1494 request
->hw_context
->state
);
1497 i915_gem_context_no_error_capture(ctx
);
1499 ee
->rq_head
= request
->head
;
1500 ee
->rq_post
= request
->postfix
;
1501 ee
->rq_tail
= request
->tail
;
1503 ring
= request
->ring
;
1504 ee
->cpu_ring_head
= ring
->head
;
1505 ee
->cpu_ring_tail
= ring
->tail
;
1507 i915_error_object_create(i915
, ring
->vma
);
1509 engine_record_requests(engine
, request
, ee
);
1513 i915_error_object_create(i915
,
1514 engine
->status_page
.vma
);
1516 ee
->wa_ctx
= i915_error_object_create(i915
, engine
->wa_ctx
.vma
);
1518 ee
->default_state
= capture_object(i915
, engine
->default_state
);
1522 static void gem_capture_vm(struct i915_gpu_state
*error
,
1523 struct i915_address_space
*vm
,
1526 struct drm_i915_error_buffer
*active_bo
;
1527 struct i915_vma
*vma
;
1531 list_for_each_entry(vma
, &vm
->active_list
, vm_link
)
1536 active_bo
= kcalloc(count
, sizeof(*active_bo
), GFP_ATOMIC
);
1538 count
= capture_error_bo(active_bo
, count
, &vm
->active_list
, false);
1542 error
->active_vm
[idx
] = vm
;
1543 error
->active_bo
[idx
] = active_bo
;
1544 error
->active_bo_count
[idx
] = count
;
1547 static void capture_active_buffers(struct i915_gpu_state
*error
)
1551 BUILD_BUG_ON(ARRAY_SIZE(error
->engine
) > ARRAY_SIZE(error
->active_bo
));
1552 BUILD_BUG_ON(ARRAY_SIZE(error
->active_bo
) != ARRAY_SIZE(error
->active_vm
));
1553 BUILD_BUG_ON(ARRAY_SIZE(error
->active_bo
) != ARRAY_SIZE(error
->active_bo_count
));
1555 /* Scan each engine looking for unique active contexts/vm */
1556 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
1557 struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
1564 for (j
= 0; j
< i
&& !found
; j
++)
1565 found
= error
->engine
[j
].vm
== ee
->vm
;
1567 gem_capture_vm(error
, ee
->vm
, cnt
++);
1571 static void capture_pinned_buffers(struct i915_gpu_state
*error
)
1573 struct i915_address_space
*vm
= &error
->i915
->ggtt
.vm
;
1574 struct drm_i915_error_buffer
*bo
;
1575 struct i915_vma
*vma
;
1576 int count_inactive
, count_active
;
1579 list_for_each_entry(vma
, &vm
->inactive_list
, vm_link
)
1583 list_for_each_entry(vma
, &vm
->active_list
, vm_link
)
1587 if (count_inactive
+ count_active
)
1588 bo
= kcalloc(count_inactive
+ count_active
,
1589 sizeof(*bo
), GFP_ATOMIC
);
1593 count_inactive
= capture_error_bo(bo
, count_inactive
,
1594 &vm
->active_list
, true);
1595 count_active
= capture_error_bo(bo
+ count_inactive
, count_active
,
1596 &vm
->inactive_list
, true);
1597 error
->pinned_bo_count
= count_inactive
+ count_active
;
1598 error
->pinned_bo
= bo
;
1601 static void capture_uc_state(struct i915_gpu_state
*error
)
1603 struct drm_i915_private
*i915
= error
->i915
;
1604 struct i915_error_uc
*error_uc
= &error
->uc
;
1606 /* Capturing uC state won't be useful if there is no GuC */
1607 if (!error
->device_info
.has_guc
)
1610 error_uc
->guc_fw
= i915
->guc
.fw
;
1611 error_uc
->huc_fw
= i915
->huc
.fw
;
1613 /* Non-default firmware paths will be specified by the modparam.
1614 * As modparams are generally accesible from the userspace make
1615 * explicit copies of the firmware paths.
1617 error_uc
->guc_fw
.path
= kstrdup(i915
->guc
.fw
.path
, GFP_ATOMIC
);
1618 error_uc
->huc_fw
.path
= kstrdup(i915
->huc
.fw
.path
, GFP_ATOMIC
);
1619 error_uc
->guc_log
= i915_error_object_create(i915
, i915
->guc
.log
.vma
);
1622 /* Capture all registers which don't fit into another category. */
1623 static void capture_reg_state(struct i915_gpu_state
*error
)
1625 struct drm_i915_private
*dev_priv
= error
->i915
;
1628 /* General organization
1629 * 1. Registers specific to a single generation
1630 * 2. Registers which belong to multiple generations
1631 * 3. Feature specific registers.
1632 * 4. Everything else
1633 * Please try to follow the order.
1636 /* 1: Registers specific to a single generation */
1637 if (IS_VALLEYVIEW(dev_priv
)) {
1638 error
->gtier
[0] = I915_READ(GTIER
);
1639 error
->ier
= I915_READ(VLV_IER
);
1640 error
->forcewake
= I915_READ_FW(FORCEWAKE_VLV
);
1643 if (IS_GEN7(dev_priv
))
1644 error
->err_int
= I915_READ(GEN7_ERR_INT
);
1646 if (INTEL_GEN(dev_priv
) >= 8) {
1647 error
->fault_data0
= I915_READ(GEN8_FAULT_TLB_DATA0
);
1648 error
->fault_data1
= I915_READ(GEN8_FAULT_TLB_DATA1
);
1651 if (IS_GEN6(dev_priv
)) {
1652 error
->forcewake
= I915_READ_FW(FORCEWAKE
);
1653 error
->gab_ctl
= I915_READ(GAB_CTL
);
1654 error
->gfx_mode
= I915_READ(GFX_MODE
);
1657 /* 2: Registers which belong to multiple generations */
1658 if (INTEL_GEN(dev_priv
) >= 7)
1659 error
->forcewake
= I915_READ_FW(FORCEWAKE_MT
);
1661 if (INTEL_GEN(dev_priv
) >= 6) {
1662 error
->derrmr
= I915_READ(DERRMR
);
1663 error
->error
= I915_READ(ERROR_GEN6
);
1664 error
->done_reg
= I915_READ(DONE_REG
);
1667 if (INTEL_GEN(dev_priv
) >= 5)
1668 error
->ccid
= I915_READ(CCID
);
1670 /* 3: Feature specific registers */
1671 if (IS_GEN6(dev_priv
) || IS_GEN7(dev_priv
)) {
1672 error
->gam_ecochk
= I915_READ(GAM_ECOCHK
);
1673 error
->gac_eco
= I915_READ(GAC_ECO_BITS
);
1676 /* 4: Everything else */
1677 if (INTEL_GEN(dev_priv
) >= 11) {
1678 error
->ier
= I915_READ(GEN8_DE_MISC_IER
);
1679 error
->gtier
[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE
);
1680 error
->gtier
[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE
);
1681 error
->gtier
[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE
);
1682 error
->gtier
[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE
);
1683 error
->gtier
[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE
);
1684 error
->gtier
[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE
);
1686 } else if (INTEL_GEN(dev_priv
) >= 8) {
1687 error
->ier
= I915_READ(GEN8_DE_MISC_IER
);
1688 for (i
= 0; i
< 4; i
++)
1689 error
->gtier
[i
] = I915_READ(GEN8_GT_IER(i
));
1691 } else if (HAS_PCH_SPLIT(dev_priv
)) {
1692 error
->ier
= I915_READ(DEIER
);
1693 error
->gtier
[0] = I915_READ(GTIER
);
1695 } else if (IS_GEN2(dev_priv
)) {
1696 error
->ier
= I915_READ16(IER
);
1697 } else if (!IS_VALLEYVIEW(dev_priv
)) {
1698 error
->ier
= I915_READ(IER
);
1700 error
->eir
= I915_READ(EIR
);
1701 error
->pgtbl_er
= I915_READ(PGTBL_ER
);
1704 static void i915_error_capture_msg(struct drm_i915_private
*dev_priv
,
1705 struct i915_gpu_state
*error
,
1707 const char *error_msg
)
1710 int engine_id
= -1, len
;
1712 ecode
= i915_error_generate_code(dev_priv
, error
, &engine_id
);
1714 len
= scnprintf(error
->error_msg
, sizeof(error
->error_msg
),
1715 "GPU HANG: ecode %d:%d:0x%08x",
1716 INTEL_GEN(dev_priv
), engine_id
, ecode
);
1718 if (engine_id
!= -1 && error
->engine
[engine_id
].context
.pid
)
1719 len
+= scnprintf(error
->error_msg
+ len
,
1720 sizeof(error
->error_msg
) - len
,
1722 error
->engine
[engine_id
].context
.comm
,
1723 error
->engine
[engine_id
].context
.pid
);
1725 scnprintf(error
->error_msg
+ len
, sizeof(error
->error_msg
) - len
,
1726 ", reason: %s, action: %s",
1728 engine_mask
? "reset" : "continue");
1731 static void capture_gen_state(struct i915_gpu_state
*error
)
1733 struct drm_i915_private
*i915
= error
->i915
;
1735 error
->awake
= i915
->gt
.awake
;
1736 error
->wakelock
= atomic_read(&i915
->runtime_pm
.wakeref_count
);
1737 error
->suspended
= i915
->runtime_pm
.suspended
;
1740 #ifdef CONFIG_INTEL_IOMMU
1741 error
->iommu
= intel_iommu_gfx_mapped
;
1743 error
->reset_count
= i915_reset_count(&i915
->gpu_error
);
1744 error
->suspend_count
= i915
->suspend_count
;
1746 memcpy(&error
->device_info
,
1748 sizeof(error
->device_info
));
1749 error
->driver_caps
= i915
->caps
;
1752 static __always_inline
void dup_param(const char *type
, void *x
)
1754 if (!__builtin_strcmp(type
, "char *"))
1755 *(void **)x
= kstrdup(*(void **)x
, GFP_ATOMIC
);
1758 static void capture_params(struct i915_gpu_state
*error
)
1760 error
->params
= i915_modparams
;
1761 #define DUP(T, x, ...) dup_param(#T, &error->params.x);
1762 I915_PARAMS_FOR_EACH(DUP
);
1766 static unsigned long capture_find_epoch(const struct i915_gpu_state
*error
)
1768 unsigned long epoch
= error
->capture
;
1771 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
1772 const struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
1774 if (ee
->hangcheck_stalled
&&
1775 time_before(ee
->hangcheck_timestamp
, epoch
))
1776 epoch
= ee
->hangcheck_timestamp
;
1782 static int capture(void *data
)
1784 struct i915_gpu_state
*error
= data
;
1786 error
->time
= ktime_get_real();
1787 error
->boottime
= ktime_get_boottime();
1788 error
->uptime
= ktime_sub(ktime_get(),
1789 error
->i915
->gt
.last_init_time
);
1790 error
->capture
= jiffies
;
1792 capture_params(error
);
1793 capture_gen_state(error
);
1794 capture_uc_state(error
);
1795 capture_reg_state(error
);
1796 gem_record_fences(error
);
1797 gem_record_rings(error
);
1798 capture_active_buffers(error
);
1799 capture_pinned_buffers(error
);
1801 error
->overlay
= intel_overlay_capture_error_state(error
->i915
);
1802 error
->display
= intel_display_capture_error_state(error
->i915
);
1804 error
->epoch
= capture_find_epoch(error
);
1809 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1811 struct i915_gpu_state
*
1812 i915_capture_gpu_state(struct drm_i915_private
*i915
)
1814 struct i915_gpu_state
*error
;
1816 error
= kzalloc(sizeof(*error
), GFP_ATOMIC
);
1820 kref_init(&error
->ref
);
1823 stop_machine(capture
, error
, NULL
);
1829 * i915_capture_error_state - capture an error record for later analysis
1830 * @i915: i915 device
1831 * @engine_mask: the mask of engines triggering the hang
1832 * @error_msg: a message to insert into the error capture header
1834 * Should be called when an error is detected (either a hang or an error
1835 * interrupt) to capture error state from the time of the error. Fills
1836 * out a structure which becomes available in debugfs for user level tools
1839 void i915_capture_error_state(struct drm_i915_private
*i915
,
1841 const char *error_msg
)
1844 struct i915_gpu_state
*error
;
1845 unsigned long flags
;
1847 if (!i915_modparams
.error_capture
)
1850 if (READ_ONCE(i915
->gpu_error
.first_error
))
1853 error
= i915_capture_gpu_state(i915
);
1855 DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
1859 i915_error_capture_msg(i915
, error
, engine_mask
, error_msg
);
1860 DRM_INFO("%s\n", error
->error_msg
);
1862 if (!error
->simulated
) {
1863 spin_lock_irqsave(&i915
->gpu_error
.lock
, flags
);
1864 if (!i915
->gpu_error
.first_error
) {
1865 i915
->gpu_error
.first_error
= error
;
1868 spin_unlock_irqrestore(&i915
->gpu_error
.lock
, flags
);
1872 __i915_gpu_state_free(&error
->ref
);
1877 ktime_get_real_seconds() - DRIVER_TIMESTAMP
< DAY_AS_SECONDS(180)) {
1878 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1879 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1880 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1881 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1882 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1883 i915
->drm
.primary
->index
);
1888 struct i915_gpu_state
*
1889 i915_first_error_state(struct drm_i915_private
*i915
)
1891 struct i915_gpu_state
*error
;
1893 spin_lock_irq(&i915
->gpu_error
.lock
);
1894 error
= i915
->gpu_error
.first_error
;
1896 i915_gpu_state_get(error
);
1897 spin_unlock_irq(&i915
->gpu_error
.lock
);
1902 void i915_reset_error_state(struct drm_i915_private
*i915
)
1904 struct i915_gpu_state
*error
;
1906 spin_lock_irq(&i915
->gpu_error
.lock
);
1907 error
= i915
->gpu_error
.first_error
;
1908 i915
->gpu_error
.first_error
= NULL
;
1909 spin_unlock_irq(&i915
->gpu_error
.lock
);
1911 i915_gpu_state_put(error
);