2 * SPDX-License-Identifier: MIT
4 * Copyright � 2008-2018 Intel Corporation
7 #ifndef _I915_GPU_ERROR_H_
8 #define _I915_GPU_ERROR_H_
10 #include <linux/kref.h>
11 #include <linux/ktime.h>
12 #include <linux/sched.h>
14 #include <drm/drm_mm.h>
16 #include "intel_device_info.h"
17 #include "intel_ringbuffer.h"
18 #include "intel_uc_fw.h"
21 #include "i915_gem_gtt.h"
22 #include "i915_params.h"
23 #include "i915_scheduler.h"
25 struct drm_i915_private
;
26 struct intel_overlay_error_state
;
27 struct intel_display_error_state
;
29 struct i915_gpu_state
{
34 unsigned long capture
;
37 struct drm_i915_private
*i915
;
47 struct intel_device_info device_info
;
48 struct intel_driver_caps driver_caps
;
49 struct i915_params params
;
51 struct i915_error_uc
{
52 struct intel_uc_fw guc_fw
;
53 struct intel_uc_fw huc_fw
;
54 struct drm_i915_error_object
*guc_log
;
57 /* Generic register state */
65 u32 error
; /* gen6+ */
66 u32 err_int
; /* gen7 */
67 u32 fault_data0
; /* gen8, gen9 */
68 u32 fault_data1
; /* gen8, gen9 */
76 u64 fence
[I915_MAX_NUM_FENCES
];
77 struct intel_overlay_error_state
*overlay
;
78 struct intel_display_error_state
*display
;
80 struct drm_i915_error_engine
{
82 /* Software tracked state */
86 unsigned long hangcheck_timestamp
;
87 bool hangcheck_stalled
;
88 enum intel_engine_hangcheck_action hangcheck_action
;
89 struct i915_address_space
*vm
;
93 /* position of active request inside the ring */
94 u32 rq_head
, rq_post
, rq_tail
;
96 /* our own tracking of ring head and tail */
119 u32 rc_psmi
; /* sleep state */
120 u32 semaphore_mboxes
[I915_NUM_ENGINES
- 1];
121 struct intel_instdone instdone
;
123 struct drm_i915_error_context
{
124 char comm
[TASK_COMM_LEN
];
132 struct i915_sched_attr sched_attr
;
135 struct drm_i915_error_object
{
142 } *ringbuffer
, *batchbuffer
, *wa_batchbuffer
, *ctx
, *hws_page
;
144 struct drm_i915_error_object
**user_bo
;
147 struct drm_i915_error_object
*wa_ctx
;
148 struct drm_i915_error_object
*default_state
;
150 struct drm_i915_error_request
{
159 struct i915_sched_attr sched_attr
;
160 } *requests
, execlist
[EXECLIST_MAX_PORTS
];
161 unsigned int num_ports
;
163 struct drm_i915_error_waiter
{
164 char comm
[TASK_COMM_LEN
];
176 } engine
[I915_NUM_ENGINES
];
178 struct drm_i915_error_buffer
{
185 s32 fence_reg
:I915_MAX_NUM_FENCE_BITS
;
192 } *active_bo
[I915_NUM_ENGINES
], *pinned_bo
;
193 u32 active_bo_count
[I915_NUM_ENGINES
], pinned_bo_count
;
194 struct i915_address_space
*active_vm
[I915_NUM_ENGINES
];
197 struct i915_gpu_error
{
198 /* For hangcheck timer */
199 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
200 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
202 struct delayed_work hangcheck_work
;
204 /* For reset and error_state handling. */
206 /* Protected by the above dev->gpu_error.lock. */
207 struct i915_gpu_state
*first_error
;
209 atomic_t pending_fb_pin
;
211 unsigned long missed_irq_rings
;
214 * State variable controlling the reset flow and count
216 * This is a counter which gets incremented when reset is triggered,
218 * Before the reset commences, the I915_RESET_BACKOFF bit is set
219 * meaning that any waiters holding onto the struct_mutex should
220 * relinquish the lock immediately in order for the reset to start.
222 * If reset is not completed successfully, the I915_WEDGE bit is
223 * set meaning that hardware is terminally sour and there is no
224 * recovery. All waiters on the reset_queue will be woken when
227 * This counter is used by the wait_seqno code to notice that reset
228 * event happened and it needs to restart the entire ioctl (since most
229 * likely the seqno it waited for won't ever signal anytime soon).
231 * This is important for lock-free wait paths, where no contended lock
232 * naturally enforces the correct ordering between the bail-out of the
233 * waiter and the gpu reset work code.
235 unsigned long reset_count
;
238 * flags: Control various stages of the GPU reset
240 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
241 * other users acquiring the struct_mutex. To do this we set the
242 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
243 * and then check for that bit before acquiring the struct_mutex (in
244 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
245 * secondary role in preventing two concurrent global reset attempts.
247 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
248 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
249 * but it may be held by some long running waiter (that we cannot
250 * interrupt without causing trouble). Once we are ready to do the GPU
251 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
252 * they already hold the struct_mutex and want to participate they can
253 * inspect the bit and do the reset directly, otherwise the worker
254 * waits for the struct_mutex.
256 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
257 * acquire the struct_mutex to reset an engine, we need an explicit
258 * flag to prevent two concurrent reset attempts in the same engine.
259 * As the number of engines continues to grow, allocate the flags from
260 * the most significant bits.
262 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
263 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
264 * i915_request_alloc(), this bit is checked and the sequence
265 * aborted (with -EIO reported to userspace) if set.
268 #define I915_RESET_BACKOFF 0
269 #define I915_RESET_HANDOFF 1
270 #define I915_RESET_MODESET 2
271 #define I915_WEDGED (BITS_PER_LONG - 1)
272 #define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
274 /** Number of times an engine has been reset */
275 u32 reset_engine_count
[I915_NUM_ENGINES
];
277 /** Set of stalled engines with guilty requests, in the current reset */
280 /** Reason for the current *global* reset */
284 * Waitqueue to signal when a hang is detected. Used to for waiters
285 * to release the struct_mutex for the reset to procede.
287 wait_queue_head_t wait_queue
;
290 * Waitqueue to signal when the reset has completed. Used by clients
291 * that wait for dev_priv->mm.wedged to settle.
293 wait_queue_head_t reset_queue
;
295 /* For missed irq/seqno simulation. */
296 unsigned long test_irq_rings
;
299 struct drm_i915_error_state_buf
{
300 struct drm_i915_private
*i915
;
309 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
312 void i915_error_printf(struct drm_i915_error_state_buf
*e
, const char *f
, ...);
313 int i915_error_state_to_str(struct drm_i915_error_state_buf
*estr
,
314 const struct i915_gpu_state
*gpu
);
315 int i915_error_state_buf_init(struct drm_i915_error_state_buf
*eb
,
316 struct drm_i915_private
*i915
,
317 size_t count
, loff_t pos
);
320 i915_error_state_buf_release(struct drm_i915_error_state_buf
*eb
)
325 struct i915_gpu_state
*i915_capture_gpu_state(struct drm_i915_private
*i915
);
326 void i915_capture_error_state(struct drm_i915_private
*dev_priv
,
328 const char *error_msg
);
330 static inline struct i915_gpu_state
*
331 i915_gpu_state_get(struct i915_gpu_state
*gpu
)
337 void __i915_gpu_state_free(struct kref
*kref
);
338 static inline void i915_gpu_state_put(struct i915_gpu_state
*gpu
)
341 kref_put(&gpu
->ref
, __i915_gpu_state_free
);
344 struct i915_gpu_state
*i915_first_error_state(struct drm_i915_private
*i915
);
345 void i915_reset_error_state(struct drm_i915_private
*i915
);
349 static inline void i915_capture_error_state(struct drm_i915_private
*dev_priv
,
351 const char *error_msg
)
355 static inline struct i915_gpu_state
*
356 i915_first_error_state(struct drm_i915_private
*i915
)
361 static inline void i915_reset_error_state(struct drm_i915_private
*i915
)
365 #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
367 #endif /* _I915_GPU_ERROR_H_ */