1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright 2016-2019 HabanaLabs, Ltd.
8 #include "habanalabs.h"
10 #include <linux/slab.h>
13 * hl_queue_add_ptr - add to pi or ci and checks if it wraps around
15 * @ptr: the current pi/ci value
16 * @val: the amount to add
18 * Add val to ptr. It can go until twice the queue length.
20 inline u32
hl_hw_queue_add_ptr(u32 ptr
, u16 val
)
23 ptr
&= ((HL_QUEUE_LENGTH
<< 1) - 1);
26 static inline int queue_ci_get(atomic_t
*ci
, u32 queue_len
)
28 return atomic_read(ci
) & ((queue_len
<< 1) - 1);
31 static inline int queue_free_slots(struct hl_hw_queue
*q
, u32 queue_len
)
33 int delta
= (q
->pi
- queue_ci_get(&q
->ci
, queue_len
));
36 return (queue_len
- delta
);
38 return (abs(delta
) - queue_len
);
41 void hl_int_hw_queue_update_ci(struct hl_cs
*cs
)
43 struct hl_device
*hdev
= cs
->ctx
->hdev
;
44 struct hl_hw_queue
*q
;
50 q
= &hdev
->kernel_queues
[0];
52 /* There are no internal queues if H/W queues are being used */
53 if (!hdev
->asic_prop
.max_queues
|| q
->queue_type
== QUEUE_TYPE_HW
)
56 for (i
= 0 ; i
< hdev
->asic_prop
.max_queues
; i
++, q
++) {
57 if (q
->queue_type
== QUEUE_TYPE_INT
)
58 atomic_add(cs
->jobs_in_queue_cnt
[i
], &q
->ci
);
63 * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
65 * @hdev: pointer to habanalabs device structure
66 * @q: pointer to habanalabs queue structure
67 * @ctl: BD's control word
71 * This function assumes there is enough space on the queue to submit a new
72 * BD to it. It initializes the next BD and calls the device specific
73 * function to set the pi (and doorbell)
75 * This function must be called when the scheduler mutex is taken
78 static void ext_and_hw_queue_submit_bd(struct hl_device
*hdev
,
79 struct hl_hw_queue
*q
, u32 ctl
, u32 len
, u64 ptr
)
83 bd
= q
->kernel_address
;
84 bd
+= hl_pi_2_offset(q
->pi
);
85 bd
->ctl
= cpu_to_le32(ctl
);
86 bd
->len
= cpu_to_le32(len
);
87 bd
->ptr
= cpu_to_le64(ptr
);
89 q
->pi
= hl_queue_inc_ptr(q
->pi
);
90 hdev
->asic_funcs
->ring_doorbell(hdev
, q
->hw_queue_id
, q
->pi
);
94 * ext_queue_sanity_checks - perform some sanity checks on external queue
96 * @hdev : pointer to hl_device structure
97 * @q : pointer to hl_hw_queue structure
98 * @num_of_entries : how many entries to check for space
99 * @reserve_cq_entry : whether to reserve an entry in the cq
101 * H/W queues spinlock should be taken before calling this function
103 * Perform the following:
104 * - Make sure we have enough space in the h/w queue
105 * - Make sure we have enough space in the completion queue
106 * - Reserve space in the completion queue (needs to be reversed if there
107 * is a failure down the road before the actual submission of work). Only
108 * do this action if reserve_cq_entry is true
111 static int ext_queue_sanity_checks(struct hl_device
*hdev
,
112 struct hl_hw_queue
*q
, int num_of_entries
,
113 bool reserve_cq_entry
)
115 atomic_t
*free_slots
=
116 &hdev
->completion_queue
[q
->cq_id
].free_slots_cnt
;
119 /* Check we have enough space in the queue */
120 free_slots_cnt
= queue_free_slots(q
, HL_QUEUE_LENGTH
);
122 if (free_slots_cnt
< num_of_entries
) {
123 dev_dbg(hdev
->dev
, "Queue %d doesn't have room for %d CBs\n",
124 q
->hw_queue_id
, num_of_entries
);
128 if (reserve_cq_entry
) {
130 * Check we have enough space in the completion queue
131 * Add -1 to counter (decrement) unless counter was already 0
132 * In that case, CQ is full so we can't submit a new CB because
133 * we won't get ack on its completion
134 * atomic_add_unless will return 0 if counter was already 0
136 if (atomic_add_negative(num_of_entries
* -1, free_slots
)) {
137 dev_dbg(hdev
->dev
, "No space for %d on CQ %d\n",
138 num_of_entries
, q
->hw_queue_id
);
139 atomic_add(num_of_entries
, free_slots
);
148 * int_queue_sanity_checks - perform some sanity checks on internal queue
150 * @hdev : pointer to hl_device structure
151 * @q : pointer to hl_hw_queue structure
152 * @num_of_entries : how many entries to check for space
154 * H/W queues spinlock should be taken before calling this function
156 * Perform the following:
157 * - Make sure we have enough space in the h/w queue
160 static int int_queue_sanity_checks(struct hl_device
*hdev
,
161 struct hl_hw_queue
*q
,
166 if (num_of_entries
> q
->int_queue_len
) {
168 "Cannot populate queue %u with %u jobs\n",
169 q
->hw_queue_id
, num_of_entries
);
173 /* Check we have enough space in the queue */
174 free_slots_cnt
= queue_free_slots(q
, q
->int_queue_len
);
176 if (free_slots_cnt
< num_of_entries
) {
177 dev_dbg(hdev
->dev
, "Queue %d doesn't have room for %d CBs\n",
178 q
->hw_queue_id
, num_of_entries
);
186 * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue
187 * @hdev: Pointer to hl_device structure.
188 * @q: Pointer to hl_hw_queue structure.
189 * @num_of_entries: How many entries to check for space.
191 * Notice: We do not reserve queue entries so this function mustn't be called
192 * more than once per CS for the same queue
195 static int hw_queue_sanity_checks(struct hl_device
*hdev
, struct hl_hw_queue
*q
,
200 /* Check we have enough space in the queue */
201 free_slots_cnt
= queue_free_slots(q
, HL_QUEUE_LENGTH
);
203 if (free_slots_cnt
< num_of_entries
) {
204 dev_dbg(hdev
->dev
, "Queue %d doesn't have room for %d CBs\n",
205 q
->hw_queue_id
, num_of_entries
);
213 * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
215 * @hdev: pointer to hl_device structure
216 * @hw_queue_id: Queue's type
217 * @cb_size: size of CB
218 * @cb_ptr: pointer to CB location
220 * This function sends a single CB, that must NOT generate a completion entry
223 int hl_hw_queue_send_cb_no_cmpl(struct hl_device
*hdev
, u32 hw_queue_id
,
224 u32 cb_size
, u64 cb_ptr
)
226 struct hl_hw_queue
*q
= &hdev
->kernel_queues
[hw_queue_id
];
230 * The CPU queue is a synchronous queue with an effective depth of
231 * a single entry (although it is allocated with room for multiple
232 * entries). Therefore, there is a different lock, called
233 * send_cpu_message_lock, that serializes accesses to the CPU queue.
234 * As a result, we don't need to lock the access to the entire H/W
235 * queues module when submitting a JOB to the CPU queue
237 if (q
->queue_type
!= QUEUE_TYPE_CPU
)
238 hdev
->asic_funcs
->hw_queues_lock(hdev
);
240 if (hdev
->disabled
) {
246 * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue
247 * type only on init phase, when the queues are empty and being tested,
248 * so there is no need for sanity checks.
250 if (q
->queue_type
!= QUEUE_TYPE_HW
) {
251 rc
= ext_queue_sanity_checks(hdev
, q
, 1, false);
256 ext_and_hw_queue_submit_bd(hdev
, q
, 0, cb_size
, cb_ptr
);
259 if (q
->queue_type
!= QUEUE_TYPE_CPU
)
260 hdev
->asic_funcs
->hw_queues_unlock(hdev
);
266 * ext_queue_schedule_job - submit a JOB to an external queue
268 * @job: pointer to the job that needs to be submitted to the queue
270 * This function must be called when the scheduler mutex is taken
273 static void ext_queue_schedule_job(struct hl_cs_job
*job
)
275 struct hl_device
*hdev
= job
->cs
->ctx
->hdev
;
276 struct hl_hw_queue
*q
= &hdev
->kernel_queues
[job
->hw_queue_id
];
277 struct hl_cq_entry cq_pkt
;
286 * Update the JOB ID inside the BD CTL so the device would know what
287 * to write in the completion queue
289 ctl
= ((q
->pi
<< BD_CTL_SHADOW_INDEX_SHIFT
) & BD_CTL_SHADOW_INDEX_MASK
);
291 cb
= job
->patched_cb
;
292 len
= job
->job_cb_size
;
293 ptr
= cb
->bus_address
;
295 cq_pkt
.data
= cpu_to_le32(
296 ((q
->pi
<< CQ_ENTRY_SHADOW_INDEX_SHIFT
)
297 & CQ_ENTRY_SHADOW_INDEX_MASK
) |
298 FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK
, 1) |
299 FIELD_PREP(CQ_ENTRY_READY_MASK
, 1));
302 * No need to protect pi_offset because scheduling to the
303 * H/W queues is done under the scheduler mutex
305 * No need to check if CQ is full because it was already
306 * checked in ext_queue_sanity_checks
308 cq
= &hdev
->completion_queue
[q
->cq_id
];
309 cq_addr
= cq
->bus_address
+ cq
->pi
* sizeof(struct hl_cq_entry
);
311 hdev
->asic_funcs
->add_end_of_cb_packets(hdev
, cb
->kernel_address
, len
,
313 le32_to_cpu(cq_pkt
.data
),
315 job
->contains_dma_pkt
);
317 q
->shadow_queue
[hl_pi_2_offset(q
->pi
)] = job
;
319 cq
->pi
= hl_cq_inc_ptr(cq
->pi
);
321 ext_and_hw_queue_submit_bd(hdev
, q
, ctl
, len
, ptr
);
325 * int_queue_schedule_job - submit a JOB to an internal queue
327 * @job: pointer to the job that needs to be submitted to the queue
329 * This function must be called when the scheduler mutex is taken
332 static void int_queue_schedule_job(struct hl_cs_job
*job
)
334 struct hl_device
*hdev
= job
->cs
->ctx
->hdev
;
335 struct hl_hw_queue
*q
= &hdev
->kernel_queues
[job
->hw_queue_id
];
340 bd
.len
= cpu_to_le32(job
->job_cb_size
);
342 if (job
->is_kernel_allocated_cb
)
343 /* bus_address is actually a mmu mapped address
344 * allocated from an internal pool
346 bd
.ptr
= cpu_to_le64(job
->user_cb
->bus_address
);
348 bd
.ptr
= cpu_to_le64((u64
) (uintptr_t) job
->user_cb
);
350 pi
= q
->kernel_address
+ (q
->pi
& (q
->int_queue_len
- 1)) * sizeof(bd
);
353 q
->pi
&= ((q
->int_queue_len
<< 1) - 1);
355 hdev
->asic_funcs
->pqe_write(hdev
, pi
, &bd
);
357 hdev
->asic_funcs
->ring_doorbell(hdev
, q
->hw_queue_id
, q
->pi
);
361 * hw_queue_schedule_job - submit a JOB to a H/W queue
363 * @job: pointer to the job that needs to be submitted to the queue
365 * This function must be called when the scheduler mutex is taken
368 static void hw_queue_schedule_job(struct hl_cs_job
*job
)
370 struct hl_device
*hdev
= job
->cs
->ctx
->hdev
;
371 struct hl_hw_queue
*q
= &hdev
->kernel_queues
[job
->hw_queue_id
];
373 u32 offset
, ctl
, len
;
376 * Upon PQE completion, COMP_DATA is used as the write data to the
377 * completion queue (QMAN HBW message), and COMP_OFFSET is used as the
378 * write address offset in the SM block (QMAN LBW message).
379 * The write address offset is calculated as "COMP_OFFSET << 2".
381 offset
= job
->cs
->sequence
& (hdev
->asic_prop
.max_pending_cs
- 1);
382 ctl
= ((offset
<< BD_CTL_COMP_OFFSET_SHIFT
) & BD_CTL_COMP_OFFSET_MASK
) |
383 ((q
->pi
<< BD_CTL_COMP_DATA_SHIFT
) & BD_CTL_COMP_DATA_MASK
);
385 len
= job
->job_cb_size
;
388 * A patched CB is created only if a user CB was allocated by driver and
389 * MMU is disabled. If MMU is enabled, the user CB should be used
390 * instead. If the user CB wasn't allocated by driver, assume that it
394 ptr
= job
->patched_cb
->bus_address
;
395 else if (job
->is_kernel_allocated_cb
)
396 ptr
= job
->user_cb
->bus_address
;
398 ptr
= (u64
) (uintptr_t) job
->user_cb
;
400 ext_and_hw_queue_submit_bd(hdev
, q
, ctl
, len
, ptr
);
403 static void init_signal_cs(struct hl_device
*hdev
,
404 struct hl_cs_job
*job
, struct hl_cs_compl
*cs_cmpl
)
406 struct hl_sync_stream_properties
*prop
;
407 struct hl_hw_sob
*hw_sob
;
410 q_idx
= job
->hw_queue_id
;
411 prop
= &hdev
->kernel_queues
[q_idx
].sync_stream_prop
;
412 hw_sob
= &prop
->hw_sob
[prop
->curr_sob_offset
];
414 cs_cmpl
->hw_sob
= hw_sob
;
415 cs_cmpl
->sob_val
= prop
->next_sob_val
++;
418 "generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n",
419 cs_cmpl
->hw_sob
->sob_id
, cs_cmpl
->sob_val
, q_idx
);
421 /* we set an EB since we must make sure all oeprations are done
422 * when sending the signal
424 hdev
->asic_funcs
->gen_signal_cb(hdev
, job
->patched_cb
,
425 cs_cmpl
->hw_sob
->sob_id
, 0, true);
427 kref_get(&hw_sob
->kref
);
429 /* check for wraparound */
430 if (prop
->next_sob_val
== HL_MAX_SOB_VAL
) {
432 * Decrement as we reached the max value.
433 * The release function won't be called here as we've
434 * just incremented the refcount.
436 kref_put(&hw_sob
->kref
, hl_sob_reset_error
);
437 prop
->next_sob_val
= 1;
438 /* only two SOBs are currently in use */
439 prop
->curr_sob_offset
=
440 (prop
->curr_sob_offset
+ 1) % HL_RSVD_SOBS
;
442 dev_dbg(hdev
->dev
, "switched to SOB %d, q_idx: %d\n",
443 prop
->curr_sob_offset
, q_idx
);
447 static void init_wait_cs(struct hl_device
*hdev
, struct hl_cs
*cs
,
448 struct hl_cs_job
*job
, struct hl_cs_compl
*cs_cmpl
)
450 struct hl_cs_compl
*signal_cs_cmpl
;
451 struct hl_sync_stream_properties
*prop
;
452 struct hl_gen_wait_properties wait_prop
;
455 q_idx
= job
->hw_queue_id
;
456 prop
= &hdev
->kernel_queues
[q_idx
].sync_stream_prop
;
458 signal_cs_cmpl
= container_of(cs
->signal_fence
,
462 /* copy the SOB id and value of the signal CS */
463 cs_cmpl
->hw_sob
= signal_cs_cmpl
->hw_sob
;
464 cs_cmpl
->sob_val
= signal_cs_cmpl
->sob_val
;
467 "generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d\n",
468 cs_cmpl
->hw_sob
->sob_id
, cs_cmpl
->sob_val
,
469 prop
->base_mon_id
, q_idx
);
471 wait_prop
.data
= (void *) job
->patched_cb
;
472 wait_prop
.sob_base
= cs_cmpl
->hw_sob
->sob_id
;
473 wait_prop
.sob_mask
= 0x1;
474 wait_prop
.sob_val
= cs_cmpl
->sob_val
;
475 wait_prop
.mon_id
= prop
->base_mon_id
;
476 wait_prop
.q_idx
= q_idx
;
478 hdev
->asic_funcs
->gen_wait_cb(hdev
, &wait_prop
);
480 kref_get(&cs_cmpl
->hw_sob
->kref
);
482 * Must put the signal fence after the SOB refcnt increment so
483 * the SOB refcnt won't turn 0 and reset the SOB before the
484 * wait CS was submitted.
487 hl_fence_put(cs
->signal_fence
);
488 cs
->signal_fence
= NULL
;
492 * init_signal_wait_cs - initialize a signal/wait CS
493 * @cs: pointer to the signal/wait CS
495 * H/W queues spinlock should be taken before calling this function
497 static void init_signal_wait_cs(struct hl_cs
*cs
)
499 struct hl_ctx
*ctx
= cs
->ctx
;
500 struct hl_device
*hdev
= ctx
->hdev
;
501 struct hl_cs_job
*job
;
502 struct hl_cs_compl
*cs_cmpl
=
503 container_of(cs
->fence
, struct hl_cs_compl
, base_fence
);
505 /* There is only one job in a signal/wait CS */
506 job
= list_first_entry(&cs
->job_list
, struct hl_cs_job
,
509 if (cs
->type
& CS_TYPE_SIGNAL
)
510 init_signal_cs(hdev
, job
, cs_cmpl
);
511 else if (cs
->type
& CS_TYPE_WAIT
)
512 init_wait_cs(hdev
, cs
, job
, cs_cmpl
);
516 * hl_hw_queue_schedule_cs - schedule a command submission
517 * @cs: pointer to the CS
519 int hl_hw_queue_schedule_cs(struct hl_cs
*cs
)
521 enum hl_device_status status
;
522 struct hl_cs_counters_atomic
*cntr
;
523 struct hl_ctx
*ctx
= cs
->ctx
;
524 struct hl_device
*hdev
= ctx
->hdev
;
525 struct hl_cs_job
*job
, *tmp
;
526 struct hl_hw_queue
*q
;
527 int rc
= 0, i
, cq_cnt
;
530 cntr
= &hdev
->aggregated_cs_counters
;
532 hdev
->asic_funcs
->hw_queues_lock(hdev
);
534 if (!hl_device_operational(hdev
, &status
)) {
535 atomic64_inc(&cntr
->device_in_reset_drop_cnt
);
536 atomic64_inc(&ctx
->cs_counters
.device_in_reset_drop_cnt
);
538 "device is %s, CS rejected!\n", hdev
->status
[status
]);
543 max_queues
= hdev
->asic_prop
.max_queues
;
545 q
= &hdev
->kernel_queues
[0];
546 for (i
= 0, cq_cnt
= 0 ; i
< max_queues
; i
++, q
++) {
547 if (cs
->jobs_in_queue_cnt
[i
]) {
548 switch (q
->queue_type
) {
550 rc
= ext_queue_sanity_checks(hdev
, q
,
551 cs
->jobs_in_queue_cnt
[i
], true);
554 rc
= int_queue_sanity_checks(hdev
, q
,
555 cs
->jobs_in_queue_cnt
[i
]);
558 rc
= hw_queue_sanity_checks(hdev
, q
,
559 cs
->jobs_in_queue_cnt
[i
]);
562 dev_err(hdev
->dev
, "Queue type %d is invalid\n",
570 &ctx
->cs_counters
.queue_full_drop_cnt
);
571 atomic64_inc(&cntr
->queue_full_drop_cnt
);
575 if (q
->queue_type
== QUEUE_TYPE_EXT
)
580 if ((cs
->type
== CS_TYPE_SIGNAL
) || (cs
->type
== CS_TYPE_WAIT
))
581 init_signal_wait_cs(cs
);
582 else if (cs
->type
== CS_TYPE_COLLECTIVE_WAIT
)
583 hdev
->asic_funcs
->collective_wait_init_cs(cs
);
585 spin_lock(&hdev
->cs_mirror_lock
);
586 list_add_tail(&cs
->mirror_node
, &hdev
->cs_mirror_list
);
588 /* Queue TDR if the CS is the first entry and if timeout is wanted */
589 if ((hdev
->timeout_jiffies
!= MAX_SCHEDULE_TIMEOUT
) &&
590 (list_first_entry(&hdev
->cs_mirror_list
,
591 struct hl_cs
, mirror_node
) == cs
)) {
592 cs
->tdr_active
= true;
593 schedule_delayed_work(&cs
->work_tdr
, hdev
->timeout_jiffies
);
597 spin_unlock(&hdev
->cs_mirror_lock
);
599 if (!hdev
->cs_active_cnt
++) {
600 struct hl_device_idle_busy_ts
*ts
;
602 ts
= &hdev
->idle_busy_ts_arr
[hdev
->idle_busy_ts_idx
];
603 ts
->busy_to_idle_ts
= ktime_set(0, 0);
604 ts
->idle_to_busy_ts
= ktime_get();
607 list_for_each_entry_safe(job
, tmp
, &cs
->job_list
, cs_node
)
608 switch (job
->queue_type
) {
610 ext_queue_schedule_job(job
);
613 int_queue_schedule_job(job
);
616 hw_queue_schedule_job(job
);
622 cs
->submitted
= true;
627 q
= &hdev
->kernel_queues
[0];
628 for (i
= 0 ; (i
< max_queues
) && (cq_cnt
> 0) ; i
++, q
++) {
629 if ((q
->queue_type
== QUEUE_TYPE_EXT
) &&
630 (cs
->jobs_in_queue_cnt
[i
])) {
631 atomic_t
*free_slots
=
632 &hdev
->completion_queue
[i
].free_slots_cnt
;
633 atomic_add(cs
->jobs_in_queue_cnt
[i
], free_slots
);
639 hdev
->asic_funcs
->hw_queues_unlock(hdev
);
645 * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue
647 * @hdev: pointer to hl_device structure
648 * @hw_queue_id: which queue to increment its ci
650 void hl_hw_queue_inc_ci_kernel(struct hl_device
*hdev
, u32 hw_queue_id
)
652 struct hl_hw_queue
*q
= &hdev
->kernel_queues
[hw_queue_id
];
657 static int ext_and_cpu_queue_init(struct hl_device
*hdev
, struct hl_hw_queue
*q
,
664 p
= hdev
->asic_funcs
->cpu_accessible_dma_pool_alloc(hdev
,
665 HL_QUEUE_SIZE_IN_BYTES
,
668 p
= hdev
->asic_funcs
->asic_dma_alloc_coherent(hdev
,
669 HL_QUEUE_SIZE_IN_BYTES
,
671 GFP_KERNEL
| __GFP_ZERO
);
675 q
->kernel_address
= p
;
677 q
->shadow_queue
= kmalloc_array(HL_QUEUE_LENGTH
,
678 sizeof(*q
->shadow_queue
),
680 if (!q
->shadow_queue
) {
682 "Failed to allocate shadow queue for H/W queue %d\n",
688 /* Make sure read/write pointers are initialized to start of queue */
689 atomic_set(&q
->ci
, 0);
696 hdev
->asic_funcs
->cpu_accessible_dma_pool_free(hdev
,
697 HL_QUEUE_SIZE_IN_BYTES
,
700 hdev
->asic_funcs
->asic_dma_free_coherent(hdev
,
701 HL_QUEUE_SIZE_IN_BYTES
,
708 static int int_queue_init(struct hl_device
*hdev
, struct hl_hw_queue
*q
)
712 p
= hdev
->asic_funcs
->get_int_queue_base(hdev
, q
->hw_queue_id
,
713 &q
->bus_address
, &q
->int_queue_len
);
716 "Failed to get base address for internal queue %d\n",
721 q
->kernel_address
= p
;
723 atomic_set(&q
->ci
, 0);
728 static int cpu_queue_init(struct hl_device
*hdev
, struct hl_hw_queue
*q
)
730 return ext_and_cpu_queue_init(hdev
, q
, true);
733 static int ext_queue_init(struct hl_device
*hdev
, struct hl_hw_queue
*q
)
735 return ext_and_cpu_queue_init(hdev
, q
, false);
738 static int hw_queue_init(struct hl_device
*hdev
, struct hl_hw_queue
*q
)
742 p
= hdev
->asic_funcs
->asic_dma_alloc_coherent(hdev
,
743 HL_QUEUE_SIZE_IN_BYTES
,
745 GFP_KERNEL
| __GFP_ZERO
);
749 q
->kernel_address
= p
;
751 /* Make sure read/write pointers are initialized to start of queue */
752 atomic_set(&q
->ci
, 0);
758 static void sync_stream_queue_init(struct hl_device
*hdev
, u32 q_idx
)
760 struct hl_sync_stream_properties
*sync_stream_prop
;
761 struct asic_fixed_properties
*prop
= &hdev
->asic_prop
;
762 struct hl_hw_sob
*hw_sob
;
763 int sob
, reserved_mon_idx
, queue_idx
;
765 sync_stream_prop
= &hdev
->kernel_queues
[q_idx
].sync_stream_prop
;
767 /* We use 'collective_mon_idx' as a running index in order to reserve
768 * monitors for collective master/slave queues.
769 * collective master queue gets 2 reserved monitors
770 * collective slave queue gets 1 reserved monitor
772 if (hdev
->kernel_queues
[q_idx
].collective_mode
==
773 HL_COLLECTIVE_MASTER
) {
774 reserved_mon_idx
= hdev
->collective_mon_idx
;
776 /* reserve the first monitor for collective master queue */
777 sync_stream_prop
->collective_mstr_mon_id
[0] =
778 prop
->collective_first_mon
+ reserved_mon_idx
;
780 /* reserve the second monitor for collective master queue */
781 sync_stream_prop
->collective_mstr_mon_id
[1] =
782 prop
->collective_first_mon
+ reserved_mon_idx
+ 1;
784 hdev
->collective_mon_idx
+= HL_COLLECTIVE_RSVD_MSTR_MONS
;
785 } else if (hdev
->kernel_queues
[q_idx
].collective_mode
==
786 HL_COLLECTIVE_SLAVE
) {
787 reserved_mon_idx
= hdev
->collective_mon_idx
++;
789 /* reserve a monitor for collective slave queue */
790 sync_stream_prop
->collective_slave_mon_id
=
791 prop
->collective_first_mon
+ reserved_mon_idx
;
794 if (!hdev
->kernel_queues
[q_idx
].supports_sync_stream
)
797 queue_idx
= hdev
->sync_stream_queue_idx
++;
799 sync_stream_prop
->base_sob_id
= prop
->sync_stream_first_sob
+
800 (queue_idx
* HL_RSVD_SOBS
);
801 sync_stream_prop
->base_mon_id
= prop
->sync_stream_first_mon
+
802 (queue_idx
* HL_RSVD_MONS
);
803 sync_stream_prop
->next_sob_val
= 1;
804 sync_stream_prop
->curr_sob_offset
= 0;
806 for (sob
= 0 ; sob
< HL_RSVD_SOBS
; sob
++) {
807 hw_sob
= &sync_stream_prop
->hw_sob
[sob
];
809 hw_sob
->sob_id
= sync_stream_prop
->base_sob_id
+ sob
;
810 hw_sob
->q_idx
= q_idx
;
811 kref_init(&hw_sob
->kref
);
815 static void sync_stream_queue_reset(struct hl_device
*hdev
, u32 q_idx
)
817 struct hl_sync_stream_properties
*prop
=
818 &hdev
->kernel_queues
[q_idx
].sync_stream_prop
;
821 * In case we got here due to a stuck CS, the refcnt might be bigger
822 * than 1 and therefore we reset it.
824 kref_init(&prop
->hw_sob
[prop
->curr_sob_offset
].kref
);
825 prop
->curr_sob_offset
= 0;
826 prop
->next_sob_val
= 1;
830 * queue_init - main initialization function for H/W queue object
832 * @hdev: pointer to hl_device device structure
833 * @q: pointer to hl_hw_queue queue structure
834 * @hw_queue_id: The id of the H/W queue
836 * Allocate dma-able memory for the queue and initialize fields
837 * Returns 0 on success
839 static int queue_init(struct hl_device
*hdev
, struct hl_hw_queue
*q
,
844 q
->hw_queue_id
= hw_queue_id
;
846 switch (q
->queue_type
) {
848 rc
= ext_queue_init(hdev
, q
);
851 rc
= int_queue_init(hdev
, q
);
854 rc
= cpu_queue_init(hdev
, q
);
857 rc
= hw_queue_init(hdev
, q
);
863 dev_crit(hdev
->dev
, "wrong queue type %d during init\n",
869 sync_stream_queue_init(hdev
, q
->hw_queue_id
);
880 * hw_queue_fini - destroy queue
882 * @hdev: pointer to hl_device device structure
883 * @q: pointer to hl_hw_queue queue structure
885 * Free the queue memory
887 static void queue_fini(struct hl_device
*hdev
, struct hl_hw_queue
*q
)
893 * If we arrived here, there are no jobs waiting on this queue
894 * so we can safely remove it.
895 * This is because this function can only called when:
896 * 1. Either a context is deleted, which only can occur if all its
898 * 2. A context wasn't able to be created due to failure or timeout,
899 * which means there are no jobs on the queue yet
901 * The only exception are the queues of the kernel context, but
902 * if they are being destroyed, it means that the entire module is
903 * being removed. If the module is removed, it means there is no open
904 * user context. It also means that if a job was submitted by
905 * the kernel driver (e.g. context creation), the job itself was
906 * released by the kernel driver when a timeout occurred on its
907 * Completion. Thus, we don't need to release it again.
910 if (q
->queue_type
== QUEUE_TYPE_INT
)
913 kfree(q
->shadow_queue
);
915 if (q
->queue_type
== QUEUE_TYPE_CPU
)
916 hdev
->asic_funcs
->cpu_accessible_dma_pool_free(hdev
,
917 HL_QUEUE_SIZE_IN_BYTES
,
920 hdev
->asic_funcs
->asic_dma_free_coherent(hdev
,
921 HL_QUEUE_SIZE_IN_BYTES
,
926 int hl_hw_queues_create(struct hl_device
*hdev
)
928 struct asic_fixed_properties
*asic
= &hdev
->asic_prop
;
929 struct hl_hw_queue
*q
;
930 int i
, rc
, q_ready_cnt
;
932 hdev
->kernel_queues
= kcalloc(asic
->max_queues
,
933 sizeof(*hdev
->kernel_queues
), GFP_KERNEL
);
935 if (!hdev
->kernel_queues
) {
936 dev_err(hdev
->dev
, "Not enough memory for H/W queues\n");
940 /* Initialize the H/W queues */
941 for (i
= 0, q_ready_cnt
= 0, q
= hdev
->kernel_queues
;
942 i
< asic
->max_queues
; i
++, q_ready_cnt
++, q
++) {
944 q
->queue_type
= asic
->hw_queues_props
[i
].type
;
945 q
->supports_sync_stream
=
946 asic
->hw_queues_props
[i
].supports_sync_stream
;
947 q
->collective_mode
= asic
->hw_queues_props
[i
].collective_mode
;
948 rc
= queue_init(hdev
, q
, i
);
951 "failed to initialize queue %d\n", i
);
959 for (i
= 0, q
= hdev
->kernel_queues
; i
< q_ready_cnt
; i
++, q
++)
962 kfree(hdev
->kernel_queues
);
967 void hl_hw_queues_destroy(struct hl_device
*hdev
)
969 struct hl_hw_queue
*q
;
970 u32 max_queues
= hdev
->asic_prop
.max_queues
;
973 for (i
= 0, q
= hdev
->kernel_queues
; i
< max_queues
; i
++, q
++)
976 kfree(hdev
->kernel_queues
);
979 void hl_hw_queue_reset(struct hl_device
*hdev
, bool hard_reset
)
981 struct hl_hw_queue
*q
;
982 u32 max_queues
= hdev
->asic_prop
.max_queues
;
985 for (i
= 0, q
= hdev
->kernel_queues
; i
< max_queues
; i
++, q
++) {
987 ((!hard_reset
) && (q
->queue_type
== QUEUE_TYPE_CPU
)))
990 atomic_set(&q
->ci
, 0);
992 if (q
->supports_sync_stream
)
993 sync_stream_queue_reset(hdev
, q
->hw_queue_id
);