1 // SPDX-License-Identifier: GPL-2.0
4 * Copyright 2016-2019 HabanaLabs, Ltd.
8 #define pr_fmt(fmt) "habanalabs: " fmt
10 #include "habanalabs.h"
12 #include <linux/pci.h>
13 #include <linux/sched/signal.h>
14 #include <linux/hwmon.h>
15 #include <uapi/misc/habanalabs.h>
17 #define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10)
19 bool hl_device_disabled_or_in_reset(struct hl_device
*hdev
)
21 if ((hdev
->disabled
) || (atomic_read(&hdev
->in_reset
)))
27 enum hl_device_status
hl_device_status(struct hl_device
*hdev
)
29 enum hl_device_status status
;
32 status
= HL_DEVICE_STATUS_MALFUNCTION
;
33 else if (atomic_read(&hdev
->in_reset
))
34 status
= HL_DEVICE_STATUS_IN_RESET
;
36 status
= HL_DEVICE_STATUS_OPERATIONAL
;
41 static void hpriv_release(struct kref
*ref
)
43 struct hl_fpriv
*hpriv
;
44 struct hl_device
*hdev
;
46 hpriv
= container_of(ref
, struct hl_fpriv
, refcount
);
50 put_pid(hpriv
->taskpid
);
52 hl_debugfs_remove_file(hpriv
);
54 mutex_destroy(&hpriv
->restore_phase_mutex
);
58 /* Now the FD is really closed */
59 atomic_dec(&hdev
->fd_open_cnt
);
61 /* This allows a new user context to open the device */
62 hdev
->user_ctx
= NULL
;
65 void hl_hpriv_get(struct hl_fpriv
*hpriv
)
67 kref_get(&hpriv
->refcount
);
70 void hl_hpriv_put(struct hl_fpriv
*hpriv
)
72 kref_put(&hpriv
->refcount
, hpriv_release
);
76 * hl_device_release - release function for habanalabs device
78 * @inode: pointer to inode structure
79 * @filp: pointer to file structure
81 * Called when process closes an habanalabs device
83 static int hl_device_release(struct inode
*inode
, struct file
*filp
)
85 struct hl_fpriv
*hpriv
= filp
->private_data
;
87 hl_cb_mgr_fini(hpriv
->hdev
, &hpriv
->cb_mgr
);
88 hl_ctx_mgr_fini(hpriv
->hdev
, &hpriv
->ctx_mgr
);
90 filp
->private_data
= NULL
;
98 * hl_mmap - mmap function for habanalabs device
100 * @*filp: pointer to file structure
101 * @*vma: pointer to vm_area_struct of the process
103 * Called when process does an mmap on habanalabs device. Call the device's mmap
104 * function at the end of the common code.
106 static int hl_mmap(struct file
*filp
, struct vm_area_struct
*vma
)
108 struct hl_fpriv
*hpriv
= filp
->private_data
;
110 if ((vma
->vm_pgoff
& HL_MMAP_CB_MASK
) == HL_MMAP_CB_MASK
) {
111 vma
->vm_pgoff
^= HL_MMAP_CB_MASK
;
112 return hl_cb_mmap(hpriv
, vma
);
118 static const struct file_operations hl_ops
= {
119 .owner
= THIS_MODULE
,
120 .open
= hl_device_open
,
121 .release
= hl_device_release
,
123 .unlocked_ioctl
= hl_ioctl
,
124 .compat_ioctl
= hl_ioctl
128 * device_setup_cdev - setup cdev and device for habanalabs device
130 * @hdev: pointer to habanalabs device structure
131 * @hclass: pointer to the class object of the device
132 * @minor: minor number of the specific device
133 * @fpos : file operations to install for this device
135 * Create a cdev and a Linux device for habanalabs's device. Need to be
136 * called at the end of the habanalabs device initialization process,
137 * because this function exposes the device to the user
139 static int device_setup_cdev(struct hl_device
*hdev
, struct class *hclass
,
140 int minor
, const struct file_operations
*fops
)
142 int err
, devno
= MKDEV(hdev
->major
, minor
);
143 struct cdev
*hdev_cdev
= &hdev
->cdev
;
146 name
= kasprintf(GFP_KERNEL
, "hl%d", hdev
->id
);
150 cdev_init(hdev_cdev
, fops
);
151 hdev_cdev
->owner
= THIS_MODULE
;
152 err
= cdev_add(hdev_cdev
, devno
, 1);
154 pr_err("Failed to add char device %s\n", name
);
158 hdev
->dev
= device_create(hclass
, NULL
, devno
, NULL
, "%s", name
);
159 if (IS_ERR(hdev
->dev
)) {
160 pr_err("Failed to create device %s\n", name
);
161 err
= PTR_ERR(hdev
->dev
);
162 goto err_device_create
;
165 dev_set_drvdata(hdev
->dev
, hdev
);
179 * device_early_init - do some early initialization for the habanalabs device
181 * @hdev: pointer to habanalabs device structure
183 * Install the relevant function pointers and call the early_init function,
184 * if such a function exists
186 static int device_early_init(struct hl_device
*hdev
)
190 switch (hdev
->asic_type
) {
192 goya_set_asic_funcs(hdev
);
193 strlcpy(hdev
->asic_name
, "GOYA", sizeof(hdev
->asic_name
));
196 dev_err(hdev
->dev
, "Unrecognized ASIC type %d\n",
201 rc
= hdev
->asic_funcs
->early_init(hdev
);
205 rc
= hl_asid_init(hdev
);
209 hdev
->cq_wq
= alloc_workqueue("hl-free-jobs", WQ_UNBOUND
, 0);
210 if (hdev
->cq_wq
== NULL
) {
211 dev_err(hdev
->dev
, "Failed to allocate CQ workqueue\n");
216 hdev
->eq_wq
= alloc_workqueue("hl-events", WQ_UNBOUND
, 0);
217 if (hdev
->eq_wq
== NULL
) {
218 dev_err(hdev
->dev
, "Failed to allocate EQ workqueue\n");
223 hdev
->hl_chip_info
= kzalloc(sizeof(struct hwmon_chip_info
),
225 if (!hdev
->hl_chip_info
) {
230 hl_cb_mgr_init(&hdev
->kernel_cb_mgr
);
232 mutex_init(&hdev
->fd_open_cnt_lock
);
233 mutex_init(&hdev
->send_cpu_message_lock
);
234 mutex_init(&hdev
->mmu_cache_lock
);
235 INIT_LIST_HEAD(&hdev
->hw_queues_mirror_list
);
236 spin_lock_init(&hdev
->hw_queues_mirror_lock
);
237 atomic_set(&hdev
->in_reset
, 0);
238 atomic_set(&hdev
->fd_open_cnt
, 0);
239 atomic_set(&hdev
->cs_active_cnt
, 0);
244 destroy_workqueue(hdev
->eq_wq
);
246 destroy_workqueue(hdev
->cq_wq
);
250 if (hdev
->asic_funcs
->early_fini
)
251 hdev
->asic_funcs
->early_fini(hdev
);
257 * device_early_fini - finalize all that was done in device_early_init
259 * @hdev: pointer to habanalabs device structure
262 static void device_early_fini(struct hl_device
*hdev
)
264 mutex_destroy(&hdev
->mmu_cache_lock
);
265 mutex_destroy(&hdev
->send_cpu_message_lock
);
267 hl_cb_mgr_fini(hdev
, &hdev
->kernel_cb_mgr
);
269 kfree(hdev
->hl_chip_info
);
271 destroy_workqueue(hdev
->eq_wq
);
272 destroy_workqueue(hdev
->cq_wq
);
276 if (hdev
->asic_funcs
->early_fini
)
277 hdev
->asic_funcs
->early_fini(hdev
);
279 mutex_destroy(&hdev
->fd_open_cnt_lock
);
282 static void set_freq_to_low_job(struct work_struct
*work
)
284 struct hl_device
*hdev
= container_of(work
, struct hl_device
,
287 if (atomic_read(&hdev
->fd_open_cnt
) == 0)
288 hl_device_set_frequency(hdev
, PLL_LOW
);
290 schedule_delayed_work(&hdev
->work_freq
,
291 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC
));
294 static void hl_device_heartbeat(struct work_struct
*work
)
296 struct hl_device
*hdev
= container_of(work
, struct hl_device
,
297 work_heartbeat
.work
);
299 if (hl_device_disabled_or_in_reset(hdev
))
302 if (!hdev
->asic_funcs
->send_heartbeat(hdev
))
305 dev_err(hdev
->dev
, "Device heartbeat failed!\n");
306 hl_device_reset(hdev
, true, false);
311 schedule_delayed_work(&hdev
->work_heartbeat
,
312 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC
));
316 * device_late_init - do late stuff initialization for the habanalabs device
318 * @hdev: pointer to habanalabs device structure
320 * Do stuff that either needs the device H/W queues to be active or needs
321 * to happen after all the rest of the initialization is finished
323 static int device_late_init(struct hl_device
*hdev
)
327 INIT_DELAYED_WORK(&hdev
->work_freq
, set_freq_to_low_job
);
328 hdev
->high_pll
= hdev
->asic_prop
.high_pll
;
330 /* force setting to low frequency */
331 atomic_set(&hdev
->curr_pll_profile
, PLL_LOW
);
333 if (hdev
->pm_mng_profile
== PM_AUTO
)
334 hdev
->asic_funcs
->set_pll_profile(hdev
, PLL_LOW
);
336 hdev
->asic_funcs
->set_pll_profile(hdev
, PLL_LAST
);
338 if (hdev
->asic_funcs
->late_init
) {
339 rc
= hdev
->asic_funcs
->late_init(hdev
);
342 "failed late initialization for the H/W\n");
347 schedule_delayed_work(&hdev
->work_freq
,
348 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC
));
350 if (hdev
->heartbeat
) {
351 INIT_DELAYED_WORK(&hdev
->work_heartbeat
, hl_device_heartbeat
);
352 schedule_delayed_work(&hdev
->work_heartbeat
,
353 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC
));
356 hdev
->late_init_done
= true;
362 * device_late_fini - finalize all that was done in device_late_init
364 * @hdev: pointer to habanalabs device structure
367 static void device_late_fini(struct hl_device
*hdev
)
369 if (!hdev
->late_init_done
)
372 cancel_delayed_work_sync(&hdev
->work_freq
);
374 cancel_delayed_work_sync(&hdev
->work_heartbeat
);
376 if (hdev
->asic_funcs
->late_fini
)
377 hdev
->asic_funcs
->late_fini(hdev
);
379 hdev
->late_init_done
= false;
383 * hl_device_set_frequency - set the frequency of the device
385 * @hdev: pointer to habanalabs device structure
386 * @freq: the new frequency value
388 * Change the frequency if needed.
389 * We allose to set PLL to low only if there is no user process
390 * Returns 0 if no change was done, otherwise returns 1;
392 int hl_device_set_frequency(struct hl_device
*hdev
, enum hl_pll_frequency freq
)
394 enum hl_pll_frequency old_freq
=
395 (freq
== PLL_HIGH
) ? PLL_LOW
: PLL_HIGH
;
398 if (hdev
->pm_mng_profile
== PM_MANUAL
)
401 ret
= atomic_cmpxchg(&hdev
->curr_pll_profile
, old_freq
, freq
);
406 * in case we want to lower frequency, check if device is not
407 * opened. We must have a check here to workaround race condition with
410 if ((freq
== PLL_LOW
) && (atomic_read(&hdev
->fd_open_cnt
) > 0)) {
411 atomic_set(&hdev
->curr_pll_profile
, PLL_HIGH
);
415 dev_dbg(hdev
->dev
, "Changing device frequency to %s\n",
416 freq
== PLL_HIGH
? "high" : "low");
418 hdev
->asic_funcs
->set_pll_profile(hdev
, freq
);
424 * hl_device_suspend - initiate device suspend
426 * @hdev: pointer to habanalabs device structure
428 * Puts the hw in the suspend state (all asics).
429 * Returns 0 for success or an error on failure.
430 * Called at driver suspend.
432 int hl_device_suspend(struct hl_device
*hdev
)
436 pci_save_state(hdev
->pdev
);
438 /* Block future CS/VM/JOB completion operations */
439 rc
= atomic_cmpxchg(&hdev
->in_reset
, 0, 1);
441 dev_err(hdev
->dev
, "Can't suspend while in reset\n");
445 /* This blocks all other stuff that is not blocked by in_reset */
446 hdev
->disabled
= true;
449 * Flush anyone that is inside the critical section of enqueue
452 hdev
->asic_funcs
->hw_queues_lock(hdev
);
453 hdev
->asic_funcs
->hw_queues_unlock(hdev
);
455 /* Flush processes that are sending message to CPU */
456 mutex_lock(&hdev
->send_cpu_message_lock
);
457 mutex_unlock(&hdev
->send_cpu_message_lock
);
459 rc
= hdev
->asic_funcs
->suspend(hdev
);
462 "Failed to disable PCI access of device CPU\n");
464 /* Shut down the device */
465 pci_disable_device(hdev
->pdev
);
466 pci_set_power_state(hdev
->pdev
, PCI_D3hot
);
472 * hl_device_resume - initiate device resume
474 * @hdev: pointer to habanalabs device structure
476 * Bring the hw back to operating state (all asics).
477 * Returns 0 for success or an error on failure.
478 * Called at driver resume.
480 int hl_device_resume(struct hl_device
*hdev
)
484 pci_set_power_state(hdev
->pdev
, PCI_D0
);
485 pci_restore_state(hdev
->pdev
);
486 rc
= pci_enable_device_mem(hdev
->pdev
);
489 "Failed to enable PCI device in resume\n");
493 pci_set_master(hdev
->pdev
);
495 rc
= hdev
->asic_funcs
->resume(hdev
);
497 dev_err(hdev
->dev
, "Failed to resume device after suspend\n");
502 hdev
->disabled
= false;
503 atomic_set(&hdev
->in_reset
, 0);
505 rc
= hl_device_reset(hdev
, true, false);
507 dev_err(hdev
->dev
, "Failed to reset device during resume\n");
514 pci_clear_master(hdev
->pdev
);
515 pci_disable_device(hdev
->pdev
);
520 static void device_kill_open_processes(struct hl_device
*hdev
)
522 u16 pending_total
, pending_cnt
;
523 struct task_struct
*task
= NULL
;
526 pending_total
= HL_PLDM_PENDING_RESET_PER_SEC
;
528 pending_total
= HL_PENDING_RESET_PER_SEC
;
530 pending_cnt
= pending_total
;
532 /* Flush all processes that are inside hl_open */
533 mutex_lock(&hdev
->fd_open_cnt_lock
);
535 while ((atomic_read(&hdev
->fd_open_cnt
)) && (pending_cnt
)) {
540 "Can't HARD reset, waiting for user to close FD\n");
544 if (atomic_read(&hdev
->fd_open_cnt
)) {
545 task
= get_pid_task(hdev
->user_ctx
->hpriv
->taskpid
,
548 dev_info(hdev
->dev
, "Killing user processes\n");
549 send_sig(SIGKILL
, task
, 1);
552 put_task_struct(task
);
556 /* We killed the open users, but because the driver cleans up after the
557 * user contexts are closed (e.g. mmu mappings), we need to wait again
558 * to make sure the cleaning phase is finished before continuing with
562 pending_cnt
= pending_total
;
564 while ((atomic_read(&hdev
->fd_open_cnt
)) && (pending_cnt
)) {
571 if (atomic_read(&hdev
->fd_open_cnt
))
573 "Going to hard reset with open user contexts\n");
575 mutex_unlock(&hdev
->fd_open_cnt_lock
);
579 static void device_hard_reset_pending(struct work_struct
*work
)
581 struct hl_device_reset_work
*device_reset_work
=
582 container_of(work
, struct hl_device_reset_work
, reset_work
);
583 struct hl_device
*hdev
= device_reset_work
->hdev
;
585 device_kill_open_processes(hdev
);
587 hl_device_reset(hdev
, true, true);
589 kfree(device_reset_work
);
593 * hl_device_reset - reset the device
595 * @hdev: pointer to habanalabs device structure
596 * @hard_reset: should we do hard reset to all engines or just reset the
597 * compute/dma engines
599 * Block future CS and wait for pending CS to be enqueued
601 * Flush all completions
602 * Re-initialize all internal data structures
603 * Call ASIC H/W init, late_init
607 * Returns 0 for success or an error on failure.
609 int hl_device_reset(struct hl_device
*hdev
, bool hard_reset
,
610 bool from_hard_reset_thread
)
614 if (!hdev
->init_done
) {
616 "Can't reset before initialization is done\n");
621 * Prevent concurrency in this function - only one reset should be
622 * done at any given time. Only need to perform this if we didn't
623 * get from the dedicated hard reset thread
625 if (!from_hard_reset_thread
) {
626 /* Block future CS/VM/JOB completion operations */
627 rc
= atomic_cmpxchg(&hdev
->in_reset
, 0, 1);
631 /* This also blocks future CS/VM/JOB completion operations */
632 hdev
->disabled
= true;
635 * Flush anyone that is inside the critical section of enqueue
638 hdev
->asic_funcs
->hw_queues_lock(hdev
);
639 hdev
->asic_funcs
->hw_queues_unlock(hdev
);
641 dev_err(hdev
->dev
, "Going to RESET device!\n");
645 if ((hard_reset
) && (!from_hard_reset_thread
)) {
646 struct hl_device_reset_work
*device_reset_work
;
648 hdev
->hard_reset_pending
= true;
652 "Reset action is NOT supported in simulator\n");
657 device_reset_work
= kzalloc(sizeof(*device_reset_work
),
659 if (!device_reset_work
) {
665 * Because the reset function can't run from interrupt or
666 * from heartbeat work, we need to call the reset function
667 * from a dedicated work
669 INIT_WORK(&device_reset_work
->reset_work
,
670 device_hard_reset_pending
);
671 device_reset_work
->hdev
= hdev
;
672 schedule_work(&device_reset_work
->reset_work
);
678 device_late_fini(hdev
);
681 * Now that the heartbeat thread is closed, flush processes
682 * which are sending messages to CPU
684 mutex_lock(&hdev
->send_cpu_message_lock
);
685 mutex_unlock(&hdev
->send_cpu_message_lock
);
689 * Halt the engines and disable interrupts so we won't get any more
690 * completions from H/W and we won't have any accesses from the
691 * H/W to the host machine
693 hdev
->asic_funcs
->halt_engines(hdev
, hard_reset
);
695 /* Go over all the queues, release all CS and their jobs */
696 hl_cs_rollback_all(hdev
);
698 /* Release kernel context */
699 if ((hard_reset
) && (hl_ctx_put(hdev
->kernel_ctx
) == 1))
700 hdev
->kernel_ctx
= NULL
;
702 /* Reset the H/W. It will be in idle state after this returns */
703 hdev
->asic_funcs
->hw_fini(hdev
, hard_reset
);
707 hl_eq_reset(hdev
, &hdev
->event_queue
);
710 /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
711 hl_hw_queue_reset(hdev
, hard_reset
);
712 for (i
= 0 ; i
< hdev
->asic_prop
.completion_queues_count
; i
++)
713 hl_cq_reset(hdev
, &hdev
->completion_queue
[i
]);
715 /* Make sure the context switch phase will run again */
716 if (hdev
->user_ctx
) {
717 atomic_set(&hdev
->user_ctx
->thread_ctx_switch_token
, 1);
718 hdev
->user_ctx
->thread_ctx_switch_wait_token
= 0;
721 /* Finished tear-down, starting to re-initialize */
724 hdev
->device_cpu_disabled
= false;
725 hdev
->hard_reset_pending
= false;
727 if (hdev
->kernel_ctx
) {
729 "kernel ctx was alive during hard reset, something is terribly wrong\n");
734 /* Allocate the kernel context */
735 hdev
->kernel_ctx
= kzalloc(sizeof(*hdev
->kernel_ctx
),
737 if (!hdev
->kernel_ctx
) {
742 hdev
->user_ctx
= NULL
;
744 rc
= hl_ctx_init(hdev
, hdev
->kernel_ctx
, true);
747 "failed to init kernel ctx in hard reset\n");
748 kfree(hdev
->kernel_ctx
);
749 hdev
->kernel_ctx
= NULL
;
754 rc
= hdev
->asic_funcs
->hw_init(hdev
);
757 "failed to initialize the H/W after reset\n");
761 hdev
->disabled
= false;
763 /* Check that the communication with the device is working */
764 rc
= hdev
->asic_funcs
->test_queues(hdev
);
767 "Failed to detect if device is alive after reset\n");
772 rc
= device_late_init(hdev
);
775 "Failed late init after hard reset\n");
779 rc
= hl_vm_init(hdev
);
782 "Failed to init memory module after hard reset\n");
786 hl_set_max_power(hdev
, hdev
->max_power
);
788 rc
= hdev
->asic_funcs
->soft_reset_late_init(hdev
);
791 "Failed late init after soft reset\n");
796 atomic_set(&hdev
->in_reset
, 0);
799 hdev
->hard_reset_cnt
++;
801 hdev
->soft_reset_cnt
++;
806 hdev
->disabled
= true;
810 "Failed to reset! Device is NOT usable\n");
811 hdev
->hard_reset_cnt
++;
814 "Failed to do soft-reset, trying hard reset\n");
815 hdev
->soft_reset_cnt
++;
820 atomic_set(&hdev
->in_reset
, 0);
826 * hl_device_init - main initialization function for habanalabs device
828 * @hdev: pointer to habanalabs device structure
830 * Allocate an id for the device, do early initialization and then call the
831 * ASIC specific initialization functions. Finally, create the cdev and the
832 * Linux device to expose it to the user
834 int hl_device_init(struct hl_device
*hdev
, struct class *hclass
)
836 int i
, rc
, cq_ready_cnt
;
839 rc
= device_setup_cdev(hdev
, hclass
, hdev
->id
, &hl_ops
);
844 /* Initialize ASIC function pointers and perform early init */
845 rc
= device_early_init(hdev
);
850 * Start calling ASIC initialization. First S/W then H/W and finally
853 rc
= hdev
->asic_funcs
->sw_init(hdev
);
858 * Initialize the H/W queues. Must be done before hw_init, because
859 * there the addresses of the kernel queue are being written to the
860 * registers of the device
862 rc
= hl_hw_queues_create(hdev
);
864 dev_err(hdev
->dev
, "failed to initialize kernel queues\n");
869 * Initialize the completion queues. Must be done before hw_init,
870 * because there the addresses of the completion queues are being
871 * passed as arguments to request_irq
873 hdev
->completion_queue
=
874 kcalloc(hdev
->asic_prop
.completion_queues_count
,
875 sizeof(*hdev
->completion_queue
), GFP_KERNEL
);
877 if (!hdev
->completion_queue
) {
878 dev_err(hdev
->dev
, "failed to allocate completion queues\n");
880 goto hw_queues_destroy
;
883 for (i
= 0, cq_ready_cnt
= 0;
884 i
< hdev
->asic_prop
.completion_queues_count
;
885 i
++, cq_ready_cnt
++) {
886 rc
= hl_cq_init(hdev
, &hdev
->completion_queue
[i
], i
);
889 "failed to initialize completion queue\n");
895 * Initialize the event queue. Must be done before hw_init,
896 * because there the address of the event queue is being
897 * passed as argument to request_irq
899 rc
= hl_eq_init(hdev
, &hdev
->event_queue
);
901 dev_err(hdev
->dev
, "failed to initialize event queue\n");
905 /* Allocate the kernel context */
906 hdev
->kernel_ctx
= kzalloc(sizeof(*hdev
->kernel_ctx
), GFP_KERNEL
);
907 if (!hdev
->kernel_ctx
) {
912 hdev
->user_ctx
= NULL
;
914 rc
= hl_ctx_init(hdev
, hdev
->kernel_ctx
, true);
916 dev_err(hdev
->dev
, "failed to initialize kernel context\n");
920 rc
= hl_cb_pool_init(hdev
);
922 dev_err(hdev
->dev
, "failed to initialize CB pool\n");
926 rc
= hl_sysfs_init(hdev
);
928 dev_err(hdev
->dev
, "failed to initialize sysfs\n");
932 hl_debugfs_add_device(hdev
);
934 if (hdev
->asic_funcs
->get_hw_state(hdev
) == HL_DEVICE_HW_STATE_DIRTY
) {
936 "H/W state is dirty, must reset before initializing\n");
937 hdev
->asic_funcs
->hw_fini(hdev
, true);
940 rc
= hdev
->asic_funcs
->hw_init(hdev
);
942 dev_err(hdev
->dev
, "failed to initialize the H/W\n");
947 hdev
->disabled
= false;
949 /* Check that the communication with the device is working */
950 rc
= hdev
->asic_funcs
->test_queues(hdev
);
952 dev_err(hdev
->dev
, "Failed to detect if device is alive\n");
957 /* After test_queues, KMD can start sending messages to device CPU */
959 rc
= device_late_init(hdev
);
961 dev_err(hdev
->dev
, "Failed late initialization\n");
966 dev_info(hdev
->dev
, "Found %s device with %lluGB DRAM\n",
968 hdev
->asic_prop
.dram_size
/ 1024 / 1024 / 1024);
970 rc
= hl_vm_init(hdev
);
972 dev_err(hdev
->dev
, "Failed to initialize memory module\n");
978 * hl_hwmon_init must be called after device_late_init, because only
979 * there we get the information from the device about which
980 * hwmon-related sensors the device supports
982 rc
= hl_hwmon_init(hdev
);
984 dev_err(hdev
->dev
, "Failed to initialize hwmon\n");
989 dev_notice(hdev
->dev
,
990 "Successfully added device to habanalabs driver\n");
992 hdev
->init_done
= true;
997 hl_cb_pool_fini(hdev
);
999 if (hl_ctx_put(hdev
->kernel_ctx
) != 1)
1001 "kernel ctx is still alive on initialization failure\n");
1003 kfree(hdev
->kernel_ctx
);
1005 hl_eq_fini(hdev
, &hdev
->event_queue
);
1007 for (i
= 0 ; i
< cq_ready_cnt
; i
++)
1008 hl_cq_fini(hdev
, &hdev
->completion_queue
[i
]);
1009 kfree(hdev
->completion_queue
);
1011 hl_hw_queues_destroy(hdev
);
1013 hdev
->asic_funcs
->sw_fini(hdev
);
1015 device_early_fini(hdev
);
1017 device_destroy(hclass
, hdev
->dev
->devt
);
1018 cdev_del(&hdev
->cdev
);
1020 hdev
->disabled
= true;
1022 dev_err(&hdev
->pdev
->dev
,
1023 "Failed to initialize hl%d. Device is NOT usable !\n",
1026 pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
1033 * hl_device_fini - main tear-down function for habanalabs device
1035 * @hdev: pointer to habanalabs device structure
1037 * Destroy the device, call ASIC fini functions and release the id
1039 void hl_device_fini(struct hl_device
*hdev
)
1044 dev_info(hdev
->dev
, "Removing device\n");
1047 * This function is competing with the reset function, so try to
1048 * take the reset atomic and if we are already in middle of reset,
1049 * wait until reset function is finished. Reset function is designed
1050 * to always finish (could take up to a few seconds in worst case).
1053 timeout
= ktime_add_us(ktime_get(),
1054 HL_PENDING_RESET_PER_SEC
* 1000 * 1000 * 4);
1055 rc
= atomic_cmpxchg(&hdev
->in_reset
, 0, 1);
1057 usleep_range(50, 200);
1058 rc
= atomic_cmpxchg(&hdev
->in_reset
, 0, 1);
1059 if (ktime_compare(ktime_get(), timeout
) > 0) {
1060 WARN(1, "Failed to remove device because reset function did not finish\n");
1065 /* Mark device as disabled */
1066 hdev
->disabled
= true;
1069 * Flush anyone that is inside the critical section of enqueue
1072 hdev
->asic_funcs
->hw_queues_lock(hdev
);
1073 hdev
->asic_funcs
->hw_queues_unlock(hdev
);
1075 hdev
->hard_reset_pending
= true;
1077 device_kill_open_processes(hdev
);
1079 hl_hwmon_fini(hdev
);
1081 device_late_fini(hdev
);
1083 hl_debugfs_remove_device(hdev
);
1085 hl_sysfs_fini(hdev
);
1088 * Halt the engines and disable interrupts so we won't get any more
1089 * completions from H/W and we won't have any accesses from the
1090 * H/W to the host machine
1092 hdev
->asic_funcs
->halt_engines(hdev
, true);
1094 /* Go over all the queues, release all CS and their jobs */
1095 hl_cs_rollback_all(hdev
);
1097 hl_cb_pool_fini(hdev
);
1099 /* Release kernel context */
1100 if ((hdev
->kernel_ctx
) && (hl_ctx_put(hdev
->kernel_ctx
) != 1))
1101 dev_err(hdev
->dev
, "kernel ctx is still alive\n");
1103 /* Reset the H/W. It will be in idle state after this returns */
1104 hdev
->asic_funcs
->hw_fini(hdev
, true);
1108 hl_eq_fini(hdev
, &hdev
->event_queue
);
1110 for (i
= 0 ; i
< hdev
->asic_prop
.completion_queues_count
; i
++)
1111 hl_cq_fini(hdev
, &hdev
->completion_queue
[i
]);
1112 kfree(hdev
->completion_queue
);
1114 hl_hw_queues_destroy(hdev
);
1116 /* Call ASIC S/W finalize function */
1117 hdev
->asic_funcs
->sw_fini(hdev
);
1119 device_early_fini(hdev
);
1121 /* Hide device from user */
1122 device_destroy(hdev
->dev
->class, hdev
->dev
->devt
);
1123 cdev_del(&hdev
->cdev
);
1125 pr_info("removed device successfully\n");
1129 * hl_poll_timeout_memory - Periodically poll a host memory address
1130 * until it is not zero or a timeout occurs
1131 * @hdev: pointer to habanalabs device structure
1132 * @addr: Address to poll
1133 * @timeout_us: timeout in us
1134 * @val: Variable to read the value into
1136 * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
1137 * case, the last read value at @addr is stored in @val. Must not
1138 * be called from atomic context if sleep_us or timeout_us are used.
1140 * The function sleeps for 100us with timeout value of
1143 int hl_poll_timeout_memory(struct hl_device
*hdev
, u64 addr
,
1144 u32 timeout_us
, u32
*val
)
1147 * address in this function points always to a memory location in the
1148 * host's (server's) memory. That location is updated asynchronously
1149 * either by the direct access of the device or by another core
1151 u32
*paddr
= (u32
*) (uintptr_t) addr
;
1154 /* timeout should be longer when working with simulator */
1158 timeout
= ktime_add_us(ktime_get(), timeout_us
);
1164 * Flush CPU read/write buffers to make sure we read updates
1165 * done by other cores or by the device
1171 if (ktime_compare(ktime_get(), timeout
) > 0) {
1175 usleep_range((100 >> 2) + 1, 100);
1178 return *val
? 0 : -ETIMEDOUT
;
1182 * hl_poll_timeout_devicememory - Periodically poll a device memory address
1183 * until it is not zero or a timeout occurs
1184 * @hdev: pointer to habanalabs device structure
1185 * @addr: Device address to poll
1186 * @timeout_us: timeout in us
1187 * @val: Variable to read the value into
1189 * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
1190 * case, the last read value at @addr is stored in @val. Must not
1191 * be called from atomic context if sleep_us or timeout_us are used.
1193 * The function sleeps for 100us with timeout value of
1196 int hl_poll_timeout_device_memory(struct hl_device
*hdev
, void __iomem
*addr
,
1197 u32 timeout_us
, u32
*val
)
1199 ktime_t timeout
= ktime_add_us(ktime_get(), timeout_us
);
1207 if (ktime_compare(ktime_get(), timeout
) > 0) {
1211 usleep_range((100 >> 2) + 1, 100);
1214 return *val
? 0 : -ETIMEDOUT
;
1218 * MMIO register access helper functions.
1222 * hl_rreg - Read an MMIO register
1224 * @hdev: pointer to habanalabs device structure
1225 * @reg: MMIO register offset (in bytes)
1227 * Returns the value of the MMIO register we are asked to read
1230 inline u32
hl_rreg(struct hl_device
*hdev
, u32 reg
)
1232 return readl(hdev
->rmmio
+ reg
);
1236 * hl_wreg - Write to an MMIO register
1238 * @hdev: pointer to habanalabs device structure
1239 * @reg: MMIO register offset (in bytes)
1240 * @val: 32-bit value
1242 * Writes the 32-bit value into the MMIO register
1245 inline void hl_wreg(struct hl_device
*hdev
, u32 reg
, u32 val
)
1247 writel(val
, hdev
->rmmio
+ reg
);