From e00dac3daaa75fc91ec67f656ca56859075059c8 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 10 Apr 2019 15:18:46 +0300 Subject: [PATCH 01/15] habanalabs: Cancel pr_fmt() definition dependency on includes order pr_fmt() should be defined before including linux/printk.h, either directly or indirectly, in order to avoid redefinition of the macro. Currently the macro definition is in habanalabs.h, which is included in many files, and that makes the addition/reorder of includes to be prone to compilation errors. This patch cancels this dependency by defining the macro only in the few source files that use it. Signed-off-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/device.c | 2 ++ drivers/misc/habanalabs/habanalabs.h | 2 -- drivers/misc/habanalabs/habanalabs_drv.c | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 25bfb093ff26..a88f7be23c7f 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -5,6 +5,8 @@ * All Rights Reserved. */ +#define pr_fmt(fmt) "habanalabs: " fmt + #include "habanalabs.h" #include diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 2f02bb55f66a..86bd5298efd6 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -11,8 +11,6 @@ #include "include/armcp_if.h" #include "include/qman_if.h" -#define pr_fmt(fmt) "habanalabs: " fmt - #include #include #include diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c index 1667df7ca64c..5f4d155be767 100644 --- a/drivers/misc/habanalabs/habanalabs_drv.c +++ b/drivers/misc/habanalabs/habanalabs_drv.c @@ -6,6 +6,8 @@ * */ +#define pr_fmt(fmt) "habanalabs: " fmt + #include "habanalabs.h" #include From 883c2459a57d93efbcb974bea1d2d506db3ae9ab Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 21 Apr 2019 10:48:41 +0300 Subject: [PATCH 02/15] habanalabs: re-factor goya_parse_cb_no_ext_queue() This patch re-factors goya_parse_cb_no_ext_queue() to make it more readable by inverting the check inside the first if statement so the bulk of the function won't be inside an if statement. The patch also fixes a spelling error in the name of the function. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 43 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index bde11fc2c251..3f707e8c408a 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -3858,36 +3858,35 @@ free_userptr: return rc; } -static int goya_parse_cb_no_ext_quque(struct hl_device *hdev, +static int goya_parse_cb_no_ext_queue(struct hl_device *hdev, struct hl_cs_parser *parser) { struct asic_fixed_properties *asic_prop = &hdev->asic_prop; struct goya_device *goya = hdev->asic_specific; - if (!(goya->hw_cap_initialized & HW_CAP_MMU)) { - /* For internal queue jobs, just check if cb address is valid */ - if (hl_mem_area_inside_range( - (u64) (uintptr_t) parser->user_cb, - parser->user_cb_size, - asic_prop->sram_user_base_address, - asic_prop->sram_end_address)) - return 0; + if (goya->hw_cap_initialized & HW_CAP_MMU) + return 0; - if (hl_mem_area_inside_range( - (u64) (uintptr_t) parser->user_cb, - parser->user_cb_size, - asic_prop->dram_user_base_address, - asic_prop->dram_end_address)) - return 0; + /* For internal queue jobs, just check if CB address is valid */ + if (hl_mem_area_inside_range( + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->sram_user_base_address, + asic_prop->sram_end_address)) + return 0; - dev_err(hdev->dev, - "Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n", - parser->user_cb, parser->user_cb_size); + if (hl_mem_area_inside_range( + (u64) (uintptr_t) parser->user_cb, + parser->user_cb_size, + asic_prop->dram_user_base_address, + asic_prop->dram_end_address)) + return 0; - return -EFAULT; - } + dev_err(hdev->dev, + "Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n", + parser->user_cb, parser->user_cb_size); - return 0; + return -EFAULT; } int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) @@ -3895,7 +3894,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) struct goya_device *goya = hdev->asic_specific; if (!parser->ext_queue) - return goya_parse_cb_no_ext_quque(hdev, parser); + return goya_parse_cb_no_ext_queue(hdev, parser); if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr) return goya_parse_cb_mmu(hdev, parser); From d691171d61b635fa36860ca65c4f8fde718abd09 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 21 Apr 2019 16:20:46 +0300 Subject: [PATCH 03/15] uapi/habanalabs: add missing fields in bmon params This patch adds missing fields of start address 0 and 1 in the bmon parameter structure that is received from the user in the debug IOCTL. Without these fields, the functionality of the bmon trace is broken, because there is no configuration of the base address of the filter of the bus monitor. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya_coresight.c | 16 ++++++++++++---- include/uapi/misc/habanalabs.h | 9 ++++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c index 68726fb4c56a..1ac951f52d1e 100644 --- a/drivers/misc/habanalabs/goya/goya_coresight.c +++ b/drivers/misc/habanalabs/goya/goya_coresight.c @@ -459,10 +459,14 @@ static int goya_config_bmon(struct hl_device *hdev, if (!input) return -EINVAL; - WREG32(base_reg + 0x208, lower_32_bits(input->addr_range0)); - WREG32(base_reg + 0x20C, upper_32_bits(input->addr_range0)); - WREG32(base_reg + 0x248, lower_32_bits(input->addr_range1)); - WREG32(base_reg + 0x24C, upper_32_bits(input->addr_range1)); + WREG32(base_reg + 0x200, lower_32_bits(input->start_addr0)); + WREG32(base_reg + 0x204, upper_32_bits(input->start_addr0)); + WREG32(base_reg + 0x208, lower_32_bits(input->addr_mask0)); + WREG32(base_reg + 0x20C, upper_32_bits(input->addr_mask0)); + WREG32(base_reg + 0x240, lower_32_bits(input->start_addr1)); + WREG32(base_reg + 0x244, upper_32_bits(input->start_addr1)); + WREG32(base_reg + 0x248, lower_32_bits(input->addr_mask1)); + WREG32(base_reg + 0x24C, upper_32_bits(input->addr_mask1)); WREG32(base_reg + 0x224, 0); WREG32(base_reg + 0x234, 0); WREG32(base_reg + 0x30C, input->bw_win); @@ -482,8 +486,12 @@ static int goya_config_bmon(struct hl_device *hdev, WREG32(base_reg + 0x100, 0x11); WREG32(base_reg + 0x304, 0x1); } else { + WREG32(base_reg + 0x200, 0); + WREG32(base_reg + 0x204, 0); WREG32(base_reg + 0x208, 0xFFFFFFFF); WREG32(base_reg + 0x20C, 0xFFFFFFFF); + WREG32(base_reg + 0x240, 0); + WREG32(base_reg + 0x244, 0); WREG32(base_reg + 0x248, 0xFFFFFFFF); WREG32(base_reg + 0x24C, 0xFFFFFFFF); WREG32(base_reg + 0x224, 0xFFFFFFFF); diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 613d431da783..8ac292cf4d00 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -374,9 +374,12 @@ struct hl_debug_params_stm { }; struct hl_debug_params_bmon { - /* Transaction address filter */ - __u64 addr_range0; - __u64 addr_range1; + /* Two address ranges that the user can request to filter */ + __u64 start_addr0; + __u64 addr_mask0; + + __u64 start_addr1; + __u64 addr_mask1; /* Capture window configuration */ __u32 bw_win; From b2377e032f17c3dd87739a97699f144ed00edf05 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 22 Apr 2019 11:49:06 +0300 Subject: [PATCH 04/15] habanalabs: use ASIC functions interface for rreg/wreg This patch slightly changes the macros of RREG32 and WREG32, which are used when reading or writing from registers. Instead of directly calling a function in the common code from these macros, the new code calls a function from the ASIC functions interface. This change allows us to share much more code between real ASICs and simulators, which in turn reduces the maintenance burden and the chances for forgetting to port code between the ASIC files. The patch also implements the hl_poll_timeout macro, instead of calling the generic readl_poll_timeout macro. This is required to allow use of this macro in the simulator files. As a result from this change, more functions in goya.c are shared with the simulator and therefore, should not be defined as static. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 43 +++++++++++----------------- drivers/misc/habanalabs/goya/goyaP.h | 40 ++++++++++++++++++-------- drivers/misc/habanalabs/habanalabs.h | 32 +++++++++++++++++---- 3 files changed, 71 insertions(+), 44 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 3f707e8c408a..8ee3b00b0fab 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -297,7 +297,7 @@ static u32 goya_all_events[] = { GOYA_ASYNC_EVENT_ID_DMA_BM_CH4 }; -static void goya_get_fixed_properties(struct hl_device *hdev) +void goya_get_fixed_properties(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; int i; @@ -542,14 +542,7 @@ static void goya_fetch_psoc_frequency(struct hl_device *hdev) prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); } -/* - * goya_late_init - GOYA late initialization code - * - * @hdev: pointer to hl_device structure - * - * Get ArmCP info and send message to CPU to enable PCI access - */ -static int goya_late_init(struct hl_device *hdev) +int goya_late_init(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; int rc; @@ -648,9 +641,6 @@ static int goya_sw_init(struct hl_device *hdev) goya->tpc_clk = GOYA_PLL_FREQ_LOW; goya->ic_clk = GOYA_PLL_FREQ_LOW; - goya->mmu_prepare_reg = goya_mmu_prepare_reg; - goya->qman0_set_security = goya_qman0_set_security; - hdev->asic_specific = goya; /* Create DMA pool for small allocations */ @@ -815,7 +805,7 @@ static void goya_init_dma_ch(struct hl_device *hdev, int dma_id) * Initialize the H/W registers of the QMAN DMA channels * */ -static void goya_init_dma_qmans(struct hl_device *hdev) +void goya_init_dma_qmans(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; struct hl_hw_queue *q; @@ -968,7 +958,7 @@ static int goya_stop_external_queues(struct hl_device *hdev) * Returns 0 on success * */ -static int goya_init_cpu_queues(struct hl_device *hdev) +int goya_init_cpu_queues(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; struct hl_eq *eq; @@ -1549,7 +1539,7 @@ static void goya_init_mme_cmdq(struct hl_device *hdev) WREG32(mmMME_CMDQ_GLBL_CFG0, CMDQ_MME_ENABLE); } -static void goya_init_mme_qmans(struct hl_device *hdev) +void goya_init_mme_qmans(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; u32 so_base_lo, so_base_hi; @@ -1656,7 +1646,7 @@ static void goya_init_tpc_cmdq(struct hl_device *hdev, int tpc_id) WREG32(mmTPC0_CMDQ_GLBL_CFG0 + reg_off, CMDQ_TPC_ENABLE); } -static void goya_init_tpc_qmans(struct hl_device *hdev) +void goya_init_tpc_qmans(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; u32 so_base_lo, so_base_hi; @@ -2373,7 +2363,7 @@ static int goya_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid, return 0; } -static int goya_mmu_init(struct hl_device *hdev) +int goya_mmu_init(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; struct goya_device *goya = hdev->asic_specific; @@ -2649,7 +2639,7 @@ static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, return rc; } -static void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) +void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) { u32 db_reg_offset, db_value; bool invalid_queue = false; @@ -2816,7 +2806,6 @@ void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id, static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) { - struct goya_device *goya = hdev->asic_specific; struct packet_msg_prot *fence_pkt; u32 *fence_ptr; dma_addr_t fence_dma_addr; @@ -2847,7 +2836,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) *fence_ptr = 0; - goya->qman0_set_security(hdev, true); + goya_qman0_set_security(hdev, true); /* * goya cs parser saves space for 2xpacket_msg_prot at end of CB. For @@ -2889,7 +2878,7 @@ free_fence_ptr: hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr, fence_dma_addr); - goya->qman0_set_security(hdev, false); + goya_qman0_set_security(hdev, false); return rc; } @@ -3927,12 +3916,12 @@ void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr, cq_pkt->addr = cpu_to_le64(CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF); } -static void goya_update_eq_ci(struct hl_device *hdev, u32 val) +void goya_update_eq_ci(struct hl_device *hdev, u32 val) { WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val); } -static void goya_restore_phase_topology(struct hl_device *hdev) +void goya_restore_phase_topology(struct hl_device *hdev) { int i, num_of_sob_in_longs, num_of_mon_in_longs; @@ -4494,7 +4483,7 @@ release_cb: return rc; } -static int goya_context_switch(struct hl_device *hdev, u32 asid) +int goya_context_switch(struct hl_device *hdev, u32 asid) { struct asic_fixed_properties *prop = &hdev->asic_prop; u64 addr = prop->sram_base_address; @@ -4556,7 +4545,7 @@ void goya_mmu_prepare(struct hl_device *hdev, u32 asid) /* zero the MMBP and ASID bits and then set the ASID */ for (i = 0 ; i < GOYA_MMU_REGS_NUM ; i++) - goya->mmu_prepare_reg(hdev, goya_mmu_regs[i], asid); + goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid); } static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard) @@ -4829,7 +4818,9 @@ static const struct hl_asic_funcs goya_funcs = { .get_hw_state = goya_get_hw_state, .pci_bars_map = goya_pci_bars_map, .set_dram_bar_base = goya_set_ddr_bar_base, - .init_iatu = goya_init_iatu + .init_iatu = goya_init_iatu, + .rreg = hl_rreg, + .wreg = hl_wreg }; /* diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index b572e0263ac5..14e216cb3668 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -147,9 +147,6 @@ enum goya_fw_component { }; struct goya_device { - void (*mmu_prepare_reg)(struct hl_device *hdev, u64 reg, u32 asid); - void (*qman0_set_security)(struct hl_device *hdev, bool secure); - /* TODO: remove hw_queues_lock after moving to scheduler code */ spinlock_t hw_queues_lock; @@ -162,13 +159,34 @@ struct goya_device { u32 hw_cap_initialized; }; +void goya_get_fixed_properties(struct hl_device *hdev); +int goya_mmu_init(struct hl_device *hdev); +void goya_init_dma_qmans(struct hl_device *hdev); +void goya_init_mme_qmans(struct hl_device *hdev); +void goya_init_tpc_qmans(struct hl_device *hdev); +int goya_init_cpu_queues(struct hl_device *hdev); +void goya_init_security(struct hl_device *hdev); +int goya_late_init(struct hl_device *hdev); +void goya_late_fini(struct hl_device *hdev); + +void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi); +void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val); +void goya_update_eq_ci(struct hl_device *hdev, u32 val); +void goya_restore_phase_topology(struct hl_device *hdev); +int goya_context_switch(struct hl_device *hdev, u32 asid); + int goya_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, u8 i2c_reg, u32 *val); int goya_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, u8 i2c_reg, u32 val); +void goya_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state); + +int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id); +int goya_test_queues(struct hl_device *hdev); int goya_test_cpu_queue(struct hl_device *hdev); int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len, u32 timeout, long *result); + long goya_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr); long goya_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr); long goya_get_current(struct hl_device *hdev, int sensor_index, u32 attr); @@ -176,33 +194,31 @@ long goya_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr); long goya_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr); void goya_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr, long value); -void goya_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state); +u64 goya_get_max_power(struct hl_device *hdev); +void goya_set_max_power(struct hl_device *hdev, u64 value); + void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq); void goya_add_device_attr(struct hl_device *hdev, struct attribute_group *dev_attr_grp); int goya_armcp_info_get(struct hl_device *hdev); -void goya_init_security(struct hl_device *hdev); int goya_debug_coresight(struct hl_device *hdev, void *data); -u64 goya_get_max_power(struct hl_device *hdev); -void goya_set_max_power(struct hl_device *hdev, u64 value); -int goya_test_queues(struct hl_device *hdev); + void goya_mmu_prepare(struct hl_device *hdev, u32 asid); int goya_mmu_clear_pgt_range(struct hl_device *hdev); int goya_mmu_set_dram_default_page(struct hl_device *hdev); -void goya_late_fini(struct hl_device *hdev); int goya_suspend(struct hl_device *hdev); int goya_resume(struct hl_device *hdev); -void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val); + void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry); void *goya_get_events_stat(struct hl_device *hdev, u32 *size); + void goya_add_end_of_cb_packets(u64 kernel_address, u32 len, u64 cq_addr, u32 cq_val, u32 msix_vec); int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser); void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id, - dma_addr_t *dma_handle, u16 *queue_len); + dma_addr_t *dma_handle, u16 *queue_len); u32 goya_get_dma_desc_list_size(struct hl_device *hdev, struct sg_table *sgt); -int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id); int goya_send_heartbeat(struct hl_device *hdev); void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 86bd5298efd6..e8bbaf0f26c1 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -489,6 +489,8 @@ enum hl_pll_frequency { * @pci_bars_map: Map PCI BARs. * @set_dram_bar_base: Set DRAM BAR to map specific device address. * @init_iatu: Initialize the iATU unit inside the PCI controller. + * @rreg: Read a register. Needed for simulator support. + * @wreg: Write a register. Needed for simulator support. */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -564,6 +566,8 @@ struct hl_asic_funcs { int (*pci_bars_map)(struct hl_device *hdev); int (*set_dram_bar_base)(struct hl_device *hdev, u64 addr); int (*init_iatu)(struct hl_device *hdev); + u32 (*rreg)(struct hl_device *hdev, u32 reg); + void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); }; @@ -1007,13 +1011,10 @@ struct hl_dbg_device_entry { u32 hl_rreg(struct hl_device *hdev, u32 reg); void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); -#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \ - readl_poll_timeout(hdev->rmmio + addr, val, cond, sleep_us, timeout_us) - -#define RREG32(reg) hl_rreg(hdev, (reg)) -#define WREG32(reg, v) hl_wreg(hdev, (reg), (v)) +#define RREG32(reg) hdev->asic_funcs->rreg(hdev, (reg)) +#define WREG32(reg, v) hdev->asic_funcs->wreg(hdev, (reg), (v)) #define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n", \ - hl_rreg(hdev, (reg))) + hdev->asic_funcs->rreg(hdev, (reg))) #define WREG32_P(reg, val, mask) \ do { \ @@ -1031,6 +1032,25 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \ (val) << REG_FIELD_SHIFT(reg, field)) +#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \ +({ \ + ktime_t __timeout = ktime_add_us(ktime_get(), timeout_us); \ + might_sleep_if(sleep_us); \ + for (;;) { \ + (val) = RREG32(addr); \ + if (cond) \ + break; \ + if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \ + (val) = RREG32(addr); \ + break; \ + } \ + if (sleep_us) \ + usleep_range((sleep_us >> 2) + 1, sleep_us); \ + } \ + (cond) ? 0 : -ETIMEDOUT; \ +}) + + #define HL_ENG_BUSY(buf, size, fmt, ...) ({ \ if (buf) \ snprintf(buf, size, fmt, ##__VA_ARGS__); \ From 027d35d0b6999c02de4c1ef86d0df4b5f4119167 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 25 Apr 2019 20:15:42 +0300 Subject: [PATCH 05/15] habanalabs: rename restore to ctx_switch when appropriate This patch only does renaming of certain variables and structure members, and their accompanied comments. This is done to better reflect the actions these variables and members represent. There is no functional change in this patch. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 16 ++++++++-------- drivers/misc/habanalabs/context.c | 4 ++-- drivers/misc/habanalabs/device.c | 6 +++--- drivers/misc/habanalabs/habanalabs.h | 17 +++++++++-------- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 02c48da0b645..c4ab694b51b5 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -601,7 +601,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) void __user *chunks; u32 num_chunks; u64 cs_seq = ULONG_MAX; - int rc, do_restore; + int rc, do_ctx_switch; bool need_soft_reset = false; if (hl_device_disabled_or_in_reset(hdev)) { @@ -612,9 +612,9 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) goto out; } - do_restore = atomic_cmpxchg(&ctx->thread_restore_token, 1, 0); + do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0); - if (do_restore || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) { + if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) { long ret; chunks = (void __user *)(uintptr_t)args->in.chunks_restore; @@ -622,7 +622,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) mutex_lock(&hpriv->restore_phase_mutex); - if (do_restore) { + if (do_ctx_switch) { rc = hdev->asic_funcs->context_switch(hdev, ctx->asid); if (rc) { dev_err_ratelimited(hdev->dev, @@ -678,18 +678,18 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) } } - ctx->thread_restore_wait_token = 1; - } else if (!ctx->thread_restore_wait_token) { + ctx->thread_ctx_switch_wait_token = 1; + } else if (!ctx->thread_ctx_switch_wait_token) { u32 tmp; rc = hl_poll_timeout_memory(hdev, - (u64) (uintptr_t) &ctx->thread_restore_wait_token, + (u64) (uintptr_t) &ctx->thread_ctx_switch_wait_token, jiffies_to_usecs(hdev->timeout_jiffies), &tmp); if (rc || !tmp) { dev_err(hdev->dev, - "restore phase hasn't finished in time\n"); + "context switch phase didn't finish in time\n"); rc = -ETIMEDOUT; goto out; } diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c index 619ace1c4ef7..4804cdcf4c48 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/context.c @@ -106,8 +106,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) ctx->cs_sequence = 1; spin_lock_init(&ctx->cs_lock); - atomic_set(&ctx->thread_restore_token, 1); - ctx->thread_restore_wait_token = 0; + atomic_set(&ctx->thread_ctx_switch_token, 1); + ctx->thread_ctx_switch_wait_token = 0; if (is_kernel_ctx) { ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */ diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index a88f7be23c7f..0e0b9ec71c80 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -710,10 +710,10 @@ again: for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) hl_cq_reset(hdev, &hdev->completion_queue[i]); - /* Make sure the setup phase for the user context will run again */ + /* Make sure the context switch phase will run again */ if (hdev->user_ctx) { - atomic_set(&hdev->user_ctx->thread_restore_token, 1); - hdev->user_ctx->thread_restore_wait_token = 0; + atomic_set(&hdev->user_ctx->thread_ctx_switch_token, 1); + hdev->user_ctx->thread_ctx_switch_wait_token = 0; } /* Finished tear-down, starting to re-initialize */ diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index e8bbaf0f26c1..a624d1e1e1e5 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -615,12 +615,13 @@ struct hl_va_range { * DRAM mapping. * @cs_lock: spinlock to protect cs_sequence. * @dram_phys_mem: amount of used physical DRAM memory by this context. - * @thread_restore_token: token to prevent multiple threads of the same context - * from running the restore phase. Only one thread - * should run it. - * @thread_restore_wait_token: token to prevent the threads that didn't run - * the restore phase from moving to their execution - * phase before the restore phase has finished. + * @thread_ctx_switch_token: token to prevent multiple threads of the same + * context from running the context switch phase. + * Only a single thread should run it. + * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run + * the context switch phase from moving to their + * execution phase before the context switch phase + * has finished. * @asid: context's unique address space ID in the device's MMU. */ struct hl_ctx { @@ -640,8 +641,8 @@ struct hl_ctx { u64 *dram_default_hops; spinlock_t cs_lock; atomic64_t dram_phys_mem; - atomic_t thread_restore_token; - u32 thread_restore_wait_token; + atomic_t thread_ctx_switch_token; + u32 thread_ctx_switch_wait_token; u32 asid; }; From a38693d77576145673a7b3d4d771d992282ad13b Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 28 Apr 2019 10:18:35 +0300 Subject: [PATCH 06/15] habanalabs: return old dram bar address upon change This patch changes the ASIC interface function that changes the DRAM bar window. The change is to return the old address that the DRAM bar pointed to instead of an error code. This simplifies the code that use this function (mainly in debugfs) to restore the bar to the old setting. This is also needed for easier support in future ASICs. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 60 +++++++++++++--------------- drivers/misc/habanalabs/habanalabs.h | 5 ++- drivers/misc/habanalabs/pci.c | 5 ++- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 8ee3b00b0fab..04e4ed8a0be6 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -389,33 +389,26 @@ static int goya_pci_bars_map(struct hl_device *hdev) return 0; } -/* - * goya_set_ddr_bar_base - set DDR bar to map specific device address - * - * @hdev: pointer to hl_device structure - * @addr: address in DDR. Must be aligned to DDR bar size - * - * This function configures the iATU so that the DDR bar will start at the - * specified addr. - * - */ -static int goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) +static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) { struct goya_device *goya = hdev->asic_specific; + u64 old_addr = addr; int rc; if ((goya) && (goya->ddr_bar_cur_addr == addr)) - return 0; + return old_addr; /* Inbound Region 1 - Bar 4 - Point to DDR */ rc = hl_pci_set_dram_bar_base(hdev, 1, 4, addr); if (rc) - return rc; + return U64_MAX; - if (goya) + if (goya) { + old_addr = goya->ddr_bar_cur_addr; goya->ddr_bar_cur_addr = addr; + } - return 0; + return old_addr; } /* @@ -2215,11 +2208,10 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout) * Before pushing u-boot/linux to device, need to set the ddr bar to * base address of dram */ - rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE); - if (rc) { + if (goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE) == U64_MAX) { dev_err(hdev->dev, "failed to map DDR bar to DRAM base address\n"); - return rc; + return -EIO; } if (hdev->pldm) { @@ -2454,12 +2446,12 @@ static int goya_hw_init(struct hl_device *hdev) * After CPU initialization is finished, change DDR bar mapping inside * iATU to point to the start address of the MMU page tables */ - rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE + - (MMU_PAGE_TABLES_ADDR & ~(prop->dram_pci_bar_size - 0x1ull))); - if (rc) { + if (goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE + + (MMU_PAGE_TABLES_ADDR & + ~(prop->dram_pci_bar_size - 0x1ull))) == U64_MAX) { dev_err(hdev->dev, "failed to map DDR bar to MMU page tables\n"); - return rc; + return -EIO; } rc = goya_mmu_init(hdev); @@ -3958,6 +3950,7 @@ void goya_restore_phase_topology(struct hl_device *hdev) static int goya_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val) { struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 ddr_bar_addr; int rc = 0; if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) { @@ -3975,15 +3968,16 @@ static int goya_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val) u64 bar_base_addr = DRAM_PHYS_BASE + (addr & ~(prop->dram_pci_bar_size - 0x1ull)); - rc = goya_set_ddr_bar_base(hdev, bar_base_addr); - if (!rc) { + ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr); + if (ddr_bar_addr != U64_MAX) { *val = readl(hdev->pcie_bar[DDR_BAR_ID] + (addr - bar_base_addr)); - rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE + - (MMU_PAGE_TABLES_ADDR & - ~(prop->dram_pci_bar_size - 0x1ull))); + ddr_bar_addr = goya_set_ddr_bar_base(hdev, + ddr_bar_addr); } + if (ddr_bar_addr == U64_MAX) + rc = -EIO; } else { rc = -EFAULT; } @@ -4008,6 +4002,7 @@ static int goya_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val) static int goya_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val) { struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 ddr_bar_addr; int rc = 0; if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) { @@ -4025,15 +4020,16 @@ static int goya_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val) u64 bar_base_addr = DRAM_PHYS_BASE + (addr & ~(prop->dram_pci_bar_size - 0x1ull)); - rc = goya_set_ddr_bar_base(hdev, bar_base_addr); - if (!rc) { + ddr_bar_addr = goya_set_ddr_bar_base(hdev, bar_base_addr); + if (ddr_bar_addr != U64_MAX) { writel(val, hdev->pcie_bar[DDR_BAR_ID] + (addr - bar_base_addr)); - rc = goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE + - (MMU_PAGE_TABLES_ADDR & - ~(prop->dram_pci_bar_size - 0x1ull))); + ddr_bar_addr = goya_set_ddr_bar_base(hdev, + ddr_bar_addr); } + if (ddr_bar_addr == U64_MAX) + rc = -EIO; } else { rc = -EFAULT; } diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index a624d1e1e1e5..65717e4055da 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -487,7 +487,8 @@ enum hl_pll_frequency { * @send_cpu_message: send buffer to ArmCP. * @get_hw_state: retrieve the H/W state * @pci_bars_map: Map PCI BARs. - * @set_dram_bar_base: Set DRAM BAR to map specific device address. + * @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns + * old address the bar pointed to or U64_MAX for failure * @init_iatu: Initialize the iATU unit inside the PCI controller. * @rreg: Read a register. Needed for simulator support. * @wreg: Write a register. Needed for simulator support. @@ -564,7 +565,7 @@ struct hl_asic_funcs { u16 len, u32 timeout, long *result); enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev); int (*pci_bars_map)(struct hl_device *hdev); - int (*set_dram_bar_base)(struct hl_device *hdev, u64 addr); + u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr); int (*init_iatu)(struct hl_device *hdev); u32 (*rreg)(struct hl_device *hdev, u32 reg); void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); diff --git a/drivers/misc/habanalabs/pci.c b/drivers/misc/habanalabs/pci.c index d472d02a8e6e..5278f086d65d 100644 --- a/drivers/misc/habanalabs/pci.c +++ b/drivers/misc/habanalabs/pci.c @@ -259,7 +259,10 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, /* Point to DRAM */ if (!hdev->asic_funcs->set_dram_bar_base) return -EINVAL; - rc |= hdev->asic_funcs->set_dram_bar_base(hdev, dram_base_address); + if (hdev->asic_funcs->set_dram_bar_base(hdev, dram_base_address) == + U64_MAX) + return -EIO; + /* Outbound Region 0 - Point to Host */ host_phys_end_addr = prop->host_phys_base_address + host_phys_size - 1; From 03d5f641dc711eb93145ded91ed68b4be729be4d Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 28 Apr 2019 19:17:38 +0300 Subject: [PATCH 07/15] habanalabs: Use single pool for CPU accessible host memory The device's CPU accessible memory on host is managed in a dedicated pool, except for 2 regions - Primary Queue (PQ) and Event Queue (EQ) - which are allocated from generic DMA pools. Due to address length limitations of the CPU, the addresses of all these memory regions must have the same MSBs starting at bit 40. This patch modifies the allocation of the PQ and EQ to be also from the dedicated pool, to ensure compliance with the limitation. Signed-off-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/habanalabs.h | 12 +++++++ drivers/misc/habanalabs/hw_queue.c | 40 ++++++++++++++++------ drivers/misc/habanalabs/include/armcp_if.h | 8 ----- drivers/misc/habanalabs/irq.c | 10 +++--- 4 files changed, 48 insertions(+), 22 deletions(-) diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 65717e4055da..687651db614c 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -321,6 +321,18 @@ struct hl_cs_job; #define HL_EQ_LENGTH 64 #define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE) +#define HL_CPU_PKT_SHIFT 5 +#define HL_CPU_PKT_SIZE (1 << HL_CPU_PKT_SHIFT) +#define HL_CPU_PKT_MASK (~((1 << HL_CPU_PKT_SHIFT) - 1)) +#define HL_CPU_MAX_PKTS_IN_CB 32 +#define HL_CPU_CB_SIZE (HL_CPU_PKT_SIZE * \ + HL_CPU_MAX_PKTS_IN_CB) +#define HL_CPU_CB_QUEUE_SIZE (HL_QUEUE_LENGTH * HL_CPU_CB_SIZE) + +/* KMD <-> ArmCP shared memory size (EQ + PQ + CPU CB queue) */ +#define HL_CPU_ACCESSIBLE_MEM_SIZE (HL_EQ_SIZE_IN_BYTES + \ + HL_QUEUE_SIZE_IN_BYTES + \ + HL_CPU_CB_QUEUE_SIZE) /** * struct hl_hw_queue - describes a H/W transport queue. diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index ef3bb6951360..a1ee52cfd505 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -415,14 +415,20 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) } static int ext_and_cpu_hw_queue_init(struct hl_device *hdev, - struct hl_hw_queue *q) + struct hl_hw_queue *q, bool is_cpu_queue) { void *p; int rc; - p = hdev->asic_funcs->dma_alloc_coherent(hdev, - HL_QUEUE_SIZE_IN_BYTES, - &q->bus_address, GFP_KERNEL | __GFP_ZERO); + if (is_cpu_queue) + p = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, + HL_QUEUE_SIZE_IN_BYTES, + &q->bus_address); + else + p = hdev->asic_funcs->dma_alloc_coherent(hdev, + HL_QUEUE_SIZE_IN_BYTES, + &q->bus_address, + GFP_KERNEL | __GFP_ZERO); if (!p) return -ENOMEM; @@ -446,8 +452,15 @@ static int ext_and_cpu_hw_queue_init(struct hl_device *hdev, return 0; free_queue: - hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address, q->bus_address); + if (is_cpu_queue) + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, + HL_QUEUE_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address); + else + hdev->asic_funcs->dma_free_coherent(hdev, + HL_QUEUE_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address, + q->bus_address); return rc; } @@ -474,12 +487,12 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) { - return ext_and_cpu_hw_queue_init(hdev, q); + return ext_and_cpu_hw_queue_init(hdev, q, true); } static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) { - return ext_and_cpu_hw_queue_init(hdev, q); + return ext_and_cpu_hw_queue_init(hdev, q, false); } /* @@ -569,8 +582,15 @@ static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q) kfree(q->shadow_queue); - hdev->asic_funcs->dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address, q->bus_address); + if (q->queue_type == QUEUE_TYPE_CPU) + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, + HL_QUEUE_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address); + else + hdev->asic_funcs->dma_free_coherent(hdev, + HL_QUEUE_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address, + q->bus_address); } int hl_hw_queues_create(struct hl_device *hdev) diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h index c8f28cadc335..1f1e35e86d84 100644 --- a/drivers/misc/habanalabs/include/armcp_if.h +++ b/drivers/misc/habanalabs/include/armcp_if.h @@ -300,14 +300,6 @@ enum armcp_pwm_attributes { armcp_pwm_enable }; -#define HL_CPU_PKT_SHIFT 5 -#define HL_CPU_PKT_SIZE (1 << HL_CPU_PKT_SHIFT) -#define HL_CPU_PKT_MASK (~((1 << HL_CPU_PKT_SHIFT) - 1)) -#define HL_CPU_MAX_PKTS_IN_CB 32 -#define HL_CPU_CB_SIZE (HL_CPU_PKT_SIZE * \ - HL_CPU_MAX_PKTS_IN_CB) -#define HL_CPU_ACCESSIBLE_MEM_SIZE (HL_QUEUE_LENGTH * HL_CPU_CB_SIZE) - /* Event Queue Packets */ struct eq_generic_event { diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index e69a09c10e3f..86a8ad57f1ca 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -284,8 +284,9 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q) BUILD_BUG_ON(HL_EQ_SIZE_IN_BYTES > HL_PAGE_SIZE); - p = hdev->asic_funcs->dma_alloc_coherent(hdev, HL_EQ_SIZE_IN_BYTES, - &q->bus_address, GFP_KERNEL | __GFP_ZERO); + p = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, + HL_EQ_SIZE_IN_BYTES, + &q->bus_address); if (!p) return -ENOMEM; @@ -308,8 +309,9 @@ void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q) { flush_workqueue(hdev->eq_wq); - hdev->asic_funcs->dma_free_coherent(hdev, HL_EQ_SIZE_IN_BYTES, - (void *) (uintptr_t) q->kernel_address, q->bus_address); + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, + HL_EQ_SIZE_IN_BYTES, + (void *) (uintptr_t) q->kernel_address); } void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q) From 3706b47006a9c8aa867d314f913d3b1310163d63 Mon Sep 17 00:00:00 2001 From: Dalit Ben Zoor Date: Tue, 30 Apr 2019 15:22:14 +0300 Subject: [PATCH 08/15] habanalabs: remove call to cs_parser() There is no need to parse the command submission when doing memset of the device memory using the DMA engine because only the driver calls the memset function and therefore, the CS is trusted and doesn't require validation and patching. Signed-off-by: Dalit Ben Zoor Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 04e4ed8a0be6..9fc8b6e1369d 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4398,7 +4398,6 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u32 size, u64 val, bool is_dram) { struct packet_lin_dma *lin_dma_pkt; - struct hl_cs_parser parser; struct hl_cs_job *job; u32 cb_size, ctl; struct hl_cb *cb; @@ -4438,36 +4437,16 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u32 size, job->user_cb->cs_cnt++; job->user_cb_size = cb_size; job->hw_queue_id = GOYA_QUEUE_ID_DMA_0; + job->patched_cb = job->user_cb; + job->job_cb_size = job->user_cb_size + + sizeof(struct packet_msg_prot) * 2; hl_debugfs_add_job(hdev, job); - parser.ctx_id = HL_KERNEL_ASID_ID; - parser.cs_sequence = 0; - parser.job_id = job->id; - parser.hw_queue_id = job->hw_queue_id; - parser.job_userptr_list = &job->userptr_list; - parser.user_cb = job->user_cb; - parser.user_cb_size = job->user_cb_size; - parser.ext_queue = job->ext_queue; - parser.use_virt_addr = hdev->mmu_enable; - - rc = hdev->asic_funcs->cs_parser(hdev, &parser); - if (rc) { - dev_err(hdev->dev, "Failed to parse kernel CB\n"); - goto free_job; - } - - job->patched_cb = parser.patched_cb; - job->job_cb_size = parser.patched_cb_size; - job->patched_cb->cs_cnt++; - rc = goya_send_job_on_qman0(hdev, job); - job->patched_cb->cs_cnt--; hl_cb_put(job->patched_cb); -free_job: - hl_userptr_delete_list(hdev, &job->userptr_list); hl_debugfs_remove_job(hdev, job); kfree(job); cb->cs_cnt--; From d9c3aa8038c391f38a391289989ca0ac356a9501 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 1 May 2019 11:47:04 +0300 Subject: [PATCH 09/15] habanalabs: rename functions to improve code readability This patch renames four functions in the ASIC-specific functions section, so it will be easier to differentiate them from the generic kernel functions with the same name. This will help in future code reviews, to make sure we don't use the kernel functions directly. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_buffer.c | 6 ++--- drivers/misc/habanalabs/goya/goya.c | 28 ++++++++++++---------- drivers/misc/habanalabs/habanalabs.h | 30 ++++++++++++------------ drivers/misc/habanalabs/hw_queue.c | 6 ++--- drivers/misc/habanalabs/irq.c | 4 ++-- 5 files changed, 38 insertions(+), 36 deletions(-) diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/command_buffer.c index b1ffca47d748..e495f44064fa 100644 --- a/drivers/misc/habanalabs/command_buffer.c +++ b/drivers/misc/habanalabs/command_buffer.c @@ -13,7 +13,7 @@ static void cb_fini(struct hl_device *hdev, struct hl_cb *cb) { - hdev->asic_funcs->dma_free_coherent(hdev, cb->size, + hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size, (void *) (uintptr_t) cb->kernel_address, cb->bus_address); kfree(cb); @@ -66,10 +66,10 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, return NULL; if (ctx_id == HL_KERNEL_ASID_ID) - p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size, + p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_ATOMIC); else - p = hdev->asic_funcs->dma_alloc_coherent(hdev, cb_size, + p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_USER | __GFP_ZERO); if (!p) { diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 9fc8b6e1369d..8e18c80a22e7 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -646,7 +646,7 @@ static int goya_sw_init(struct hl_device *hdev) } hdev->cpu_accessible_dma_mem = - hdev->asic_funcs->dma_alloc_coherent(hdev, + hdev->asic_funcs->asic_dma_alloc_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, &hdev->cpu_accessible_dma_address, GFP_KERNEL | __GFP_ZERO); @@ -681,7 +681,8 @@ static int goya_sw_init(struct hl_device *hdev) free_cpu_pq_pool: gen_pool_destroy(hdev->cpu_accessible_dma_pool); free_cpu_pq_dma_mem: - hdev->asic_funcs->dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, + hdev->asic_funcs->asic_dma_free_coherent(hdev, + HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem, hdev->cpu_accessible_dma_address); free_dma_pool: @@ -704,7 +705,8 @@ static int goya_sw_fini(struct hl_device *hdev) gen_pool_destroy(hdev->cpu_accessible_dma_pool); - hdev->asic_funcs->dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, + hdev->asic_funcs->asic_dma_free_coherent(hdev, + HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem, hdev->cpu_accessible_dma_address); @@ -2818,7 +2820,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) return -EBUSY; } - fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL, + fence_ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, 4, GFP_KERNEL, &fence_dma_addr); if (!fence_ptr) { dev_err(hdev->dev, @@ -2867,7 +2869,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) } free_fence_ptr: - hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr, + hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr, fence_dma_addr); goya_qman0_set_security(hdev, false); @@ -2901,7 +2903,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) fence_val = GOYA_QMAN0_FENCE_VAL; - fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL, + fence_ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, 4, GFP_KERNEL, &fence_dma_addr); if (!fence_ptr) { dev_err(hdev->dev, @@ -2911,7 +2913,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) *fence_ptr = 0; - fence_pkt = hdev->asic_funcs->dma_pool_zalloc(hdev, + fence_pkt = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, sizeof(struct packet_msg_prot), GFP_KERNEL, &pkt_dma_addr); if (!fence_pkt) { @@ -2955,10 +2957,10 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) } free_pkt: - hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_pkt, + hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_pkt, pkt_dma_addr); free_fence_ptr: - hdev->asic_funcs->dma_pool_free(hdev, (void *) fence_ptr, + hdev->asic_funcs->asic_dma_pool_free(hdev, (void *) fence_ptr, fence_dma_addr); return rc; } @@ -4755,12 +4757,12 @@ static const struct hl_asic_funcs goya_funcs = { .cb_mmap = goya_cb_mmap, .ring_doorbell = goya_ring_doorbell, .flush_pq_write = goya_flush_pq_write, - .dma_alloc_coherent = goya_dma_alloc_coherent, - .dma_free_coherent = goya_dma_free_coherent, + .asic_dma_alloc_coherent = goya_dma_alloc_coherent, + .asic_dma_free_coherent = goya_dma_free_coherent, .get_int_queue_base = goya_get_int_queue_base, .test_queues = goya_test_queues, - .dma_pool_zalloc = goya_dma_pool_zalloc, - .dma_pool_free = goya_dma_pool_free, + .asic_dma_pool_zalloc = goya_dma_pool_zalloc, + .asic_dma_pool_free = goya_dma_pool_free, .cpu_accessible_dma_pool_alloc = goya_cpu_accessible_dma_pool_alloc, .cpu_accessible_dma_pool_free = goya_cpu_accessible_dma_pool_free, .hl_dma_unmap_sg = goya_dma_unmap_sg, diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 687651db614c..b64594be6dbd 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -453,19 +453,19 @@ enum hl_pll_frequency { * @cb_mmap: maps a CB. * @ring_doorbell: increment PI on a given QMAN. * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed. - * @dma_alloc_coherent: Allocate coherent DMA memory by calling - * dma_alloc_coherent(). This is ASIC function because its - * implementation is not trivial when the driver is loaded - * in simulation mode (not upstreamed). - * @dma_free_coherent: Free coherent DMA memory by calling dma_free_coherent(). - * This is ASIC function because its implementation is not - * trivial when the driver is loaded in simulation mode - * (not upstreamed). + * @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling + * dma_alloc_coherent(). This is ASIC function because + * its implementation is not trivial when the driver + * is loaded in simulation mode (not upstreamed). + * @asic_dma_free_coherent: Free coherent DMA memory by calling + * dma_free_coherent(). This is ASIC function because + * its implementation is not trivial when the driver + * is loaded in simulation mode (not upstreamed). * @get_int_queue_base: get the internal queue base address. * @test_queues: run simple test on all queues for sanity check. - * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool. - * size of allocation is HL_DMA_POOL_BLK_SIZE. - * @dma_pool_free: free small DMA allocation from pool. + * @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool. + * size of allocation is HL_DMA_POOL_BLK_SIZE. + * @asic_dma_pool_free: free small DMA allocation from pool. * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool. * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool. * @hl_dma_unmap_sg: DMA unmap scatter-gather list. @@ -521,16 +521,16 @@ struct hl_asic_funcs { u64 kaddress, phys_addr_t paddress, u32 size); void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val); - void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size, + void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flag); - void (*dma_free_coherent)(struct hl_device *hdev, size_t size, + void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size, void *cpu_addr, dma_addr_t dma_handle); void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id, dma_addr_t *dma_handle, u16 *queue_len); int (*test_queues)(struct hl_device *hdev); - void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size, + void* (*asic_dma_pool_zalloc)(struct hl_device *hdev, size_t size, gfp_t mem_flags, dma_addr_t *dma_handle); - void (*dma_pool_free)(struct hl_device *hdev, void *vaddr, + void (*asic_dma_pool_free)(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr); void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle); diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index a1ee52cfd505..6cdaa117fc40 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -425,7 +425,7 @@ static int ext_and_cpu_hw_queue_init(struct hl_device *hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address); else - p = hdev->asic_funcs->dma_alloc_coherent(hdev, + p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address, GFP_KERNEL | __GFP_ZERO); @@ -457,7 +457,7 @@ free_queue: HL_QUEUE_SIZE_IN_BYTES, (void *) (uintptr_t) q->kernel_address); else - hdev->asic_funcs->dma_free_coherent(hdev, + hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, (void *) (uintptr_t) q->kernel_address, q->bus_address); @@ -587,7 +587,7 @@ static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q) HL_QUEUE_SIZE_IN_BYTES, (void *) (uintptr_t) q->kernel_address); else - hdev->asic_funcs->dma_free_coherent(hdev, + hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, (void *) (uintptr_t) q->kernel_address, q->bus_address); diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index 86a8ad57f1ca..ea9f72ff456c 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -222,7 +222,7 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE); - p = hdev->asic_funcs->dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES, + p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES, &q->bus_address, GFP_KERNEL | __GFP_ZERO); if (!p) return -ENOMEM; @@ -248,7 +248,7 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) */ void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q) { - hdev->asic_funcs->dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES, + hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES, (void *) (uintptr_t) q->kernel_address, q->bus_address); } From 94cb669ceb0589f24ee91e3a8ae8ed3013fd6904 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 1 May 2019 11:28:15 +0300 Subject: [PATCH 10/15] habanalabs: Manipulate DMA addresses in ASIC functions Routing device accesses to the host memory requires the usage of a base offset, which is canceled by the iATU just before leaving the device. The value of the base offset might be distinctive between different ASIC types. The manipulation of the addresses is currently used throughout the driver code, and one should be aware to it whenever providing a host memory address to the device. This patch removes this manipulation from the driver common code, and moves it to the ASIC specific functions that are responsible for host memory allocation/mapping. Signed-off-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/firmware_if.c | 7 +-- drivers/misc/habanalabs/goya/goya.c | 90 +++++++++++++++++---------- drivers/misc/habanalabs/habanalabs.h | 10 ++- drivers/misc/habanalabs/hw_queue.c | 6 +- drivers/misc/habanalabs/memory.c | 4 -- drivers/misc/habanalabs/pci.c | 11 ++-- 6 files changed, 72 insertions(+), 56 deletions(-) diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c index 1acf82650b20..eda5d7fcb79f 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/firmware_if.c @@ -249,8 +249,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev) pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET << ARMCP_PKT_CTL_OPCODE_SHIFT); - pkt.addr = cpu_to_le64(armcp_info_dma_addr + - prop->host_phys_base_address); + pkt.addr = cpu_to_le64(armcp_info_dma_addr); pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info)); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), @@ -281,7 +280,6 @@ out: int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size) { - struct asic_fixed_properties *prop = &hdev->asic_prop; struct armcp_packet pkt = {}; void *eeprom_info_cpu_addr; dma_addr_t eeprom_info_dma_addr; @@ -301,8 +299,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size) pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET << ARMCP_PKT_CTL_OPCODE_SHIFT); - pkt.addr = cpu_to_le64(eeprom_info_dma_addr + - prop->host_phys_base_address); + pkt.addr = cpu_to_le64(eeprom_info_dma_addr); pkt.data_max_size = cpu_to_le32(max_size); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 8e18c80a22e7..31dc3b872f9e 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -345,7 +345,6 @@ void goya_get_fixed_properties(struct hl_device *hdev) prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE; prop->dram_page_size = PAGE_SIZE_2MB; - prop->host_phys_base_address = HOST_PHYS_BASE; prop->va_space_host_start_address = VA_HOST_SPACE_START; prop->va_space_host_end_address = VA_HOST_SPACE_END; prop->va_space_dram_start_address = VA_DDR_SPACE_START; @@ -422,7 +421,7 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr) static int goya_init_iatu(struct hl_device *hdev) { return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE, - HOST_PHYS_SIZE); + HOST_PHYS_BASE, HOST_PHYS_SIZE); } /* @@ -804,7 +803,6 @@ void goya_init_dma_qmans(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; struct hl_hw_queue *q; - dma_addr_t bus_address; int i; if (goya->hw_cap_initialized & HW_CAP_DMA) @@ -813,10 +811,7 @@ void goya_init_dma_qmans(struct hl_device *hdev) q = &hdev->kernel_queues[0]; for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) { - bus_address = q->bus_address + - hdev->asic_prop.host_phys_base_address; - - goya_init_dma_qman(hdev, i, bus_address); + goya_init_dma_qman(hdev, i, q->bus_address); goya_init_dma_ch(hdev, i); } @@ -957,7 +952,6 @@ int goya_init_cpu_queues(struct hl_device *hdev) { struct goya_device *goya = hdev->asic_specific; struct hl_eq *eq; - dma_addr_t bus_address; u32 status; struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ]; int err; @@ -970,19 +964,18 @@ int goya_init_cpu_queues(struct hl_device *hdev) eq = &hdev->event_queue; - bus_address = cpu_pq->bus_address + - hdev->asic_prop.host_phys_base_address; - WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address)); - WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, + lower_32_bits(cpu_pq->bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, + upper_32_bits(cpu_pq->bus_address)); - bus_address = eq->bus_address + hdev->asic_prop.host_phys_base_address; - WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(bus_address)); - WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(eq->bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(eq->bus_address)); - bus_address = hdev->cpu_accessible_dma_address + - hdev->asic_prop.host_phys_base_address; - WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address)); - WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, + lower_32_bits(hdev->cpu_accessible_dma_address)); + WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, + upper_32_bits(hdev->cpu_accessible_dma_address)); WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES); WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_4, HL_EQ_SIZE_IN_BYTES); @@ -2731,13 +2724,23 @@ void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val) static void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { - return dma_alloc_coherent(&hdev->pdev->dev, size, dma_handle, flags); + void *kernel_addr = dma_alloc_coherent(&hdev->pdev->dev, size, + dma_handle, flags); + + /* Shift to the device's base physical address of host memory */ + if (kernel_addr) + *dma_handle += HOST_PHYS_BASE; + + return kernel_addr; } static void goya_dma_free_coherent(struct hl_device *hdev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { - dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle); + /* Cancel the device's base physical address of host memory */ + dma_addr_t fixed_dma_handle = dma_handle - HOST_PHYS_BASE; + + dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle); } void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id, @@ -2848,8 +2851,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) (1 << GOYA_PKT_CTL_MB_SHIFT); fence_pkt->ctl = cpu_to_le32(tmp); fence_pkt->value = cpu_to_le32(GOYA_QMAN0_FENCE_VAL); - fence_pkt->addr = cpu_to_le64(fence_dma_addr + - hdev->asic_prop.host_phys_base_address); + fence_pkt->addr = cpu_to_le64(fence_dma_addr); rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0, job->job_cb_size, cb->bus_address); @@ -2928,8 +2930,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) (1 << GOYA_PKT_CTL_MB_SHIFT); fence_pkt->ctl = cpu_to_le32(tmp); fence_pkt->value = cpu_to_le32(fence_val); - fence_pkt->addr = cpu_to_le64(fence_dma_addr + - hdev->asic_prop.host_phys_base_address); + fence_pkt->addr = cpu_to_le64(fence_dma_addr); rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, sizeof(struct packet_msg_prot), @@ -3001,16 +3002,27 @@ int goya_test_queues(struct hl_device *hdev) static void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size, gfp_t mem_flags, dma_addr_t *dma_handle) { + void *kernel_addr; + if (size > GOYA_DMA_POOL_BLK_SIZE) return NULL; - return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle); + kernel_addr = dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle); + + /* Shift to the device's base physical address of host memory */ + if (kernel_addr) + *dma_handle += HOST_PHYS_BASE; + + return kernel_addr; } static void goya_dma_pool_free(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr) { - dma_pool_free(hdev->dma_pool, vaddr, dma_addr); + /* Cancel the device's base physical address of host memory */ + dma_addr_t fixed_dma_addr = dma_addr - HOST_PHYS_BASE; + + dma_pool_free(hdev->dma_pool, vaddr, fixed_dma_addr); } void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, @@ -3025,19 +3037,33 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, hl_fw_cpu_accessible_dma_pool_free(hdev, size, vaddr); } -static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg, +static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { - if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir)) + struct scatterlist *sg; + int i; + + if (!dma_map_sg(&hdev->pdev->dev, sgl, nents, dir)) return -ENOMEM; + /* Shift to the device's base physical address of host memory */ + for_each_sg(sgl, sg, nents, i) + sg->dma_address += HOST_PHYS_BASE; + return 0; } -static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg, +static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { - dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir); + struct scatterlist *sg; + int i; + + /* Cancel the device's base physical address of host memory */ + for_each_sg(sgl, sg, nents, i) + sg->dma_address -= HOST_PHYS_BASE; + + dma_unmap_sg(&hdev->pdev->dev, sgl, nents, dir); } u32 goya_get_dma_desc_list_size(struct hl_device *hdev, struct sg_table *sgt) @@ -3589,8 +3615,6 @@ static int goya_patch_dma_packet(struct hl_device *hdev, new_dma_pkt->ctl = cpu_to_le32(ctl); new_dma_pkt->tsize = cpu_to_le32((u32) len); - dma_addr += hdev->asic_prop.host_phys_base_address; - if (dir == DMA_TO_DEVICE) { new_dma_pkt->src_addr = cpu_to_le64(dma_addr); new_dma_pkt->dst_addr = cpu_to_le64(device_memory_addr); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index b64594be6dbd..f08f71982585 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -135,8 +135,6 @@ enum hl_device_hw_state { * @dram_user_base_address: DRAM physical start address for user access. * @dram_size: DRAM total size. * @dram_pci_bar_size: size of PCI bar towards DRAM. - * @host_phys_base_address: base physical address of host memory for - * transactions that the device generates. * @max_power_default: max power of the device after reset * @va_space_host_start_address: base address of virtual memory range for * mapping host memory. @@ -184,7 +182,6 @@ struct asic_fixed_properties { u64 dram_user_base_address; u64 dram_size; u64 dram_pci_bar_size; - u64 host_phys_base_address; u64 max_power_default; u64 va_space_host_start_address; u64 va_space_host_end_address; @@ -537,11 +534,11 @@ struct hl_asic_funcs { void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev, size_t size, void *vaddr); void (*hl_dma_unmap_sg)(struct hl_device *hdev, - struct scatterlist *sg, int nents, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser); int (*asic_dma_map_sg)(struct hl_device *hdev, - struct scatterlist *sg, int nents, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); u32 (*get_dma_desc_list_size)(struct hl_device *hdev, struct sg_table *sgt); @@ -1450,7 +1447,8 @@ int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data); int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, u64 addr); int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, - u64 dram_base_address, u64 host_phys_size); + u64 dram_base_address, u64 host_phys_base_address, + u64 host_phys_size); int hl_pci_init(struct hl_device *hdev, u8 dma_mask); void hl_pci_fini(struct hl_device *hdev); int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask); diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index 6cdaa117fc40..2894d8975933 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -82,7 +82,7 @@ static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, bd += hl_pi_2_offset(q->pi); bd->ctl = __cpu_to_le32(ctl); bd->len = __cpu_to_le32(len); - bd->ptr = __cpu_to_le64(ptr + hdev->asic_prop.host_phys_base_address); + bd->ptr = __cpu_to_le64(ptr); q->pi = hl_queue_inc_ptr(q->pi); hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); @@ -263,9 +263,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job) * checked in hl_queue_sanity_checks */ cq = &hdev->completion_queue[q->hw_queue_id]; - cq_addr = cq->bus_address + - hdev->asic_prop.host_phys_base_address; - cq_addr += cq->pi * sizeof(struct hl_cq_entry); + cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry); hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len, cq_addr, diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index 43ef3ad8438a..d67d24c13efd 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -759,10 +759,6 @@ static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr, for (i = 0 ; i < phys_pg_pack->npages ; i++) { paddr = phys_pg_pack->pages[i]; - /* For accessing the host we need to turn on bit 39 */ - if (phys_pg_pack->created_from_userptr) - paddr += hdev->asic_prop.host_phys_base_address; - rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size); if (rc) { dev_err(hdev->dev, diff --git a/drivers/misc/habanalabs/pci.c b/drivers/misc/habanalabs/pci.c index 5278f086d65d..0e78a04d63f4 100644 --- a/drivers/misc/habanalabs/pci.c +++ b/drivers/misc/habanalabs/pci.c @@ -236,6 +236,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, * @hdev: Pointer to hl_device structure. * @sram_base_address: SRAM base address. * @dram_base_address: DRAM base address. + * @host_phys_base_address: Base physical address of host memory for device + * transactions. * @host_phys_size: Size of host memory for device transactions. * * This is needed in case the firmware doesn't initialize the iATU. @@ -243,7 +245,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar, * Return: 0 on success, negative value for failure. */ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, - u64 dram_base_address, u64 host_phys_size) + u64 dram_base_address, u64 host_phys_base_address, + u64 host_phys_size) { struct asic_fixed_properties *prop = &hdev->asic_prop; u64 host_phys_end_addr; @@ -265,11 +268,11 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address, /* Outbound Region 0 - Point to Host */ - host_phys_end_addr = prop->host_phys_base_address + host_phys_size - 1; + host_phys_end_addr = host_phys_base_address + host_phys_size - 1; rc |= hl_pci_iatu_write(hdev, 0x008, - lower_32_bits(prop->host_phys_base_address)); + lower_32_bits(host_phys_base_address)); rc |= hl_pci_iatu_write(hdev, 0x00C, - upper_32_bits(prop->host_phys_base_address)); + upper_32_bits(host_phys_base_address)); rc |= hl_pci_iatu_write(hdev, 0x010, lower_32_bits(host_phys_end_addr)); rc |= hl_pci_iatu_write(hdev, 0x014, 0); rc |= hl_pci_iatu_write(hdev, 0x018, 0); From 5809e18e028218c006011dbbfe30429eaf4bb29b Mon Sep 17 00:00:00 2001 From: Dalit Ben Zoor Date: Wed, 1 May 2019 13:16:18 +0300 Subject: [PATCH 11/15] habanalabs: remove redundant member from parser struct use_virt_addr member was used for telling whether to treat the addresses in the CB as virtual during parsing. We disabled it only when calling the parser from the driver memset device function, and since this call had been removed, it should always be enabled. Signed-off-by: Dalit Ben Zoor Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 1 - drivers/misc/habanalabs/goya/goya.c | 2 +- drivers/misc/habanalabs/habanalabs.h | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index c4ab694b51b5..6fe785e26859 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -93,7 +93,6 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job) parser.user_cb_size = job->user_cb_size; parser.ext_queue = job->ext_queue; job->patched_cb = NULL; - parser.use_virt_addr = hdev->mmu_enable; rc = hdev->asic_funcs->cs_parser(hdev, &parser); if (job->ext_queue) { diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 31dc3b872f9e..ba6790f9ec6b 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -3903,7 +3903,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) if (!parser->ext_queue) return goya_parse_cb_no_ext_queue(hdev, parser); - if ((goya->hw_cap_initialized & HW_CAP_MMU) && parser->use_virt_addr) + if (goya->hw_cap_initialized & HW_CAP_MMU) return goya_parse_cb_mmu(hdev, parser); else return goya_parse_cb_no_mmu(hdev, parser); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index f08f71982585..0da80e8eab42 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -779,8 +779,6 @@ struct hl_cs_job { * @patched_cb_size: the size of the CB after parsing. * @ext_queue: whether the job is for external queue or internal queue. * @job_id: the id of the related job inside the related CS. - * @use_virt_addr: whether to treat the addresses in the CB as virtual during - * parsing. */ struct hl_cs_parser { struct hl_cb *user_cb; @@ -793,7 +791,6 @@ struct hl_cs_parser { u32 patched_cb_size; u8 ext_queue; u8 job_id; - u8 use_virt_addr; }; From f0539fb0fb5fcb595bbb46dfe88c97e825f29f1f Mon Sep 17 00:00:00 2001 From: Dalit Ben Zoor Date: Wed, 1 May 2019 13:24:58 +0300 Subject: [PATCH 12/15] habanalabs: remove condition that is always true After removing the parsing of the command submission when doing memset of the device memory, goya_validate_dma_pkt_host is never called by the kernel, so there is no need to check context id. Signed-off-by: Dalit Ben Zoor Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 44 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index ba6790f9ec6b..9bf572a2d292 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -3213,31 +3213,29 @@ static int goya_validate_dma_pkt_host(struct hl_device *hdev, return -EFAULT; } - if (parser->ctx_id != HL_KERNEL_ASID_ID) { - if (sram_addr) { - if (!hl_mem_area_inside_range(device_memory_addr, - le32_to_cpu(user_dma_pkt->tsize), - hdev->asic_prop.sram_user_base_address, - hdev->asic_prop.sram_end_address)) { + if (sram_addr) { + if (!hl_mem_area_inside_range(device_memory_addr, + le32_to_cpu(user_dma_pkt->tsize), + hdev->asic_prop.sram_user_base_address, + hdev->asic_prop.sram_end_address)) { - dev_err(hdev->dev, - "SRAM address 0x%llx + 0x%x is invalid\n", - device_memory_addr, - user_dma_pkt->tsize); - return -EFAULT; - } - } else { - if (!hl_mem_area_inside_range(device_memory_addr, - le32_to_cpu(user_dma_pkt->tsize), - hdev->asic_prop.dram_user_base_address, - hdev->asic_prop.dram_end_address)) { + dev_err(hdev->dev, + "SRAM address 0x%llx + 0x%x is invalid\n", + device_memory_addr, + user_dma_pkt->tsize); + return -EFAULT; + } + } else { + if (!hl_mem_area_inside_range(device_memory_addr, + le32_to_cpu(user_dma_pkt->tsize), + hdev->asic_prop.dram_user_base_address, + hdev->asic_prop.dram_end_address)) { - dev_err(hdev->dev, - "DRAM address 0x%llx + 0x%x is invalid\n", - device_memory_addr, - user_dma_pkt->tsize); - return -EFAULT; - } + dev_err(hdev->dev, + "DRAM address 0x%llx + 0x%x is invalid\n", + device_memory_addr, + user_dma_pkt->tsize); + return -EFAULT; } } From b1b537713eb1a63a2ecc3547693b3eef7dfb9281 Mon Sep 17 00:00:00 2001 From: Dalit Ben Zoor Date: Tue, 30 Apr 2019 17:18:51 +0300 Subject: [PATCH 13/15] habanalabs: increase timeout if working with simulator Where there is a spike in the CPU consumption, it may cause random failures in the C/I since the KMD timeout for CPU and/or QMAN0 jobs expires and it stops communicating to the simulator. This commit fixes it by increasing timeout on polling functions if working with simulator. Signed-off-by: Dalit Ben Zoor Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/device.c | 8 +++++++- drivers/misc/habanalabs/habanalabs.h | 7 ++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 0e0b9ec71c80..91a9e47a3482 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -1147,7 +1147,13 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr, * either by the direct access of the device or by another core */ u32 *paddr = (u32 *) (uintptr_t) addr; - ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); + ktime_t timeout; + + /* timeout should be longer when working with simulator */ + if (!hdev->pdev) + timeout_us *= 10; + + timeout = ktime_add_us(ktime_get(), timeout_us); might_sleep(); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 0da80e8eab42..71243b319920 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -1042,7 +1042,12 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); #define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \ ({ \ - ktime_t __timeout = ktime_add_us(ktime_get(), timeout_us); \ + ktime_t __timeout; \ + /* timeout should be longer when working with simulator */ \ + if (hdev->pdev) \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ + else \ + __timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \ might_sleep_if(sleep_us); \ for (;;) { \ (val) = RREG32(addr); \ From ba209e1587227f8f5a86bb4b040547cf9a79d4cc Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 2 May 2019 11:33:12 +0300 Subject: [PATCH 14/15] habanalabs: Update CPU DMA pool label name The CPU accessible DMA pool is general and not used only for PQ. Accordingly, this patch rename the "free_cpu_pq_pool" label with "free_cpu_accessible_dma_pool". Signed-off-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 9bf572a2d292..0fa0bdd7c852 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -670,14 +670,14 @@ static int goya_sw_init(struct hl_device *hdev) dev_err(hdev->dev, "Failed to add memory to CPU accessible DMA pool\n"); rc = -EFAULT; - goto free_cpu_pq_pool; + goto free_cpu_accessible_dma_pool; } spin_lock_init(&goya->hw_queues_lock); return 0; -free_cpu_pq_pool: +free_cpu_accessible_dma_pool: gen_pool_destroy(hdev->cpu_accessible_dma_pool); free_cpu_pq_dma_mem: hdev->asic_funcs->asic_dma_free_coherent(hdev, From 9f832fda79eb6e4f8ebde8d77eb442b95bf6b08a Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 2 May 2019 15:37:19 +0300 Subject: [PATCH 15/15] habanalabs: Update CPU DMA memory label name The CPU accessible DMA memory is general and not used only for PQ. Accordingly, this patch renames the "free_cpu_pq_dma_mem" label with "free_cpu_dma_mem". Signed-off-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 0fa0bdd7c852..a582e29c1ee4 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -660,7 +660,7 @@ static int goya_sw_init(struct hl_device *hdev) dev_err(hdev->dev, "Failed to create CPU accessible DMA pool\n"); rc = -ENOMEM; - goto free_cpu_pq_dma_mem; + goto free_cpu_dma_mem; } rc = gen_pool_add(hdev->cpu_accessible_dma_pool, @@ -679,7 +679,7 @@ static int goya_sw_init(struct hl_device *hdev) free_cpu_accessible_dma_pool: gen_pool_destroy(hdev->cpu_accessible_dma_pool); -free_cpu_pq_dma_mem: +free_cpu_dma_mem: hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem,