| /* |
| * Copyright © 2014-2017 Broadcom |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| /** |
| * @file v3dx_simulator.c |
| * |
| * Implements the actual HW interaction betweeh the GL driver's V3D simulator and the simulator. |
| * |
| * The register headers between V3D versions will have conflicting defines, so |
| * all register interactions appear in this file and are compiled per V3D version |
| * we support. |
| */ |
| |
| #ifdef USE_V3D_SIMULATOR |
| |
| #include <assert.h> |
| #include <stdbool.h> |
| #include <stdio.h> |
| |
| #include "v3d_simulator.h" |
| #include "v3d_simulator_wrapper.h" |
| |
| #include "util/macros.h" |
| #include "util/bitscan.h" |
| #include "drm-uapi/v3d_drm.h" |
| |
| #define HW_REGISTER_RO(x) (x) |
| #define HW_REGISTER_RW(x) (x) |
| #if V3D_VERSION >= 41 |
| #include "libs/core/v3d/registers/4.1.35.0/v3d.h" |
| #else |
| #include "libs/core/v3d/registers/3.3.0.0/v3d.h" |
| #endif |
| |
| #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val) |
| #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg) |
| |
| static void |
| v3d_invalidate_l3(struct v3d_hw *v3d) |
| { |
| #if V3D_VERSION < 40 |
| uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL); |
| |
| V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET); |
| V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET); |
| #endif |
| } |
| |
| /* Invalidates the L2C cache. This is a read-only cache for uniforms and instructions. */ |
| static void |
| v3d_invalidate_l2c(struct v3d_hw *v3d) |
| { |
| if (V3D_VERSION >= 33) |
| return; |
| |
| V3D_WRITE(V3D_CTL_0_L2CACTL, |
| V3D_CTL_0_L2CACTL_L2CCLR_SET | |
| V3D_CTL_0_L2CACTL_L2CENA_SET); |
| } |
| |
| enum v3d_l2t_cache_flush_mode { |
| V3D_CACHE_FLUSH_MODE_FLUSH, |
| V3D_CACHE_FLUSH_MODE_CLEAR, |
| V3D_CACHE_FLUSH_MODE_CLEAN, |
| }; |
| |
| /* Invalidates texture L2 cachelines */ |
| static void |
| v3d_invalidate_l2t(struct v3d_hw *v3d) |
| { |
| V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0); |
| V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0); |
| V3D_WRITE(V3D_CTL_0_L2TCACTL, |
| V3D_CTL_0_L2TCACTL_L2TFLS_SET | |
| (V3D_CACHE_FLUSH_MODE_FLUSH << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); |
| } |
| |
| /* |
| * Wait for l2tcactl, used for flushes. |
| * |
| * FIXME: for a multicore scenario we should pass here the core. All wrapper |
| * assumes just one core, so would be better to handle that on that case. |
| */ |
| static UNUSED void v3d_core_wait_l2tcactl(struct v3d_hw *v3d, |
| uint32_t ctrl) |
| { |
| assert(!(ctrl & ~(V3D_CTL_0_L2TCACTL_TMUWCF_SET | V3D_CTL_0_L2TCACTL_L2TFLS_SET))); |
| |
| while (V3D_READ(V3D_CTL_0_L2TCACTL) & ctrl) { |
| v3d_hw_tick(v3d); |
| } |
| } |
| |
| /* Flushes dirty texture cachelines from the L1 write combiner */ |
| static void |
| v3d_flush_l1td(struct v3d_hw *v3d) |
| { |
| V3D_WRITE(V3D_CTL_0_L2TCACTL, |
| V3D_CTL_0_L2TCACTL_TMUWCF_SET); |
| |
| /* Note: here the kernel (and previous versions of the simulator |
| * wrapper) is using V3D_CTL_0_L2TCACTL_L2TFLS_SET, as with l2t. We |
| * understand that it makes more sense to do like this. We need to |
| * confirm which one is doing it correctly. So far things work fine on |
| * the simulator this way. |
| */ |
| v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_TMUWCF_SET); |
| } |
| |
| /* Flushes dirty texture L2 cachelines */ |
| static void |
| v3d_flush_l2t(struct v3d_hw *v3d) |
| { |
| V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0); |
| V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0); |
| V3D_WRITE(V3D_CTL_0_L2TCACTL, |
| V3D_CTL_0_L2TCACTL_L2TFLS_SET | |
| (V3D_CACHE_FLUSH_MODE_CLEAN << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); |
| |
| v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_L2TFLS_SET); |
| } |
| |
| /* Invalidates the slice caches. These are read-only caches. */ |
| static void |
| v3d_invalidate_slices(struct v3d_hw *v3d) |
| { |
| V3D_WRITE(V3D_CTL_0_SLCACTL, ~0); |
| } |
| |
| static void |
| v3d_invalidate_caches(struct v3d_hw *v3d) |
| { |
| v3d_invalidate_l3(v3d); |
| v3d_invalidate_l2c(v3d); |
| v3d_invalidate_l2t(v3d); |
| v3d_invalidate_slices(v3d); |
| } |
| |
| static uint32_t g_gmp_ofs; |
| static void |
| v3d_reload_gmp(struct v3d_hw *v3d) |
| { |
| /* Completely reset the GMP. */ |
| V3D_WRITE(V3D_GMP_CFG, |
| V3D_GMP_CFG_PROTENABLE_SET); |
| V3D_WRITE(V3D_GMP_TABLE_ADDR, g_gmp_ofs); |
| V3D_WRITE(V3D_GMP_CLEAR_LOAD, ~0); |
| while (V3D_READ(V3D_GMP_STATUS) & |
| V3D_GMP_STATUS_CFG_BUSY_SET) { |
| ; |
| } |
| } |
| |
| static UNUSED void |
| v3d_flush_caches(struct v3d_hw *v3d) |
| { |
| v3d_flush_l1td(v3d); |
| v3d_flush_l2t(v3d); |
| } |
| |
| int |
| v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, |
| struct drm_v3d_submit_tfu *args) |
| { |
| int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; |
| |
| V3D_WRITE(V3D_TFU_IIA, args->iia); |
| V3D_WRITE(V3D_TFU_IIS, args->iis); |
| V3D_WRITE(V3D_TFU_ICA, args->ica); |
| V3D_WRITE(V3D_TFU_IUA, args->iua); |
| V3D_WRITE(V3D_TFU_IOA, args->ioa); |
| V3D_WRITE(V3D_TFU_IOS, args->ios); |
| V3D_WRITE(V3D_TFU_COEF0, args->coef[0]); |
| V3D_WRITE(V3D_TFU_COEF1, args->coef[1]); |
| V3D_WRITE(V3D_TFU_COEF2, args->coef[2]); |
| V3D_WRITE(V3D_TFU_COEF3, args->coef[3]); |
| |
| V3D_WRITE(V3D_TFU_ICFG, args->icfg); |
| |
| while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { |
| v3d_hw_tick(v3d); |
| } |
| |
| return 0; |
| } |
| |
| #if V3D_VERSION >= 41 |
| int |
| v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, |
| struct drm_v3d_submit_csd *args, |
| uint32_t gmp_ofs) |
| { |
| int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) & |
| V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET); |
| g_gmp_ofs = gmp_ofs; |
| v3d_reload_gmp(v3d); |
| |
| v3d_invalidate_caches(v3d); |
| |
| V3D_WRITE(V3D_CSD_0_QUEUED_CFG1, args->cfg[1]); |
| V3D_WRITE(V3D_CSD_0_QUEUED_CFG2, args->cfg[2]); |
| V3D_WRITE(V3D_CSD_0_QUEUED_CFG3, args->cfg[3]); |
| V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]); |
| V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]); |
| V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]); |
| /* CFG0 kicks off the job */ |
| V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]); |
| |
| /* Now we wait for the dispatch to finish. The safest way is to check |
| * if NUM_COMPLETED_JOBS has increased. Note that in spite of that |
| * name that register field is about the number of completed |
| * dispatches. |
| */ |
| while ((V3D_READ(V3D_CSD_0_STATUS) & |
| V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET) == last_completed_jobs) { |
| v3d_hw_tick(v3d); |
| } |
| |
| v3d_flush_caches(v3d); |
| |
| return 0; |
| } |
| #endif |
| |
| int |
| v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, |
| struct drm_v3d_get_param *args) |
| { |
| static const uint32_t reg_map[] = { |
| [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG, |
| [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1, |
| [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2, |
| [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3, |
| [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0, |
| [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1, |
| [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2, |
| }; |
| |
| switch (args->param) { |
| case DRM_V3D_PARAM_SUPPORTS_TFU: |
| args->value = 1; |
| return 0; |
| case DRM_V3D_PARAM_SUPPORTS_CSD: |
| args->value = V3D_VERSION >= 41; |
| return 0; |
| case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH: |
| args->value = 1; |
| return 0; |
| case DRM_V3D_PARAM_SUPPORTS_PERFMON: |
| args->value = V3D_VERSION >= 41; |
| return 0; |
| case DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT: |
| args->value = 1; |
| return 0; |
| } |
| |
| if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) { |
| args->value = V3D_READ(reg_map[args->param]); |
| return 0; |
| } |
| |
| fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM(%lld)\n", |
| (long long)args->value); |
| abort(); |
| } |
| |
| static struct v3d_hw *v3d_isr_hw; |
| |
| |
| static void |
| v3d_isr_core(struct v3d_hw *v3d, |
| unsigned core) |
| { |
| /* FIXME: so far we are assuming just one core, and using only the _0_ |
| * registers. If we add multiple-core on the simulator, we would need |
| * to pass core as a parameter, and chose the proper registers. |
| */ |
| assert(core == 0); |
| uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS); |
| V3D_WRITE(V3D_CTL_0_INT_CLR, core_status); |
| |
| if (core_status & V3D_CTL_0_INT_STS_INT_OUTOMEM_SET) { |
| uint32_t size = 256 * 1024; |
| uint32_t offset = v3d_simulator_get_spill(size); |
| |
| v3d_reload_gmp(v3d); |
| |
| V3D_WRITE(V3D_PTB_0_BPOA, offset); |
| V3D_WRITE(V3D_PTB_0_BPOS, size); |
| return; |
| } |
| |
| if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { |
| fprintf(stderr, "GMP violation at 0x%08x\n", |
| V3D_READ(V3D_GMP_VIO_ADDR)); |
| abort(); |
| } else { |
| fprintf(stderr, |
| "Unexpected ISR with core status 0x%08x\n", |
| core_status); |
| } |
| abort(); |
| } |
| |
| static void |
| handle_mmu_interruptions(struct v3d_hw *v3d, |
| uint32_t hub_status) |
| { |
| bool wrv = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET; |
| bool pti = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET; |
| bool cap = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET; |
| |
| if (!(pti || cap || wrv)) |
| return; |
| |
| const char *client = "?"; |
| uint32_t axi_id = V3D_READ(V3D_MMU_VIO_ID); |
| uint32_t va_width = 30; |
| |
| #if V3D_VERSION >= 41 |
| static const char *const v3d41_axi_ids[] = { |
| "L2T", |
| "PTB", |
| "PSE", |
| "TLB", |
| "CLE", |
| "TFU", |
| "MMU", |
| "GMP", |
| }; |
| |
| axi_id = axi_id >> 5; |
| if (axi_id < ARRAY_SIZE(v3d41_axi_ids)) |
| client = v3d41_axi_ids[axi_id]; |
| |
| uint32_t mmu_debug = V3D_READ(V3D_MMU_DEBUG_INFO); |
| |
| va_width += ((mmu_debug & V3D_MMU_DEBUG_INFO_VA_WIDTH_SET) |
| >> V3D_MMU_DEBUG_INFO_VA_WIDTH_LSB); |
| #endif |
| /* Only the top bits (final number depends on the gen) of the virtual |
| * address are reported in the MMU VIO_ADDR register. |
| */ |
| uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU_VIO_ADDR) << |
| (va_width - 32)); |
| |
| /* Difference with the kernal: here were are going to abort after |
| * logging, so we don't bother with some stuff that the kernel does, |
| * like restoring the MMU ctrl bits |
| */ |
| |
| fprintf(stderr, "MMU error from client %s (%d) at 0x%llx%s%s%s\n", |
| client, axi_id, (long long) vio_addr, |
| wrv ? ", write violation" : "", |
| pti ? ", pte invalid" : "", |
| cap ? ", cap exceeded" : ""); |
| |
| abort(); |
| } |
| |
| static void |
| v3d_isr_hub(struct v3d_hw *v3d) |
| { |
| uint32_t hub_status = V3D_READ(V3D_HUB_CTL_INT_STS); |
| |
| /* Acknowledge the interrupts we're handling here */ |
| V3D_WRITE(V3D_HUB_CTL_INT_CLR, hub_status); |
| |
| if (hub_status & V3D_HUB_CTL_INT_STS_INT_TFUC_SET) { |
| /* FIXME: we were not able to raise this exception. We let the |
| * unreachable here, so we could get one if it is raised on |
| * the future. In any case, note that for this case we would |
| * only be doing debugging log. |
| */ |
| unreachable("TFU Conversion Complete interrupt not handled"); |
| } |
| |
| handle_mmu_interruptions(v3d, hub_status); |
| } |
| |
| static void |
| v3d_isr(uint32_t hub_status) |
| { |
| struct v3d_hw *v3d = v3d_isr_hw; |
| uint32_t mask = hub_status; |
| |
| /* Check the hub_status bits */ |
| while (mask) { |
| unsigned core = u_bit_scan(&mask); |
| |
| if (core == v3d_hw_get_hub_core()) |
| v3d_isr_hub(v3d); |
| else |
| v3d_isr_core(v3d, core); |
| } |
| |
| return; |
| } |
| |
| void |
| v3dX(simulator_init_regs)(struct v3d_hw *v3d) |
| { |
| #if V3D_VERSION == 33 |
| /* Set OVRTMUOUT to match kernel behavior. |
| * |
| * This means that the texture sampler uniform configuration's tmu |
| * output type field is used, instead of using the hardware default |
| * behavior based on the texture type. If you want the default |
| * behavior, you can still put "2" in the indirect texture state's |
| * output_type field. |
| */ |
| V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET); |
| #endif |
| |
| /* FIXME: the kernel captures some additional core interrupts here, |
| * for tracing. Perhaps we should evaluate to do the same here and add |
| * some debug options. |
| */ |
| uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET | |
| V3D_CTL_0_INT_STS_INT_OUTOMEM_SET); |
| V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); |
| V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); |
| |
| uint32_t hub_interrupts = |
| (V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET | /* write violation */ |
| V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET | /* page table invalid */ |
| V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */ |
| V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */ |
| |
| V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts); |
| V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts); |
| |
| v3d_isr_hw = v3d; |
| v3d_hw_set_isr(v3d, v3d_isr); |
| } |
| |
| void |
| v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d, |
| struct drm_v3d_submit_cl *submit, |
| uint32_t gmp_ofs) |
| { |
| int last_bfc = (V3D_READ(V3D_CLE_0_BFC) & |
| V3D_CLE_0_BFC_BMFCT_SET); |
| |
| int last_rfc = (V3D_READ(V3D_CLE_0_RFC) & |
| V3D_CLE_0_RFC_RMFCT_SET); |
| |
| g_gmp_ofs = gmp_ofs; |
| v3d_reload_gmp(v3d); |
| |
| v3d_invalidate_caches(v3d); |
| |
| if (submit->qma) { |
| V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma); |
| V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms); |
| } |
| #if V3D_VERSION >= 41 |
| if (submit->qts) { |
| V3D_WRITE(V3D_CLE_0_CT0QTS, |
| V3D_CLE_0_CT0QTS_CTQTSEN_SET | |
| submit->qts); |
| } |
| #endif |
| V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start); |
| V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end); |
| |
| /* Wait for bin to complete before firing render. The kernel's |
| * scheduler implements this using the GPU scheduler blocking on the |
| * bin fence completing. (We don't use HW semaphores). |
| */ |
| while ((V3D_READ(V3D_CLE_0_BFC) & |
| V3D_CLE_0_BFC_BMFCT_SET) == last_bfc) { |
| v3d_hw_tick(v3d); |
| } |
| |
| v3d_invalidate_caches(v3d); |
| |
| V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start); |
| V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end); |
| |
| while ((V3D_READ(V3D_CLE_0_RFC) & |
| V3D_CLE_0_RFC_RMFCT_SET) == last_rfc) { |
| v3d_hw_tick(v3d); |
| } |
| } |
| |
| #if V3D_VERSION >= 41 |
| #define V3D_PCTR_0_PCTR_N(x) (V3D_PCTR_0_PCTR0 + 4 * (x)) |
| #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x)) |
| #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8) |
| #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \ |
| V3D_PCTR_0_SRC_N_SHIFT(x) + 6)) |
| #endif |
| |
| void |
| v3dX(simulator_perfmon_start)(struct v3d_hw *v3d, |
| uint32_t ncounters, |
| uint8_t *events) |
| { |
| #if V3D_VERSION >= 41 |
| int i, j; |
| uint32_t source; |
| uint32_t mask = BITFIELD_RANGE(0, ncounters); |
| |
| for (i = 0; i < ncounters; i+=4) { |
| source = i / 4; |
| uint32_t channels = 0; |
| for (j = 0; j < 4 && (i + j) < ncounters; j++) |
| channels |= events[i + j] << V3D_PCTR_0_SRC_N_SHIFT(j); |
| V3D_WRITE(V3D_PCTR_0_SRC_N(source), channels); |
| } |
| V3D_WRITE(V3D_PCTR_0_CLR, mask); |
| V3D_WRITE(V3D_PCTR_0_OVERFLOW, mask); |
| V3D_WRITE(V3D_PCTR_0_EN, mask); |
| #endif |
| } |
| |
| void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d, |
| uint32_t ncounters, |
| uint64_t *values) |
| { |
| #if V3D_VERSION >= 41 |
| int i; |
| |
| for (i = 0; i < ncounters; i++) |
| values[i] += V3D_READ(V3D_PCTR_0_PCTR_N(i)); |
| |
| V3D_WRITE(V3D_PCTR_0_EN, 0); |
| #endif |
| } |
| |
| #endif /* USE_V3D_SIMULATOR */ |