| /* |
| * Copyright © 2016 Red Hat. |
| * Copyright © 2016 Bas Nieuwenhuizen |
| * |
| * based in part on anv driver which is: |
| * Copyright © 2015 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
| * DEALINGS IN THE SOFTWARE. |
| */ |
| |
| #ifndef TU_PRIVATE_H |
| #define TU_PRIVATE_H |
| |
| #include <assert.h> |
| #include <pthread.h> |
| #include <stdbool.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #ifdef HAVE_VALGRIND |
| #include <memcheck.h> |
| #include <valgrind.h> |
| #define VG(x) x |
| #else |
| #define VG(x) ((void)0) |
| #endif |
| |
| #define MESA_LOG_TAG "TU" |
| |
| #include "c11/threads.h" |
| #include "util/rounding.h" |
| #include "util/bitscan.h" |
| #include "util/list.h" |
| #include "util/log.h" |
| #include "util/macros.h" |
| #include "util/u_atomic.h" |
| #include "util/u_dynarray.h" |
| #include "util/xmlconfig.h" |
| #include "util/perf/u_trace.h" |
| #include "vk_alloc.h" |
| #include "vk_debug_report.h" |
| #include "vk_device.h" |
| #include "vk_dispatch_table.h" |
| #include "vk_extensions.h" |
| #include "vk_instance.h" |
| #include "vk_log.h" |
| #include "vk_physical_device.h" |
| #include "vk_shader_module.h" |
| #include "wsi_common.h" |
| |
| #include "ir3/ir3_compiler.h" |
| #include "ir3/ir3_shader.h" |
| |
| #include "adreno_common.xml.h" |
| #include "adreno_pm4.xml.h" |
| #include "a6xx.xml.h" |
| #include "fdl/freedreno_layout.h" |
| #include "common/freedreno_dev_info.h" |
| #include "perfcntrs/freedreno_perfcntr.h" |
| |
| #include "tu_descriptor_set.h" |
| #include "tu_autotune.h" |
| #include "tu_util.h" |
| #include "tu_perfetto.h" |
| |
| /* Pre-declarations needed for WSI entrypoints */ |
| struct wl_surface; |
| struct wl_display; |
| typedef struct xcb_connection_t xcb_connection_t; |
| typedef uint32_t xcb_visualid_t; |
| typedef uint32_t xcb_window_t; |
| |
| #include <vulkan/vk_android_native_buffer.h> |
| #include <vulkan/vk_icd.h> |
| #include <vulkan/vulkan.h> |
| |
| #include "tu_entrypoints.h" |
| |
| #include "vk_format.h" |
| #include "vk_image.h" |
| #include "vk_command_buffer.h" |
| #include "vk_queue.h" |
| #include "vk_object.h" |
| #include "vk_sync.h" |
| #include "vk_fence.h" |
| #include "vk_semaphore.h" |
| #include "vk_drm_syncobj.h" |
| #include "vk_sync_timeline.h" |
| |
| #define MAX_VBS 32 |
| #define MAX_VERTEX_ATTRIBS 32 |
| #define MAX_RTS 8 |
| #define MAX_VSC_PIPES 32 |
| #define MAX_VIEWPORTS 16 |
| #define MAX_VIEWPORT_SIZE (1 << 14) |
| #define MAX_SCISSORS 16 |
| #define MAX_DISCARD_RECTANGLES 4 |
| #define MAX_PUSH_CONSTANTS_SIZE 128 |
| #define MAX_PUSH_DESCRIPTORS 32 |
| #define MAX_DYNAMIC_UNIFORM_BUFFERS 16 |
| #define MAX_DYNAMIC_STORAGE_BUFFERS 8 |
| #define MAX_DYNAMIC_BUFFERS \ |
| (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS) |
| #define TU_MAX_DRM_DEVICES 8 |
| #define MAX_VIEWS 16 |
| #define MAX_BIND_POINTS 2 /* compute + graphics */ |
| /* The Qualcomm driver exposes 0x20000058 */ |
| #define MAX_STORAGE_BUFFER_RANGE 0x20000000 |
| /* We use ldc for uniform buffer loads, just like the Qualcomm driver, so |
| * expose the same maximum range. |
| * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual |
| * range might be higher. |
| */ |
| #define MAX_UNIFORM_BUFFER_RANGE 0x10000 |
| |
| #define A6XX_TEX_CONST_DWORDS 16 |
| #define A6XX_TEX_SAMP_DWORDS 4 |
| |
| #define COND(bool, val) ((bool) ? (val) : 0) |
| #define BIT(bit) (1u << (bit)) |
| |
| /* Whenever we generate an error, pass it through this function. Useful for |
| * debugging, where we can break on it. Only call at error site, not when |
| * propagating errors. Might be useful to plug in a stack trace here. |
| */ |
| |
| struct tu_instance; |
| |
| VkResult |
| __vk_startup_errorf(struct tu_instance *instance, |
| VkResult error, |
| bool force_print, |
| const char *file, |
| int line, |
| const char *format, |
| ...) PRINTFLIKE(6, 7); |
| |
| /* Prints startup errors if TU_DEBUG=startup is set or on a debug driver |
| * build. |
| */ |
| #define vk_startup_errorf(instance, error, format, ...) \ |
| __vk_startup_errorf(instance, error, \ |
| instance->debug_flags & TU_DEBUG_STARTUP, \ |
| __FILE__, __LINE__, format, ##__VA_ARGS__) |
| |
| void |
| __tu_finishme(const char *file, int line, const char *format, ...) |
| PRINTFLIKE(3, 4); |
| |
| /** |
| * Print a FINISHME message, including its source location. |
| */ |
| #define tu_finishme(format, ...) \ |
| do { \ |
| static bool reported = false; \ |
| if (!reported) { \ |
| __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__); \ |
| reported = true; \ |
| } \ |
| } while (0) |
| |
| #define tu_stub() \ |
| do { \ |
| tu_finishme("stub %s", __func__); \ |
| } while (0) |
| |
| struct tu_memory_heap { |
| /* Standard bits passed on to the client */ |
| VkDeviceSize size; |
| VkMemoryHeapFlags flags; |
| |
| /** Copied from ANV: |
| * |
| * Driver-internal book-keeping. |
| * |
| * Align it to 64 bits to make atomic operations faster on 32 bit platforms. |
| */ |
| VkDeviceSize used __attribute__ ((aligned (8))); |
| }; |
| |
| uint64_t |
| tu_get_system_heap_size(void); |
| |
| struct tu_physical_device |
| { |
| struct vk_physical_device vk; |
| |
| struct tu_instance *instance; |
| |
| const char *name; |
| uint8_t driver_uuid[VK_UUID_SIZE]; |
| uint8_t device_uuid[VK_UUID_SIZE]; |
| uint8_t cache_uuid[VK_UUID_SIZE]; |
| |
| struct wsi_device wsi_device; |
| |
| int local_fd; |
| int master_fd; |
| |
| uint32_t gmem_size; |
| uint64_t gmem_base; |
| uint32_t ccu_offset_gmem; |
| uint32_t ccu_offset_bypass; |
| |
| struct fd_dev_id dev_id; |
| const struct fd_dev_info *info; |
| |
| int msm_major_version; |
| int msm_minor_version; |
| |
| /* This is the drivers on-disk cache used as a fallback as opposed to |
| * the pipeline cache defined by apps. |
| */ |
| struct disk_cache *disk_cache; |
| |
| struct tu_memory_heap heap; |
| |
| struct vk_sync_type syncobj_type; |
| struct vk_sync_timeline_type timeline_type; |
| const struct vk_sync_type *sync_types[3]; |
| }; |
| |
| enum tu_debug_flags |
| { |
| TU_DEBUG_STARTUP = 1 << 0, |
| TU_DEBUG_NIR = 1 << 1, |
| TU_DEBUG_NOBIN = 1 << 3, |
| TU_DEBUG_SYSMEM = 1 << 4, |
| TU_DEBUG_FORCEBIN = 1 << 5, |
| TU_DEBUG_NOUBWC = 1 << 6, |
| TU_DEBUG_NOMULTIPOS = 1 << 7, |
| TU_DEBUG_NOLRZ = 1 << 8, |
| TU_DEBUG_PERFC = 1 << 9, |
| TU_DEBUG_FLUSHALL = 1 << 10, |
| TU_DEBUG_SYNCDRAW = 1 << 11, |
| TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12, |
| TU_DEBUG_GMEM = 1 << 13, |
| }; |
| |
| struct tu_instance |
| { |
| struct vk_instance vk; |
| |
| uint32_t api_version; |
| int physical_device_count; |
| struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES]; |
| |
| struct driOptionCache dri_options; |
| struct driOptionCache available_dri_options; |
| |
| enum tu_debug_flags debug_flags; |
| }; |
| |
| VkResult |
| tu_wsi_init(struct tu_physical_device *physical_device); |
| void |
| tu_wsi_finish(struct tu_physical_device *physical_device); |
| |
| bool |
| tu_instance_extension_supported(const char *name); |
| uint32_t |
| tu_physical_device_api_version(struct tu_physical_device *dev); |
| bool |
| tu_physical_device_extension_supported(struct tu_physical_device *dev, |
| const char *name); |
| |
| struct cache_entry; |
| |
| struct tu_pipeline_cache |
| { |
| struct vk_object_base base; |
| |
| struct tu_device *device; |
| pthread_mutex_t mutex; |
| |
| uint32_t total_size; |
| uint32_t table_size; |
| uint32_t kernel_count; |
| struct cache_entry **hash_table; |
| bool modified; |
| |
| VkAllocationCallbacks alloc; |
| }; |
| |
| struct tu_pipeline_key |
| { |
| }; |
| |
| |
| /* queue types */ |
| #define TU_QUEUE_GENERAL 0 |
| |
| #define TU_MAX_QUEUE_FAMILIES 1 |
| |
| /* Keep tu_syncobj until porting to common code for kgsl too */ |
| #ifdef TU_USE_KGSL |
| struct tu_syncobj; |
| #endif |
| struct tu_u_trace_syncobj; |
| |
| /* Define tu_timeline_sync type based on drm syncobj for a point type |
| * for vk_sync_timeline, and the logic to handle is mostly copied from |
| * anv_bo_sync since it seems it can be used by similar way to anv. |
| */ |
| enum tu_timeline_sync_state { |
| /** Indicates that this is a new (or newly reset fence) */ |
| TU_TIMELINE_SYNC_STATE_RESET, |
| |
| /** Indicates that this fence has been submitted to the GPU but is still |
| * (as far as we know) in use by the GPU. |
| */ |
| TU_TIMELINE_SYNC_STATE_SUBMITTED, |
| |
| TU_TIMELINE_SYNC_STATE_SIGNALED, |
| }; |
| |
| struct tu_timeline_sync { |
| struct vk_sync base; |
| |
| enum tu_timeline_sync_state state; |
| uint32_t syncobj; |
| }; |
| |
| struct tu_queue |
| { |
| struct vk_queue vk; |
| |
| struct tu_device *device; |
| |
| uint32_t msm_queue_id; |
| int fence; |
| }; |
| |
| struct tu_bo |
| { |
| uint32_t gem_handle; |
| uint64_t size; |
| uint64_t iova; |
| void *map; |
| }; |
| |
| enum global_shader { |
| GLOBAL_SH_VS_BLIT, |
| GLOBAL_SH_VS_CLEAR, |
| GLOBAL_SH_FS_BLIT, |
| GLOBAL_SH_FS_BLIT_ZSCALE, |
| GLOBAL_SH_FS_COPY_MS, |
| GLOBAL_SH_FS_CLEAR0, |
| GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS, |
| GLOBAL_SH_COUNT, |
| }; |
| |
| #define TU_BORDER_COLOR_COUNT 4096 |
| #define TU_BORDER_COLOR_BUILTIN 6 |
| |
| #define TU_BLIT_SHADER_SIZE 1024 |
| |
| /* This struct defines the layout of the global_bo */ |
| struct tu6_global |
| { |
| /* clear/blit shaders */ |
| uint32_t shaders[TU_BLIT_SHADER_SIZE]; |
| |
| uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */ |
| uint32_t _pad0; |
| volatile uint32_t vsc_draw_overflow; |
| uint32_t _pad1; |
| volatile uint32_t vsc_prim_overflow; |
| uint32_t _pad2; |
| uint64_t predicate; |
| |
| /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ |
| struct { |
| uint32_t offset; |
| uint32_t pad[7]; |
| } flush_base[4]; |
| |
| ALIGN16 uint32_t cs_indirect_xyz[3]; |
| |
| /* note: larger global bo will be used for customBorderColors */ |
| struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[]; |
| }; |
| #define gb_offset(member) offsetof(struct tu6_global, member) |
| #define global_iova(cmd, member) ((cmd)->device->global_bo.iova + gb_offset(member)) |
| |
| /* extra space in vsc draw/prim streams */ |
| #define VSC_PAD 0x40 |
| |
| struct tu_device |
| { |
| struct vk_device vk; |
| struct tu_instance *instance; |
| |
| struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES]; |
| int queue_count[TU_MAX_QUEUE_FAMILIES]; |
| |
| struct tu_physical_device *physical_device; |
| int fd; |
| |
| struct ir3_compiler *compiler; |
| |
| /* Backup in-memory cache to be used if the app doesn't provide one */ |
| struct tu_pipeline_cache *mem_cache; |
| |
| #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */ |
| |
| /* Currently the kernel driver uses a 32-bit GPU address space, but it |
| * should be impossible to go beyond 48 bits. |
| */ |
| struct { |
| struct tu_bo bo; |
| mtx_t construct_mtx; |
| bool initialized; |
| } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2]; |
| |
| struct tu_bo global_bo; |
| |
| /* the blob seems to always use 8K factor and 128K param sizes, copy them */ |
| #define TU_TESS_FACTOR_SIZE (8 * 1024) |
| #define TU_TESS_PARAM_SIZE (128 * 1024) |
| #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE) |
| /* Lazily allocated, protected by the device mutex. */ |
| struct tu_bo tess_bo; |
| |
| struct ir3_shader_variant *global_shaders[GLOBAL_SH_COUNT]; |
| uint64_t global_shader_va[GLOBAL_SH_COUNT]; |
| |
| uint32_t vsc_draw_strm_pitch; |
| uint32_t vsc_prim_strm_pitch; |
| BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT); |
| mtx_t mutex; |
| |
| /* bo list for submits: */ |
| struct drm_msm_gem_submit_bo *bo_list; |
| /* map bo handles to bo list index: */ |
| uint32_t *bo_idx; |
| uint32_t bo_count, bo_list_size, bo_idx_size; |
| mtx_t bo_mutex; |
| |
| /* Command streams to set pass index to a scratch reg */ |
| struct tu_cs *perfcntrs_pass_cs; |
| struct tu_cs_entry *perfcntrs_pass_cs_entries; |
| |
| /* Condition variable for timeline semaphore to notify waiters when a |
| * new submit is executed. */ |
| pthread_cond_t timeline_cond; |
| pthread_mutex_t submit_mutex; |
| |
| struct tu_autotune autotune; |
| |
| #ifdef ANDROID |
| const void *gralloc; |
| enum { |
| TU_GRALLOC_UNKNOWN, |
| TU_GRALLOC_CROS, |
| TU_GRALLOC_OTHER, |
| } gralloc_type; |
| #endif |
| |
| uint32_t submit_count; |
| |
| struct u_trace_context trace_context; |
| |
| #ifdef HAVE_PERFETTO |
| struct tu_perfetto_state perfetto; |
| #endif |
| }; |
| |
| void tu_init_clear_blit_shaders(struct tu_device *dev); |
| |
| void tu_destroy_clear_blit_shaders(struct tu_device *dev); |
| |
| VkResult |
| tu_device_submit_deferred_locked(struct tu_device *dev); |
| |
| VkResult |
| tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj); |
| |
| uint64_t |
| tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts); |
| |
| enum tu_bo_alloc_flags |
| { |
| TU_BO_ALLOC_NO_FLAGS = 0, |
| TU_BO_ALLOC_ALLOW_DUMP = 1 << 0, |
| TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1, |
| }; |
| |
| VkResult |
| tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size, |
| enum tu_bo_alloc_flags flags); |
| VkResult |
| tu_bo_init_dmabuf(struct tu_device *dev, |
| struct tu_bo *bo, |
| uint64_t size, |
| int fd); |
| int |
| tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo); |
| void |
| tu_bo_finish(struct tu_device *dev, struct tu_bo *bo); |
| VkResult |
| tu_bo_map(struct tu_device *dev, struct tu_bo *bo); |
| |
| /* Get a scratch bo for use inside a command buffer. This will always return |
| * the same bo given the same size or similar sizes, so only one scratch bo |
| * can be used at the same time. It's meant for short-lived things where we |
| * need to write to some piece of memory, read from it, and then immediately |
| * discard it. |
| */ |
| VkResult |
| tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo); |
| |
| struct tu_cs_entry |
| { |
| /* No ownership */ |
| const struct tu_bo *bo; |
| |
| uint32_t size; |
| uint32_t offset; |
| }; |
| |
| struct tu_cs_memory { |
| uint32_t *map; |
| uint64_t iova; |
| }; |
| |
| struct tu_draw_state { |
| uint64_t iova : 48; |
| uint32_t size : 16; |
| }; |
| |
| enum tu_dynamic_state |
| { |
| /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */ |
| TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1, |
| TU_DYNAMIC_STATE_RB_DEPTH_CNTL, |
| TU_DYNAMIC_STATE_RB_STENCIL_CNTL, |
| TU_DYNAMIC_STATE_VB_STRIDE, |
| TU_DYNAMIC_STATE_RASTERIZER_DISCARD, |
| TU_DYNAMIC_STATE_COUNT, |
| /* no associated draw state: */ |
| TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT, |
| TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE, |
| /* re-use the line width enum as it uses GRAS_SU_CNTL: */ |
| TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH, |
| }; |
| |
| enum tu_draw_state_group_id |
| { |
| TU_DRAW_STATE_PROGRAM_CONFIG, |
| TU_DRAW_STATE_PROGRAM, |
| TU_DRAW_STATE_PROGRAM_BINNING, |
| TU_DRAW_STATE_VB, |
| TU_DRAW_STATE_VI, |
| TU_DRAW_STATE_VI_BINNING, |
| TU_DRAW_STATE_RAST, |
| TU_DRAW_STATE_BLEND, |
| TU_DRAW_STATE_SHADER_GEOM_CONST, |
| TU_DRAW_STATE_FS_CONST, |
| TU_DRAW_STATE_DESC_SETS, |
| TU_DRAW_STATE_DESC_SETS_LOAD, |
| TU_DRAW_STATE_VS_PARAMS, |
| TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, |
| TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, |
| TU_DRAW_STATE_LRZ, |
| TU_DRAW_STATE_DEPTH_PLANE, |
| |
| /* dynamic state related draw states */ |
| TU_DRAW_STATE_DYNAMIC, |
| TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT, |
| }; |
| |
| enum tu_cs_mode |
| { |
| |
| /* |
| * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it |
| * is full. tu_cs_begin must be called before command packet emission and |
| * tu_cs_end must be called after. |
| * |
| * This mode may create multiple entries internally. The entries must be |
| * submitted together. |
| */ |
| TU_CS_MODE_GROW, |
| |
| /* |
| * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external, |
| * fixed-size buffer. tu_cs_begin and tu_cs_end are optional and have no |
| * effect on it. |
| * |
| * This mode does not create any entry or any BO. |
| */ |
| TU_CS_MODE_EXTERNAL, |
| |
| /* |
| * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct |
| * command packet emission. tu_cs_begin_sub_stream must be called to get a |
| * sub-stream to emit comamnd packets to. When done with the sub-stream, |
| * tu_cs_end_sub_stream must be called. |
| * |
| * This mode does not create any entry internally. |
| */ |
| TU_CS_MODE_SUB_STREAM, |
| }; |
| |
| struct tu_cs |
| { |
| uint32_t *start; |
| uint32_t *cur; |
| uint32_t *reserved_end; |
| uint32_t *end; |
| |
| struct tu_device *device; |
| enum tu_cs_mode mode; |
| uint32_t next_bo_size; |
| |
| struct tu_cs_entry *entries; |
| uint32_t entry_count; |
| uint32_t entry_capacity; |
| |
| struct tu_bo **bos; |
| uint32_t bo_count; |
| uint32_t bo_capacity; |
| |
| /* state for cond_exec_start/cond_exec_end */ |
| uint32_t cond_flags; |
| uint32_t *cond_dwords; |
| }; |
| |
| struct tu_device_memory |
| { |
| struct vk_object_base base; |
| |
| struct tu_bo bo; |
| }; |
| |
| struct tu_descriptor_range |
| { |
| uint64_t va; |
| uint32_t size; |
| }; |
| |
| struct tu_descriptor_set |
| { |
| struct vk_object_base base; |
| |
| /* Link to descriptor pool's desc_sets list . */ |
| struct list_head pool_link; |
| |
| struct tu_descriptor_set_layout *layout; |
| struct tu_descriptor_pool *pool; |
| uint32_t size; |
| |
| uint64_t va; |
| uint32_t *mapped_ptr; |
| |
| uint32_t *dynamic_descriptors; |
| }; |
| |
| struct tu_descriptor_pool_entry |
| { |
| uint32_t offset; |
| uint32_t size; |
| struct tu_descriptor_set *set; |
| }; |
| |
| struct tu_descriptor_pool |
| { |
| struct vk_object_base base; |
| |
| struct tu_bo bo; |
| uint64_t current_offset; |
| uint64_t size; |
| |
| uint8_t *host_memory_base; |
| uint8_t *host_memory_ptr; |
| uint8_t *host_memory_end; |
| uint8_t *host_bo; |
| |
| struct list_head desc_sets; |
| |
| uint32_t entry_count; |
| uint32_t max_entry_count; |
| struct tu_descriptor_pool_entry entries[0]; |
| }; |
| |
| struct tu_descriptor_update_template_entry |
| { |
| VkDescriptorType descriptor_type; |
| |
| /* The number of descriptors to update */ |
| uint32_t descriptor_count; |
| |
| /* Into mapped_ptr or dynamic_descriptors, in units of the respective array |
| */ |
| uint32_t dst_offset; |
| |
| /* In dwords. Not valid/used for dynamic descriptors */ |
| uint32_t dst_stride; |
| |
| uint32_t buffer_offset; |
| |
| /* Only valid for combined image samplers and samplers */ |
| uint16_t has_sampler; |
| |
| /* In bytes */ |
| size_t src_offset; |
| size_t src_stride; |
| |
| /* For push descriptors */ |
| const struct tu_sampler *immutable_samplers; |
| }; |
| |
| struct tu_descriptor_update_template |
| { |
| struct vk_object_base base; |
| |
| uint32_t entry_count; |
| VkPipelineBindPoint bind_point; |
| struct tu_descriptor_update_template_entry entry[0]; |
| }; |
| |
| struct tu_buffer |
| { |
| struct vk_object_base base; |
| |
| VkDeviceSize size; |
| |
| VkBufferUsageFlags usage; |
| VkBufferCreateFlags flags; |
| |
| struct tu_bo *bo; |
| uint64_t iova; |
| }; |
| |
| const char * |
| tu_get_debug_option_name(int id); |
| |
| const char * |
| tu_get_perftest_option_name(int id); |
| |
| struct tu_descriptor_state |
| { |
| struct tu_descriptor_set *sets[MAX_SETS]; |
| struct tu_descriptor_set push_set; |
| uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS]; |
| }; |
| |
| enum tu_cmd_dirty_bits |
| { |
| TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0), |
| TU_CMD_DIRTY_VB_STRIDE = BIT(1), |
| TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2), |
| TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3), |
| TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4), |
| TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5), |
| TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6), |
| TU_CMD_DIRTY_SHADER_CONSTS = BIT(7), |
| TU_CMD_DIRTY_LRZ = BIT(8), |
| TU_CMD_DIRTY_VS_PARAMS = BIT(9), |
| TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10), |
| /* all draw states were disabled and need to be re-enabled: */ |
| TU_CMD_DIRTY_DRAW_STATE = BIT(11) |
| }; |
| |
| /* There are only three cache domains we have to care about: the CCU, or |
| * color cache unit, which is used for color and depth/stencil attachments |
| * and copy/blit destinations, and is split conceptually into color and depth, |
| * and the universal cache or UCHE which is used for pretty much everything |
| * else, except for the CP (uncached) and host. We need to flush whenever data |
| * crosses these boundaries. |
| */ |
| |
| enum tu_cmd_access_mask { |
| TU_ACCESS_UCHE_READ = 1 << 0, |
| TU_ACCESS_UCHE_WRITE = 1 << 1, |
| TU_ACCESS_CCU_COLOR_READ = 1 << 2, |
| TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, |
| TU_ACCESS_CCU_DEPTH_READ = 1 << 4, |
| TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, |
| |
| /* Experiments have shown that while it's safe to avoid flushing the CCU |
| * after each blit/renderpass, it's not safe to assume that subsequent |
| * lookups with a different attachment state will hit unflushed cache |
| * entries. That is, the CCU needs to be flushed and possibly invalidated |
| * when accessing memory with a different attachment state. Writing to an |
| * attachment under the following conditions after clearing using the |
| * normal 2d engine path is known to have issues: |
| * |
| * - It isn't the 0'th layer. |
| * - There are more than one attachment, and this isn't the 0'th attachment |
| * (this seems to also depend on the cpp of the attachments). |
| * |
| * Our best guess is that the layer/MRT state is used when computing |
| * the location of a cache entry in CCU, to avoid conflicts. We assume that |
| * any access in a renderpass after or before an access by a transfer needs |
| * a flush/invalidate, and use the _INCOHERENT variants to represent access |
| * by a renderpass. |
| */ |
| TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, |
| TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, |
| TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, |
| TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, |
| |
| /* Accesses which bypasses any cache. e.g. writes via the host, |
| * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE. |
| */ |
| TU_ACCESS_SYSMEM_READ = 1 << 10, |
| TU_ACCESS_SYSMEM_WRITE = 1 << 11, |
| |
| /* Memory writes from the CP start in-order with draws and event writes, |
| * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read. |
| */ |
| TU_ACCESS_CP_WRITE = 1 << 12, |
| |
| TU_ACCESS_READ = |
| TU_ACCESS_UCHE_READ | |
| TU_ACCESS_CCU_COLOR_READ | |
| TU_ACCESS_CCU_DEPTH_READ | |
| TU_ACCESS_CCU_COLOR_INCOHERENT_READ | |
| TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | |
| TU_ACCESS_SYSMEM_READ, |
| |
| TU_ACCESS_WRITE = |
| TU_ACCESS_UCHE_WRITE | |
| TU_ACCESS_CCU_COLOR_WRITE | |
| TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | |
| TU_ACCESS_CCU_DEPTH_WRITE | |
| TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | |
| TU_ACCESS_SYSMEM_WRITE | |
| TU_ACCESS_CP_WRITE, |
| |
| TU_ACCESS_ALL = |
| TU_ACCESS_READ | |
| TU_ACCESS_WRITE, |
| }; |
| |
| /* Starting with a6xx, the pipeline is split into several "clusters" (really |
| * pipeline stages). Each stage has its own pair of register banks and can |
| * switch them independently, so that earlier stages can run ahead of later |
| * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at |
| * the same time. |
| * |
| * As a result of this, we need to insert a WFI when an earlier stage depends |
| * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any |
| * pending WFI's to complete before starting, and usually before reading |
| * indirect params even, so a WFI also acts as a full "pipeline stall". |
| * |
| * Note, the names of the stages come from CLUSTER_* in devcoredump. We |
| * include all the stages for completeness, even ones which do not read/write |
| * anything. |
| */ |
| |
| enum tu_stage { |
| /* This doesn't correspond to a cluster, but we need it for tracking |
| * indirect draw parameter reads etc. |
| */ |
| TU_STAGE_CP, |
| |
| /* - Fetch index buffer |
| * - Fetch vertex attributes, dispatch VS |
| */ |
| TU_STAGE_FE, |
| |
| /* Execute all geometry stages (VS thru GS) */ |
| TU_STAGE_SP_VS, |
| |
| /* Write to VPC, do primitive assembly. */ |
| TU_STAGE_PC_VS, |
| |
| /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according |
| * to devcoredump so presumably this stage stalls for TU_STAGE_PS when |
| * early depth testing is enabled before dispatching fragments? However |
| * GRAS reads and writes LRZ directly. |
| */ |
| TU_STAGE_GRAS, |
| |
| /* Execute FS */ |
| TU_STAGE_SP_PS, |
| |
| /* - Fragment tests |
| * - Write color/depth |
| * - Streamout writes (???) |
| * - Varying interpolation (???) |
| */ |
| TU_STAGE_PS, |
| }; |
| |
| enum tu_cmd_flush_bits { |
| TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, |
| TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, |
| TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, |
| TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, |
| TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, |
| TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, |
| TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6, |
| TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7, |
| TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8, |
| |
| TU_CMD_FLAG_ALL_FLUSH = |
| TU_CMD_FLAG_CCU_FLUSH_DEPTH | |
| TU_CMD_FLAG_CCU_FLUSH_COLOR | |
| TU_CMD_FLAG_CACHE_FLUSH | |
| /* Treat the CP as a sort of "cache" which may need to be "flushed" via |
| * waiting for writes to land with WAIT_FOR_MEM_WRITES. |
| */ |
| TU_CMD_FLAG_WAIT_MEM_WRITES, |
| |
| TU_CMD_FLAG_ALL_INVALIDATE = |
| TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | |
| TU_CMD_FLAG_CCU_INVALIDATE_COLOR | |
| TU_CMD_FLAG_CACHE_INVALIDATE, |
| }; |
| |
| /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty |
| * heavy, involving a CCU cache flush/invalidate and a WFI in order to change |
| * which part of the gmem is used by the CCU. Here we keep track of what the |
| * state of the CCU. |
| */ |
| enum tu_cmd_ccu_state { |
| TU_CMD_CCU_SYSMEM, |
| TU_CMD_CCU_GMEM, |
| TU_CMD_CCU_UNKNOWN, |
| }; |
| |
| struct tu_cache_state { |
| /* Caches which must be made available (flushed) eventually if there are |
| * any users outside that cache domain, and caches which must be |
| * invalidated eventually if there are any reads. |
| */ |
| enum tu_cmd_flush_bits pending_flush_bits; |
| /* Pending flushes */ |
| enum tu_cmd_flush_bits flush_bits; |
| }; |
| |
| enum tu_lrz_force_disable_mask { |
| TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0, |
| TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1, |
| }; |
| |
| enum tu_lrz_direction { |
| TU_LRZ_UNKNOWN, |
| /* Depth func less/less-than: */ |
| TU_LRZ_LESS, |
| /* Depth func greater/greater-than: */ |
| TU_LRZ_GREATER, |
| }; |
| |
| struct tu_lrz_pipeline |
| { |
| uint32_t force_disable_mask; |
| bool fs_has_kill; |
| bool force_late_z; |
| bool early_fragment_tests; |
| }; |
| |
| struct tu_lrz_state |
| { |
| /* Depth/Stencil image currently on use to do LRZ */ |
| struct tu_image *image; |
| bool valid : 1; |
| struct tu_draw_state state; |
| enum tu_lrz_direction prev_direction; |
| }; |
| |
| struct tu_vs_params { |
| uint32_t vertex_offset; |
| uint32_t first_instance; |
| }; |
| |
| struct tu_cmd_state |
| { |
| uint32_t dirty; |
| |
| struct tu_pipeline *pipeline; |
| struct tu_pipeline *compute_pipeline; |
| |
| /* Vertex buffers, viewports, and scissors |
| * the states for these can be updated partially, so we need to save these |
| * to be able to emit a complete draw state |
| */ |
| struct { |
| uint64_t base; |
| uint32_t size; |
| uint32_t stride; |
| } vb[MAX_VBS]; |
| VkViewport viewport[MAX_VIEWPORTS]; |
| VkRect2D scissor[MAX_SCISSORS]; |
| uint32_t max_viewport, max_scissor; |
| |
| /* for dynamic states that can't be emitted directly */ |
| uint32_t dynamic_stencil_mask; |
| uint32_t dynamic_stencil_wrmask; |
| uint32_t dynamic_stencil_ref; |
| |
| uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl; |
| uint32_t pc_raster_cntl, vpc_unknown_9107; |
| enum pc_di_primtype primtype; |
| bool primitive_restart_enable; |
| |
| /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */ |
| struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; |
| struct tu_draw_state vertex_buffers; |
| struct tu_draw_state shader_const[2]; |
| struct tu_draw_state desc_sets; |
| |
| struct tu_draw_state vs_params; |
| |
| /* Index buffer */ |
| uint64_t index_va; |
| uint32_t max_index_count; |
| uint8_t index_size; |
| |
| /* because streamout base has to be 32-byte aligned |
| * there is an extra offset to deal with when it is |
| * unaligned |
| */ |
| uint8_t streamout_offset[IR3_MAX_SO_BUFFERS]; |
| |
| /* Renderpasses are tricky, because we may need to flush differently if |
| * using sysmem vs. gmem and therefore we have to delay any flushing that |
| * happens before a renderpass. So we have to have two copies of the flush |
| * state, one for intra-renderpass flushes (i.e. renderpass dependencies) |
| * and one for outside a renderpass. |
| */ |
| struct tu_cache_state cache; |
| struct tu_cache_state renderpass_cache; |
| |
| enum tu_cmd_ccu_state ccu_state; |
| |
| const struct tu_render_pass *pass; |
| const struct tu_subpass *subpass; |
| const struct tu_framebuffer *framebuffer; |
| VkRect2D render_area; |
| |
| const struct tu_image_view **attachments; |
| |
| bool xfb_used; |
| bool has_tess; |
| bool tessfactor_addr_set; |
| bool has_subpass_predication; |
| bool predication_active; |
| bool disable_gmem; |
| enum a5xx_line_mode line_mode; |
| |
| uint32_t drawcall_count; |
| |
| /* A calculated "draw cost" value for renderpass, which tries to |
| * estimate the bandwidth-per-sample of all the draws according |
| * to: |
| * |
| * foreach_draw (...) { |
| * cost += num_frag_outputs; |
| * if (blend_enabled) |
| * cost += num_blend_enabled; |
| * if (depth_test_enabled) |
| * cost++; |
| * if (depth_write_enabled) |
| * cost++; |
| * } |
| * |
| * The idea is that each sample-passed minimally does one write |
| * per MRT. If blend is enabled, the hw will additionally do |
| * a framebuffer read per sample-passed (for each MRT with blend |
| * enabled). If depth-test is enabled, the hw will additionally |
| * a depth buffer read. If depth-write is enable, the hw will |
| * additionally do a depth buffer write. |
| * |
| * This does ignore depth buffer traffic for samples which do not |
| * pass do to depth-test fail, and some other details. But it is |
| * just intended to be a rough estimate that is easy to calculate. |
| */ |
| uint32_t total_drawcalls_cost; |
| |
| struct tu_lrz_state lrz; |
| |
| struct tu_draw_state depth_plane_state; |
| |
| struct tu_vs_params last_vs_params; |
| }; |
| |
| struct tu_cmd_pool |
| { |
| struct vk_object_base base; |
| |
| VkAllocationCallbacks alloc; |
| struct list_head cmd_buffers; |
| struct list_head free_cmd_buffers; |
| uint32_t queue_family_index; |
| }; |
| |
| enum tu_cmd_buffer_status |
| { |
| TU_CMD_BUFFER_STATUS_INVALID, |
| TU_CMD_BUFFER_STATUS_INITIAL, |
| TU_CMD_BUFFER_STATUS_RECORDING, |
| TU_CMD_BUFFER_STATUS_EXECUTABLE, |
| TU_CMD_BUFFER_STATUS_PENDING, |
| }; |
| |
| struct tu_cmd_buffer |
| { |
| struct vk_command_buffer vk; |
| |
| struct tu_device *device; |
| |
| struct tu_cmd_pool *pool; |
| struct list_head pool_link; |
| |
| struct u_trace trace; |
| struct u_trace_iterator trace_renderpass_start; |
| struct u_trace_iterator trace_renderpass_end; |
| |
| struct list_head renderpass_autotune_results; |
| |
| VkCommandBufferUsageFlags usage_flags; |
| VkCommandBufferLevel level; |
| enum tu_cmd_buffer_status status; |
| |
| struct tu_cmd_state state; |
| uint32_t queue_family_index; |
| |
| uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; |
| VkShaderStageFlags push_constant_stages; |
| struct tu_descriptor_set meta_push_descriptors; |
| |
| struct tu_descriptor_state descriptors[MAX_BIND_POINTS]; |
| |
| VkResult record_result; |
| |
| struct tu_cs cs; |
| struct tu_cs draw_cs; |
| struct tu_cs tile_store_cs; |
| struct tu_cs draw_epilogue_cs; |
| struct tu_cs sub_cs; |
| |
| uint32_t vsc_draw_strm_pitch; |
| uint32_t vsc_prim_strm_pitch; |
| }; |
| |
| /* Temporary struct for tracking a register state to be written, used by |
| * a6xx-pack.h and tu_cs_emit_regs() |
| */ |
| struct tu_reg_value { |
| uint32_t reg; |
| uint64_t value; |
| bool is_address; |
| struct tu_bo *bo; |
| bool bo_write; |
| uint32_t bo_offset; |
| uint32_t bo_shift; |
| }; |
| |
| |
| void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, |
| struct tu_cs *cs); |
| |
| void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, |
| struct tu_cs *cs, |
| enum tu_cmd_ccu_state ccu_state); |
| |
| void |
| tu6_emit_event_write(struct tu_cmd_buffer *cmd, |
| struct tu_cs *cs, |
| enum vgt_event_type event); |
| |
| static inline struct tu_descriptor_state * |
| tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, |
| VkPipelineBindPoint bind_point) |
| { |
| return &cmd_buffer->descriptors[bind_point]; |
| } |
| |
| struct tu_event |
| { |
| struct vk_object_base base; |
| struct tu_bo bo; |
| }; |
| |
| struct tu_push_constant_range |
| { |
| uint32_t lo; |
| uint32_t count; |
| }; |
| |
| struct tu_shader |
| { |
| struct ir3_shader *ir3_shader; |
| |
| struct tu_push_constant_range push_consts; |
| uint8_t active_desc_sets; |
| bool multi_pos_output; |
| }; |
| |
| bool |
| tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output, |
| struct tu_device *dev); |
| |
| nir_shader * |
| tu_spirv_to_nir(struct tu_device *dev, |
| const VkPipelineShaderStageCreateInfo *stage_info, |
| gl_shader_stage stage); |
| |
| struct tu_shader * |
| tu_shader_create(struct tu_device *dev, |
| nir_shader *nir, |
| const VkPipelineShaderStageCreateInfo *stage_info, |
| unsigned multiview_mask, |
| struct tu_pipeline_layout *layout, |
| const VkAllocationCallbacks *alloc); |
| |
| void |
| tu_shader_destroy(struct tu_device *dev, |
| struct tu_shader *shader, |
| const VkAllocationCallbacks *alloc); |
| |
| struct tu_program_descriptor_linkage |
| { |
| struct ir3_const_state const_state; |
| |
| uint32_t constlen; |
| |
| struct tu_push_constant_range push_consts; |
| }; |
| |
| struct tu_pipeline_executable { |
| gl_shader_stage stage; |
| |
| struct ir3_info stats; |
| bool is_binning; |
| |
| char *nir_from_spirv; |
| char *nir_final; |
| char *disasm; |
| }; |
| |
| struct tu_pipeline |
| { |
| struct vk_object_base base; |
| |
| struct tu_cs cs; |
| |
| /* Separate BO for private memory since it should GPU writable */ |
| struct tu_bo pvtmem_bo; |
| |
| struct tu_pipeline_layout *layout; |
| |
| bool need_indirect_descriptor_sets; |
| VkShaderStageFlags active_stages; |
| uint32_t active_desc_sets; |
| |
| /* mask of enabled dynamic states |
| * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used |
| */ |
| uint32_t dynamic_state_mask; |
| struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; |
| |
| /* for dynamic states which use the same register: */ |
| uint32_t gras_su_cntl, gras_su_cntl_mask; |
| uint32_t rb_depth_cntl, rb_depth_cntl_mask; |
| uint32_t rb_stencil_cntl, rb_stencil_cntl_mask; |
| uint32_t pc_raster_cntl, pc_raster_cntl_mask; |
| uint32_t vpc_unknown_9107, vpc_unknown_9107_mask; |
| uint32_t stencil_wrmask; |
| |
| bool rb_depth_cntl_disable; |
| |
| enum a5xx_line_mode line_mode; |
| |
| /* draw states for the pipeline */ |
| struct tu_draw_state load_state, rast_state, blend_state; |
| |
| /* for vertex buffers state */ |
| uint32_t num_vbs; |
| |
| struct |
| { |
| struct tu_draw_state config_state; |
| struct tu_draw_state state; |
| struct tu_draw_state binning_state; |
| |
| struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; |
| } program; |
| |
| struct |
| { |
| struct tu_draw_state state; |
| struct tu_draw_state binning_state; |
| } vi; |
| |
| struct |
| { |
| enum pc_di_primtype primtype; |
| bool primitive_restart; |
| } ia; |
| |
| struct |
| { |
| uint32_t patch_type; |
| uint32_t param_stride; |
| bool upper_left_domain_origin; |
| } tess; |
| |
| struct |
| { |
| uint32_t local_size[3]; |
| uint32_t subgroup_size; |
| } compute; |
| |
| bool provoking_vertex_last; |
| |
| struct tu_lrz_pipeline lrz; |
| |
| /* Base drawcall cost for sysmem vs gmem autotuner */ |
| uint8_t drawcall_base_cost; |
| |
| void *executables_mem_ctx; |
| /* tu_pipeline_executable */ |
| struct util_dynarray executables; |
| }; |
| |
| void |
| tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport); |
| |
| void |
| tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count); |
| |
| void |
| tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value); |
| |
| void |
| tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc); |
| |
| void |
| tu6_emit_depth_bias(struct tu_cs *cs, |
| float constant_factor, |
| float clamp, |
| float slope_factor); |
| |
| void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples, |
| enum a5xx_line_mode line_mode); |
| |
| void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); |
| |
| void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); |
| |
| void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); |
| |
| void tu6_apply_depth_bounds_workaround(struct tu_device *device, |
| uint32_t *rb_depth_cntl); |
| |
| struct tu_pvtmem_config { |
| uint64_t iova; |
| uint32_t per_fiber_size; |
| uint32_t per_sp_size; |
| bool per_wave; |
| }; |
| |
| void |
| tu6_emit_xs_config(struct tu_cs *cs, |
| gl_shader_stage stage, |
| const struct ir3_shader_variant *xs); |
| |
| void |
| tu6_emit_xs(struct tu_cs *cs, |
| gl_shader_stage stage, |
| const struct ir3_shader_variant *xs, |
| const struct tu_pvtmem_config *pvtmem, |
| uint64_t binary_iova); |
| |
| void |
| tu6_emit_vpc(struct tu_cs *cs, |
| const struct ir3_shader_variant *vs, |
| const struct ir3_shader_variant *hs, |
| const struct ir3_shader_variant *ds, |
| const struct ir3_shader_variant *gs, |
| const struct ir3_shader_variant *fs, |
| uint32_t patch_control_points); |
| |
| void |
| tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs); |
| |
| struct tu_image_view; |
| |
| void |
| tu_resolve_sysmem(struct tu_cmd_buffer *cmd, |
| struct tu_cs *cs, |
| const struct tu_image_view *src, |
| const struct tu_image_view *dst, |
| uint32_t layer_mask, |
| uint32_t layers, |
| const VkRect2D *rect); |
| |
| void |
| tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, |
| struct tu_cs *cs, |
| uint32_t a, |
| const VkRenderPassBeginInfo *info); |
| |
| void |
| tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, |
| struct tu_cs *cs, |
| uint32_t a, |
| const VkRenderPassBeginInfo *info); |
| |
| void |
| tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, |
| struct tu_cs *cs, |
| uint32_t a, |
| bool force_load); |
| |
| /* expose this function to be able to emit load without checking LOAD_OP */ |
| void |
| tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); |
| |
| /* note: gmem store can also resolve */ |
| void |
| tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, |
| struct tu_cs *cs, |
| uint32_t a, |
| uint32_t gmem_a); |
| |
| enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); |
| |
| struct tu_native_format |
| { |
| enum a6xx_format fmt : 8; |
| enum a3xx_color_swap swap : 8; |
| enum a6xx_tile_mode tile_mode : 8; |
| }; |
| |
| enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); |
| bool tu6_format_vtx_supported(VkFormat format); |
| struct tu_native_format tu6_format_vtx(VkFormat format); |
| bool tu6_format_color_supported(enum pipe_format format); |
| struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode); |
| bool tu6_format_texture_supported(enum pipe_format format); |
| struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode); |
| |
| static inline enum a6xx_format |
| tu6_base_format(enum pipe_format format) |
| { |
| /* note: tu6_format_color doesn't care about tiling for .fmt field */ |
| return tu6_format_color(format, TILE6_LINEAR).fmt; |
| } |
| |
| struct tu_image |
| { |
| struct vk_object_base base; |
| |
| /* The original VkFormat provided by the client. This may not match any |
| * of the actual surface formats. |
| */ |
| VkFormat vk_format; |
| uint32_t level_count; |
| uint32_t layer_count; |
| |
| struct fdl_layout layout[3]; |
| uint32_t total_size; |
| |
| #ifdef ANDROID |
| /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */ |
| VkDeviceMemory owned_memory; |
| #endif |
| |
| /* Set when bound */ |
| struct tu_bo *bo; |
| uint64_t iova; |
| |
| uint32_t lrz_height; |
| uint32_t lrz_pitch; |
| uint32_t lrz_offset; |
| |
| bool shareable; |
| }; |
| |
| static inline uint32_t |
| tu_get_layerCount(const struct tu_image *image, |
| const VkImageSubresourceRange *range) |
| { |
| return range->layerCount == VK_REMAINING_ARRAY_LAYERS |
| ? image->layer_count - range->baseArrayLayer |
| : range->layerCount; |
| } |
| |
| static inline uint32_t |
| tu_get_levelCount(const struct tu_image *image, |
| const VkImageSubresourceRange *range) |
| { |
| return range->levelCount == VK_REMAINING_MIP_LEVELS |
| ? image->level_count - range->baseMipLevel |
| : range->levelCount; |
| } |
| |
| enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane); |
| |
| uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask); |
| |
| enum pipe_format tu_format_for_aspect(enum pipe_format format, |
| VkImageAspectFlags aspect_mask); |
| |
| struct tu_image_view |
| { |
| struct vk_object_base base; |
| |
| struct tu_image *image; /**< VkImageViewCreateInfo::image */ |
| |
| struct fdl6_view view; |
| |
| /* for d32s8 separate stencil */ |
| uint64_t stencil_base_addr; |
| uint32_t stencil_layer_size; |
| uint32_t stencil_PITCH; |
| }; |
| |
| struct tu_sampler_ycbcr_conversion { |
| struct vk_object_base base; |
| |
| VkFormat format; |
| VkSamplerYcbcrModelConversion ycbcr_model; |
| VkSamplerYcbcrRange ycbcr_range; |
| VkComponentMapping components; |
| VkChromaLocation chroma_offsets[2]; |
| VkFilter chroma_filter; |
| }; |
| |
| struct tu_sampler { |
| struct vk_object_base base; |
| |
| uint32_t descriptor[A6XX_TEX_SAMP_DWORDS]; |
| struct tu_sampler_ycbcr_conversion *ycbcr_sampler; |
| }; |
| |
| void |
| tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer); |
| |
| void |
| tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src); |
| |
| void |
| tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer); |
| |
| void |
| tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); |
| |
| #define tu_image_view_stencil(iview, x) \ |
| ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT)) |
| |
| VkResult |
| tu_gralloc_info(struct tu_device *device, |
| const VkNativeBufferANDROID *gralloc_info, |
| int *dma_buf, |
| uint64_t *modifier); |
| |
| VkResult |
| tu_import_memory_from_gralloc_handle(VkDevice device_h, |
| int dma_buf, |
| const VkAllocationCallbacks *alloc, |
| VkImage image_h); |
| |
| void |
| tu_image_view_init(struct tu_image_view *iview, |
| const VkImageViewCreateInfo *pCreateInfo, |
| bool limited_z24s8); |
| |
| bool |
| tiling_possible(VkFormat format); |
| |
| bool |
| ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage, |
| const struct fd_dev_info *info, VkSampleCountFlagBits samples); |
| |
| struct tu_buffer_view |
| { |
| struct vk_object_base base; |
| |
| uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; |
| |
| struct tu_buffer *buffer; |
| }; |
| void |
| tu_buffer_view_init(struct tu_buffer_view *view, |
| struct tu_device *device, |
| const VkBufferViewCreateInfo *pCreateInfo); |
| |
| struct tu_attachment_info |
| { |
| struct tu_image_view *attachment; |
| }; |
| |
| struct tu_framebuffer |
| { |
| struct vk_object_base base; |
| |
| uint32_t width; |
| uint32_t height; |
| uint32_t layers; |
| |
| /* size of the first tile */ |
| VkExtent2D tile0; |
| /* number of tiles */ |
| VkExtent2D tile_count; |
| |
| /* size of the first VSC pipe */ |
| VkExtent2D pipe0; |
| /* number of VSC pipes */ |
| VkExtent2D pipe_count; |
| |
| /* pipe register values */ |
| uint32_t pipe_config[MAX_VSC_PIPES]; |
| uint32_t pipe_sizes[MAX_VSC_PIPES]; |
| |
| uint32_t attachment_count; |
| struct tu_attachment_info attachments[0]; |
| }; |
| |
| void |
| tu_framebuffer_tiling_config(struct tu_framebuffer *fb, |
| const struct tu_device *device, |
| const struct tu_render_pass *pass); |
| |
| struct tu_subpass_barrier { |
| VkPipelineStageFlags src_stage_mask; |
| VkPipelineStageFlags dst_stage_mask; |
| VkAccessFlags src_access_mask; |
| VkAccessFlags dst_access_mask; |
| bool incoherent_ccu_color, incoherent_ccu_depth; |
| }; |
| |
| struct tu_subpass_attachment |
| { |
| uint32_t attachment; |
| |
| /* For input attachments, true if it needs to be patched to refer to GMEM |
| * in GMEM mode. This is false if it hasn't already been written as an |
| * attachment. |
| */ |
| bool patch_input_gmem; |
| }; |
| |
| struct tu_subpass |
| { |
| uint32_t input_count; |
| uint32_t color_count; |
| uint32_t resolve_count; |
| bool resolve_depth_stencil; |
| |
| /* True if there is any feedback loop at all. */ |
| bool feedback; |
| |
| /* True if we must invalidate UCHE thanks to a feedback loop. */ |
| bool feedback_invalidate; |
| |
| struct tu_subpass_attachment *input_attachments; |
| struct tu_subpass_attachment *color_attachments; |
| struct tu_subpass_attachment *resolve_attachments; |
| struct tu_subpass_attachment depth_stencil_attachment; |
| |
| VkSampleCountFlagBits samples; |
| |
| uint32_t srgb_cntl; |
| uint32_t multiview_mask; |
| |
| struct tu_subpass_barrier start_barrier; |
| }; |
| |
| struct tu_render_pass_attachment |
| { |
| VkFormat format; |
| uint32_t samples; |
| uint32_t cpp; |
| VkImageAspectFlags clear_mask; |
| uint32_t clear_views; |
| bool load; |
| bool store; |
| int32_t gmem_offset; |
| /* for D32S8 separate stencil: */ |
| bool load_stencil; |
| bool store_stencil; |
| int32_t gmem_offset_stencil; |
| }; |
| |
| struct tu_render_pass |
| { |
| struct vk_object_base base; |
| |
| uint32_t attachment_count; |
| uint32_t subpass_count; |
| uint32_t gmem_pixels; |
| uint32_t tile_align_w; |
| struct tu_subpass_attachment *subpass_attachments; |
| struct tu_render_pass_attachment *attachments; |
| struct tu_subpass_barrier end_barrier; |
| struct tu_subpass subpasses[0]; |
| }; |
| |
| #define PERF_CNTRS_REG 4 |
| |
| struct tu_perf_query_data |
| { |
| uint32_t gid; /* group-id */ |
| uint32_t cid; /* countable-id within the group */ |
| uint32_t cntr_reg; /* counter register within the group */ |
| uint32_t pass; /* pass index that countables can be requested */ |
| uint32_t app_idx; /* index provided by apps */ |
| }; |
| |
| struct tu_query_pool |
| { |
| struct vk_object_base base; |
| |
| VkQueryType type; |
| uint32_t stride; |
| uint64_t size; |
| uint32_t pipeline_statistics; |
| struct tu_bo bo; |
| |
| /* For performance query */ |
| const struct fd_perfcntr_group *perf_group; |
| uint32_t perf_group_count; |
| uint32_t counter_index_count; |
| struct tu_perf_query_data perf_query_data[0]; |
| }; |
| |
| uint32_t |
| tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index); |
| |
| void |
| tu_update_descriptor_sets(const struct tu_device *device, |
| VkDescriptorSet overrideSet, |
| uint32_t descriptorWriteCount, |
| const VkWriteDescriptorSet *pDescriptorWrites, |
| uint32_t descriptorCopyCount, |
| const VkCopyDescriptorSet *pDescriptorCopies); |
| |
| void |
| tu_update_descriptor_set_with_template( |
| const struct tu_device *device, |
| struct tu_descriptor_set *set, |
| VkDescriptorUpdateTemplate descriptorUpdateTemplate, |
| const void *pData); |
| |
| VkResult |
| tu_physical_device_init(struct tu_physical_device *device, |
| struct tu_instance *instance); |
| VkResult |
| tu_enumerate_devices(struct tu_instance *instance); |
| |
| int |
| tu_device_get_gpu_timestamp(struct tu_device *dev, |
| uint64_t *ts); |
| |
| int |
| tu_device_get_suspend_count(struct tu_device *dev, |
| uint64_t *suspend_count); |
| |
| int |
| tu_drm_submitqueue_new(const struct tu_device *dev, |
| int priority, |
| uint32_t *queue_id); |
| |
| void |
| tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id); |
| |
| int |
| tu_signal_syncs(struct tu_device *device, struct vk_sync *sync1, struct vk_sync *sync2); |
| |
| int |
| tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync); |
| |
| VkResult |
| tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit); |
| |
| void |
| tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream, |
| void *ts_from, uint32_t from_offset, |
| void *ts_to, uint32_t to_offset, |
| uint32_t count); |
| |
| |
| VkResult |
| tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs, |
| struct u_trace **trace_copy); |
| |
| /* If we copy trace and timestamps we will have to free them. */ |
| struct tu_u_trace_cmd_data |
| { |
| struct tu_cs *timestamp_copy_cs; |
| struct u_trace *trace; |
| }; |
| |
| /* Data necessary to retrieve timestamps and clean all |
| * associated resources afterwards. |
| */ |
| struct tu_u_trace_submission_data |
| { |
| uint32_t submission_id; |
| /* We have to know when timestamps are available, |
| * this sync object indicates it. |
| */ |
| struct tu_u_trace_syncobj *syncobj; |
| |
| uint32_t cmd_buffer_count; |
| uint32_t last_buffer_with_tracepoints; |
| struct tu_u_trace_cmd_data *cmd_trace_data; |
| }; |
| |
| VkResult |
| tu_u_trace_submission_data_create( |
| struct tu_device *device, |
| struct tu_cmd_buffer **cmd_buffers, |
| uint32_t cmd_buffer_count, |
| struct tu_u_trace_submission_data **submission_data); |
| |
| void |
| tu_u_trace_submission_data_finish( |
| struct tu_device *device, |
| struct tu_u_trace_submission_data *submission_data); |
| |
| #define TU_FROM_HANDLE(__tu_type, __name, __handle) \ |
| VK_FROM_HANDLE(__tu_type, __name, __handle) |
| |
| VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer, |
| VK_OBJECT_TYPE_COMMAND_BUFFER) |
| VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE) |
| VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance, |
| VK_OBJECT_TYPE_INSTANCE) |
| VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice, |
| VK_OBJECT_TYPE_PHYSICAL_DEVICE) |
| VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE) |
| |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, base, VkCommandPool, |
| VK_OBJECT_TYPE_COMMAND_POOL) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer, |
| VK_OBJECT_TYPE_BUFFER) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView, |
| VK_OBJECT_TYPE_BUFFER_VIEW) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool, |
| VK_OBJECT_TYPE_DESCRIPTOR_POOL) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet, |
| VK_OBJECT_TYPE_DESCRIPTOR_SET) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base, |
| VkDescriptorSetLayout, |
| VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base, |
| VkDescriptorUpdateTemplate, |
| VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory, |
| VK_OBJECT_TYPE_DEVICE_MEMORY) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer, |
| VK_OBJECT_TYPE_FRAMEBUFFER) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView, |
| VK_OBJECT_TYPE_IMAGE_VIEW); |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache, |
| VK_OBJECT_TYPE_PIPELINE_CACHE) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline, |
| VK_OBJECT_TYPE_PIPELINE) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout, |
| VK_OBJECT_TYPE_PIPELINE_LAYOUT) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool, |
| VK_OBJECT_TYPE_QUERY_POOL) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass, |
| VK_OBJECT_TYPE_RENDER_PASS) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler, |
| VK_OBJECT_TYPE_SAMPLER) |
| VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion, |
| VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION) |
| |
| /* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */ |
| #define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x)) |
| |
| void |
| update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask); |
| |
| #endif /* TU_PRIVATE_H */ |