| /* |
| * Copyright © 2022 Collabora Ltd. and Red Hat Inc. |
| * SPDX-License-Identifier: MIT |
| */ |
| #include "nvk_device.h" |
| |
| #include "nvk_cmd_buffer.h" |
| #include "nvk_entrypoints.h" |
| #include "nvk_instance.h" |
| #include "nvk_physical_device.h" |
| #include "nvk_sampler.h" |
| #include "nvk_shader.h" |
| #include "layers/nvk_app_workarounds.h" |
| #include "nvkmd/nvkmd.h" |
| |
| #include "vk_common_entrypoints.h" |
| #include "vk_drm_syncobj.h" |
| #include "vk_pipeline_cache.h" |
| #include "vk_debug_utils.h" |
| #include "util/u_printf.h" |
| #include "vulkan/wsi/wsi_common.h" |
| |
| #include "cl9097.h" |
| #include "clb097.h" |
| #include "clb197.h" |
| #include "clc397.h" |
| |
| static void |
| nvk_slm_area_init(struct nvk_slm_area *area) |
| { |
| memset(area, 0, sizeof(*area)); |
| simple_mtx_init(&area->mutex, mtx_plain); |
| } |
| |
| static void |
| nvk_slm_area_finish(struct nvk_slm_area *area) |
| { |
| simple_mtx_destroy(&area->mutex); |
| if (area->mem) |
| nvkmd_mem_unref(area->mem); |
| } |
| |
| struct nvkmd_mem * |
| nvk_slm_area_get_mem_ref(struct nvk_slm_area *area, |
| uint32_t *bytes_per_warp_out, |
| uint32_t *bytes_per_tpc_out) |
| { |
| simple_mtx_lock(&area->mutex); |
| struct nvkmd_mem *mem = area->mem; |
| if (mem) |
| nvkmd_mem_ref(mem); |
| *bytes_per_warp_out = area->bytes_per_warp; |
| *bytes_per_tpc_out = area->bytes_per_tpc; |
| simple_mtx_unlock(&area->mutex); |
| |
| return mem; |
| } |
| |
| static VkResult |
| nvk_slm_area_ensure(struct nvk_device *dev, |
| struct nvk_slm_area *area, |
| uint32_t slm_bytes_per_lane, |
| uint32_t crs_bytes_per_warp) |
| { |
| const struct nvk_physical_device *pdev = nvk_device_physical(dev); |
| VkResult result; |
| |
| assert(slm_bytes_per_lane < (1 << 24)); |
| assert(crs_bytes_per_warp <= (1 << 20)); |
| uint64_t bytes_per_warp = slm_bytes_per_lane * 32 + crs_bytes_per_warp; |
| |
| /* The hardware seems to require this alignment for |
| * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP |
| */ |
| bytes_per_warp = align64(bytes_per_warp, 0x200); |
| |
| uint64_t bytes_per_mp = bytes_per_warp * pdev->info.max_warps_per_mp; |
| uint64_t bytes_per_tpc = bytes_per_mp * pdev->info.mp_per_tpc; |
| |
| /* The hardware seems to require this alignment for |
| * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER. |
| */ |
| bytes_per_tpc = align64(bytes_per_tpc, 0x8000); |
| |
| /* nvk_slm_area::bytes_per_mp only ever increases so we can check this |
| * outside the lock and exit early in the common case. We only need to |
| * take the lock if we're actually going to resize. |
| * |
| * Also, we only care about bytes_per_mp and not bytes_per_warp because |
| * they are integer multiples of each other. |
| */ |
| if (likely(bytes_per_tpc <= area->bytes_per_tpc)) |
| return VK_SUCCESS; |
| |
| uint64_t size = bytes_per_tpc * pdev->info.tpc_count; |
| |
| /* The hardware seems to require this alignment for |
| * NV9097_SET_SHADER_LOCAL_MEMORY_D_SIZE_LOWER. |
| */ |
| size = align64(size, 0x20000); |
| |
| struct nvkmd_mem *mem; |
| result = nvkmd_dev_alloc_mem(dev->nvkmd, &dev->vk.base, size, 0, |
| NVKMD_MEM_LOCAL, &mem); |
| if (result != VK_SUCCESS) |
| return result; |
| |
| struct nvkmd_mem *unref_mem; |
| simple_mtx_lock(&area->mutex); |
| if (bytes_per_tpc <= area->bytes_per_tpc) { |
| /* We lost the race, throw away our BO */ |
| assert(area->bytes_per_warp >= bytes_per_warp); |
| unref_mem = mem; |
| } else { |
| unref_mem = area->mem; |
| area->mem = mem; |
| area->bytes_per_warp = bytes_per_warp; |
| area->bytes_per_tpc = bytes_per_tpc; |
| } |
| simple_mtx_unlock(&area->mutex); |
| |
| if (unref_mem) |
| nvkmd_mem_unref(unref_mem); |
| |
| return VK_SUCCESS; |
| } |
| |
| static VkResult |
| nvk_init_printf(struct nvk_device *dev) |
| { |
| VkResult result; |
| struct nvkmd_mem *mem; |
| const uint64_t mem_size = NAK_PRINTF_BUFFER_SIZE; |
| |
| result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &dev->vk.base, |
| mem_size, 0 /* align_B */, |
| NVKMD_MEM_GART | NVKMD_MEM_COHERENT, |
| NVKMD_MEM_MAP_RDWR, |
| &mem); |
| |
| if (result != VK_SUCCESS) |
| return result; |
| |
| u_printf_init(&dev->printf, mem, mem->map); |
| |
| return VK_SUCCESS; |
| } |
| |
| static void |
| nvk_destroy_printf(struct nvk_device *dev) { |
| struct nvkmd_mem *mem = dev->printf.bo; |
| u_printf_destroy(&dev->printf); |
| nvkmd_mem_unref(mem); |
| } |
| |
| static VkResult |
| nvk_device_check_status(struct vk_device *vk_dev) |
| { |
| VkResult status = VK_SUCCESS; |
| struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk); |
| |
| if (NAK_CAN_PRINTF) |
| status = vk_check_printf_status(&dev->vk, &dev->printf); |
| |
| return status; |
| } |
| |
| static VkResult |
| nvk_device_get_timestamp(struct vk_device *vk_dev, uint64_t *timestamp) |
| { |
| struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk); |
| *timestamp = nvkmd_dev_get_gpu_timestamp(dev->nvkmd); |
| return VK_SUCCESS; |
| } |
| |
| struct dispatch_table_builder { |
| struct vk_device_dispatch_table *tables[NVK_DISPATCH_TABLE_COUNT]; |
| bool used[NVK_DISPATCH_TABLE_COUNT]; |
| bool initialized[NVK_DISPATCH_TABLE_COUNT]; |
| }; |
| |
| static void |
| add_entrypoints(struct dispatch_table_builder *b, const struct vk_device_entrypoint_table *entrypoints, |
| enum nvk_dispatch_table table) |
| { |
| for (int32_t i = table - 1; i >= NVK_DEVICE_DISPATCH_TABLE; i--) { |
| if (i == NVK_DEVICE_DISPATCH_TABLE || b->used[i]) { |
| vk_device_dispatch_table_from_entrypoints(b->tables[i], entrypoints, !b->initialized[i]); |
| b->initialized[i] = true; |
| } |
| } |
| |
| if (table < NVK_DISPATCH_TABLE_COUNT) |
| b->used[table] = true; |
| } |
| |
| static void |
| init_app_workarounds_entrypoints(struct nvk_device *device, struct dispatch_table_builder *b) |
| { |
| const struct nvk_physical_device *pdev = nvk_device_physical(device); |
| const struct nvk_instance *instance = nvk_physical_device_instance(pdev); |
| struct vk_device_entrypoint_table table = {0}; |
| |
| #define SET_ENTRYPOINT(app_layer, entrypoint) table.entrypoint = app_layer##_##entrypoint; |
| if (!strcmp(instance->app_layer, "metroexodus")) { |
| SET_ENTRYPOINT(metro_exodus, GetSemaphoreCounterValue); |
| } |
| #undef SET_ENTRYPOINT |
| |
| add_entrypoints(b, &table, NVK_APP_DISPATCH_TABLE); |
| } |
| |
| static void |
| init_dispatch_tables(struct nvk_device *dev) |
| { |
| struct dispatch_table_builder b = {0}; |
| b.tables[NVK_DEVICE_DISPATCH_TABLE] = &dev->vk.dispatch_table; |
| b.tables[NVK_APP_DISPATCH_TABLE] = &dev->layer_dispatch.app; |
| |
| init_app_workarounds_entrypoints(dev, &b); |
| |
| add_entrypoints(&b, &nvk_device_entrypoints, NVK_DISPATCH_TABLE_COUNT); |
| add_entrypoints(&b, &wsi_device_entrypoints, NVK_DISPATCH_TABLE_COUNT); |
| add_entrypoints(&b, &vk_common_device_entrypoints, NVK_DISPATCH_TABLE_COUNT); |
| } |
| |
| VKAPI_ATTR VkResult VKAPI_CALL |
| nvk_CreateDevice(VkPhysicalDevice physicalDevice, |
| const VkDeviceCreateInfo *pCreateInfo, |
| const VkAllocationCallbacks *pAllocator, |
| VkDevice *pDevice) |
| { |
| VK_FROM_HANDLE(nvk_physical_device, pdev, physicalDevice); |
| VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY; |
| struct nvk_device *dev; |
| |
| dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, |
| sizeof(*dev), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); |
| if (!dev) |
| return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY); |
| |
| result = vk_device_init(&dev->vk, &pdev->vk, NULL, pCreateInfo, pAllocator); |
| if (result != VK_SUCCESS) |
| goto fail_alloc; |
| |
| init_dispatch_tables(dev); |
| |
| dev->vk.shader_ops = &nvk_device_shader_ops; |
| dev->vk.check_status = &nvk_device_check_status; |
| |
| uint32_t queue_count = 0; |
| for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) |
| queue_count += pCreateInfo->pQueueCreateInfos[i].queueCount; |
| |
| if (queue_count > 0) { |
| result = nvkmd_pdev_create_dev(pdev->nvkmd, &pdev->vk.base, &dev->nvkmd); |
| if (result != VK_SUCCESS) |
| goto fail_init; |
| |
| vk_device_set_drm_fd(&dev->vk, nvkmd_dev_get_drm_fd(dev->nvkmd)); |
| dev->vk.command_buffer_ops = &nvk_cmd_buffer_ops; |
| |
| dev->vk.get_timestamp = nvk_device_get_timestamp; |
| dev->vk.copy_sync_payloads = vk_drm_syncobj_copy_payloads; |
| |
| result = nvk_upload_queue_init(dev, &dev->upload); |
| if (result != VK_SUCCESS) |
| goto fail_nvkmd; |
| |
| result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &pdev->vk.base, |
| 0x1000, 0, NVKMD_MEM_LOCAL, |
| NVKMD_MEM_MAP_WR, &dev->zero_page); |
| if (result != VK_SUCCESS) |
| goto fail_upload; |
| |
| memset(dev->zero_page->map, 0, 0x1000); |
| nvkmd_mem_sync_map_to_gpu(dev->zero_page, 0, 0x1000); |
| nvkmd_mem_unmap(dev->zero_page, 0); |
| |
| result = nvk_descriptor_table_init(dev, &dev->images, |
| sizeof(struct nil_descriptor), |
| 1024, 1024 * 1024); |
| if (result != VK_SUCCESS) |
| goto fail_zero_page; |
| |
| /* Reserve the descriptor at offset 0 to be the null descriptor */ |
| const struct nil_descriptor null_desc = |
| nil_null_descriptor(&pdev->info, dev->zero_page->va->addr); |
| |
| ASSERTED uint32_t null_image_index; |
| result = nvk_descriptor_table_add(dev, &dev->images, |
| &null_desc, sizeof(null_desc), |
| &null_image_index); |
| assert(result == VK_SUCCESS); |
| assert(null_image_index == 0); |
| |
| result = nvk_descriptor_table_init(dev, &dev->samplers, |
| 8 * 4 /* tsc entry size */, |
| 4096, 4096); |
| if (result != VK_SUCCESS) |
| goto fail_images; |
| |
| /* On Kepler and earlier, TXF takes a sampler but SPIR-V defines it as |
| * not taking one so we need to reserve one at device create time. If |
| * we do so now then it will always have sampler index 0 so we can rely |
| * on that in the compiler lowering code (similar to null descriptors). |
| */ |
| if (pdev->info.cls_eng3d < MAXWELL_A) { |
| const struct nvk_sampler_header txf_sampler = |
| nvk_txf_sampler_header(pdev); |
| |
| ASSERTED uint32_t txf_sampler_index; |
| result = nvk_descriptor_table_add(dev, &dev->samplers, |
| &txf_sampler, sizeof(txf_sampler), |
| &txf_sampler_index); |
| assert(result == VK_SUCCESS); |
| assert(txf_sampler_index == 0); |
| } |
| |
| if (dev->vk.enabled_features.descriptorBuffer || |
| nvk_use_edb_buffer_views(pdev)) { |
| result = nvk_edb_bview_cache_init(dev, &dev->edb_bview_cache); |
| if (result != VK_SUCCESS) |
| goto fail_samplers; |
| } |
| |
| /* If we have a full BAR, go ahead and do shader uploads on the CPU. |
| * Otherwise, we fall back to doing shader uploads via the upload queue. |
| * |
| * Also, the I-cache pre-fetches and NVIDIA has informed us |
| * overallocating shaders BOs by 2K is sufficient. |
| */ |
| enum nvkmd_mem_map_flags shader_map_flags = 0; |
| if (pdev->info.bar_size_B >= pdev->info.vram_size_B) |
| shader_map_flags = NVKMD_MEM_MAP_WR; |
| result = nvk_heap_init(dev, &dev->shader_heap, |
| NVKMD_MEM_LOCAL, shader_map_flags, |
| 2048 /* overalloc */, |
| pdev->info.cls_eng3d < VOLTA_A); |
| if (result != VK_SUCCESS) |
| goto fail_edb_bview_cache; |
| |
| result = nvk_heap_init(dev, &dev->event_heap, |
| NVKMD_MEM_LOCAL | NVKMD_MEM_COHERENT, |
| NVKMD_MEM_MAP_WR, |
| 0 /* overalloc */, false /* contiguous */); |
| if (result != VK_SUCCESS) |
| goto fail_shader_heap; |
| |
| if (pdev->info.cls_eng3d < MAXWELL_B) { |
| result = nvk_heap_init(dev, &dev->qmd_heap, |
| NVKMD_MEM_LOCAL, NVKMD_MEM_MAP_WR, |
| 0 /* overalloc */, false /* contiguous */); |
| if (result != VK_SUCCESS) |
| goto fail_event_heap; |
| } |
| |
| nvk_slm_area_init(&dev->slm); |
| |
| if (pdev->info.cls_eng3d >= FERMI_A && |
| pdev->info.cls_eng3d < MAXWELL_A) { |
| /* max size is 256k */ |
| result = nvkmd_dev_alloc_mem(dev->nvkmd, &pdev->vk.base, |
| 256 * 1024, 0, NVKMD_MEM_LOCAL, |
| &dev->vab_memory); |
| if (result != VK_SUCCESS) |
| goto fail_slm; |
| } |
| |
| for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) { |
| for (unsigned q = 0; q < pCreateInfo->pQueueCreateInfos[i].queueCount; q++) { |
| result = nvk_queue_create(dev, &pCreateInfo->pQueueCreateInfos[i], q); |
| if (result != VK_SUCCESS) |
| goto fail_queues; |
| } |
| } |
| } |
| |
| struct vk_pipeline_cache_create_info cache_info = { |
| .weak_ref = true, |
| }; |
| dev->vk.mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL); |
| if (dev->vk.mem_cache == NULL) { |
| result = VK_ERROR_OUT_OF_HOST_MEMORY; |
| goto fail_queues; |
| } |
| |
| if (queue_count > 0) { |
| result = nvk_device_init_meta(dev); |
| if (result != VK_SUCCESS) |
| goto fail_mem_cache; |
| } |
| |
| if (queue_count > 0 && NAK_CAN_PRINTF) { |
| result = nvk_init_printf(dev); |
| if (result != VK_SUCCESS) |
| goto fail_mem_cache; |
| } |
| |
| *pDevice = nvk_device_to_handle(dev); |
| |
| return VK_SUCCESS; |
| |
| fail_mem_cache: |
| vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL); |
| fail_queues: |
| vk_foreach_queue_safe(iter, &dev->vk) { |
| struct nvk_queue *queue = container_of(iter, struct nvk_queue, vk); |
| nvk_queue_destroy(dev, queue); |
| } |
| if (dev->vab_memory) |
| nvkmd_mem_unref(dev->vab_memory); |
| fail_slm: |
| nvk_slm_area_finish(&dev->slm); |
| if (pdev->info.cls_eng3d < MAXWELL_B) |
| nvk_heap_finish(dev, &dev->qmd_heap); |
| fail_event_heap: |
| nvk_heap_finish(dev, &dev->event_heap); |
| fail_shader_heap: |
| nvk_heap_finish(dev, &dev->shader_heap); |
| fail_edb_bview_cache: |
| nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache); |
| fail_samplers: |
| nvk_descriptor_table_finish(dev, &dev->samplers); |
| fail_images: |
| nvk_descriptor_table_finish(dev, &dev->images); |
| fail_zero_page: |
| nvkmd_mem_unref(dev->zero_page); |
| fail_upload: |
| nvk_upload_queue_finish(dev, &dev->upload); |
| fail_nvkmd: |
| nvkmd_dev_destroy(dev->nvkmd); |
| fail_init: |
| vk_device_finish(&dev->vk); |
| fail_alloc: |
| vk_free(&dev->vk.alloc, dev); |
| return result; |
| } |
| |
| VKAPI_ATTR void VKAPI_CALL |
| nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) |
| { |
| VK_FROM_HANDLE(nvk_device, dev, _device); |
| |
| if (!dev) |
| return; |
| |
| const struct nvk_physical_device *pdev = nvk_device_physical(dev); |
| |
| if (dev->nvkmd && NAK_CAN_PRINTF) |
| nvk_destroy_printf(dev); |
| |
| if (dev->copy_queries) |
| vk_shader_destroy(&dev->vk, &dev->copy_queries->vk, &dev->vk.alloc); |
| |
| if (dev->nvkmd) |
| nvk_device_finish_meta(dev); |
| |
| vk_pipeline_cache_destroy(dev->vk.mem_cache, NULL); |
| |
| vk_foreach_queue_safe(iter, &dev->vk) { |
| struct nvk_queue *queue = container_of(iter, struct nvk_queue, vk); |
| nvk_queue_destroy(dev, queue); |
| } |
| |
| if (dev->vab_memory) |
| nvkmd_mem_unref(dev->vab_memory); |
| |
| if (dev->nvkmd) { |
| /* Idle the upload queue before we tear down heaps */ |
| nvk_upload_queue_sync(dev, &dev->upload); |
| |
| nvk_slm_area_finish(&dev->slm); |
| if (pdev->info.cls_eng3d < MAXWELL_B) |
| nvk_heap_finish(dev, &dev->qmd_heap); |
| nvk_heap_finish(dev, &dev->event_heap); |
| nvk_heap_finish(dev, &dev->shader_heap); |
| nvk_edb_bview_cache_finish(dev, &dev->edb_bview_cache); |
| nvk_descriptor_table_finish(dev, &dev->samplers); |
| nvk_descriptor_table_finish(dev, &dev->images); |
| nvkmd_mem_unref(dev->zero_page); |
| nvk_upload_queue_finish(dev, &dev->upload); |
| nvkmd_dev_destroy(dev->nvkmd); |
| } |
| |
| vk_device_finish(&dev->vk); |
| vk_free(&dev->vk.alloc, dev); |
| } |
| |
| VkResult |
| nvk_device_ensure_slm(struct nvk_device *dev, |
| uint32_t slm_bytes_per_lane, |
| uint32_t crs_bytes_per_warp) |
| { |
| return nvk_slm_area_ensure(dev, &dev->slm, |
| slm_bytes_per_lane, |
| crs_bytes_per_warp); |
| } |