blob: 8d8493b52f854261600d280799a8ba3b605014f7 [file] [log] [blame] [edit]
/*
* Copyright 2024 Valve Corporation
* Copyright 2024 Alyssa Rosenzweig
* Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
* SPDX-License-Identifier: MIT
*/
#include "hk_device.h"
#include "agx_bg_eot.h"
#include "agx_helpers.h"
#include "agx_opcodes.h"
#include "agx_scratch.h"
#include "hk_cmd_buffer.h"
#include "hk_descriptor_table.h"
#include "hk_entrypoints.h"
#include "hk_instance.h"
#include "hk_physical_device.h"
#include "hk_shader.h"
#include "asahi/genxml/agx_pack.h"
#include "asahi/lib/agx_bo.h"
#include "asahi/lib/agx_device.h"
#include "asahi/lib/shaders/geometry.h"
#include "util/hash_table.h"
#include "util/os_file.h"
#include "util/ralloc.h"
#include "util/simple_mtx.h"
#include "vulkan/vulkan_core.h"
#include "vulkan/wsi/wsi_common.h"
#include "vk_cmd_enqueue_entrypoints.h"
#include "vk_common_entrypoints.h"
#include "vk_pipeline_cache.h"
#include <fcntl.h>
#include <xf86drm.h>
/*
* We preupload some constants so we can cheaply reference later without extra
* allocation and copying.
*
* TODO: This is small, don't waste a whole BO.
*/
static VkResult
hk_upload_rodata(struct hk_device *dev)
{
dev->rodata.bo =
agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, 0, "Read only data");
if (!dev->rodata.bo)
return VK_ERROR_OUT_OF_HOST_MEMORY;
uint8_t *map = dev->rodata.bo->map;
uint32_t offs = 0;
offs = align(offs, 8);
agx_pack(&dev->rodata.txf_sampler, USC_SAMPLER, cfg) {
cfg.start = 0;
cfg.count = 1;
cfg.buffer = dev->rodata.bo->va->addr + offs;
}
agx_pack_txf_sampler((struct agx_sampler_packed *)(map + offs));
offs += AGX_SAMPLER_LENGTH;
/* The image heap is allocated on the device prior to the rodata. The heap
* lives as long as the device does and has a stable address (requiring
* sparse binding to grow dynamically). That means its address is effectively
* rodata and can be uploaded now. agx_usc_uniform requires an indirection to
* push the heap address, so this takes care of that indirection up front to
* cut an alloc/upload at draw time.
*/
offs = align(offs, sizeof(uint64_t));
agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) {
cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM;
cfg.size_halfs = 4;
cfg.buffer = dev->rodata.bo->va->addr + offs;
}
uint64_t *image_heap_ptr = dev->rodata.bo->map + offs;
*image_heap_ptr = dev->images.bo->va->addr;
offs += sizeof(uint64_t);
/* The geometry state buffer isn't strictly readonly data, but we only have a
* single instance of it device-wide and -- after initializing at heap
* allocate time -- it is read-only from the CPU perspective. The GPU uses it
* for scratch, but is required to reset it after use to ensure resubmitting
* the same command buffer works.
*
* So, we allocate it here for convenience.
*/
offs = align(offs, sizeof(uint64_t));
dev->rodata.geometry_state = dev->rodata.bo->va->addr + offs;
offs += sizeof(struct agx_geometry_state);
/* For null readonly buffers, we need to allocate 16 bytes of zeroes for
* robustness2 semantics on read.
*/
offs = align(offs, 16);
dev->rodata.zero_sink = dev->rodata.bo->va->addr + offs;
memset(dev->rodata.bo->map + offs, 0, 16);
offs += 16;
/* For null storage descriptors, we need to reserve 16 bytes to catch writes.
* No particular content is required; we cannot get robustness2 semantics
* without more work.
*/
offs = align(offs, 16);
dev->rodata.null_sink = dev->rodata.bo->va->addr + offs;
offs += 16;
return VK_SUCCESS;
}
static uint32_t
internal_key_hash(const void *key_)
{
const struct hk_internal_key *key = key_;
return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size);
}
static bool
internal_key_equal(const void *a_, const void *b_)
{
const struct hk_internal_key *a = a_;
const struct hk_internal_key *b = b_;
return a->builder == b->builder && a->key_size == b->key_size &&
memcmp(a->key, b->key, a->key_size) == 0;
}
static VkResult
hk_init_internal_shaders(struct hk_internal_shaders *s)
{
s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal);
if (!s->ht)
return VK_ERROR_OUT_OF_HOST_MEMORY;
simple_mtx_init(&s->lock, mtx_plain);
return VK_SUCCESS;
}
static void
hk_destroy_internal_shaders(struct hk_device *dev,
struct hk_internal_shaders *s, bool part)
{
hash_table_foreach(s->ht, ent) {
if (part) {
struct agx_shader_part *part = ent->data;
free(part->binary);
/* The agx_shader_part itself is ralloc'd against the hash table so
* will be freed.
*/
} else {
struct hk_api_shader *obj = ent->data;
hk_api_shader_destroy(&dev->vk, &obj->vk, NULL);
}
}
_mesa_hash_table_destroy(s->ht, NULL);
simple_mtx_destroy(&s->lock);
}
DERIVE_HASH_TABLE(agx_sampler_packed);
static VkResult
hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
{
h->ht = agx_sampler_packed_table_create(NULL);
if (!h->ht)
return VK_ERROR_OUT_OF_HOST_MEMORY;
VkResult result =
hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024);
if (result != VK_SUCCESS) {
ralloc_free(h->ht);
return result;
}
simple_mtx_init(&h->lock, mtx_plain);
return VK_SUCCESS;
}
static void
hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
{
hk_descriptor_table_finish(dev, &h->table);
ralloc_free(h->ht);
simple_mtx_destroy(&h->lock);
}
static VkResult
hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h,
struct agx_sampler_packed desc,
struct hk_rc_sampler **out)
{
struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc);
if (ent != NULL) {
*out = ent->data;
assert((*out)->refcount != 0);
(*out)->refcount++;
return VK_SUCCESS;
}
struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler);
if (!rc)
return VK_ERROR_OUT_OF_HOST_MEMORY;
uint32_t index;
VkResult result =
hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index);
if (result != VK_SUCCESS) {
ralloc_free(rc);
return result;
}
*rc = (struct hk_rc_sampler){
.key = desc,
.refcount = 1,
.index = index,
};
_mesa_hash_table_insert(h->ht, &rc->key, rc);
*out = rc;
return VK_SUCCESS;
}
VkResult
hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc,
struct hk_rc_sampler **out)
{
struct hk_sampler_heap *h = &dev->samplers;
simple_mtx_lock(&h->lock);
VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out);
simple_mtx_unlock(&h->lock);
return result;
}
static void
hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h,
struct hk_rc_sampler *rc)
{
assert(rc->refcount != 0);
rc->refcount--;
if (rc->refcount == 0) {
hk_descriptor_table_remove(dev, &h->table, rc->index);
_mesa_hash_table_remove_key(h->ht, &rc->key);
ralloc_free(rc);
}
}
void
hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc)
{
struct hk_sampler_heap *h = &dev->samplers;
simple_mtx_lock(&h->lock);
hk_sampler_heap_remove_locked(dev, h, rc);
simple_mtx_unlock(&h->lock);
}
/*
* To implement nullDescriptor, the descriptor set code will reference
* preuploaded null descriptors at fixed offsets in the image heap. Here we
* upload those descriptors, initializing the image heap.
*/
static void
hk_upload_null_descriptors(struct hk_device *dev)
{
struct agx_texture_packed null_tex;
struct agx_pbe_packed null_pbe;
uint32_t offset_tex, offset_pbe;
agx_set_null_texture(&null_tex, dev->rodata.null_sink);
agx_set_null_pbe(&null_pbe, dev->rodata.null_sink);
hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex),
&offset_tex);
hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe),
&offset_pbe);
assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static");
assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static");
}
VKAPI_ATTR VkResult VKAPI_CALL
hk_CreateDevice(VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
{
VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
struct hk_device *dev;
dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!dev)
return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
struct vk_device_dispatch_table dispatch_table;
/* For secondary command buffer support, overwrite any command entrypoints
* in the main device-level dispatch table with
* vk_cmd_enqueue_unless_primary_Cmd*.
*/
vk_device_dispatch_table_from_entrypoints(
&dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&hk_device_entrypoints, false);
vk_device_dispatch_table_from_entrypoints(&dispatch_table,
&wsi_device_entrypoints, false);
/* Populate primary cmd_dispatch table */
vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
&hk_device_entrypoints, true);
vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
&wsi_device_entrypoints, false);
vk_device_dispatch_table_from_entrypoints(
&dev->cmd_dispatch, &vk_common_device_entrypoints, false);
result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo,
pAllocator);
if (result != VK_SUCCESS)
goto fail_alloc;
dev->vk.shader_ops = &hk_device_shader_ops;
dev->vk.command_dispatch_table = &dev->cmd_dispatch;
drmDevicePtr drm_device = NULL;
int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
if (ret != 0) {
result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
"Failed to get DRM device: %m");
goto fail_init;
}
const char *path = drm_device->nodes[DRM_NODE_RENDER];
dev->dev.fd = open(path, O_RDWR | O_CLOEXEC);
if (dev->dev.fd < 0) {
drmFreeDevice(&drm_device);
result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
"failed to open device %s", path);
goto fail_init;
}
bool succ = agx_open_device(NULL, &dev->dev);
drmFreeDevice(&drm_device);
if (!succ) {
result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
"Failed to get DRM device: %m");
goto fail_fd;
}
vk_device_set_drm_fd(&dev->vk, dev->dev.fd);
dev->vk.command_buffer_ops = &hk_cmd_buffer_ops;
result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH,
1024, 1024 * 1024);
if (result != VK_SUCCESS)
goto fail_dev;
result = hk_init_sampler_heap(dev, &dev->samplers);
if (result != VK_SUCCESS)
goto fail_images;
result = hk_descriptor_table_init(
dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES,
AGX_MAX_OCCLUSION_QUERIES);
if (result != VK_SUCCESS)
goto fail_samplers;
result = hk_upload_rodata(dev);
if (result != VK_SUCCESS)
goto fail_queries;
/* Depends on rodata */
hk_upload_null_descriptors(dev);
/* XXX: error handling, and should this even go on the device? */
agx_bg_eot_init(&dev->bg_eot, &dev->dev);
if (!dev->bg_eot.ht) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto fail_rodata;
}
result = hk_init_internal_shaders(&dev->prolog_epilog);
if (result != VK_SUCCESS)
goto fail_bg_eot;
result = hk_init_internal_shaders(&dev->kernels);
if (result != VK_SUCCESS)
goto fail_internal_shaders;
result =
hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0);
if (result != VK_SUCCESS)
goto fail_internal_shaders_2;
struct vk_pipeline_cache_create_info cache_info = {
.weak_ref = true,
};
dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
if (dev->mem_cache == NULL) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto fail_queue;
}
result = hk_device_init_meta(dev);
if (result != VK_SUCCESS)
goto fail_mem_cache;
*pDevice = hk_device_to_handle(dev);
simple_mtx_init(&dev->scratch.lock, mtx_plain);
agx_scratch_init(&dev->dev, &dev->scratch.vs);
agx_scratch_init(&dev->dev, &dev->scratch.fs);
agx_scratch_init(&dev->dev, &dev->scratch.cs);
return VK_SUCCESS;
fail_mem_cache:
vk_pipeline_cache_destroy(dev->mem_cache, NULL);
fail_queue:
hk_queue_finish(dev, &dev->queue);
fail_rodata:
agx_bo_unreference(&dev->dev, dev->rodata.bo);
fail_bg_eot:
agx_bg_eot_cleanup(&dev->bg_eot);
fail_internal_shaders_2:
hk_destroy_internal_shaders(dev, &dev->kernels, false);
fail_internal_shaders:
hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
fail_queries:
hk_descriptor_table_finish(dev, &dev->occlusion_queries);
fail_samplers:
hk_destroy_sampler_heap(dev, &dev->samplers);
fail_images:
hk_descriptor_table_finish(dev, &dev->images);
fail_dev:
agx_close_device(&dev->dev);
fail_fd:
close(dev->dev.fd);
fail_init:
vk_device_finish(&dev->vk);
fail_alloc:
vk_free(&dev->vk.alloc, dev);
return result;
}
VKAPI_ATTR void VKAPI_CALL
hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
{
VK_FROM_HANDLE(hk_device, dev, _device);
if (!dev)
return;
hk_device_finish_meta(dev);
hk_destroy_internal_shaders(dev, &dev->kernels, false);
hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
vk_pipeline_cache_destroy(dev->mem_cache, NULL);
hk_queue_finish(dev, &dev->queue);
vk_device_finish(&dev->vk);
agx_scratch_fini(&dev->scratch.vs);
agx_scratch_fini(&dev->scratch.fs);
agx_scratch_fini(&dev->scratch.cs);
simple_mtx_destroy(&dev->scratch.lock);
hk_destroy_sampler_heap(dev, &dev->samplers);
hk_descriptor_table_finish(dev, &dev->images);
hk_descriptor_table_finish(dev, &dev->occlusion_queries);
agx_bo_unreference(&dev->dev, dev->rodata.bo);
agx_bo_unreference(&dev->dev, dev->heap);
agx_bg_eot_cleanup(&dev->bg_eot);
agx_close_device(&dev->dev);
vk_free(&dev->vk.alloc, dev);
}
VKAPI_ATTR VkResult VKAPI_CALL
hk_GetCalibratedTimestampsKHR(
VkDevice _device, uint32_t timestampCount,
const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps,
uint64_t *pMaxDeviation)
{
// VK_FROM_HANDLE(hk_device, dev, _device);
// struct hk_physical_device *pdev = hk_device_physical(dev);
uint64_t max_clock_period = 0;
uint64_t begin, end;
int d;
#ifdef CLOCK_MONOTONIC_RAW
begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
#else
begin = vk_clock_gettime(CLOCK_MONOTONIC);
#endif
for (d = 0; d < timestampCount; d++) {
switch (pTimestampInfos[d].timeDomain) {
case VK_TIME_DOMAIN_DEVICE_KHR:
unreachable("todo");
// pTimestamps[d] = agx_get_gpu_timestamp(&pdev->dev);
max_clock_period = MAX2(
max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */
break;
case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
max_clock_period = MAX2(max_clock_period, 1);
break;
#ifdef CLOCK_MONOTONIC_RAW
case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
pTimestamps[d] = begin;
break;
#endif
default:
pTimestamps[d] = 0;
break;
}
}
#ifdef CLOCK_MONOTONIC_RAW
end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
#else
end = vk_clock_gettime(CLOCK_MONOTONIC);
#endif
*pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
return VK_SUCCESS;
}