blob: f30338d14cc2d204726d797f2547ee53f2e9f7a2 [file] [log] [blame]
/*
* Copyright © 2021 Igalia S.L.
* SPDX-License-Identifier: MIT
*/
#ifndef TU_AUTOTUNE_H
#define TU_AUTOTUNE_H
#include "tu_common.h"
#include "util/hash_table.h"
#include "util/rwlock.h"
#include "tu_suballoc.h"
struct tu_renderpass_history;
/**
* "autotune" our decisions about bypass vs GMEM rendering, based on historical
* data about a given render target.
*
* In deciding which path to take there are tradeoffs, including some that
* are not reasonably estimateable without having some additional information:
*
* (1) If you know you are touching every pixel (ie. there is a clear),
* then the GMEM path will at least not cost more memory bandwidth than
* sysmem[1]
*
* (2) If there is no clear, GMEM could potentially cost *more* bandwidth
* if there is sysmem->GMEM restore pass.
*
* (3) If you see a high draw count, that is an indication that there will be
* enough pixels accessed multiple times to benefit from the reduced
* memory bandwidth that GMEM brings
*
* (4) But high draw count where there is not much overdraw can actually be
* faster in bypass mode if it is pushing a lot of state change, due to
* not having to go thru the state changes per-tile[1]
*
* The approach taken is to measure the samples-passed for the batch to estimate
* the amount of overdraw to detect cases where the number of pixels touched is
* low.
*
* [1] ignoring early-tile-exit optimizations, but any draw that touches all/
* most of the tiles late in the tile-pass can defeat that
*/
struct tu_autotune {
/* We may have to disable autotuner if there are too many
* renderpasses in-flight.
*/
bool enabled;
struct tu_device *device;
/**
* Cache to map renderpass key to historical information about
* rendering to that particular render target.
*/
struct hash_table *ht;
struct u_rwlock ht_lock;
/**
* List of per-renderpass results that we are waiting for the GPU
* to finish with before reading back the results.
*/
struct list_head pending_results;
/**
* List of per-submission data that we may want to free after we
* processed submission results.
* This could happend after command buffers which were in the submission
* are destroyed.
*/
struct list_head pending_submission_data;
/**
* List of per-submission data that has been finished and can be reused.
*/
struct list_head submission_data_pool;
uint32_t fence_counter;
uint32_t idx_counter;
};
/**
* From the cmdstream, the captured samples-passed values are recorded
* at the start and end of the batch.
*
* Note that we do the math on the CPU to avoid a WFI. But pre-emption
* may force us to revisit that.
*/
struct tu_renderpass_samples {
uint64_t samples_start;
/* hw requires the sample start/stop locations to be 128b aligned. */
uint64_t __pad0;
uint64_t samples_end;
uint64_t __pad1;
};
/**
* Tracks the results from an individual renderpass. Initially created
* per renderpass, and appended to the tail of at->pending_results. At a later
* time, when the GPU has finished writing the results, we fill samples_passed.
*/
struct tu_renderpass_result {
/* Points into GPU memory */
struct tu_renderpass_samples* samples;
struct tu_suballoc_bo bo;
/*
* Below here, only used internally within autotune
*/
uint64_t rp_key;
struct tu_renderpass_history *history;
struct list_head node;
uint32_t fence;
uint64_t samples_passed;
};
VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
bool tu_autotune_use_bypass(struct tu_autotune *at,
struct tu_cmd_buffer *cmd_buffer,
struct tu_renderpass_result **autotune_result);
void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count);
/**
* A magic 8-ball that tells the gmem code whether we should do bypass mode
* for moar fps.
*/
struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
struct tu_autotune *at,
struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count);
struct tu_autotune_results_buffer;
void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_renderpass_result *autotune_result);
void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_renderpass_result *autotune_result);
#endif /* TU_AUTOTUNE_H */