| /* |
| * Copyright © 2021 Valve Corporation |
| * |
| * SPDX-License-Identifier: MIT |
| */ |
| |
| |
| #ifndef AC_NIR_H |
| #define AC_NIR_H |
| |
| #include "ac_hw_stage.h" |
| #include "ac_shader_args.h" |
| #include "ac_shader_util.h" |
| #include "nir_defines.h" |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| enum |
| { |
| /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */ |
| AC_EXP_PARAM_OFFSET_0 = 0, |
| AC_EXP_PARAM_OFFSET_31 = 31, |
| /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */ |
| AC_EXP_PARAM_DEFAULT_VAL_0000 = 64, |
| AC_EXP_PARAM_DEFAULT_VAL_0001, |
| AC_EXP_PARAM_DEFAULT_VAL_1110, |
| AC_EXP_PARAM_DEFAULT_VAL_1111, |
| AC_EXP_PARAM_UNDEFINED = 255, |
| }; |
| |
| enum { |
| AC_EXP_FLAG_COMPRESSED = (1 << 0), |
| AC_EXP_FLAG_DONE = (1 << 1), |
| AC_EXP_FLAG_VALID_MASK = (1 << 2), |
| }; |
| |
| struct ac_nir_config { |
| enum amd_gfx_level gfx_level; |
| bool uses_aco; |
| }; |
| |
| /* Maps I/O semantics to the actual location used by the lowering pass. */ |
| typedef unsigned (*ac_nir_map_io_driver_location)(unsigned semantic); |
| |
| /* Forward declaration of nir_builder so we don't have to include nir_builder.h here */ |
| struct nir_builder; |
| typedef struct nir_builder nir_builder; |
| |
| struct nir_xfb_info; |
| typedef struct nir_xfb_info nir_xfb_info; |
| |
| /* Executed by ac_nir_cull when the current primitive is accepted. */ |
| typedef void (*ac_nir_cull_accepted)(nir_builder *b, void *state); |
| |
| void |
| ac_nir_set_options(struct radeon_info *info, bool use_llvm, |
| nir_shader_compiler_options *options); |
| |
| nir_def * |
| ac_nir_load_arg_at_offset(nir_builder *b, const struct ac_shader_args *ac_args, |
| struct ac_arg arg, unsigned relative_index); |
| |
| nir_def * |
| ac_nir_load_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg); |
| |
| nir_def * |
| ac_nir_load_arg_upper_bound(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg, |
| unsigned upper_bound); |
| |
| void ac_nir_store_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg, |
| nir_def *val); |
| |
| nir_def * |
| ac_nir_unpack_arg(nir_builder *b, const struct ac_shader_args *ac_args, struct ac_arg arg, |
| unsigned rshift, unsigned bitwidth); |
| |
| bool ac_nir_lower_sin_cos(nir_shader *shader); |
| |
| bool ac_nir_lower_intrinsics_to_args(nir_shader *shader, const enum amd_gfx_level gfx_level, |
| bool has_ls_vgpr_init_bug, const enum ac_hw_stage hw_stage, |
| unsigned wave_size, unsigned workgroup_size, |
| const struct ac_shader_args *ac_args); |
| |
| nir_xfb_info *ac_nir_get_sorted_xfb_info(const nir_shader *nir); |
| |
| bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed, |
| int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS], |
| uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS]); |
| |
| typedef struct { |
| /* Per-vertex slots and tess levels. */ |
| uint64_t vram_output_mask; |
| uint64_t lds_output_mask; |
| uint64_t vgpr_output_mask; /* Hold the output values in VGPRs until the end. */ |
| /* Generic per-patch slots. */ |
| uint32_t vram_patch_output_mask; |
| uint32_t lds_patch_output_mask; |
| uint32_t vgpr_patch_output_mask; /* Hold the output values in VGPRs until the end. */ |
| |
| /* The highest index returned by map_io + 1. */ |
| uint8_t highest_remapped_vram_output; |
| uint8_t highest_remapped_vram_patch_output; |
| } ac_nir_tess_io_info; |
| |
| void |
| ac_nir_get_tess_io_info(const nir_shader *tcs, const nir_tcs_info *tcs_info, uint64_t tes_inputs_read, |
| uint32_t tes_patch_inputs_read, ac_nir_map_io_driver_location map_io, |
| bool remapped_outputs_include_tess_levels, ac_nir_tess_io_info *io_info); |
| |
| bool |
| ac_nir_lower_ls_outputs_to_mem(nir_shader *ls, |
| ac_nir_map_io_driver_location map, |
| enum amd_gfx_level gfx_level, |
| bool tcs_in_out_eq, |
| uint64_t tcs_inputs_via_temp, |
| uint64_t tcs_inputs_via_lds); |
| |
| bool |
| ac_nir_lower_hs_inputs_to_mem(nir_shader *shader, |
| ac_nir_map_io_driver_location map, |
| enum amd_gfx_level gfx_level, |
| bool tcs_in_out_eq, |
| uint64_t tcs_inputs_via_temp, |
| uint64_t tcs_inputs_via_lds); |
| |
| bool |
| ac_nir_lower_hs_outputs_to_mem(nir_shader *shader, const nir_tcs_info *info, |
| const ac_nir_tess_io_info *io_info, |
| ac_nir_map_io_driver_location map, |
| enum amd_gfx_level gfx_level, |
| unsigned wave_size); |
| |
| bool |
| ac_nir_lower_tes_inputs_to_mem(nir_shader *shader, |
| ac_nir_map_io_driver_location map); |
| |
| void |
| ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io_info *io_info, |
| unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid, |
| unsigned num_tcs_input_cp, unsigned lds_input_vertex_size, |
| unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg, |
| unsigned *hw_lds_size); |
| |
| bool |
| ac_nir_lower_es_outputs_to_mem(nir_shader *shader, |
| ac_nir_map_io_driver_location map, |
| enum amd_gfx_level gfx_level, |
| unsigned esgs_itemsize, |
| uint64_t gs_inputs_read); |
| |
| bool |
| ac_nir_lower_gs_inputs_to_mem(nir_shader *shader, |
| ac_nir_map_io_driver_location map, |
| enum amd_gfx_level gfx_level, |
| bool triangle_strip_adjacency_fix); |
| |
| bool |
| ac_nir_lower_indirect_derefs(nir_shader *shader, |
| enum amd_gfx_level gfx_level); |
| |
| typedef struct { |
| const struct radeon_info *hw_info; |
| |
| unsigned max_workgroup_size; |
| unsigned wave_size; |
| /* The mask of clip and cull distances that the shader should export. |
| * |
| * Clip/cull distance components that are missing in export_clipdist_mask are removed, improving |
| * throughput by up to 50% (3 pos exports -> 2 pos exports). The caller shouldn't set no-op |
| * components (>= 0) in export_clipdist_mask to remove those completely. No-op components |
| * should be determined by nir_opt_clip_cull_const before this. |
| * |
| * If can_cull is true, the shader culls cull distances and they are not exported to increase |
| * throughput by reducing the number of pos exports. cull_clipdist_mask must be set to include |
| * all cull distances that are < 0. The best case scenario is 100% increase in throughput from |
| * not exporting any cull distances (2 pos exports -> 1 pos export). |
| */ |
| uint8_t export_clipdist_mask; |
| /* The mask of clip and cull distances that the shader should cull against. |
| * If no clip and cull distance outputs are present, it will load clip planes and cull |
| * either against CLIP_VERTEX or POS. |
| */ |
| uint8_t cull_clipdist_mask; |
| bool write_pos_to_clipvertex; |
| const uint8_t *vs_output_param_offset; /* GFX11+ */ |
| bool has_param_exports; |
| bool can_cull; |
| bool disable_streamout; |
| bool has_gen_prim_query; |
| bool has_xfb_prim_query; |
| bool use_gfx12_xfb_intrinsic; |
| bool has_gs_invocations_query; |
| bool has_gs_primitives_query; |
| bool force_vrs; |
| bool compact_primitives; |
| /* Skip culling dependent on the viewport state, which is frustum culling and small prim |
| * culling. Set this when the shader writes the viewport index. |
| */ |
| bool skip_viewport_state_culling; |
| /* Use the point-triangle intersection to cull small triangles. */ |
| bool use_point_tri_intersection; |
| |
| /* VS */ |
| unsigned num_vertices_per_primitive; |
| bool early_prim_export; |
| bool passthrough; |
| bool use_edgeflags; |
| bool export_primitive_id; |
| bool export_primitive_id_per_prim; |
| uint32_t instance_rate_inputs; |
| } ac_nir_lower_ngg_options; |
| |
| bool |
| ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options, |
| uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size); |
| |
| bool |
| ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, |
| uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size); |
| |
| bool |
| ac_nir_lower_ngg_mesh(nir_shader *shader, |
| const struct radeon_info *hw_info, |
| uint32_t clipdist_enable_mask, |
| const uint8_t *vs_output_param_offset, |
| bool has_param_exports, |
| bool *out_needs_scratch_ring, |
| unsigned wave_size, |
| unsigned workgroup_size, |
| bool multiview, |
| bool has_query); |
| |
| bool |
| ac_nir_lower_task_outputs_to_mem(nir_shader *shader, |
| unsigned task_payload_entry_bytes, |
| unsigned task_num_entries, |
| bool has_query); |
| |
| bool |
| ac_nir_lower_mesh_inputs_to_mem(nir_shader *shader, |
| unsigned task_payload_entry_bytes, |
| unsigned task_num_entries); |
| |
| bool |
| ac_nir_lower_global_access(nir_shader *shader); |
| |
| bool ac_nir_lower_resinfo(nir_shader *nir, enum amd_gfx_level gfx_level); |
| bool ac_nir_lower_image_opcodes(nir_shader *nir); |
| |
| bool |
| ac_nir_lower_legacy_vs(nir_shader *nir, |
| enum amd_gfx_level gfx_level, |
| uint32_t export_clipdist_mask, |
| bool write_pos_to_clipvertex, |
| const uint8_t *param_offsets, |
| bool has_param_exports, |
| bool export_primitive_id, |
| bool disable_streamout, |
| bool force_vrs); |
| |
| typedef struct { |
| bool has_gen_prim_query; |
| bool has_pipeline_stats_query; |
| |
| enum amd_gfx_level gfx_level; |
| uint32_t export_clipdist_mask; |
| bool write_pos_to_clipvertex; |
| const uint8_t *param_offsets; |
| bool has_param_exports; |
| bool disable_streamout; |
| bool force_vrs; |
| } ac_nir_lower_legacy_gs_options; |
| |
| typedef struct { |
| uint8_t num_components_per_stream[4]; |
| } ac_nir_legacy_gs_info; |
| |
| bool |
| ac_nir_lower_legacy_gs(nir_shader *nir, ac_nir_lower_legacy_gs_options *options, |
| nir_shader **gs_copy_shader, ac_nir_legacy_gs_info *out_info); |
| |
| /* This is a pre-link pass. It should only eliminate code and do lowering that mostly doesn't |
| * generate AMD-specific intrinsics. |
| */ |
| typedef struct { |
| /* System values. */ |
| bool msaa_disabled; /* true if MSAA is disabled, false may mean that the state is unknown */ |
| bool uses_vrs_coarse_shading; |
| bool load_sample_positions_always_loads_current_ones; |
| bool dynamic_rasterization_samples; |
| int force_front_face; /* 0 -> keep, 1 -> set to true, -1 -> set to false */ |
| bool optimize_frag_coord; /* TODO: remove this after RADV can handle it */ |
| bool frag_coord_is_center; /* GL requirement for sample shading */ |
| |
| /* frag_coord/pixel_coord: |
| * allow_pixel_coord && (frag_coord_is_center || ps_iter_samples == 1 || msaa_disabled || |
| * the fractional part of frag_coord.xy isn't used): |
| * * frag_coord.xy is replaced by u2f(pixel_coord) + 0.5. |
| * else: |
| * * pixel_coord is replaced by f2u16(frag_coord.xy) |
| * * ps_iter_samples == 0 means the state is unknown. |
| * |
| * barycentrics: |
| * msaa_disabled: |
| * * All barycentrics including at_sample but excluding at_offset are changed to |
| * barycentric_pixel |
| * ps_iter_samples >= 2: |
| * * All barycentrics are changed to per-sample interpolation except at_offset/at_sample. |
| * * barycentric_at_sample(sample_id) is replaced by barycentric_sample. |
| * |
| * sample_mask_in: |
| * msaa_disabled && !uses_vrs_coarse_shading: |
| * * sample_mask_in is replaced by b2i32(!helper_invocation) |
| * ps_iter_samples == 2, 4: |
| * * sample_mask_in is changed to (sample_mask_in & (ps_iter_mask << sample_id)) |
| * ps_iter_samples == 8: |
| * * sample_mask_in is replaced by 1 << sample_id. |
| * |
| * When ps_iter_samples is equal to rasterization samples, set ps_iter_samples = 8 for this pass. |
| */ |
| unsigned ps_iter_samples; |
| |
| /* fbfetch_output */ |
| bool fbfetch_is_1D; |
| bool fbfetch_layered; |
| bool fbfetch_msaa; |
| bool fbfetch_apply_fmask; |
| |
| /* Outputs. */ |
| bool clamp_color; /* GL only */ |
| bool alpha_test_alpha_to_one; /* GL only, this only affects alpha test */ |
| enum compare_func alpha_func; /* GL only */ |
| bool keep_alpha_for_mrtz; /* this prevents killing alpha based on spi_shader_col_format_hint */ |
| unsigned spi_shader_col_format_hint; /* this only shrinks and eliminates output stores */ |
| bool kill_z; |
| bool kill_stencil; |
| bool kill_samplemask; |
| } ac_nir_lower_ps_early_options; |
| |
| bool |
| ac_nir_lower_ps_early(nir_shader *nir, const ac_nir_lower_ps_early_options *options); |
| |
| /* This is a post-link pass. It shouldn't eliminate any code and it shouldn't affect shader_info |
| * (those should be done in the early pass). |
| */ |
| typedef struct { |
| enum amd_gfx_level gfx_level; |
| enum radeon_family family; |
| bool use_aco; |
| |
| /* System values. */ |
| bool bc_optimize_for_persp; |
| bool bc_optimize_for_linear; |
| |
| /* Exports. */ |
| bool uses_discard; |
| bool dcc_decompress_gfx11; |
| bool alpha_to_coverage_via_mrtz; |
| bool dual_src_blend_swizzle; |
| unsigned spi_shader_col_format; |
| unsigned color_is_int8; |
| unsigned color_is_int10; |
| bool alpha_to_one; |
| |
| /* Vulkan only */ |
| unsigned enable_mrt_output_nan_fixup; |
| bool no_color_export; |
| bool no_depth_export; |
| } ac_nir_lower_ps_late_options; |
| |
| bool |
| ac_nir_lower_ps_late(nir_shader *nir, const ac_nir_lower_ps_late_options *options); |
| |
| typedef struct { |
| enum amd_gfx_level gfx_level; |
| |
| /* If true, round the layer component of the coordinates source to the nearest |
| * integer for all array ops. This is always done for cube array ops. |
| */ |
| bool lower_array_layer_round_even; |
| |
| /* Fix derivatives of constants and FS inputs in control flow. |
| * |
| * Ignores interpolateAtSample()/interpolateAtOffset(), dynamically indexed input loads, |
| * pervertexEXT input loads, textureGather() with implicit LOD and 16-bit derivatives and |
| * texture samples with nir_tex_src_min_lod. |
| * |
| * The layer must also be a constant or FS input. |
| */ |
| bool fix_derivs_in_divergent_cf; |
| unsigned max_wqm_vgprs; |
| } ac_nir_lower_tex_options; |
| |
| bool |
| ac_nir_lower_tex(nir_shader *nir, const ac_nir_lower_tex_options *options); |
| |
| void |
| ac_nir_store_debug_log_amd(nir_builder *b, nir_def *uvec4); |
| |
| bool |
| ac_nir_opt_pack_half(nir_shader *shader, enum amd_gfx_level gfx_level); |
| |
| unsigned |
| ac_nir_varying_expression_max_cost(nir_shader *producer, nir_shader *consumer); |
| |
| bool |
| ac_nir_opt_shared_append(nir_shader *shader); |
| |
| bool |
| ac_nir_flag_smem_for_loads(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm, bool after_lowering); |
| |
| bool |
| ac_nir_lower_mem_access_bit_sizes(nir_shader *shader, enum amd_gfx_level gfx_level, bool use_llvm); |
| |
| bool |
| ac_nir_optimize_uniform_atomics(nir_shader *nir); |
| |
| unsigned |
| ac_nir_lower_bit_size_callback(const nir_instr *instr, void *data); |
| |
| bool |
| ac_nir_might_lower_bit_size(const nir_shader *shader); |
| |
| bool |
| ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, |
| unsigned num_components, int64_t hole_size, |
| nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data); |
| |
| bool |
| ac_nir_scalarize_overfetching_loads_callback(const nir_instr *instr, const void *data); |
| |
| enum gl_access_qualifier |
| ac_nir_get_mem_access_flags(const nir_intrinsic_instr *instr); |
| |
| uint8_t |
| ac_nir_lower_phis_to_scalar_cb(const nir_instr *instr, const void *_); |
| |
| #ifdef __cplusplus |
| } |
| #endif |
| |
| #endif /* AC_NIR_H */ |