| /* |
| * Copyright © 2021 Valve Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| */ |
| |
| #include "ac_nir.h" |
| #include "nir_builder.h" |
| |
| /* |
| * Lower NIR cross-stage I/O intrinsics into the memory accesses that actually happen on the HW. |
| * |
| * These HW stages are used only when a Geometry Shader is used. |
| * Export Shader (ES) runs the SW stage before GS, can be either VS or TES. |
| * |
| * * GFX6-8: |
| * ES and GS are separate HW stages. |
| * I/O is passed between them through VRAM. |
| * * GFX9+: |
| * ES and GS are merged into a single HW stage. |
| * I/O is passed between them through LDS. |
| * |
| */ |
| |
| typedef struct { |
| /* Which hardware generation we're dealing with */ |
| enum chip_class chip_class; |
| |
| /* Number of ES outputs for which memory should be reserved. |
| * When compacted, this should be the number of linked ES outputs. |
| */ |
| unsigned num_reserved_es_outputs; |
| } lower_esgs_io_state; |
| |
| static nir_ssa_def * |
| emit_split_buffer_load(nir_builder *b, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off, |
| unsigned component_stride, unsigned num_components, unsigned bit_size) |
| { |
| unsigned total_bytes = num_components * bit_size / 8u; |
| unsigned full_dwords = total_bytes / 4u; |
| unsigned remaining_bytes = total_bytes - full_dwords * 4u; |
| |
| /* Accomodate max number of split 64-bit loads */ |
| nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS * 2u]; |
| |
| /* Assume that 1x32-bit load is better than 1x16-bit + 1x8-bit */ |
| if (remaining_bytes == 3) { |
| remaining_bytes = 0; |
| full_dwords++; |
| } |
| |
| for (unsigned i = 0; i < full_dwords; ++i) |
| comps[i] = nir_build_load_buffer_amd(b, 1, 32, desc, v_off, s_off, |
| .base = component_stride * i, .memory_modes = nir_var_shader_in); |
| |
| if (remaining_bytes) |
| comps[full_dwords] = nir_build_load_buffer_amd(b, 1, remaining_bytes * 8, desc, v_off, s_off, |
| .base = component_stride * full_dwords, .memory_modes = nir_var_shader_in); |
| |
| return nir_extract_bits(b, comps, full_dwords + !!remaining_bytes, 0, num_components, bit_size); |
| } |
| |
| static void |
| emit_split_buffer_store(nir_builder *b, nir_ssa_def *d, nir_ssa_def *desc, nir_ssa_def *v_off, nir_ssa_def *s_off, |
| unsigned component_stride, unsigned num_components, unsigned bit_size, |
| unsigned writemask, bool swizzled, bool slc) |
| { |
| while (writemask) { |
| int start, count; |
| u_bit_scan_consecutive_range(&writemask, &start, &count); |
| assert(start >= 0 && count >= 0); |
| |
| unsigned bytes = count * bit_size / 8u; |
| unsigned start_byte = start * bit_size / 8u; |
| |
| while (bytes) { |
| unsigned store_bytes = MIN2(bytes, 4u); |
| if ((start_byte % 4) == 1 || (start_byte % 4) == 3) |
| store_bytes = MIN2(store_bytes, 1); |
| else if ((start_byte % 4) == 2) |
| store_bytes = MIN2(store_bytes, 2); |
| |
| nir_ssa_def *store_val = nir_extract_bits(b, &d, 1, start_byte * 8u, 1, store_bytes * 8u); |
| nir_build_store_buffer_amd(b, store_val, desc, v_off, s_off, .is_swizzled = swizzled, .slc_amd = slc, |
| .base = start_byte, .write_mask = 1u, .memory_modes = nir_var_shader_out); |
| |
| start_byte += store_bytes; |
| bytes -= store_bytes; |
| } |
| } |
| } |
| |
| static bool |
| lower_es_output_store(nir_builder *b, |
| nir_instr *instr, |
| void *state) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| |
| if (intrin->intrinsic != nir_intrinsic_store_output) |
| return false; |
| |
| lower_esgs_io_state *st = (lower_esgs_io_state *) state; |
| unsigned write_mask = nir_intrinsic_write_mask(intrin); |
| |
| b->cursor = nir_before_instr(instr); |
| nir_ssa_def *io_off = nir_build_calc_io_offset(b, intrin, nir_imm_int(b, 16u), 4u); |
| |
| if (st->chip_class <= GFX8) { |
| /* GFX6-8: ES is a separate HW stage, data is passed from ES to GS in VRAM. */ |
| nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b); |
| nir_ssa_def *es2gs_off = nir_build_load_ring_es2gs_offset_amd(b); |
| emit_split_buffer_store(b, intrin->src[0].ssa, ring, io_off, es2gs_off, 4u, |
| intrin->src[0].ssa->num_components, intrin->src[0].ssa->bit_size, |
| write_mask, true, true); |
| } else { |
| /* GFX9+: ES is merged into GS, data is passed through LDS. */ |
| unsigned esgs_itemsize = st->num_reserved_es_outputs * 16u; |
| nir_ssa_def *vertex_idx = nir_build_load_local_invocation_index(b); |
| nir_ssa_def *off = nir_iadd(b, nir_imul_imm(b, vertex_idx, esgs_itemsize), io_off); |
| nir_build_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask, |
| .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u); |
| } |
| |
| nir_instr_remove(instr); |
| return true; |
| } |
| |
| static nir_ssa_def * |
| gs_per_vertex_input_vertex_offset_gfx6(nir_builder *b, nir_src *vertex_src) |
| { |
| if (nir_src_is_const(*vertex_src)) |
| return nir_build_load_gs_vertex_offset_amd(b, .base = nir_src_as_uint(*vertex_src)); |
| |
| nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0); |
| |
| for (unsigned i = 1; i < b->shader->info.gs.vertices_in; ++i) { |
| nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i); |
| nir_ssa_def *elem = nir_build_load_gs_vertex_offset_amd(b, .base = i); |
| vertex_offset = nir_bcsel(b, cond, elem, vertex_offset); |
| } |
| |
| return vertex_offset; |
| } |
| |
| static nir_ssa_def * |
| gs_per_vertex_input_vertex_offset_gfx9(nir_builder *b, nir_src *vertex_src) |
| { |
| if (nir_src_is_const(*vertex_src)) { |
| unsigned vertex = nir_src_as_uint(*vertex_src); |
| return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u * 2u), |
| nir_imm_int(b, (vertex % 2u) * 16u), nir_imm_int(b, 16u)); |
| } |
| |
| nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0); |
| |
| for (unsigned i = 1; i < b->shader->info.gs.vertices_in; i++) { |
| nir_ssa_def *cond = nir_ieq_imm(b, vertex_src->ssa, i); |
| nir_ssa_def *elem = nir_build_load_gs_vertex_offset_amd(b, .base = i / 2u * 2u); |
| if (i % 2u) |
| elem = nir_ishr_imm(b, elem, 16u); |
| |
| vertex_offset = nir_bcsel(b, cond, elem, vertex_offset); |
| } |
| |
| return nir_iand_imm(b, vertex_offset, 0xffffu); |
| } |
| |
| static nir_ssa_def * |
| gs_per_vertex_input_offset(nir_builder *b, |
| lower_esgs_io_state *st, |
| nir_intrinsic_instr *instr) |
| { |
| nir_src *vertex_src = nir_get_io_vertex_index_src(instr); |
| nir_ssa_def *vertex_offset = st->chip_class >= GFX9 |
| ? gs_per_vertex_input_vertex_offset_gfx9(b, vertex_src) |
| : gs_per_vertex_input_vertex_offset_gfx6(b, vertex_src); |
| |
| unsigned base_stride = st->chip_class >= GFX9 ? 1 : 64 /* Wave size on GFX6-8 */; |
| nir_ssa_def *io_off = nir_build_calc_io_offset(b, instr, nir_imm_int(b, base_stride * 4u), base_stride); |
| nir_ssa_def *off = nir_iadd(b, io_off, vertex_offset); |
| return nir_imul_imm(b, off, 4u); |
| } |
| |
| static nir_ssa_def * |
| lower_gs_per_vertex_input_load(nir_builder *b, |
| nir_instr *instr, |
| void *state) |
| { |
| lower_esgs_io_state *st = (lower_esgs_io_state *) state; |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| nir_ssa_def *off = gs_per_vertex_input_offset(b, st, intrin); |
| |
| if (st->chip_class >= GFX9) |
| return nir_build_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off, |
| .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u); |
| |
| unsigned wave_size = 64u; /* GFX6-8 only support wave64 */ |
| nir_ssa_def *ring = nir_build_load_ring_esgs_amd(b); |
| return emit_split_buffer_load(b, ring, off, nir_imm_zero(b, 1, 32), 4u * wave_size, |
| intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size); |
| } |
| |
| static bool |
| filter_load_per_vertex_input(const nir_instr *instr, UNUSED const void *state) |
| { |
| return instr->type == nir_instr_type_intrinsic && nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_per_vertex_input; |
| } |
| |
| void |
| ac_nir_lower_es_outputs_to_mem(nir_shader *shader, |
| enum chip_class chip_class, |
| unsigned num_reserved_es_outputs) |
| { |
| lower_esgs_io_state state = { |
| .chip_class = chip_class, |
| .num_reserved_es_outputs = num_reserved_es_outputs, |
| }; |
| |
| nir_shader_instructions_pass(shader, |
| lower_es_output_store, |
| nir_metadata_block_index | nir_metadata_dominance, |
| &state); |
| } |
| |
| void |
| ac_nir_lower_gs_inputs_to_mem(nir_shader *shader, |
| enum chip_class chip_class, |
| unsigned num_reserved_es_outputs) |
| { |
| lower_esgs_io_state state = { |
| .chip_class = chip_class, |
| .num_reserved_es_outputs = num_reserved_es_outputs, |
| }; |
| |
| nir_shader_lower_instructions(shader, |
| filter_load_per_vertex_input, |
| lower_gs_per_vertex_input_load, |
| &state); |
| } |