| /* |
| * Copyright © 2021 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| #include "brw_compiler.h" |
| #include "brw_fs.h" |
| #include "brw_nir.h" |
| #include "brw_private.h" |
| #include "compiler/nir/nir_builder.h" |
| #include "dev/intel_debug.h" |
| |
| using namespace brw; |
| |
| static bool |
| brw_nir_lower_load_uniforms_filter(const nir_instr *instr, |
| UNUSED const void *data) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| return intrin->intrinsic == nir_intrinsic_load_uniform; |
| } |
| |
| static nir_ssa_def * |
| brw_nir_lower_load_uniforms_impl(nir_builder *b, nir_instr *instr, |
| UNUSED void *data) |
| { |
| assert(instr->type == nir_instr_type_intrinsic); |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| assert(intrin->intrinsic == nir_intrinsic_load_uniform); |
| |
| /* Read the first few 32-bit scalars from InlineData. */ |
| if (nir_src_is_const(intrin->src[0]) && |
| nir_dest_bit_size(intrin->dest) == 32 && |
| nir_dest_num_components(intrin->dest) == 1) { |
| unsigned off = nir_intrinsic_base(intrin) + nir_src_as_uint(intrin->src[0]); |
| unsigned off_dw = off / 4; |
| if (off % 4 == 0 && off_dw < BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW) { |
| off_dw += BRW_TASK_MESH_PUSH_CONSTANTS_START_DW; |
| return nir_load_mesh_inline_data_intel(b, 32, off_dw); |
| } |
| } |
| |
| return brw_nir_load_global_const(b, intrin, |
| nir_load_mesh_inline_data_intel(b, 64, 0), 0); |
| } |
| |
| static bool |
| brw_nir_lower_load_uniforms(nir_shader *nir) |
| { |
| return nir_shader_lower_instructions(nir, brw_nir_lower_load_uniforms_filter, |
| brw_nir_lower_load_uniforms_impl, NULL); |
| } |
| |
| static inline int |
| type_size_scalar_dwords(const struct glsl_type *type, bool bindless) |
| { |
| return glsl_count_dword_slots(type, bindless); |
| } |
| |
| /* TODO(mesh): Make this a common function. */ |
| static void |
| shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) |
| { |
| assert(glsl_type_is_vector_or_scalar(type)); |
| |
| uint32_t comp_size = glsl_type_is_boolean(type) |
| ? 4 : glsl_get_bit_size(type) / 8; |
| unsigned length = glsl_get_vector_elements(type); |
| *size = comp_size * length, |
| *align = comp_size * (length == 3 ? 4 : length); |
| } |
| |
| static void |
| brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map) |
| { |
| memset(map, 0, sizeof(*map)); |
| |
| /* TUE header contains 4 words: |
| * |
| * - Word 0 for Task Count. |
| * |
| * - Words 1-3 used for "Dispatch Dimensions" feature, to allow mapping a |
| * 3D dispatch into the 1D dispatch supported by HW. Currently not used. |
| */ |
| nir_foreach_shader_out_variable(var, nir) { |
| assert(var->data.location == VARYING_SLOT_TASK_COUNT); |
| var->data.driver_location = 0; |
| } |
| |
| NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, |
| type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32); |
| |
| /* From bspec: "It is suggested that SW reserve the 16 bytes following the |
| * TUE Header, and therefore start the SW-defined data structure at 32B |
| * alignment. This allows the TUE Header to always be written as 32 bytes |
| * with 32B alignment, the most optimal write performance case." |
| */ |
| map->per_task_data_start_dw = 8; |
| |
| /* Lowering to explicit types will start offsets from task_payload_size, so |
| * set it to start after the header. |
| */ |
| nir->info.task_payload_size = map->per_task_data_start_dw * 4; |
| NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, |
| nir_var_mem_task_payload, shared_type_info); |
| NIR_PASS(_, nir, nir_lower_explicit_io, |
| nir_var_mem_task_payload, nir_address_format_32bit_offset); |
| |
| map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8); |
| } |
| |
| static void |
| brw_print_tue_map(FILE *fp, const struct brw_tue_map *map) |
| { |
| fprintf(fp, "TUE (%d dwords)\n\n", map->size_dw); |
| } |
| |
| static bool |
| brw_nir_adjust_task_payload_offsets_instr(struct nir_builder *b, |
| nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| switch (intrin->intrinsic) { |
| case nir_intrinsic_store_task_payload: |
| case nir_intrinsic_load_task_payload: { |
| nir_src *offset_src = nir_get_io_offset_src(intrin); |
| |
| if (nir_src_is_const(*offset_src)) |
| assert(nir_src_as_uint(*offset_src) % 4 == 0); |
| |
| b->cursor = nir_before_instr(&intrin->instr); |
| |
| /* Regular I/O uses dwords while explicit I/O used for task payload uses |
| * bytes. Normalize it to dwords. |
| * |
| * TODO(mesh): Figure out how to handle 8-bit, 16-bit. |
| */ |
| |
| assert(offset_src->is_ssa); |
| nir_ssa_def *offset = nir_ishr_imm(b, offset_src->ssa, 2); |
| nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset)); |
| |
| return true; |
| } |
| |
| default: |
| return false; |
| } |
| } |
| |
| static bool |
| brw_nir_adjust_task_payload_offsets(nir_shader *nir) |
| { |
| return nir_shader_instructions_pass(nir, |
| brw_nir_adjust_task_payload_offsets_instr, |
| nir_metadata_block_index | |
| nir_metadata_dominance, |
| NULL); |
| } |
| |
| static void |
| brw_nir_adjust_payload(nir_shader *shader, const struct brw_compiler *compiler) |
| { |
| /* Adjustment of task payload offsets must be performed *after* last pass |
| * which interprets them as bytes, because it changes their unit. |
| */ |
| bool adjusted = false; |
| NIR_PASS(adjusted, shader, brw_nir_adjust_task_payload_offsets); |
| if (adjusted) /* clean up the mess created by offset adjustments */ |
| NIR_PASS(_, shader, nir_opt_constant_folding); |
| } |
| |
| const unsigned * |
| brw_compile_task(const struct brw_compiler *compiler, |
| void *mem_ctx, |
| struct brw_compile_task_params *params) |
| { |
| struct nir_shader *nir = params->nir; |
| const struct brw_task_prog_key *key = params->key; |
| struct brw_task_prog_data *prog_data = params->prog_data; |
| const bool debug_enabled = INTEL_DEBUG(DEBUG_TASK); |
| |
| prog_data->base.base.stage = MESA_SHADER_TASK; |
| prog_data->base.base.total_shared = nir->info.shared_size; |
| prog_data->base.base.total_scratch = 0; |
| |
| prog_data->base.local_size[0] = nir->info.workgroup_size[0]; |
| prog_data->base.local_size[1] = nir->info.workgroup_size[1]; |
| prog_data->base.local_size[2] = nir->info.workgroup_size[2]; |
| |
| prog_data->uses_drawid = |
| BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); |
| |
| brw_nir_lower_tue_outputs(nir, &prog_data->map); |
| |
| const unsigned required_dispatch_width = |
| brw_required_dispatch_width(&nir->info); |
| |
| fs_visitor *v[3] = {0}; |
| const char *error[3] = {0}; |
| |
| for (unsigned simd = 0; simd < 3; simd++) { |
| if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, &prog_data->base, |
| required_dispatch_width, &error[simd])) |
| continue; |
| |
| const unsigned dispatch_width = 8 << simd; |
| |
| nir_shader *shader = nir_shader_clone(mem_ctx, nir); |
| brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true /* is_scalar */); |
| |
| NIR_PASS(_, shader, brw_nir_lower_load_uniforms); |
| NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); |
| |
| brw_postprocess_nir(shader, compiler, true /* is_scalar */, debug_enabled, |
| key->base.robust_buffer_access); |
| |
| brw_nir_adjust_payload(shader, compiler); |
| |
| v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, |
| &prog_data->base.base, shader, dispatch_width, |
| debug_enabled); |
| |
| if (prog_data->base.prog_mask) { |
| unsigned first = ffs(prog_data->base.prog_mask) - 1; |
| v[simd]->import_uniforms(v[first]); |
| } |
| |
| const bool allow_spilling = !prog_data->base.prog_mask; |
| |
| if (v[simd]->run_task(allow_spilling)) |
| brw_simd_mark_compiled(simd, &prog_data->base, v[simd]->spilled_any_registers); |
| else |
| error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg); |
| } |
| |
| int selected_simd = brw_simd_select(&prog_data->base); |
| if (selected_simd < 0) { |
| params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n", |
| error[0], error[1], error[2]);; |
| return NULL; |
| } |
| |
| fs_visitor *selected = v[selected_simd]; |
| prog_data->base.prog_mask = 1 << selected_simd; |
| |
| if (unlikely(debug_enabled)) { |
| fprintf(stderr, "Task Output "); |
| brw_print_tue_map(stderr, &prog_data->map); |
| } |
| |
| fs_generator g(compiler, params->log_data, mem_ctx, |
| &prog_data->base.base, false, MESA_SHADER_TASK); |
| if (unlikely(debug_enabled)) { |
| g.enable_debug(ralloc_asprintf(mem_ctx, |
| "%s task shader %s", |
| nir->info.label ? nir->info.label |
| : "unnamed", |
| nir->info.name)); |
| } |
| |
| g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats, |
| selected->performance_analysis.require(), params->stats); |
| |
| delete v[0]; |
| delete v[1]; |
| delete v[2]; |
| |
| return g.get_assembly(); |
| } |
| |
| static void |
| brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map) |
| { |
| if (!map) |
| return; |
| |
| nir->info.task_payload_size = map->per_task_data_start_dw * 4; |
| |
| bool progress = false; |
| |
| NIR_PASS(progress, nir, nir_lower_vars_to_explicit_types, |
| nir_var_mem_task_payload, shared_type_info); |
| |
| if (progress) { |
| /* The types for Task Output and Mesh Input should match, so their sizes |
| * should also match. |
| */ |
| assert(map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8)); |
| } else { |
| /* Mesh doesn't read any input, to make it clearer set the |
| * task_payload_size to zero instead of keeping an incomplete size that |
| * just includes the header. |
| */ |
| nir->info.task_payload_size = 0; |
| } |
| |
| NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_task_payload, |
| nir_address_format_32bit_offset); |
| } |
| |
| /* Mesh URB Entry consists of an initial section |
| * |
| * - Primitive Count |
| * - Primitive Indices (from 0 to Max-1) |
| * - Padding to 32B if needed |
| * |
| * optionally followed by a section for per-primitive data, |
| * in which each primitive (from 0 to Max-1) gets |
| * |
| * - Primitive Header (e.g. ViewportIndex) |
| * - Primitive Custom Attributes |
| * |
| * then followed by a section for per-vertex data |
| * |
| * - Vertex Header (e.g. Position) |
| * - Vertex Custom Attributes |
| * |
| * Each per-element section has a pitch and a starting offset. All the |
| * individual attributes offsets in start_dw are considering the first entry |
| * of the section (i.e. where the Position for first vertex, or ViewportIndex |
| * for first primitive). Attributes for other elements are calculated using |
| * the pitch. |
| */ |
| static void |
| brw_compute_mue_map(struct nir_shader *nir, struct brw_mue_map *map) |
| { |
| memset(map, 0, sizeof(*map)); |
| |
| for (int i = 0; i < VARYING_SLOT_MAX; i++) |
| map->start_dw[i] = -1; |
| |
| unsigned vertices_per_primitive = |
| num_mesh_vertices_per_primitive(nir->info.mesh.primitive_type); |
| |
| map->max_primitives = nir->info.mesh.max_primitives_out; |
| map->max_vertices = nir->info.mesh.max_vertices_out; |
| |
| uint64_t outputs_written = nir->info.outputs_written; |
| |
| /* Assign initial section. */ |
| if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT) & outputs_written) { |
| map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT] = 0; |
| outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_COUNT); |
| } |
| if (BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES) & outputs_written) { |
| map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES] = 1; |
| outputs_written &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES); |
| } |
| |
| /* One dword for primitives count then K extra dwords for each |
| * primitive. Note this should change when we implement other index types. |
| */ |
| const unsigned primitive_list_size_dw = 1 + vertices_per_primitive * map->max_primitives; |
| |
| /* TODO(mesh): Multiview. */ |
| map->per_primitive_header_size_dw = |
| (nir->info.outputs_written & (BITFIELD64_BIT(VARYING_SLOT_VIEWPORT) | |
| BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE) | |
| BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_SHADING_RATE) | |
| BITFIELD64_BIT(VARYING_SLOT_LAYER))) ? 8 : 0; |
| |
| map->per_primitive_start_dw = ALIGN(primitive_list_size_dw, 8); |
| |
| map->per_primitive_data_size_dw = 0; |
| u_foreach_bit64(location, outputs_written & nir->info.per_primitive_outputs) { |
| assert(map->start_dw[location] == -1); |
| |
| unsigned start; |
| switch (location) { |
| case VARYING_SLOT_PRIMITIVE_SHADING_RATE: |
| start = map->per_primitive_start_dw + 0; |
| break; |
| case VARYING_SLOT_LAYER: |
| start = map->per_primitive_start_dw + 1; /* RTAIndex */ |
| break; |
| case VARYING_SLOT_VIEWPORT: |
| start = map->per_primitive_start_dw + 2; |
| break; |
| case VARYING_SLOT_CULL_PRIMITIVE: |
| start = map->per_primitive_start_dw + 3; |
| break; |
| default: |
| assert(location == VARYING_SLOT_PRIMITIVE_ID || |
| location >= VARYING_SLOT_VAR0); |
| start = map->per_primitive_start_dw + |
| map->per_primitive_header_size_dw + |
| map->per_primitive_data_size_dw; |
| map->per_primitive_data_size_dw += 4; |
| break; |
| } |
| |
| map->start_dw[location] = start; |
| } |
| |
| map->per_primitive_pitch_dw = ALIGN(map->per_primitive_header_size_dw + |
| map->per_primitive_data_size_dw, 8); |
| |
| map->per_vertex_start_dw = ALIGN(map->per_primitive_start_dw + |
| map->per_primitive_pitch_dw * map->max_primitives, 8); |
| |
| /* TODO(mesh): Multiview. */ |
| unsigned fixed_header_size = 8; |
| map->per_vertex_header_size_dw = ALIGN(fixed_header_size + |
| nir->info.clip_distance_array_size + |
| nir->info.cull_distance_array_size, 8); |
| map->per_vertex_data_size_dw = 0; |
| u_foreach_bit64(location, outputs_written & ~nir->info.per_primitive_outputs) { |
| assert(map->start_dw[location] == -1); |
| |
| unsigned start; |
| switch (location) { |
| case VARYING_SLOT_PSIZ: |
| start = map->per_vertex_start_dw + 3; |
| break; |
| case VARYING_SLOT_POS: |
| start = map->per_vertex_start_dw + 4; |
| break; |
| case VARYING_SLOT_CLIP_DIST0: |
| start = map->per_vertex_start_dw + fixed_header_size + 0; |
| break; |
| case VARYING_SLOT_CLIP_DIST1: |
| start = map->per_vertex_start_dw + fixed_header_size + 4; |
| break; |
| case VARYING_SLOT_CULL_DIST0: |
| case VARYING_SLOT_CULL_DIST1: |
| unreachable("cull distances should be lowered earlier"); |
| break; |
| default: |
| assert(location >= VARYING_SLOT_VAR0); |
| start = map->per_vertex_start_dw + |
| map->per_vertex_header_size_dw + |
| map->per_vertex_data_size_dw; |
| map->per_vertex_data_size_dw += 4; |
| break; |
| } |
| map->start_dw[location] = start; |
| } |
| |
| map->per_vertex_pitch_dw = ALIGN(map->per_vertex_header_size_dw + |
| map->per_vertex_data_size_dw, 8); |
| |
| map->size_dw = |
| map->per_vertex_start_dw + map->per_vertex_pitch_dw * map->max_vertices; |
| |
| assert(map->size_dw % 8 == 0); |
| } |
| |
| static void |
| brw_print_mue_map(FILE *fp, const struct brw_mue_map *map) |
| { |
| fprintf(fp, "MUE map (%d dwords, %d primitives, %d vertices)\n", |
| map->size_dw, map->max_primitives, map->max_vertices); |
| fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_COUNT\n", |
| map->start_dw[VARYING_SLOT_PRIMITIVE_COUNT]); |
| fprintf(fp, " %4d: VARYING_SLOT_PRIMITIVE_INDICES\n", |
| map->start_dw[VARYING_SLOT_PRIMITIVE_INDICES]); |
| |
| fprintf(fp, " ----- per primitive (start %d, header_size %d, data_size %d, pitch %d)\n", |
| map->per_primitive_start_dw, |
| map->per_primitive_header_size_dw, |
| map->per_primitive_data_size_dw, |
| map->per_primitive_pitch_dw); |
| |
| for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { |
| if (map->start_dw[i] < 0) |
| continue; |
| const unsigned offset = map->start_dw[i]; |
| if (offset >= map->per_primitive_start_dw && |
| offset < map->per_primitive_start_dw + map->per_primitive_pitch_dw) { |
| fprintf(fp, " %4d: %s\n", offset, |
| gl_varying_slot_name_for_stage((gl_varying_slot)i, |
| MESA_SHADER_MESH)); |
| } |
| } |
| |
| fprintf(fp, " ----- per vertex (start %d, header_size %d, data_size %d, pitch %d)\n", |
| map->per_vertex_start_dw, |
| map->per_vertex_header_size_dw, |
| map->per_vertex_data_size_dw, |
| map->per_vertex_pitch_dw); |
| |
| for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) { |
| if (map->start_dw[i] < 0) |
| continue; |
| const unsigned offset = map->start_dw[i]; |
| if (offset >= map->per_vertex_start_dw && |
| offset < map->per_vertex_start_dw + map->per_vertex_pitch_dw) { |
| fprintf(fp, " %4d: %s\n", offset, |
| gl_varying_slot_name_for_stage((gl_varying_slot)i, |
| MESA_SHADER_MESH)); |
| } |
| } |
| |
| fprintf(fp, "\n"); |
| } |
| |
| static void |
| brw_nir_lower_mue_outputs(nir_shader *nir, const struct brw_mue_map *map) |
| { |
| nir_foreach_shader_out_variable(var, nir) { |
| int location = var->data.location; |
| assert(location >= 0); |
| assert(map->start_dw[location] != -1); |
| var->data.driver_location = map->start_dw[location]; |
| } |
| |
| NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, |
| type_size_scalar_dwords, nir_lower_io_lower_64bit_to_32); |
| |
| NIR_PASS(_, nir, brw_nir_lower_shading_rate_output); |
| } |
| |
| static void |
| brw_nir_initialize_mue(nir_shader *nir, |
| const struct brw_mue_map *map, |
| unsigned dispatch_width) |
| { |
| assert(map->per_primitive_header_size_dw > 0); |
| |
| nir_builder b; |
| nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir); |
| nir_builder_init(&b, entrypoint); |
| b.cursor = nir_before_block(nir_start_block(entrypoint)); |
| |
| nir_ssa_def *dw_off = nir_imm_int(&b, 0); |
| nir_ssa_def *zerovec = nir_imm_vec4(&b, 0, 0, 0, 0); |
| |
| /* TODO(mesh): can we write in bigger batches, generating fewer SENDs? */ |
| |
| assert(!nir->info.workgroup_size_variable); |
| const unsigned workgroup_size = nir->info.workgroup_size[0] * |
| nir->info.workgroup_size[1] * |
| nir->info.workgroup_size[2]; |
| |
| /* Invocations from a single workgroup will cooperate in zeroing MUE. */ |
| |
| /* How many prims each invocation needs to cover without checking its index? */ |
| unsigned prims_per_inv = map->max_primitives / workgroup_size; |
| |
| /* Zero first 4 dwords of MUE Primitive Header: |
| * Reserved, RTAIndex, ViewportIndex, CullPrimitiveMask. |
| */ |
| |
| nir_ssa_def *local_invocation_index = nir_load_local_invocation_index(&b); |
| |
| /* Zero primitive headers distanced by workgroup_size, starting from |
| * invocation index. |
| */ |
| for (unsigned prim_in_inv = 0; prim_in_inv < prims_per_inv; ++prim_in_inv) { |
| nir_ssa_def *prim = nir_iadd_imm(&b, local_invocation_index, |
| prim_in_inv * workgroup_size); |
| |
| nir_store_per_primitive_output(&b, zerovec, prim, dw_off, |
| .base = (int)map->per_primitive_start_dw, |
| .write_mask = WRITEMASK_XYZW, |
| .component = 0, |
| .src_type = nir_type_uint32); |
| } |
| |
| /* How many prims are left? */ |
| unsigned remaining = map->max_primitives % workgroup_size; |
| |
| if (remaining) { |
| /* Zero "remaining" primitive headers starting from the last one covered |
| * by the loop above + workgroup_size. |
| */ |
| nir_ssa_def *cmp = nir_ilt(&b, local_invocation_index, |
| nir_imm_int(&b, remaining)); |
| nir_if *if_stmt = nir_push_if(&b, cmp); |
| { |
| nir_ssa_def *prim = nir_iadd_imm(&b, local_invocation_index, |
| prims_per_inv * workgroup_size); |
| |
| nir_store_per_primitive_output(&b, zerovec, prim, dw_off, |
| .base = (int)map->per_primitive_start_dw, |
| .write_mask = WRITEMASK_XYZW, |
| .component = 0, |
| .src_type = nir_type_uint32); |
| } |
| nir_pop_if(&b, if_stmt); |
| } |
| |
| /* If there's more than one subgroup, then we need to wait for all of them |
| * to finish initialization before we can proceed. Otherwise some subgroups |
| * may start filling MUE before other finished initializing. |
| */ |
| if (workgroup_size > dispatch_width) { |
| nir_scoped_barrier(&b, NIR_SCOPE_WORKGROUP, NIR_SCOPE_WORKGROUP, |
| NIR_MEMORY_ACQ_REL, nir_var_shader_out); |
| } |
| |
| if (remaining) { |
| nir_metadata_preserve(entrypoint, nir_metadata_none); |
| } else { |
| nir_metadata_preserve(entrypoint, nir_metadata_block_index | |
| nir_metadata_dominance); |
| } |
| } |
| |
| static bool |
| brw_nir_adjust_offset_for_arrayed_indices_instr(nir_builder *b, nir_instr *instr, void *data) |
| { |
| if (instr->type != nir_instr_type_intrinsic) |
| return false; |
| |
| nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); |
| |
| const struct brw_mue_map *map = (const struct brw_mue_map *) data; |
| |
| /* Remap per_vertex and per_primitive offsets using the extra source and |
| * the pitch. |
| */ |
| switch (intrin->intrinsic) { |
| case nir_intrinsic_load_per_vertex_output: |
| case nir_intrinsic_store_per_vertex_output: { |
| const bool is_load = intrin->intrinsic == nir_intrinsic_load_per_vertex_output; |
| nir_src *index_src = &intrin->src[is_load ? 0 : 1]; |
| nir_src *offset_src = &intrin->src[is_load ? 1 : 2]; |
| |
| assert(index_src->is_ssa); |
| b->cursor = nir_before_instr(&intrin->instr); |
| nir_ssa_def *offset = |
| nir_iadd(b, |
| offset_src->ssa, |
| nir_imul_imm(b, index_src->ssa, map->per_vertex_pitch_dw)); |
| nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset)); |
| return true; |
| } |
| |
| case nir_intrinsic_load_per_primitive_output: |
| case nir_intrinsic_store_per_primitive_output: { |
| const bool is_load = intrin->intrinsic == nir_intrinsic_load_per_primitive_output; |
| nir_src *index_src = &intrin->src[is_load ? 0 : 1]; |
| nir_src *offset_src = &intrin->src[is_load ? 1 : 2]; |
| |
| assert(index_src->is_ssa); |
| b->cursor = nir_before_instr(&intrin->instr); |
| |
| assert(index_src->is_ssa); |
| nir_ssa_def *offset = |
| nir_iadd(b, |
| offset_src->ssa, |
| nir_imul_imm(b, index_src->ssa, map->per_primitive_pitch_dw)); |
| nir_instr_rewrite_src(&intrin->instr, offset_src, nir_src_for_ssa(offset)); |
| return true; |
| } |
| |
| default: |
| return false; |
| } |
| } |
| |
| static bool |
| brw_nir_adjust_offset_for_arrayed_indices(nir_shader *nir, const struct brw_mue_map *map) |
| { |
| return nir_shader_instructions_pass(nir, |
| brw_nir_adjust_offset_for_arrayed_indices_instr, |
| nir_metadata_block_index | |
| nir_metadata_dominance, |
| (void *)map); |
| } |
| |
| const unsigned * |
| brw_compile_mesh(const struct brw_compiler *compiler, |
| void *mem_ctx, |
| struct brw_compile_mesh_params *params) |
| { |
| struct nir_shader *nir = params->nir; |
| const struct brw_mesh_prog_key *key = params->key; |
| struct brw_mesh_prog_data *prog_data = params->prog_data; |
| const bool debug_enabled = INTEL_DEBUG(DEBUG_MESH); |
| |
| prog_data->base.base.stage = MESA_SHADER_MESH; |
| prog_data->base.base.total_shared = nir->info.shared_size; |
| prog_data->base.base.total_scratch = 0; |
| |
| prog_data->base.local_size[0] = nir->info.workgroup_size[0]; |
| prog_data->base.local_size[1] = nir->info.workgroup_size[1]; |
| prog_data->base.local_size[2] = nir->info.workgroup_size[2]; |
| |
| prog_data->clip_distance_mask = (1 << nir->info.clip_distance_array_size) - 1; |
| prog_data->cull_distance_mask = |
| ((1 << nir->info.cull_distance_array_size) - 1) << |
| nir->info.clip_distance_array_size; |
| prog_data->primitive_type = nir->info.mesh.primitive_type; |
| |
| /* TODO(mesh): Use other index formats (that are more compact) for optimization. */ |
| prog_data->index_format = BRW_INDEX_FORMAT_U32; |
| |
| prog_data->uses_drawid = |
| BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID); |
| |
| brw_nir_lower_tue_inputs(nir, params->tue_map); |
| |
| brw_compute_mue_map(nir, &prog_data->map); |
| brw_nir_lower_mue_outputs(nir, &prog_data->map); |
| |
| const unsigned required_dispatch_width = |
| brw_required_dispatch_width(&nir->info); |
| |
| fs_visitor *v[3] = {0}; |
| const char *error[3] = {0}; |
| |
| for (int simd = 0; simd < 3; simd++) { |
| if (!brw_simd_should_compile(mem_ctx, simd, compiler->devinfo, &prog_data->base, |
| required_dispatch_width, &error[simd])) |
| continue; |
| |
| const unsigned dispatch_width = 8 << simd; |
| |
| nir_shader *shader = nir_shader_clone(mem_ctx, nir); |
| |
| /* |
| * When Primitive Header is enabled, we may not generates writes to all |
| * fields, so let's initialize everything. |
| */ |
| if (prog_data->map.per_primitive_header_size_dw > 0) |
| NIR_PASS_V(shader, brw_nir_initialize_mue, &prog_data->map, dispatch_width); |
| |
| brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true /* is_scalar */); |
| |
| NIR_PASS(_, shader, brw_nir_adjust_offset_for_arrayed_indices, &prog_data->map); |
| /* Load uniforms can do a better job for constants, so fold before it. */ |
| NIR_PASS(_, shader, nir_opt_constant_folding); |
| NIR_PASS(_, shader, brw_nir_lower_load_uniforms); |
| |
| NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width); |
| |
| brw_postprocess_nir(shader, compiler, true /* is_scalar */, debug_enabled, |
| key->base.robust_buffer_access); |
| |
| brw_nir_adjust_payload(shader, compiler); |
| |
| v[simd] = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base, |
| &prog_data->base.base, shader, dispatch_width, |
| debug_enabled); |
| |
| if (prog_data->base.prog_mask) { |
| unsigned first = ffs(prog_data->base.prog_mask) - 1; |
| v[simd]->import_uniforms(v[first]); |
| } |
| |
| const bool allow_spilling = !prog_data->base.prog_mask; |
| |
| if (v[simd]->run_mesh(allow_spilling)) |
| brw_simd_mark_compiled(simd, &prog_data->base, v[simd]->spilled_any_registers); |
| else |
| error[simd] = ralloc_strdup(mem_ctx, v[simd]->fail_msg); |
| } |
| |
| int selected_simd = brw_simd_select(&prog_data->base); |
| if (selected_simd < 0) { |
| params->error_str = ralloc_asprintf(mem_ctx, "Can't compile shader: %s, %s and %s.\n", |
| error[0], error[1], error[2]);; |
| return NULL; |
| } |
| |
| fs_visitor *selected = v[selected_simd]; |
| prog_data->base.prog_mask = 1 << selected_simd; |
| |
| if (unlikely(debug_enabled)) { |
| if (params->tue_map) { |
| fprintf(stderr, "Mesh Input "); |
| brw_print_tue_map(stderr, params->tue_map); |
| } |
| fprintf(stderr, "Mesh Output "); |
| brw_print_mue_map(stderr, &prog_data->map); |
| } |
| |
| fs_generator g(compiler, params->log_data, mem_ctx, |
| &prog_data->base.base, false, MESA_SHADER_MESH); |
| if (unlikely(debug_enabled)) { |
| g.enable_debug(ralloc_asprintf(mem_ctx, |
| "%s mesh shader %s", |
| nir->info.label ? nir->info.label |
| : "unnamed", |
| nir->info.name)); |
| } |
| |
| g.generate_code(selected->cfg, selected->dispatch_width, selected->shader_stats, |
| selected->performance_analysis.require(), params->stats); |
| |
| delete v[0]; |
| delete v[1]; |
| delete v[2]; |
| |
| return g.get_assembly(); |
| } |
| |
| static fs_reg |
| get_mesh_urb_handle(const fs_builder &bld, nir_intrinsic_op op) |
| { |
| unsigned subreg; |
| if (bld.shader->stage == MESA_SHADER_TASK) { |
| subreg = 6; |
| } else { |
| assert(bld.shader->stage == MESA_SHADER_MESH); |
| subreg = op == nir_intrinsic_load_task_payload ? 7 : 6; |
| } |
| |
| fs_builder ubld8 = bld.group(8, 0).exec_all(); |
| |
| fs_reg h = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1); |
| ubld8.MOV(h, retype(brw_vec1_grf(0, subreg), BRW_REGISTER_TYPE_UD)); |
| ubld8.AND(h, h, brw_imm_ud(0xFFFF)); |
| |
| return h; |
| } |
| |
| static unsigned |
| component_from_intrinsic(nir_intrinsic_instr *instr) |
| { |
| if (nir_intrinsic_has_component(instr)) |
| return nir_intrinsic_component(instr); |
| else |
| return 0; |
| } |
| |
| static void |
| adjust_handle_and_offset(const fs_builder &bld, |
| fs_reg &urb_handle, |
| unsigned &urb_global_offset) |
| { |
| /* Make sure that URB global offset is below 2048 (2^11), because |
| * that's the maximum possible value encoded in Message Descriptor. |
| */ |
| unsigned adjustment = (urb_global_offset >> 11) << 11; |
| |
| if (adjustment) { |
| fs_builder ubld8 = bld.group(8, 0).exec_all(); |
| ubld8.ADD(urb_handle, urb_handle, brw_imm_ud(adjustment)); |
| urb_global_offset -= adjustment; |
| } |
| } |
| |
| static void |
| emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr, |
| const fs_reg &src) |
| { |
| assert(nir_src_bit_size(instr->src[0]) == 32); |
| |
| nir_src *offset_nir_src = nir_get_io_offset_src(instr); |
| assert(nir_src_is_const(*offset_nir_src)); |
| |
| fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic); |
| |
| const unsigned comps = nir_src_num_components(instr->src[0]); |
| assert(comps <= 4); |
| |
| const unsigned mask = nir_intrinsic_write_mask(instr); |
| const unsigned offset_in_dwords = nir_intrinsic_base(instr) + |
| nir_src_as_uint(*offset_nir_src) + |
| component_from_intrinsic(instr); |
| |
| /* URB writes are vec4 aligned but the intrinsic offsets are in dwords. |
| * With a max of 4 components, an intrinsic can require up to two writes. |
| * |
| * First URB write will be shifted by comp_shift. If there are other |
| * components left, then dispatch a second write. In addition to that, |
| * take mask into account to decide whether each write will be actually |
| * needed. |
| */ |
| const unsigned comp_shift = offset_in_dwords % 4; |
| const unsigned first_comps = MIN2(comps, 4 - comp_shift); |
| const unsigned second_comps = comps - first_comps; |
| const unsigned first_mask = (mask << comp_shift) & 0xF; |
| const unsigned second_mask = (mask >> (4 - comp_shift)) & 0xF; |
| |
| unsigned urb_global_offset = offset_in_dwords / 4; |
| adjust_handle_and_offset(bld, urb_handle, urb_global_offset); |
| |
| if (first_mask > 0) { |
| for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { |
| fs_builder bld8 = bld.group(8, q); |
| |
| fs_reg payload_srcs[4]; |
| unsigned length = 0; |
| |
| for (unsigned i = 0; i < comp_shift; i++) |
| payload_srcs[length++] = reg_undef; |
| |
| for (unsigned c = 0; c < first_comps; c++) |
| payload_srcs[length++] = quarter(offset(src, bld, c), q); |
| |
| fs_reg srcs[URB_LOGICAL_NUM_SRCS]; |
| srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; |
| srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(first_mask << 16); |
| srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), |
| BRW_REGISTER_TYPE_F); |
| bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); |
| |
| fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, |
| reg_undef, srcs, ARRAY_SIZE(srcs)); |
| inst->mlen = 2 + length; |
| inst->offset = urb_global_offset; |
| assert(inst->offset < 2048); |
| } |
| } |
| |
| if (second_mask > 0) { |
| urb_global_offset++; |
| adjust_handle_and_offset(bld, urb_handle, urb_global_offset); |
| |
| for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { |
| fs_builder bld8 = bld.group(8, q); |
| |
| fs_reg payload_srcs[4]; |
| unsigned length = 0; |
| |
| for (unsigned c = 0; c < second_comps; c++) |
| payload_srcs[length++] = quarter(offset(src, bld, c + first_comps), q); |
| |
| fs_reg srcs[URB_LOGICAL_NUM_SRCS]; |
| srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; |
| srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(second_mask << 16); |
| srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), |
| BRW_REGISTER_TYPE_F); |
| bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); |
| |
| fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, |
| reg_undef, srcs, ARRAY_SIZE(srcs)); |
| inst->mlen = 2 + length; |
| inst->offset = urb_global_offset; |
| assert(inst->offset < 2048); |
| } |
| } |
| } |
| |
| static void |
| emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr, |
| const fs_reg &src, const fs_reg &offset_src) |
| { |
| assert(nir_src_bit_size(instr->src[0]) == 32); |
| |
| const unsigned comps = nir_src_num_components(instr->src[0]); |
| assert(comps <= 4); |
| |
| fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic); |
| |
| const unsigned base_in_dwords = nir_intrinsic_base(instr) + |
| component_from_intrinsic(instr); |
| |
| /* Use URB write message that allow different offsets per-slot. The offset |
| * is in units of vec4s (128 bits), so we use a write for each component, |
| * replicating it in the sources and applying the appropriate mask based on |
| * the dword offset. |
| */ |
| |
| for (unsigned c = 0; c < comps; c++) { |
| if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0) |
| continue; |
| |
| fs_reg src_comp = offset(src, bld, c); |
| |
| for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { |
| fs_builder bld8 = bld.group(8, q); |
| |
| fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); |
| bld8.MOV(off, quarter(offset_src, q)); |
| bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords)); |
| |
| fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); |
| bld8.AND(mask, off, brw_imm_ud(0x3)); |
| |
| fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); |
| bld8.MOV(one, brw_imm_ud(1)); |
| bld8.SHL(mask, one, mask); |
| bld8.SHL(mask, mask, brw_imm_ud(16)); |
| |
| bld8.SHR(off, off, brw_imm_ud(2)); |
| |
| fs_reg payload_srcs[4]; |
| unsigned length = 0; |
| |
| for (unsigned j = 0; j < 4; j++) |
| payload_srcs[length++] = quarter(src_comp, q); |
| |
| fs_reg srcs[URB_LOGICAL_NUM_SRCS]; |
| srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; |
| srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off; |
| srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask; |
| srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length), |
| BRW_REGISTER_TYPE_F); |
| bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0); |
| |
| fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, |
| reg_undef, srcs, ARRAY_SIZE(srcs)); |
| inst->mlen = 3 + length; |
| inst->offset = 0; |
| } |
| } |
| } |
| |
| static void |
| emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr, |
| const fs_reg &dest) |
| { |
| assert(nir_dest_bit_size(instr->dest) == 32); |
| |
| unsigned comps = nir_dest_num_components(instr->dest); |
| if (comps == 0) |
| return; |
| |
| nir_src *offset_nir_src = nir_get_io_offset_src(instr); |
| assert(nir_src_is_const(*offset_nir_src)); |
| |
| fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic); |
| |
| const unsigned offset_in_dwords = nir_intrinsic_base(instr) + |
| nir_src_as_uint(*offset_nir_src) + |
| component_from_intrinsic(instr); |
| |
| unsigned urb_global_offset = offset_in_dwords / 4; |
| adjust_handle_and_offset(bld, urb_handle, urb_global_offset); |
| |
| const unsigned comp_offset = offset_in_dwords % 4; |
| const unsigned num_regs = comp_offset + comps; |
| |
| fs_builder ubld8 = bld.group(8, 0).exec_all(); |
| fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs); |
| fs_reg srcs[URB_LOGICAL_NUM_SRCS]; |
| srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; |
| |
| fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data, |
| srcs, ARRAY_SIZE(srcs)); |
| inst->mlen = 1; |
| inst->offset = urb_global_offset; |
| assert(inst->offset < 2048); |
| inst->size_written = num_regs * REG_SIZE; |
| |
| for (unsigned c = 0; c < comps; c++) { |
| fs_reg dest_comp = offset(dest, bld, c); |
| fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0); |
| bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp); |
| } |
| } |
| |
| static void |
| emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr, |
| const fs_reg &dest, const fs_reg &offset_src) |
| { |
| assert(nir_dest_bit_size(instr->dest) == 32); |
| |
| unsigned comps = nir_dest_num_components(instr->dest); |
| if (comps == 0) |
| return; |
| |
| fs_reg seq_ud; |
| { |
| fs_builder ubld8 = bld.group(8, 0).exec_all(); |
| seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1); |
| fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1); |
| ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210))); |
| ubld8.MOV(seq_ud, seq_uw); |
| ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2)); |
| } |
| |
| fs_reg urb_handle = get_mesh_urb_handle(bld, instr->intrinsic); |
| |
| const unsigned base_in_dwords = nir_intrinsic_base(instr) + |
| component_from_intrinsic(instr); |
| |
| for (unsigned c = 0; c < comps; c++) { |
| for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) { |
| fs_builder bld8 = bld.group(8, q); |
| |
| fs_reg off = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); |
| bld8.MOV(off, quarter(offset_src, q)); |
| bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c)); |
| |
| STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1); |
| |
| fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1); |
| bld8.AND(comp, off, brw_imm_ud(0x3)); |
| bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1)); |
| bld8.ADD(comp, comp, seq_ud); |
| |
| bld8.SHR(off, off, brw_imm_ud(2)); |
| |
| fs_reg srcs[URB_LOGICAL_NUM_SRCS]; |
| srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle; |
| srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off; |
| |
| fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4); |
| |
| fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, |
| data, srcs, ARRAY_SIZE(srcs)); |
| inst->mlen = 2; |
| inst->offset = 0; |
| inst->size_written = 4 * REG_SIZE; |
| |
| fs_reg dest_comp = offset(dest, bld, c); |
| bld8.emit(SHADER_OPCODE_MOV_INDIRECT, |
| retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD), |
| data, |
| comp, |
| brw_imm_ud(4)); |
| } |
| } |
| } |
| |
| void |
| fs_visitor::emit_task_mesh_store(const fs_builder &bld, nir_intrinsic_instr *instr) |
| { |
| fs_reg src = get_nir_src(instr->src[0]); |
| nir_src *offset_nir_src = nir_get_io_offset_src(instr); |
| |
| /* TODO(mesh): for per_vertex and per_primitive, if we could keep around |
| * the non-array-index offset, we could use to decide if we can perform |
| * either one or (at most) two writes instead one per component. |
| */ |
| |
| if (nir_src_is_const(*offset_nir_src)) |
| emit_urb_direct_writes(bld, instr, src); |
| else |
| emit_urb_indirect_writes(bld, instr, src, get_nir_src(*offset_nir_src)); |
| } |
| |
| void |
| fs_visitor::emit_task_mesh_load(const fs_builder &bld, nir_intrinsic_instr *instr) |
| { |
| fs_reg dest = get_nir_dest(instr->dest); |
| nir_src *offset_nir_src = nir_get_io_offset_src(instr); |
| |
| /* TODO(mesh): for per_vertex and per_primitive, if we could keep around |
| * the non-array-index offset, we could use to decide if we can perform |
| * a single large aligned read instead one per component. |
| */ |
| |
| if (nir_src_is_const(*offset_nir_src)) |
| emit_urb_direct_reads(bld, instr, dest); |
| else |
| emit_urb_indirect_reads(bld, instr, dest, get_nir_src(*offset_nir_src)); |
| } |
| |
| void |
| fs_visitor::nir_emit_task_intrinsic(const fs_builder &bld, |
| nir_intrinsic_instr *instr) |
| { |
| assert(stage == MESA_SHADER_TASK); |
| |
| switch (instr->intrinsic) { |
| case nir_intrinsic_store_output: |
| case nir_intrinsic_store_task_payload: |
| emit_task_mesh_store(bld, instr); |
| break; |
| |
| case nir_intrinsic_load_output: |
| case nir_intrinsic_load_task_payload: |
| emit_task_mesh_load(bld, instr); |
| break; |
| |
| default: |
| nir_emit_task_mesh_intrinsic(bld, instr); |
| break; |
| } |
| } |
| |
| void |
| fs_visitor::nir_emit_mesh_intrinsic(const fs_builder &bld, |
| nir_intrinsic_instr *instr) |
| { |
| assert(stage == MESA_SHADER_MESH); |
| |
| switch (instr->intrinsic) { |
| case nir_intrinsic_store_per_primitive_output: |
| case nir_intrinsic_store_per_vertex_output: |
| case nir_intrinsic_store_output: |
| emit_task_mesh_store(bld, instr); |
| break; |
| |
| case nir_intrinsic_load_per_vertex_output: |
| case nir_intrinsic_load_per_primitive_output: |
| case nir_intrinsic_load_output: |
| case nir_intrinsic_load_task_payload: |
| emit_task_mesh_load(bld, instr); |
| break; |
| |
| default: |
| nir_emit_task_mesh_intrinsic(bld, instr); |
| break; |
| } |
| } |
| |
| void |
| fs_visitor::nir_emit_task_mesh_intrinsic(const fs_builder &bld, |
| nir_intrinsic_instr *instr) |
| { |
| assert(stage == MESA_SHADER_MESH || stage == MESA_SHADER_TASK); |
| |
| fs_reg dest; |
| if (nir_intrinsic_infos[instr->intrinsic].has_dest) |
| dest = get_nir_dest(instr->dest); |
| |
| switch (instr->intrinsic) { |
| case nir_intrinsic_load_mesh_inline_data_intel: |
| assert(payload.num_regs == 3 || payload.num_regs == 4); |
| /* Inline Parameter is the last element of the payload. */ |
| bld.MOV(dest, retype(brw_vec1_grf(payload.num_regs - 1, |
| nir_intrinsic_align_offset(instr)), |
| dest.type)); |
| break; |
| |
| case nir_intrinsic_load_draw_id: |
| /* DrawID comes from Extended Parameter 0 (XP0). */ |
| bld.MOV(dest, brw_vec1_grf(0, 3)); |
| break; |
| |
| case nir_intrinsic_load_local_invocation_index: |
| case nir_intrinsic_load_local_invocation_id: |
| /* Local_ID.X is given by the HW in the shader payload. */ |
| dest = retype(dest, BRW_REGISTER_TYPE_UD); |
| bld.MOV(dest, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW)); |
| /* Task/Mesh only use one dimension. */ |
| if (instr->intrinsic == nir_intrinsic_load_local_invocation_id) { |
| bld.MOV(offset(dest, bld, 1), brw_imm_uw(0)); |
| bld.MOV(offset(dest, bld, 2), brw_imm_uw(0)); |
| } |
| break; |
| |
| default: |
| nir_emit_cs_intrinsic(bld, instr); |
| break; |
| } |
| } |